In [None]:
import ee
import geopandas as gpd
from pathlib import Path
from datetime import datetime, timedelta
from typing import List, Optional, Tuple
import urllib.request
from calendar import monthrange
import requests
from requests.exceptions import Timeout as RequestsTimeout, RequestException, HTTPError
from geobr import read_municipality
import math
import time
import json
import socket
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import shutil
from datetime import datetime


In [2]:
ee.Initialize(project="cropyieldprediction-476612")
years_to_process = ['2017', '2025']


In [None]:
class GEEDownloader:
    """Downloader for Sentinel-2 imagery with MapBiomas crop masking from Google Earth Engine."""
    
    def __init__(self, output_dir: str = "files/gee_images_30m", failed_municipalities_file: str = "files/failed_municipalities_30m.json"):
        """
        Initialize the GEE downloader.
        
        Parameters:
        -----------
        output_dir : str, default "files/gee_images"
            Directory where downloaded images will be saved.
        failed_municipalities_file : str, default "files/failed_municipalities.json"
            Path to JSON file storing list of municipalities that failed to download.
        """
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.failed_municipalities_file = Path(failed_municipalities_file)
        self.failed_municipalities_file.parent.mkdir(parents=True, exist_ok=True)
        self.mapbiomas_asset = "projects/mapbiomas-public/assets/brazil/lulc/collection10/mapbiomas_brazil_collection10_integration_v2"
        self.sentinel2_collection = "COPERNICUS/S2_SR_HARMONIZED"
    
    def check_missing_months(
        self,
        municipality_code: str,
        municipality_name: str,
        start_date: str,
        end_date: str,
        min_tiles_per_month: int = 1
    ) -> Tuple[List[str], List[str]]:
        """
        Check which months are missing for a specific municipality.
        
        Parameters:
        -----------
        municipality_code : str
            IBGE municipality code.
        municipality_name : str
            Municipality name.
        start_date : str
            Start date for the download period (YYYY-MM-DD format).
        end_date : str
            End date for the download period (YYYY-MM-DD format).
        min_tiles_per_month : int, default 1
            Minimum number of tiles expected per month. If a month has fewer tiles
            than this, it's considered incomplete/missing.
        
        Returns:
        --------
        Tuple[List[str], List[str]]
            (downloaded_months, missing_months)
            Lists of month strings in format "YYYY-MM" (e.g., "2021-01")
        """
        start = datetime.strptime(start_date, "%Y-%m-%d")
        end = datetime.strptime(end_date, "%Y-%m-%d")
        download_year = start.year
        
        # Create year-specific output directory path
        year_output_dir = self.output_dir.parent / f"{self.output_dir.name}_{download_year}"
        safe_name = "".join(c for c in municipality_name if c.isalnum() or c in (' ', '-', '_')).strip()
        mun_output_dir = year_output_dir / f"{municipality_code}_{safe_name}"
        
        # Generate expected months list
        expected_months = []
        current = start
        while current <= end:
            year = current.year
            month = current.month
            month_str = f"{year}-{month:02d}"
            expected_months.append(month_str)
            
            month_end = datetime(year, month, monthrange(year, month)[1])
            if month_end > end:
                month_end = end
            current = month_end + timedelta(days=1)
        
        # Check which months are already downloaded
        downloaded_months = []
        if mun_output_dir.exists():
            existing_files = list(mun_output_dir.glob(f"{municipality_code}_*.tif"))
            
            # Count tiles per month
            tiles_per_month = {}
            for file in existing_files:
                # Extract month from filename: {municipality_code}_{year}-{month}_tile_...
                parts = file.stem.split("_")
                if len(parts) >= 2:
                    date_part = parts[1]  # e.g., "2021-01"
                    if date_part in expected_months:
                        tiles_per_month[date_part] = tiles_per_month.get(date_part, 0) + 1
            
            # Check if each expected month has enough tiles
            for month_str in expected_months:
                tile_count = tiles_per_month.get(month_str, 0)
                if tile_count >= min_tiles_per_month:
                    downloaded_months.append(month_str)
        
        missing_months = [m for m in expected_months if m not in downloaded_months]
        
        return downloaded_months, missing_months
    
    def download_municipality(
        self,
        shapefile_path: str,
        crop_type: int = 39,
        resolution: int = 30,
        tile_size: int = 25000,
        start_date: str = "2021-01-01",
        end_date: str = "2021-06-30",
        cloud_threshold: float = 30.0,
        composite_method: str = "median",
        bands: List[str] = ["B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B11", "B12"],
        municipality_code: Optional[str] = None,
        municipality_name: Optional[str] = None,
        force_redownload: bool = False,
        timeout_minutes: int = 60,
        max_download_workers: int = 16
    ) -> Path:
        """
        Download Sentinel-2 imagery for a municipality with MapBiomas crop masking.
        
        Downloads monthly composite images divided into fixed-size tiles. Only tiles
        containing the specified crop type are downloaded. Images are masked to show
        only pixels classified as the target crop in MapBiomas.
        
        Parameters:
        -----------
        shapefile_path : str
            Path to the municipality shapefile (.shp file).
        crop_type : int, default 39
            MapBiomas classification code for the crop type (39 = soybean).
        resolution : int, default 30
            Pixel resolution in meters for the downloaded images.
        tile_size : int, default 1000
            Size of each tile in meters (e.g., 1000 = 1km x 1km tiles).
            Note: In practice, larger values (e.g., 5000) are often used for efficiency.
        start_date : str, default "2021-01-01"
            Start date for the download period (YYYY-MM-DD format).
        end_date : str, default "2021-06-30"
            End date for the download period (YYYY-MM-DD format).
        cloud_threshold : float, default 30.0
            Maximum cloud cover percentage for images to be included in composite.
        composite_method : str, default "median"
            Method to composite multiple images per month: "median", "mean", or "first".
        bands : List[str], default ["B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B11", "B12"]
            List of Sentinel-2 bands to download (excluding B1, B9, B10).
        municipality_code : Optional[str], default None
            IBGE municipality code. If None, extracted from shapefile filename.
        municipality_name : Optional[str], default None
            Municipality name. If None, uses municipality_code.
        force_redownload : bool, default False
            If True, re-downloads even if files already exist. If False, skips
            municipalities that have already been downloaded for the requested date range.
        max_download_workers : int, default 16
            Maximum number of parallel download threads. Increase for faster downloads
            if you have good bandwidth, but be mindful of GEE rate limits.
        
        Returns:
        --------
        Path
            Path to the directory containing downloaded GeoTIFF files.
            Files are named: {municipality_code}_{year}-{month}_tile_{i:02d}_{j:02d}.tif
        """
        
        gdf = gpd.read_file(shapefile_path)
        geometry = gdf.geometry.values[0]
        geo_json = geometry.__geo_interface__
        ee_geometry = ee.Geometry(geo_json)
        
        start = datetime.strptime(start_date, "%Y-%m-%d")
        download_year = start.year
        
        year_output_dir = self.output_dir.parent / f"{self.output_dir.name}_{download_year}"
        year_output_dir.mkdir(parents=True, exist_ok=True)
        
        failed_municipalities_file_year = self.failed_municipalities_file.parent / f"{self.failed_municipalities_file.stem}_{download_year}{self.failed_municipalities_file.suffix}"
        
        if not municipality_code:
            municipality_code = Path(shapefile_path).stem
        if not municipality_name:
            municipality_name = municipality_code
        
        safe_name = "".join(c for c in municipality_name if c.isalnum() or c in (' ', '-', '_')).strip()
        mun_output_dir = year_output_dir / f"{municipality_code}_{safe_name}"
        
        downloaded_months, missing_months = self.check_missing_months(
            municipality_code, municipality_name, start_date, end_date
        )
        missing_months_set = set(missing_months)
        
        if not force_redownload:
            if not missing_months:
                existing_files = list(mun_output_dir.glob("*.tif")) if mun_output_dir.exists() else []
                print(f"Skipping {municipality_code} ({municipality_name}): all months already downloaded ({len(existing_files)} files)")
                return mun_output_dir
            elif downloaded_months:
                print(f"[LOG] Found {len(downloaded_months)} already downloaded months: {', '.join(sorted(downloaded_months))}")
                print(f"[LOG] Missing {len(missing_months)} months: {', '.join(sorted(missing_months))}")
        else:
            # If forcing redownload, still show what's already there but will redownload everything
            if downloaded_months:
                print(f"[LOG] Found {len(downloaded_months)} already downloaded months (will be redownloaded): {', '.join(sorted(downloaded_months))}")
            if missing_months:
                print(f"[LOG] Missing {len(missing_months)} months (will be downloaded): {', '.join(sorted(missing_months))}")
        
        municipality_start_time = time.time()
        print(f"\n[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] Starting processing: {municipality_code} ({municipality_name})")
        
        mun_output_dir.mkdir(parents=True, exist_ok=True)
        
        TIMEOUT_SECONDS = 60 * timeout_minutes
        
        gdf_bounds = gdf.total_bounds
        min_lon, min_lat = gdf_bounds[0], gdf_bounds[1]
        max_lon, max_lat = gdf_bounds[2], gdf_bounds[3]
        
        center_lat = (min_lat + max_lat) / 2
        
        meters_per_degree_lat = 111000
        meters_per_degree_lon = 111000 * math.cos(math.radians(center_lat))
        
        tile_size_deg_lat = tile_size / meters_per_degree_lat
        tile_size_deg_lon = tile_size / meters_per_degree_lon
        
        num_tiles_lon = int((max_lon - min_lon) / tile_size_deg_lon) + 1
        num_tiles_lat = int((max_lat - min_lat) / tile_size_deg_lat) + 1
        
        start = datetime.strptime(start_date, "%Y-%m-%d")
        end = datetime.strptime(end_date, "%Y-%m-%d")
        
        current = start
        while current <= end:
            year = current.year
            month = current.month
            month_str = f"{year}-{month:02d}"
            
            if not force_redownload and month_str not in missing_months_set:
                month_end = datetime(year, month, monthrange(year, month)[1])
                if month_end > end:
                    month_end = end
                current = month_end + timedelta(days=1)
                print(f"[LOG] Skipping {year}-{month:02d} (already downloaded)...")
                continue
            
            month_end = datetime(year, month, monthrange(year, month)[1])
            if month_end > end:
                month_end = end
            
            month_start_str = current.strftime("%Y-%m-%d")
            month_end_str = month_end.strftime("%Y-%m-%d")
            
            print(f"[LOG] Processing {year}-{month:02d}...")
            
            sentinel2 = (
                ee.ImageCollection(self.sentinel2_collection)
                .filterBounds(ee_geometry)
                .filterDate(month_start_str, month_end_str)
                .filter(ee.Filter.lt("CLOUDY_PIXEL_PERCENTAGE", cloud_threshold))
            )
            
            try:
                if composite_method == "median":
                    composite = sentinel2.median()
                elif composite_method == "mean":
                    composite = sentinel2.mean()
                elif composite_method == "first":
                    composite = sentinel2.first()
                else:
                    composite = sentinel2.median()
                
                rgb = composite.select(bands).clip(ee_geometry)
                
                mapbiomas = ee.Image(self.mapbiomas_asset)
                classification = mapbiomas.select(f"classification_{year}").clip(ee_geometry)
                crop_mask = classification.eq(crop_type)
                
                rgb_masked = rgb.multiply(crop_mask).updateMask(crop_mask)
            except (ee.EEException, Exception) as e:
                error_msg = str(e)
                if "Band pattern" in error_msg and "no bands" in error_msg:
                    print(f"[LOG] Skipping {year}-{month:02d} - no bands")
                    current = month_end + timedelta(days=1)
                    continue
                elif "geometry for image clipping must not be empty" in error_msg:
                    print(f"[LOG] Skipping {year}-{month:02d} - empty geometry")
                    current = month_end + timedelta(days=1)
                    continue
                else:
                    print(f"[LOG] Error creating composite for {year}-{month:02d}: {error_msg}")
                    raise
            
            tile_tasks = []
            for i in range(num_tiles_lon):
                for j in range(num_tiles_lat):
                    tile_lon_min = min_lon + (i * tile_size_deg_lon)
                    tile_lon_max = min_lon + ((i + 1) * tile_size_deg_lon)
                    tile_lat_min = min_lat + (j * tile_size_deg_lat)
                    tile_lat_max = min_lat + ((j + 1) * tile_size_deg_lat)
                    
                    tile_geometry = ee.Geometry.Rectangle(
                        [tile_lon_min, tile_lat_min, tile_lon_max, tile_lat_max]
                    )
                    tile_geometry = tile_geometry.intersection(ee_geometry, ee.ErrorMargin(1))
                    
                    filename = f"{municipality_code}_{year}-{month:02d}_tile_{i:02d}_{j:02d}.tif"
                    filepath = mun_output_dir / filename
                    
                    tile_tasks.append((i, j, tile_geometry, filepath))
            
            print(f"[LOG] Generating download URLs for {len(tile_tasks)} tiles...")
            url_tasks = {}
            failed_lock = Lock()
            
            def generate_url(task_data):
                i, j, tile_geom, filepath = task_data
                try:
                    tile_rgb_masked = rgb_masked.clip(tile_geom)
                    url = tile_rgb_masked.getDownloadUrl({
                        "region": tile_geom,
                        "scale": resolution,
                        "crs": "EPSG:4326",
                        "format": "GEO_TIFF"
                    })
                    return (i, j, url, filepath, None)
                except Exception as e:
                    error_msg = str(e)
                    if "geometry for image clipping must not be empty" in error_msg:
                        return (i, j, None, filepath, "empty_geometry")
                    elif "Total request size" in error_msg and "must be less than" in error_msg:
                        return (i, j, None, filepath, "request_too_large")
                    return (i, j, None, filepath, error_msg)
            
            max_url_workers = 5  # Limit concurrent GEE API calls
            with ThreadPoolExecutor(max_workers=max_url_workers) as executor:
                url_futures = {executor.submit(generate_url, task): task for task in tile_tasks}
                for future in as_completed(url_futures):
                    i, j, url, filepath, error = future.result()
                    if url:
                        url_tasks[(i, j)] = (url, filepath)
                    elif error and error not in ["empty_geometry", "request_too_large"]:
                        print(f"[LOG] Error generating URL for tile {i:02d}_{j:02d}: {error}")
            
            print(f"[LOG] Downloading {len(url_tasks)} tiles in parallel (max {max_download_workers} workers)...")
            tiles_downloaded_this_month = 0
            download_lock = Lock()
            
            def download_tile(url_filepath):
                url, filepath = url_filepath
                filepath.parent.mkdir(parents=True, exist_ok=True)
                success, error_msg = self._download_single_tile(
                    url, filepath, municipality_code, municipality_name, TIMEOUT_SECONDS
                )
                if success:
                    with download_lock:
                        nonlocal tiles_downloaded_this_month
                        tiles_downloaded_this_month += 1
                    return (True, filepath, None)
                else:
                    return (False, filepath, error_msg)
            
            with ThreadPoolExecutor(max_workers=max_download_workers) as executor:
                download_futures = {executor.submit(download_tile, (url, filepath)): (url, filepath) 
                                  for url, filepath in url_tasks.values()}
                
                for future in as_completed(download_futures):
                    url, filepath = download_futures[future]
                    try:
                        success, result_filepath, error_msg = future.result()
                        if not success:
                            error_msg_str = str(error_msg) if error_msg else "unknown"
                            tile_name = result_filepath.name
                            
                            self._save_failed_tile(
                                municipality_code,
                                municipality_name,
                                tile_name,
                                error_msg_str,
                                url=url,
                                failed_file=failed_municipalities_file_year
                            )
                            
                            if "timeout" in error_msg_str.lower():
                                elapsed_time = time.time() - municipality_start_time
                                if elapsed_time > TIMEOUT_SECONDS:
                                    print(f"[LOG] TIMEOUT: Municipality timeout exceeded ({elapsed_time/60:.1f} minutes)")
                                    self._save_failed_municipality(
                                        municipality_code, 
                                        municipality_name, 
                                        "timeout",
                                        failed_file=failed_municipalities_file_year
                                    )
                                    return mun_output_dir
                            else:
                                print(f"[LOG] Download failed for tile {tile_name}: {error_msg_str}")
                    except Exception as e:
                        error_msg_str = str(e)
                        print(f"[LOG] Unexpected error downloading tile: {error_msg_str}")

                        url, filepath = download_futures[future]
                        tile_name = filepath.name if filepath else "unknown"
                        self._save_failed_tile(
                            municipality_code,
                            municipality_name,
                            tile_name,
                            error_msg_str,
                            url=url if url else None,
                            failed_file=failed_municipalities_file_year
                        )
            
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(f"[{timestamp}] Completed {year}-{month:02d}: {tiles_downloaded_this_month} tiles")
            
            current = month_end + timedelta(days=1)
        
        elapsed_time = time.time() - municipality_start_time
        elapsed_minutes = int(elapsed_time // 60)
        elapsed_seconds = int(elapsed_time % 60)
        end_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        num_files = len(list(mun_output_dir.glob("*.tif")))
        print(f"[{end_timestamp}] Completed: {municipality_code} ({municipality_name}) - {num_files} files in {elapsed_minutes}m {elapsed_seconds}s")
        
        return mun_output_dir
    
    def _save_failed_municipality(self, municipality_code: str, municipality_name: str, reason: str, url: Optional[str] = None, error_details: Optional[str] = None, failed_file: Optional[Path] = None):
        """
        Save a failed municipality to the failed municipalities list.
        
        Parameters:
        -----------
        municipality_code : str
            Municipality code.
        municipality_name : str
            Municipality name.
        reason : str
            Reason for failure (e.g., "timeout", "503_service_unavailable").
        url : Optional[str], default None
            URL that failed (e.g., the GEE download URL).
        error_details : Optional[str], default None
            Additional error details or full error message.
        failed_file : Optional[Path], default None
            Path to the failed municipalities file. If None, uses self.failed_municipalities_file.
        """
        target_file = failed_file if failed_file is not None else self.failed_municipalities_file
        target_file.parent.mkdir(parents=True, exist_ok=True)
        
        if target_file.exists():
            with open(target_file, 'r') as f:
                failed_list = json.load(f)
        else:
            failed_list = []
        
        failed_entry = {
            "code": municipality_code,
            "name": municipality_name,
            "reason": reason,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        # Add URL if provided
        if url:
            failed_entry["url"] = url
        
        # Add error details if provided
        if error_details:
            failed_entry["error_details"] = error_details
        
        # Check if this municipality already exists in the list
        existing_entry = None
        for entry in failed_list:
            if entry["code"] == municipality_code:
                existing_entry = entry
                break
        
        if existing_entry:
            # Update existing entry - add new URL if not already present
            if url and "urls" not in existing_entry:
                # Convert single url to urls list for backward compatibility
                if "url" in existing_entry:
                    existing_entry["urls"] = [existing_entry.pop("url")]
                else:
                    existing_entry["urls"] = []
            
            if url:
                if "urls" in existing_entry:
                    if url not in existing_entry["urls"]:
                        existing_entry["urls"].append(url)
                elif "url" in existing_entry:
                    if existing_entry["url"] != url:
                        existing_entry["urls"] = [existing_entry.pop("url"), url]
                else:
                    existing_entry["url"] = url
            
            # Update error details
            if error_details:
                if "error_details" not in existing_entry:
                    existing_entry["error_details"] = []
                elif isinstance(existing_entry["error_details"], str):
                    existing_entry["error_details"] = [existing_entry["error_details"]]
                
                if isinstance(existing_entry["error_details"], list):
                    if error_details not in existing_entry["error_details"]:
                        existing_entry["error_details"].append(error_details)
                else:
                    existing_entry["error_details"] = [str(existing_entry["error_details"]), error_details]
            
            # Update timestamp
            existing_entry["timestamp"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        else:
            # Add new entry
            failed_list.append(failed_entry)
        
        with open(target_file, 'w') as f:
            json.dump(failed_list, f, indent=2)
    
    def _save_failed_tile(
        self,
        municipality_code: str,
        municipality_name: str,
        tile_name: str,
        error_msg: str,
        url: Optional[str] = None,
        failed_file: Optional[Path] = None
    ):
        """
        Save a failed tile to the failed municipalities list.
        Adds the tile to the municipality's failed_tiles list.
        
        Parameters:
        -----------
        municipality_code : str
            Municipality code.
        municipality_name : str
            Municipality name.
        tile_name : str
            Name of the failed tile file.
        error_msg : str
            Error message describing the failure.
        url : Optional[str], default None
            URL that failed (e.g., the GEE download URL).
        failed_file : Optional[Path], default None
            Path to the failed municipalities file. If None, uses self.failed_municipalities_file.
        """
        target_file = failed_file if failed_file is not None else self.failed_municipalities_file
        target_file.parent.mkdir(parents=True, exist_ok=True)
        
        if target_file.exists():
            with open(target_file, 'r') as f:
                failed_list = json.load(f)
        else:
            failed_list = []
        
        municipality_entry = None
        for entry in failed_list:
            if entry["code"] == municipality_code:
                municipality_entry = entry
                break
        
        if not municipality_entry:
            municipality_entry = {
                "code": municipality_code,
                "name": municipality_name,
                "reason": "tile_failures",
                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "failed_tiles": []
            }
            failed_list.append(municipality_entry)
        
        if "failed_tiles" not in municipality_entry:
            municipality_entry["failed_tiles"] = []
        
        tile_entry = {
            "tile_name": tile_name,
            "error": error_msg,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        if url:
            tile_entry["url"] = url
        
        tile_exists = False
        for existing_tile in municipality_entry["failed_tiles"]:
            if existing_tile.get("tile_name") == tile_name:
                existing_tile.update(tile_entry)
                tile_exists = True
                break
        
        if not tile_exists:
            municipality_entry["failed_tiles"].append(tile_entry)
        
        municipality_entry["timestamp"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        with open(target_file, 'w') as f:
            json.dump(failed_list, f, indent=2)
    
    def _download_single_tile(
        self,
        url: str,
        filepath: Path,
        municipality_code: str,
        municipality_name: str,
        timeout_seconds: int,
        chunk_size: int = 65536 
    ) -> Tuple[bool, Optional[str]]:
        """
        Download a single tile file.
        
        Returns:
        --------
        Tuple[bool, Optional[str]]
            (success, error_message)
        """
        try:
            response = requests.get(url, timeout=timeout_seconds, stream=True)
            response.raise_for_status()
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
            return True, None
        except Exception as e:
            error_msg = str(e)
            return False, error_msg
    
    def download_specific_url(self, url: str, municipality_code: str, municipality_name: str, output_filename: str, timeout_minutes: int = 60):
        """
        Download a specific URL (e.g., for retrying failed 503 errors).
        
        Parameters:
        -----------
        url : str
            The GEE download URL to retry.
        municipality_code : str
            Municipality code.
        municipality_name : str
            Municipality name.
        output_filename : str
            Filename to save the downloaded file (e.g., "4104402_2021-04_tile_00_05.tif").
        timeout_minutes : int, default 60
            Timeout in minutes for the download.
        """
        safe_name = "".join(c for c in municipality_name if c.isalnum() or c in (' ', '-', '_')).strip()
        mun_output_dir = self.output_dir / f"{municipality_code}_{safe_name}"
        mun_output_dir.mkdir(parents=True, exist_ok=True)
        
        filepath = mun_output_dir / output_filename
        TIMEOUT_SECONDS = 60 * timeout_minutes
        
        print(f"[LOG] Retrying download for {municipality_code} ({municipality_name}): {output_filename}")
        try:
            response = requests.get(url, timeout=TIMEOUT_SECONDS, stream=True)
            response.raise_for_status()
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            print(f"[LOG] Successfully downloaded: {filepath}")
            return filepath
        except Exception as e:
            print(f"[LOG] Failed to download {url}: {str(e)}")
            raise


In [4]:
shapefile_dir = Path("files/municipal_shapefiles")
shapefiles = list(shapefile_dir.glob("**/*.shp"))

for shp_file in shapefiles:
    print(f"Using shapefile: {shp_file}")
    
    gdf = gpd.read_file(shp_file)
    mun_code = shp_file.stem
    mun_name = shp_file.parent.name.split('_', 1)[1] if '_' in shp_file.parent.name else mun_code
    
    downloader = GEEDownloader()
    for year in years_to_process:
        output_path = downloader.download_municipality(
            shapefile_path=str(shp_file),
            crop_type=39,
            resolution=30,
            tile_size=5000,
            start_date=f"{year}-01-01",
            end_date=f"{year}-06-30",
            cloud_threshold=30.0,
            composite_method="median",
            bands=["B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B11", "B12"],
            municipality_code=mun_code,
            municipality_name=mun_name,
            timeout_minutes=20
        )
    
    print(f"Download complete! Files saved to: {output_path}")
else:
    print("No shapefiles found in files/municipal_shapefiles/")


Using shapefile: files\municipal_shapefiles\4100103_Abatiá\4100103.shp

[2026-01-08 10:45:40] Starting processing: 4100103 (Abatiá)
[LOG] Processing 2017-01...
[LOG] Generating download URLs for 24 tiles...
[LOG] Error generating URL for tile 00_03: Image.select: Band pattern 'B2' was applied to an Image with no bands. See https://developers.google.com/earth-engine/guides/debugging#no-bands
[LOG] Error generating URL for tile 00_02: Image.select: Band pattern 'B2' was applied to an Image with no bands. See https://developers.google.com/earth-engine/guides/debugging#no-bands
[LOG] Error generating URL for tile 01_00: Image.select: Band pattern 'B2' was applied to an Image with no bands. See https://developers.google.com/earth-engine/guides/debugging#no-bands
[LOG] Error generating URL for tile 01_02: Image.select: Band pattern 'B2' was applied to an Image with no bands. See https://developers.google.com/earth-engine/guides/debugging#no-bands
[LOG] Error generating URL for tile 01_01: Im

KeyboardInterrupt: 

In [None]:
failed_file = Path("files/failed_municipalities_30m.json")
if failed_file.exists():
    with open(failed_file, 'r') as f:
        failed_municipalities = json.load(f)
    
    downloader = GEEDownloader()
    successfully_retried = [] 
        
    all_failed_municipalities = {}
    for year in years_to_process:
        failed_file_year = failed_file.parent / f"{failed_file.stem}_{year}{failed_file.suffix}"
        if failed_file_year.exists():
            print(f"[LOG] Reading from year-specific failed file: {failed_file_year}")
            with open(failed_file_year, 'r') as f:
                year_failed = json.load(f)
                for entry in year_failed:
                    code = entry["code"]
                    if code not in all_failed_municipalities:
                        all_failed_municipalities[code] = entry
    
    if not all_failed_municipalities:
        print(f"[LOG] Year-specific files not found, reading from generic file: {failed_file}")
        all_failed_municipalities = {entry["code"]: entry for entry in failed_municipalities}
    
    failed_municipalities = list(all_failed_municipalities.values())
    
    for failed_entry in failed_municipalities:
        mun_code = failed_entry["code"]
        mun_name = failed_entry["name"]
        reason = failed_entry.get("reason", "unknown")
        
        print(f"\n{'='*60}")
        print(f"Processing failed municipality: {mun_code} ({mun_name})")
        print(f"Original reason: {reason}")
        print(f"{'='*60}\n")
        
        safe_name = "".join(c for c in mun_name if c.isalnum() or c in (' ', '-', '_')).strip()
        mun_output_dir = downloader.output_dir / f"{mun_code}_{safe_name}"
        
        if mun_output_dir.exists():
            print(f"[LOG] Deleting existing folder: {mun_output_dir}")
            shutil.rmtree(mun_output_dir)
        
        shapefile_path = Path(f"files/municipal_shapefiles/{mun_code}_{mun_name}/{mun_code}.shp")
        if not shapefile_path.exists():
            shapefile_path = Path(f"files/municipal_shapefiles") / f"{mun_code}_{mun_name}" / f"{mun_code}.shp"
        
        if shapefile_path.exists():
            print(f"[LOG] Retrying full municipality download for {mun_code} ({mun_name})")
            for year in years_to_process:
                failed_file_year = failed_file.parent / f"{failed_file.stem}_{year}{failed_file.suffix}"
                
                try:
                    output_path = downloader.download_municipality(
                        shapefile_path=str(shapefile_path),
                        crop_type=39,
                        resolution=30,
                        tile_size=5000,
                        start_date=f"{year}-01-01",
                        end_date=f"{year}-06-30",
                        cloud_threshold=30.0,
                        composite_method="median",
                        bands=["B2", "B3", "B4", "B5", "B6", "B7", "B8", "B8A", "B11", "B12"],
                        municipality_code=mun_code,
                        municipality_name=mun_name,
                        timeout_minutes=120,
                        force_redownload=True
                    )
                    
                    expected_months = [f"{year}-01", f"{year}-02", f"{year}-03", f"{year}-04", f"{year}-05", f"{year}-06"]
                    downloaded_files = list(output_path.glob(f"{mun_code}_*.tif"))
                    downloaded_months = set()
                    for file in downloaded_files:

                        parts = file.stem.split("_")
                        if len(parts) >= 2:
                            date_part = parts[1] 
                            if date_part in expected_months:
                                downloaded_months.add(date_part)
                    
                    if len(downloaded_months) == len(expected_months):
                        print(f"[LOG] Successfully completed {year}: {output_path}")

                        successfully_retried.append(f"{mun_code}_{year}")
                    else:
                        missing_months = set(expected_months) - downloaded_months
                        print(f"[LOG] Download incomplete for {year} - missing months: {', '.join(sorted(missing_months))}")
                        print(f"[LOG] Adding municipality back to failed list for {year}.")

                        error_reason = f"incomplete_download_missing_months_{'_'.join(sorted(missing_months))}"
                        downloader._save_failed_municipality(
                            mun_code,
                            mun_name,
                            error_reason,
                            error_details=f"Missing months: {', '.join(sorted(missing_months))}",
                            failed_file=failed_file_year
                        )
                except Exception as e:
                    error_msg = str(e)
                    print(f"[LOG] Error retrying {mun_code} for {year}: {error_msg}")
                    print(f"[LOG] Adding municipality back to failed list for {year}.")

                    downloader._save_failed_municipality(
                        mun_code,
                        mun_name,
                        "retry_failed",
                        error_details=error_msg,
                        failed_file=failed_file_year
                    )
        else:
            print(f"[LOG] ERROR: Shapefile not found: {shapefile_path}")
            print(f"[LOG] Adding municipality back to failed list for all years.")
            for year in years_to_process:
                failed_file_year = failed_file.parent / f"{failed_file.stem}_{year}{failed_file.suffix}"
                downloader._save_failed_municipality(
                    mun_code,
                    mun_name,
                    "shapefile_not_found",
                    error_details=f"Shapefile not found at: {shapefile_path}",
                    failed_file=failed_file_year
                )
    
    if successfully_retried:
        retries_by_year = {}
        for retry_key in successfully_retried:
            if '_' in retry_key:
                code, year = retry_key.rsplit('_', 1)
                if year not in retries_by_year:
                    retries_by_year[year] = []
                retries_by_year[year].append(code)
        
        total_removed = 0
        for year, codes in retries_by_year.items():
            failed_file_year = failed_file.parent / f"{failed_file.stem}_{year}{failed_file.suffix}"
            if failed_file_year.exists():
                with open(failed_file_year, 'r') as f:
                    year_failed = json.load(f)
                
                original_count = len(year_failed)
                year_failed = [entry for entry in year_failed if entry["code"] not in codes]
                removed_count = original_count - len(year_failed)
                total_removed += removed_count
                

                with open(failed_file_year, 'w') as f:
                    json.dump(year_failed, f, indent=2)
                
                print(f"[LOG] Removed {removed_count} municipality/municipalities from {year} failed list: {', '.join(codes)}")
        
        print(f"\n{'='*60}")
        print(f"Successfully retried {total_removed} municipality-year combinations.")
        print(f"{'='*60}\n")
    else:
        print(f"\n{'='*60}")
        print("No municipalities were successfully retried.")
        print(f"{'='*60}\n")
else:
    print("No failed municipalities file found: files/failed_municipalities_30m.json")


[LOG] Reading from year-specific failed file: files\failed_municipalities_30m_2018.json

Processing failed municipality: 4103370 (Brasilândia do Sul)
Original reason: tile_failures

[LOG] Retrying full municipality download for 4103370 (Brasilândia do Sul)
[LOG] Found 1 already downloaded months (will be redownloaded): 2018-04
[LOG] Missing 5 months (will be downloaded): 2018-01, 2018-02, 2018-03, 2018-05, 2018-06

[2026-01-03 14:36:14] Starting processing: 4103370 (Brasilândia do Sul)
[LOG] Processing 2018-01...
[LOG] Generating download URLs for 25 tiles...
[LOG] Error generating URL for tile 00_02: Image.select: Band pattern 'B2' was applied to an Image with no bands. See https://developers.google.com/earth-engine/guides/debugging#no-bands
[LOG] Error generating URL for tile 00_00: Image.select: Band pattern 'B2' was applied to an Image with no bands. See https://developers.google.com/earth-engine/guides/debugging#no-bands
[LOG] Error generating URL for tile 00_04: Image.select: Ban