In [1]:
import os
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from tqdm import tqdm
import numpy as np

def get_file_paths(folder_path):
    file_paths = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            full_path = os.path.abspath(os.path.join(root, file))
            file_paths.append(full_path)
    return file_paths

def create_heat_index(lst_path):
    """
    Create a heat index raster on a scale of 1-25 from a Land Surface Temperature raster.
    1 = coldest, 25 = hottest.
    
    The function:
    1. Applies percent clip to the LST raster (0.5% from each end)
    2. Divides the clipped range into 25 equal intervals
    3. Creates a new raster with the heat index values
    4. Preserves all geographic information
    
    Args:
        lst_path: Path to the Land Surface Temperature raster
        
    Returns:
        Path to the created heat index raster
    """
    
    # Create output path (same directory as input)
    output_dir = os.path.dirname(lst_path)
    base_name = os.path.basename(lst_path).replace('LST.tif', 'HeatIndex.tif')
    output_path = os.path.join(output_dir, base_name)
    
    # Skip if already processed
    # if os.path.exists(output_path):
    #     return output_path
    
    # Open the LST raster
    with rasterio.open(lst_path) as src:
        # Read the data
        lst_data = src.read(1)
        
        # Create a proper mask for valid data (not NaN)
        mask = np.ones_like(lst_data, dtype=bool)
        if src.nodata is not None:
            mask = lst_data != src.nodata
        mask = mask & ~np.isnan(lst_data)
        
        valid_data = lst_data[mask]
        
        if valid_data.size == 0:
            print(f"Warning: No valid data in {lst_path}")
            return None
        
        # Apply percent clip (0.5% from each end)
        p_min, p_max = np.percentile(valid_data, [0.5, 99.5])
        
        # Initialize heat index with nodata value (-9999)
        heat_index = np.full_like(lst_data, -9999, dtype=np.float32)
        
        # Calculate the range and bin width
        data_range = p_max - p_min
        bin_width = data_range / 25
        
        # Apply the binning
        for i in range(1, 26):
            bin_min = p_min + (i-1) * bin_width
            bin_max = p_min + i * bin_width
            
            if i == 25:  # Ensure the maximum value is included in the last bin
                mask_bin = (lst_data >= bin_min) & (lst_data <= bin_max) & mask
            else:
                mask_bin = (lst_data >= bin_min) & (lst_data < bin_max) & mask
            
            heat_index[mask_bin] = float(i)
        
        # Create output raster with the same profile but modified dtype and nodata
        profile = src.profile.copy()
        profile.update(
            dtype='float32',  
            nodata=-9999
        )
        
        # Create directory if needed
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # Write output
        with rasterio.open(output_path, 'w', **profile) as dst:
            dst.write(heat_index, 1)
from datetime import datetime
from dateutil.relativedelta import relativedelta
def preprocessImages(data_dir: str, debug: bool, monthsAhead: int = 0):
    # Initialize file lists
    albedo_files = []
    x_dir = os.path.join(data_dir, 'X', 'less5CloudCover')
    
    # Find all albedo files from 2014
    for file_path in tqdm(get_file_paths(x_dir), desc='Gathering scenes (Preprocessing)...'):
        date = file_path.split('/')[-2]
        if debug and '2014' not in date:
            continue
        if 'Albedo' in file_path:
            albedo_files.append(file_path)
    
    # Process each scene
    for albedo_path in tqdm(albedo_files, desc='Preprocessing images...'):
        scene_dir = os.path.dirname(albedo_path)
        scene_files = [f for f in os.listdir(scene_dir) if os.path.isfile(os.path.join(scene_dir, f))]
        
        # Get all raster paths for this scene
        raster_paths = []
        for raster_file in scene_files:
            src_path = os.path.join(scene_dir, raster_file)
            raster_paths.append(src_path)
        
        # Add LST path
        lst_path = albedo_path.replace('/X/', '/y/').replace('Albedo.tif', 'LST.tif')
        if os.path.exists(lst_path):
            raster_paths.append(lst_path) 
        else:            
            continue
        date = lst_path.split('/')[-2]
        date_object = datetime.strptime(date, "%Y-%m")
        date_object = date_object + relativedelta(months=monthsAhead)
        dateAhead = date_object.strftime("%Y-%m")
        lst_path = lst_path.replace(date, dateAhead)
        if os.path.exists(lst_path):
            create_heat_index(lst_path)
            raster_paths.append(lst_path)
            raster_paths.append(lst_path.replace('LST.tif', 'HeatIndex.tif'))
        else:
            # print(f"Warning: LST file not found at {lst_path}")
            continue
        
        # Get reference CRS from the first raster
        with rasterio.open(raster_paths[0]) as src:
            reference_crs = src.crs
        
        # Find minimum dimensions across all rasters in the scene
        min_width = float('inf')
        min_height = float('inf')
        cell_size = 0.00030286739114101927
        
        for path in raster_paths:
            try:
                with rasterio.open(path) as scene_src:
                    # Calculate bounds in the target CRS
                    west, south, east, north = rasterio.warp.transform_bounds(
                        scene_src.crs, reference_crs, *scene_src.bounds
                    )
                    
                    # Calculate dimensions based on cell size
                    width = max(int(round((east - west) / cell_size)), 1)
                    height = max(int(round((north - south) / cell_size)), 1)
                    
                    # Update minimum dimensions
                    min_width = min(min_width, width)
                    min_height = min(min_height, height)
            except Exception as e:
                print(f"Error reading dimensions from {path}: {e}")
                continue
        
        # Ensure dimensions are divisible by 128
        min_width = ((min_width + 127) // 128) * 128
        min_height = ((min_height + 127) // 128) * 128
        # print(raster_paths)
        
        # Process each raster with the standardized dimensions
        for k, src_path in enumerate(raster_paths):
            dst_path = src_path.replace('Data/', f'Data/preprocess_{monthsAhead}monthsahead/')
            # if k == 6:
            #     dst_path = dst_path.replace('/y/', f'/X/')  
            #     dst_path = dst_path.replace('LST.tif', 'LSTInput.tif')          
            # Skip if already processed
            if os.path.exists(dst_path):
                continue
            
            resample_to_fixed_cell_size(src_path, dst_path, reference_crs, min_width, min_height, cell_size)

def resample_to_fixed_cell_size(src_path, dst_path, crs, width, height, cell_size=0.00030286739114101927):
    """
    Resample a raster to fixed dimensions and convert to float32 with -9999 nodata.
    
    Args:
        src_path: Path to the source raster
        dst_path: Path to save the resampled raster
        crs: Target coordinate reference system
        width: Fixed width to use for all rasters
        height: Fixed height to use for all rasters
        cell_size: Target cell size in CRS units
    """
    try:
        with rasterio.open(src_path) as src:
            # Store original nodata value
            original_nodata = src.nodata
            
            # Calculate bounds in the target CRS
            west, south, east, north = rasterio.warp.transform_bounds(
                src.crs, crs, *src.bounds
            )
            
            # Use the specified fixed dimensions
            # Note: width and height are already ensured to be divisible by 128 in the calling function
            
            # Calculate the transformation matrix
            dst_transform = rasterio.transform.from_bounds(
                west, south, east, north, width, height
            )
            
            # Update profile for the new raster
            dst_kwargs = src.profile.copy()
            dst_kwargs.update({
                'crs': crs,
                'transform': dst_transform,
                'width': width,
                'height': height,
                'dtype': 'float32',  # Set dtype to float32
                'nodata': -9999.0    # Set nodata to -9999
            })
            
            # Create directory if it doesn't exist
            os.makedirs(os.path.dirname(dst_path), exist_ok=True)
            
            # Create the new raster
            with rasterio.open(dst_path, 'w', **dst_kwargs) as dst:
                # Reproject and resample
                for i in range(1, src.count + 1):
                    # Read the source band
                    source_data = src.read(i)
                    
                    # Create a destination array filled with the nodata value
                    dest_data = np.full(
                        (dst_kwargs['height'], dst_kwargs['width']), 
                        -9999.0, 
                        dtype='float32'
                    )
                    
                    # Reproject with specified parameters
                    reproject(
                        source=source_data,
                        destination=dest_data,
                        src_transform=src.transform,
                        src_crs=src.crs,
                        dst_transform=dst_transform,
                        dst_crs=crs,
                        src_nodata=original_nodata,
                        dst_nodata=-9999.0,
                        resampling=Resampling.nearest,
                        num_threads=4
                    )
                    
                    # Write the result
                    dst.write(dest_data, i)
    except Exception as e:
        print(f"Error processing {src_path}: {e}")

In [None]:
preprocessImages("./Data", False, 1)
preprocessImages("./Data", False, 3)

Gathering scenes (Preprocessing)...: 100%|██████████| 45667/45667 [00:00<00:00, 1871068.48it/s]
Preprocessing images...:  13%|█▎        | 986/7611 [05:23<03:49, 28.92it/s]  

In [None]:
import rasterio

def check_tif_divisibility(tif_path, divisor=128):
    """
    Check if a TIF file's dimensions are divisible by a given value.
    
    Args:
        tif_path (str): Path to the TIF file
        divisor (int): Value to check divisibility against
        
    Returns:
        dict: Results of the check
    """
    with rasterio.open(tif_path) as src:
        width = src.width
        height = src.height
    
    height_remainder = height % divisor
    width_remainder = width % divisor
    
    return {
        'is_divisible': (height_remainder == 0 and width_remainder == 0),
        'height': height,
        'width': width,
        'height_remainder': height_remainder,
        'width_remainder': width_remainder
    }

result = check_tif_divisibility("/home/ubuntu/heat-island-test/Data/y/less5CloudCover/Lubbock_TX/2021-09/LST.tif")
if result['is_divisible']:
    print(f"TIF is divisible by 128: {result['height']}x{result['width']}")
else:
    print(f"TIF not divisible by 128. Dimensions: {result['height']}x{result['width']}")
    print(f"Remainders: {result['height_remainder']}x{result['width_remainder']}")

In [None]:
from tqdm import tqdm
import os
def list_files_in_folder(folder_path):
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if
             os.path.isfile(os.path.join(folder_path, f))]
    return files

def get_file_paths(folder_path):
    file_paths = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            full_path = os.path.abspath(os.path.join(root, file))
            file_paths.append(full_path)
    return file_paths
file_list = []
allAlbedoPixelFiles = []
for filePath in get_file_paths('./Data/preprocess/X/less5CloudCover'):
    if 'Albedo' in filePath:
        allAlbedoPixelFiles.append(filePath)
for xPath in tqdm(allAlbedoPixelFiles, desc="Packing X,y to dictionary..."):
    fileParts = xPath.split('/')
    fileName, date, city, cloudCategory, dataType = fileParts[-1], fileParts[-2], fileParts[-3], fileParts[-4], fileParts[-5]
    sceneFiles = list_files_in_folder(os.path.dirname(os.path.abspath(xPath)))
    rasterDict = {}
    for rasterPath in sceneFiles:
        rasterName = rasterPath.split('/')[-1]
        rasterDict[rasterName] = rasterPath
        lstPath = xPath.replace('/X/', '/y/').replace('Albedo.tif', 'LST.tif')
        rasterDict['LST.tif'] = lstPath
    file_list.append(rasterDict)

In [None]:
import rasterio
import json

def get_tif_ranges(data_list):
    ranges = {
        'NDWI': {'min': float('inf'), 'max': float('-inf')},
        'LST': {'min': float('inf'), 'max': float('-inf')},
        'Land_Cover': {'min': float('inf'), 'max': float('-inf')},
        'NDVI': {'min': float('inf'), 'max': float('-inf')},
        'NDBI': {'min': float('inf'), 'max': float('-inf')},
        'DEM': {'min': float('inf'), 'max': float('-inf')},
        'Albedo': {'min': float('inf'), 'max': float('-inf')}
    }

    for item in tqdm(data_list, desc="Getting Ranges"):
        for tif_type, path in item.items():
            with rasterio.open(path) as src:
                data = src.read(1)
                tif_name = tif_type.split('.')[0]
                minimum = min(ranges[tif_name]['min'], float(data[data != -9999].min()))
                maximum = max(ranges[tif_name]['max'], float(data.max()))
                ranges[tif_name]['min'] = minimum
                ranges[tif_name]['max'] = maximum
    return ranges

ranges = get_tif_ranges(file_list)
print(json.dumps(ranges, indent=4))