pre processing are all steps that could be done in arcgis or qgis

In [6]:
import rasterio
from rasterio.vrt import WarpedVRT
from rasterio.enums import Resampling
import os

def resample_tiff_to_5m(input_tiff, output_tiff, resampling_method=Resampling.bilinear):
    """
    Resample a TIFF raster file to 5m resolution using specified resampling method.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file
    output_tiff : str
        Path to the output resampled TIFF file
    resampling_method : rasterio.enums.Resampling
        Resampling method to use (default: bilinear)
        Options: nearest, bilinear, cubic, cubicspline, lanczos, average, mode, etc.
    
    Returns:
    --------
    dict : Dictionary with resampling info (output path, original resolution, new resolution)
    """
    
    with rasterio.open(input_tiff) as src:
        # Get original metadata
        meta = src.meta.copy()
        original_res = src.res[0]  # Get resolution (assuming square pixels)
        
        # Calculate scale factor (5m / original_resolution)
        scale = original_res / 5.0
        
        # Update metadata for resampling
        meta.update({
            'height': int(src.height * scale),
            'width': int(src.width * scale),
            'transform': src.transform * src.transform.scale(1.0 / scale, 1.0 / scale)
        })
        
        # Use WarpedVRT for efficient resampling
        with WarpedVRT(src, resampling=resampling_method, 
                       height=meta['height'], 
                       width=meta['width'],
                       transform=meta['transform']) as vrt:
            
            # Write resampled data to output file
            with rasterio.open(output_tiff, 'w', **meta) as dst:
                dst.write(vrt.read())
    
    return {
        'output_path': output_tiff,
        'original_resolution': original_res,
        'new_resolution': 5.0,
        'resampling_method': str(resampling_method)
    }

# Example usage:
# result = resample_tiff_to_5m('raw_data/mnsterrainbatiment_2015_1m_montreal-est/MNS(terrain+batiment)_2015_1m_Montréal-Est.tif', 'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m.tif')
# print(result)


recalculate normals so the all values are positive. although this isnt strictly required, it makes it easier to trouble shoot issues

In [7]:
def normalize_raster_to_positive(input_tiff, output_tiff):
    """
    Normalize a raster so that values range from 0 to maximum value.
    Subtracts the minimum value from all pixels to shift negative values to positive.
    Properly handles NoData values to avoid corruption.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file (resampled raster)
    output_tiff : str
        Path to the output normalized TIFF file
    
    Returns:
    --------
    dict : Dictionary with normalization info (min value, max value, output path)
    """
    
    with rasterio.open(input_tiff) as src:
        data = src.read(1).astype(np.float64)  # Use float64 for precision
        meta = src.meta.copy()
        
        # Identify and handle NoData values
        nodata = src.nodata
        if nodata is not None:
            valid_mask = data != nodata
            print(f"NoData value detected: {nodata}")
            print(f"Valid pixels: {np.sum(valid_mask):,} / {data.size:,}")
        else:
            # Check for common NoData sentinel values
            if np.any(data < -32000):
                print(f"Warning: Detected likely NoData values (< -32000)")
                valid_mask = data > -32000
                print(f"Valid pixels: {np.sum(valid_mask):,} / {data.size:,}")
            else:
                valid_mask = np.ones(data.shape, dtype=bool)
        
        # Find minimum value ONLY from valid pixels
        min_value = np.min(data[valid_mask])
        max_value_orig = np.max(data[valid_mask])
        
        print(f"Original data range (valid pixels only): {min_value:.2f} to {max_value_orig:.2f}")
        
        # Normalize: subtract minimum from valid values only
        normalized_data = data.copy()
        normalized_data[valid_mask] = data[valid_mask] - min_value
        
        # Set NoData pixels to 0 (they'll be at the base of the model)
        normalized_data[~valid_mask] = 0
        
        # Get new max value after normalization
        max_value = np.max(normalized_data[valid_mask])
        
        print(f"Normalized data range: 0.00 to {max_value:.2f}")
        print(f"Unique values: {len(np.unique(normalized_data[valid_mask])):,}")
        
        # Update metadata - remove nodata since we converted them to 0
        meta.update({
            'dtype': rasterio.float32,
            'nodata': None
        })
        
        # Write normalized data to output file
        with rasterio.open(output_tiff, 'w', **meta) as dst:
            dst.write(normalized_data.astype(rasterio.float32), 1)
    
    return {
        'output_path': output_tiff,
        'original_min': float(min_value),
        'original_max': float(max_value_orig),
        'normalized_max': float(max_value),
        'shift_value': float(min_value),
        'nodata_handled': True
    }

# Example usage:
# result = normalize_raster_to_positive('raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m.tif', 'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized.tif')
# print(result)


In [8]:
from rasterio.warp import calculate_default_transform, reproject

def reproject_tiff(input_tiff, output_tiff, target_crs='EPSG:32188'):
    """
    Reproject a TIFF raster file to a target coordinate reference system.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file
    output_tiff : str
        Path to the output reprojected TIFF file
    target_crs : str
        Target CRS (default: 'EPSG:32188')
    
    Returns:
    --------
    dict : Dictionary with reprojection info (output path, original CRS, target CRS)
    """
    
    with rasterio.open(input_tiff) as src:
        # Store original CRS
        original_crs = src.crs
        
        # Calculate transform and dimensions for target CRS
        transform, width, height = calculate_default_transform(
            src.crs, target_crs, src.width, src.height, *src.bounds
        )
        
        # Update metadata
        meta = src.meta.copy()
        meta.update({
            'crs': target_crs,
            'transform': transform,
            'width': width,
            'height': height
        })
        
        # Write reprojected data to output file
        with rasterio.open(output_tiff, 'w', **meta) as dst:
            for i in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, i),
                    destination=rasterio.band(dst, i),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=target_crs,
                    resampling=Resampling.bilinear
                )
    
    return {
        'output_path': output_tiff,
        'original_crs': str(original_crs),
        'target_crs': target_crs
    }

# Example usage:
# result = reproject_tiff('raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized.tif', 'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected.tif')
# print(result)


## Ensure Clean Raster Extents

- **No NoData holes inside AOI**: Fill internal voids with interpolation or morphological operations
- **Use Set Null or Fill if needed**: Multiple fill methods available (interpolate, fill_holes, mean, nearest)
- **Rectangular extents import better as heightmaps**: Removes fragmented data for clean heightmap import

In [9]:
import numpy as np
from scipy.ndimage import binary_fill_holes, distance_transform_edt
from scipy.interpolate import griddata

def clean_raster_extents(input_tiff, output_tiff, fill_method='interpolate', nodata_value=None, min_hole_threshold=0.001):
    """
    Ensure clean raster extents by filling NoData holes inside AOI.
    Creates rectangular extents suitable for heightmap import.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file
    output_tiff : str
        Path to the output cleaned TIFF file
    fill_method : str
        Method to fill NoData values:
        - 'interpolate': Interpolate missing values from surrounding pixels (default)
        - 'fill_holes': Fill internal holes using morphological operations
        - 'mean': Fill with mean value of valid pixels
        - 'nearest': Nearest neighbor interpolation
    nodata_value : float
        NoData value to identify holes (default: uses raster's nodata value)
    min_hole_threshold : float
        Minimum percentage (0-1) of holes required to trigger hole filling
        Default: 0.001 (0.1% of pixels). Below this, data is copied unchanged.
    
    Returns:
    --------
    dict : Dictionary with cleaning info (method, holes_filled, output path)
    """
    
    with rasterio.open(input_tiff) as src:
        meta = src.meta.copy()
        data = src.read(1).astype(float)  # Read first band as float
        
        # Set nodata value
        if nodata_value is None:
            nodata_value = src.nodata if src.nodata is not None else np.nan
        
        # Identify valid and invalid pixels
        valid_mask = ~np.isnan(data) & (data != nodata_value) & np.isfinite(data)
        invalid_mask = ~valid_mask
        holes_count = np.sum(invalid_mask)
        total_pixels = data.size
        hole_percentage = holes_count / total_pixels
        
        print(f"Data inspection:")
        print(f"  Total pixels: {total_pixels:,}")
        print(f"  Valid pixels: {np.sum(valid_mask):,} ({100 * np.sum(valid_mask) / total_pixels:.2f}%)")
        print(f"  Invalid/NoData pixels: {holes_count:,} ({100 * hole_percentage:.2f}%)")
        print(f"  Data range: {np.nanmin(data[valid_mask]):.2f} to {np.nanmax(data[valid_mask]):.2f}")
        
        # Only process if there are significant holes
        if hole_percentage < min_hole_threshold:
            print(f"✓ Hole percentage ({100 * hole_percentage:.3f}%) below threshold ({100 * min_hole_threshold:.1f}%)")
            print(f"  Copying data unchanged (no processing needed)")
            processed_data = data.copy()
        else:
            print(f"  Filling {holes_count:,} holes using '{fill_method}' method...")
            processed_data = data.copy()
            
            # Apply fill method
            if fill_method == 'interpolate':
                # Interpolate missing values from surrounding valid pixels
                if np.any(invalid_mask) and np.any(valid_mask):
                    # Get coordinates of valid pixels
                    valid_points = np.column_stack(np.where(valid_mask))
                    valid_values = data[valid_mask]
                    
                    # Get coordinates of invalid pixels that need filling
                    invalid_points = np.column_stack(np.where(invalid_mask))
                    
                    # Interpolate ONLY the invalid pixels
                    filled_values = griddata(
                        points=valid_points,
                        values=valid_values,
                        xi=invalid_points,
                        method='linear',
                        fill_value=np.nanmean(valid_values)
                    )
                    
                    # Replace invalid pixels with interpolated values
                    processed_data[invalid_mask] = filled_values
            
            elif fill_method == 'fill_holes':
                # Fill internal holes using morphological operations + nearest neighbor
                if np.any(invalid_mask) and np.any(valid_mask):
                    # Use distance transform to find nearest valid pixel for each invalid pixel
                    indices = distance_transform_edt(invalid_mask, return_distances=False, return_indices=True)
                    processed_data[invalid_mask] = data[tuple(indices[:, invalid_mask])]
            
            elif fill_method == 'mean':
                # Fill with mean of valid pixels
                if np.any(valid_mask):
                    mean_value = np.nanmean(data[valid_mask])
                    processed_data[invalid_mask] = mean_value
            
            elif fill_method == 'nearest':
                # Nearest neighbor fill using distance transform
                if np.any(invalid_mask) and np.any(valid_mask):
                    indices = distance_transform_edt(invalid_mask, return_distances=False, return_indices=True)
                    processed_data[invalid_mask] = data[tuple(indices[:, invalid_mask])]
        
        # Update metadata (ensure rectangular extent, no missing values)
        meta.update({
            'dtype': rasterio.float32,
            'nodata': None  # Remove nodata since we filled holes (or had none)
        })
        
        # Write cleaned data
        with rasterio.open(output_tiff, 'w', **meta) as dst:
            dst.write(processed_data.astype(rasterio.float32), 1)
    
    return {
        'output_path': output_tiff,
        'fill_method': fill_method if hole_percentage >= min_hole_threshold else 'none (no holes detected)',
        'holes_filled': int(holes_count),
        'hole_percentage': float(hole_percentage * 100),
        'processing_applied': hole_percentage >= min_hole_threshold,
        'rectangular_extent': True,
        'heightmap_ready': True
    }

# Example usage:
# result = clean_raster_extents('raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected.tif', 'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected_cleaned.tif', fill_method='interpolate')
# print(result)


In [10]:
def save_as_geotiff(input_tiff, output_tiff, compression='LZW'):
    """
    Save raster as GeoTIFF with specified format requirements.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file
    output_tiff : str
        Path to the output GeoTIFF file
    compression : str
        Compression method: 'LZW', None, or other GDAL-supported methods
        (default: 'LZW')
    
    Returns:
    --------
    dict : Dictionary with file info (output path, pixel type, compression, square pixels)
    """
    
    with rasterio.open(input_tiff) as src:
        # Read data
        data = src.read()
        
        # Get metadata
        meta = src.meta.copy()
        
        # Ensure square pixels by checking resolution
        x_res, y_res = src.res
        is_square = abs(x_res - y_res) < 1e-6
        
        # Update metadata for GeoTIFF requirements
        meta.update({
            'driver': 'GTiff',
            'dtype': rasterio.float32,  # 32-bit float
            'compress': compression if compression else None
        })
        
        # Write to output file
        with rasterio.open(output_tiff, 'w', **meta) as dst:
            dst.write(data.astype(rasterio.float32))
    
    return {
        'output_path': output_tiff,
        'pixel_type': '32-bit float',
        'compression': compression if compression else 'None',
        'square_pixels': is_square,
        'x_resolution': float(x_res),
        'y_resolution': float(y_res)
    }

# Example usage:
# result = save_as_geotiff('raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected_cleaned.tif', 'data/MNS(terrain+batiment)_2015_1m_Montréal-Est_preprocessed.tif', compression='LZW')
# print(result)


In [11]:
import os
from pathlib import Path

def preprocess_dem(input_dem_path, output_name=None):
    """
    Master function that runs the complete DEM preprocessing pipeline.
    Executes all preprocessing steps in sequence on the input DEM file.
    
    Parameters:
    -----------
    input_dem_path : str
        Path to the input DEM TIFF file
    output_name : str
        Name for the final output file (without extension). 
        If None, uses input filename + '_preprocessed'
    
    Returns:
    --------
    dict : Dictionary containing the final output path and preprocessing summary
    """
    
    # Get the raw data folder (where input file is located)
    input_path = Path(input_dem_path)
    raw_data_folder = input_path.parent
    
    # Get input filename without extension
    input_basename = input_path.stem
    
    # Set output name
    if output_name is None:
        output_name = f"{input_basename}_preprocessed"
    
    print(f"Starting DEM preprocessing pipeline for: {input_basename}")
    print("=" * 60)
    
    # Step 1: Resample to 5m resolution
    print("\n[Step 1/5] Resampling to 5m resolution...")
    resampled_path = str(raw_data_folder / 'output_dem_5m.tif')
    result1 = resample_tiff_to_5m(input_dem_path, resampled_path)
    print(f"✓ Saved: {result1['output_path']}")
    
    # Step 2: Normalize to positive values
    print("\n[Step 2/5] Normalizing raster to positive values...")
    normalized_path = str(raw_data_folder / 'output_dem_5m_normalized.tif')
    result2 = normalize_raster_to_positive(resampled_path, normalized_path)
    print(f"✓ Saved: {result2['output_path']}")
    print(f"  Original min: {result2['original_min']}, Normalized max: {result2['normalized_max']}")
    
    # Step 3: Reproject to target CRS
    print("\n[Step 3/5] Reprojecting to EPSG:32188...")
    reprojected_path = str(raw_data_folder / 'output_dem_5m_normalized_reprojected.tif')
    result3 = reproject_tiff(normalized_path, reprojected_path)
    print(f"✓ Saved: {result3['output_path']}")
    print(f"  CRS: {result3['original_crs']} → {result3['target_crs']}")
    
    # Step 4: Clean raster extents
    print("\n[Step 4/5] Cleaning raster extents (filling NoData holes)...")
    cleaned_path = str(raw_data_folder / 'output_dem_5m_normalized_reprojected_cleaned.tif')
    result4 = clean_raster_extents(reprojected_path, cleaned_path, fill_method='interpolate')
    print(f"✓ Saved: {result4['output_path']}")
    print(f"  Holes filled: {result4['holes_filled']}")
    
    # Step 5: Save as final GeoTIFF
    print("\n[Step 5/5] Saving final GeoTIFF to data folder...")
    # Ensure data folder exists
    os.makedirs('data', exist_ok=True)
    final_output_path = f'data/{output_name}.tif'
    result5 = save_as_geotiff(cleaned_path, final_output_path, compression='LZW')
    print(f"✓ Saved: {result5['output_path']}")
    print(f"  Pixel type: {result5['pixel_type']}")
    print(f"  Compression: {result5['compression']}")
    print(f"  Square pixels: {result5['square_pixels']}")
    
    print("\n" + "=" * 60)
    print("✓ DEM preprocessing completed successfully!")
    print(f"Final output: {final_output_path}")
    
    return {
        'input_file': input_dem_path,
        'output_file': final_output_path,
        'status': 'completed',
        'steps_executed': 5,
        'final_resolution': 5.0,
        'target_crs': 'EPSG:32188'
    }

# Example usage:
result = preprocess_dem('raw_data/mnsterrainbatiment_2015_1m_montreal-est/MNS(terrain+batiment)_2015_1m_Montréal-Est.tif')
print(result)


Starting DEM preprocessing pipeline for: MNS(terrain+batiment)_2015_1m_Montréal-Est

[Step 1/5] Resampling to 5m resolution...
✓ Saved: raw_data\mnsterrainbatiment_2015_1m_montreal-est\output_dem_5m.tif

[Step 2/5] Normalizing raster to positive values...
NoData value detected: -32767.0
Valid pixels: 558,979 / 1,135,745
Original data range (valid pixels only): -72.82 to 72.53
Normalized data range: 0.00 to 145.34
Unique values: 542,714
✓ Saved: raw_data\mnsterrainbatiment_2015_1m_montreal-est\output_dem_5m_normalized.tif
  Original min: -72.81660461425781, Normalized max: 145.34471893310547

[Step 3/5] Reprojecting to EPSG:32188...
✓ Saved: raw_data\mnsterrainbatiment_2015_1m_montreal-est\output_dem_5m_normalized_reprojected.tif
  CRS: EPSG:2950 → EPSG:32188

[Step 4/5] Cleaning raster extents (filling NoData holes)...
Data inspection:
  Total pixels: 1,135,745
  Valid pixels: 1,135,745 (100.00%)
  Invalid/NoData pixels: 0 (0.00%)
  Data range: 0.00 to 145.34
✓ Hole percentage (0.000%)

In [12]:
def diagnose_tiff_data(tiff_path):
    """
    Diagnostic function to inspect TIFF file data quality.
    Helps identify where corruption is introduced in the pipeline.
    """
    print(f"\n{'='*60}")
    print(f"Diagnosing: {tiff_path}")
    print(f"{'='*60}")
    
    with rasterio.open(tiff_path) as src:
        data = src.read(1).astype(float)
        
        # Basic info
        print(f"Shape: {data.shape}")
        print(f"Data type: {src.dtypes[0]}")
        print(f"CRS: {src.crs}")
        print(f"Resolution: {src.res}")
        
        # Data quality checks
        total_pixels = data.size
        valid_data = np.isfinite(data)
        unique_values = len(np.unique(data[valid_data]))
        
        print(f"\nData Quality:")
        print(f"  Total pixels: {total_pixels:,}")
        print(f"  Finite values: {np.sum(valid_data):,} ({100*np.sum(valid_data)/total_pixels:.2f}%)")
        print(f"  NaN values: {np.sum(np.isnan(data)):,}")
        print(f"  Inf values: {np.sum(np.isinf(data)):,}")
        print(f"  Unique values: {unique_values:,}")
        
        if np.any(valid_data):
            print(f"\nValue Range:")
            print(f"  Min: {np.nanmin(data):.2f}")
            print(f"  Max: {np.nanmax(data):.2f}")
            print(f"  Mean: {np.nanmean(data):.2f}")
            print(f"  Std: {np.nanstd(data):.2f}")
            
            # Sample a 100x100 patch from center to check for patterns
            h, w = data.shape
            if h > 100 and w > 100:
                center_h, center_w = h // 2, w // 2
                sample = data[center_h-50:center_h+50, center_w-50:center_w+50]
                
                # Check for suspicious patterns (like all same values, or striping)
                sample_unique = len(np.unique(sample[np.isfinite(sample)]))
                row_variance = np.var([np.nanmean(sample[i, :]) for i in range(sample.shape[0])])
                col_variance = np.var([np.nanmean(sample[:, j]) for j in range(sample.shape[1])])
                
                print(f"\nCenter Sample (100x100):")
                print(f"  Unique values: {sample_unique}")
                print(f"  Row variance: {row_variance:.4f}")
                print(f"  Col variance: {col_variance:.4f}")
                
                if sample_unique < 10:
                    print(f"  ⚠️  WARNING: Very few unique values - possible data loss!")
                if row_variance > col_variance * 10 or col_variance > row_variance * 10:
                    print(f"  ⚠️  WARNING: Directional variance suggests striping pattern!")

# Diagnose each intermediate file
print("Checking intermediate files...")
files_to_check = [
    'raw_data/mnsterrainbatiment_2015_1m_montreal-est/MNS(terrain+batiment)_2015_1m_Montréal-Est.tif',
    'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m.tif',
    'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized.tif',
    'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected.tif',
    'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected_cleaned.tif',
]

for f in files_to_check:
    if os.path.exists(f):
        diagnose_tiff_data(f)
    else:
        print(f"\n⚠️  File not found: {f}")


Checking intermediate files...

Diagnosing: raw_data/mnsterrainbatiment_2015_1m_montreal-est/MNS(terrain+batiment)_2015_1m_Montréal-Est.tif
Shape: (4326, 6568)
Data type: float32
CRS: EPSG:2950
Resolution: (1.0, 1.0)

Data Quality:
  Total pixels: 28,413,168
  Finite values: 28,413,168 (100.00%)
  NaN values: 0
  Inf values: 0
  Unique values: 8,664,046

Value Range:
  Min: -32767.00
  Max: 76.68
  Mean: -16637.80
  Std: 16394.50

Center Sample (100x100):
  Unique values: 9916
  Row variance: 3.9227
  Col variance: 5.5917

Diagnosing: raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m.tif
Shape: (865, 1313)
Data type: float32
CRS: EPSG:2950
Resolution: (5.0, 5.0)

Data Quality:
  Total pixels: 1,135,745
  Finite values: 1,135,745 (100.00%)
  NaN values: 0
  Inf values: 0
  Unique values: 542,715

Value Range:
  Min: -32767.00
  Max: 72.53
  Mean: -16627.11
  Std: 16394.67

Center Sample (100x100):
  Unique values: 9979
  Row variance: 0.6318
  Col variance: 2.3688

Diagnosi