pre processing are all steps that could be done in arcgis or qgis

In [None]:
import rasterio
from rasterio.vrt import WarpedVRT
from rasterio.enums import Resampling
import os

def resample_tiff_to_5m(input_tiff, output_tiff, resampling_method=Resampling.bilinear):
    """
    Resample a TIFF raster file to 5m resolution using specified resampling method.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file
    output_tiff : str
        Path to the output resampled TIFF file
    resampling_method : rasterio.enums.Resampling
        Resampling method to use (default: bilinear)
        Options: nearest, bilinear, cubic, cubicspline, lanczos, average, mode, etc.
    
    Returns:
    --------
    dict : Dictionary with resampling info (output path, original resolution, new resolution)
    """
    
    with rasterio.open(input_tiff) as src:
        # Get original metadata
        meta = src.meta.copy()
        original_res = src.res[0]  # Get resolution (assuming square pixels)
        
        # Calculate scale factor (5m / original_resolution)
        scale = original_res / 5.0
        
        # Update metadata for resampling
        meta.update({
            'height': int(src.height * scale),
            'width': int(src.width * scale),
            'transform': src.transform * src.transform.scale(1.0 / scale, 1.0 / scale)
        })
        
        # Use WarpedVRT for efficient resampling
        with WarpedVRT(src, resampling=resampling_method, 
                       height=meta['height'], 
                       width=meta['width'],
                       transform=meta['transform']) as vrt:
            
            # Write resampled data to output file
            with rasterio.open(output_tiff, 'w', **meta) as dst:
                dst.write(vrt.read())
    
    return {
        'output_path': output_tiff,
        'original_resolution': original_res,
        'new_resolution': 5.0,
        'resampling_method': str(resampling_method)
    }

# Example usage:
# result = resample_tiff_to_5m('raw_data/mnsterrainbatiment_2015_1m_montreal-est/MNS(terrain+batiment)_2015_1m_Montréal-Est.tif', 'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m.tif')
# print(result)


recalculate normals so the all values are positive. although this isnt strictly required, it makes it easier to trouble shoot issues

In [None]:
def normalize_raster_to_positive(input_tiff, output_tiff):
    """
    Normalize a raster so that values range from 0 to maximum value.
    Subtracts the minimum value from all pixels to shift negative values to positive.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file (resampled raster)
    output_tiff : str
        Path to the output normalized TIFF file
    
    Returns:
    --------
    dict : Dictionary with normalization info (min value, max value, output path)
    """
    
    with rasterio.open(input_tiff) as src:
        data = src.read()
        meta = src.meta.copy()
        
        # Find minimum value across all bands
        min_value = data.min()
        
        # Normalize: subtract minimum from all values
        normalized_data = data - min_value
        
        # Get new max value after normalization
        max_value = normalized_data.max()
        
        # Write normalized data to output file
        with rasterio.open(output_tiff, 'w', **meta) as dst:
            dst.write(normalized_data)
    
    return {
        'output_path': output_tiff,
        'original_min': float(min_value),
        'normalized_max': float(max_value),
        'shift_value': float(min_value)
    }

# Example usage:
# result = normalize_raster_to_positive('raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m.tif', 'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized.tif')
# print(result)


In [None]:
from rasterio.warp import calculate_default_transform, reproject

def reproject_tiff(input_tiff, output_tiff, target_crs='EPSG:32188'):
    """
    Reproject a TIFF raster file to a target coordinate reference system.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file
    output_tiff : str
        Path to the output reprojected TIFF file
    target_crs : str
        Target CRS (default: 'EPSG:32188')
    
    Returns:
    --------
    dict : Dictionary with reprojection info (output path, original CRS, target CRS)
    """
    
    with rasterio.open(input_tiff) as src:
        # Store original CRS
        original_crs = src.crs
        
        # Calculate transform and dimensions for target CRS
        transform, width, height = calculate_default_transform(
            src.crs, target_crs, src.width, src.height, *src.bounds
        )
        
        # Update metadata
        meta = src.meta.copy()
        meta.update({
            'crs': target_crs,
            'transform': transform,
            'width': width,
            'height': height
        })
        
        # Write reprojected data to output file
        with rasterio.open(output_tiff, 'w', **meta) as dst:
            for i in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, i),
                    destination=rasterio.band(dst, i),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=target_crs,
                    resampling=Resampling.bilinear
                )
    
    return {
        'output_path': output_tiff,
        'original_crs': str(original_crs),
        'target_crs': target_crs
    }

# Example usage:
# result = reproject_tiff('raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized.tif', 'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected.tif')
# print(result)


In [None]:
## Ensure Clean Raster Extents

- **No NoData holes inside AOI**: Fill internal voids with interpolation or morphological operations
- **Use Set Null or Fill if needed**: Multiple fill methods available (interpolate, fill_holes, mean, nearest)
- **Rectangular extents import better as heightmaps**: Removes fragmented data for clean heightmap import

In [None]:
import numpy as np
from scipy.ndimage import binary_fill_holes, distance_transform_edt
from scipy.interpolate import griddata

def clean_raster_extents(input_tiff, output_tiff, fill_method='interpolate', nodata_value=None):
    """
    Ensure clean raster extents by filling NoData holes inside AOI.
    Creates rectangular extents suitable for heightmap import.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file
    output_tiff : str
        Path to the output cleaned TIFF file
    fill_method : str
        Method to fill NoData values:
        - 'interpolate': Interpolate missing values from surrounding pixels (default)
        - 'fill_holes': Fill internal holes using morphological operations
        - 'mean': Fill with mean value of valid pixels
        - 'nearest': Nearest neighbor interpolation
    nodata_value : float
        NoData value to identify holes (default: uses raster's nodata value)
    
    Returns:
    --------
    dict : Dictionary with cleaning info (method, holes_filled, output path)
    """
    
    with rasterio.open(input_tiff) as src:
        meta = src.meta.copy()
        data = src.read(1).astype(float)  # Read first band as float
        
        # Set nodata value
        if nodata_value is None:
            nodata_value = src.nodata if src.nodata is not None else np.nan
        
        # Identify valid and invalid pixels
        valid_mask = ~np.isnan(data) & (data != nodata_value)
        invalid_mask = ~valid_mask
        holes_count = np.sum(invalid_mask)
        
        # Apply fill method
        if fill_method == 'interpolate':
            # Interpolate missing values from surrounding valid pixels
            if np.any(valid_mask):
                points = np.where(valid_mask)
                values = data[valid_mask]
                
                # Create grid coordinates
                coords = np.array(np.meshgrid(np.arange(data.shape[0]), 
                                             np.arange(data.shape[1]), 
                                             indexing='ij')).T.reshape(-1, 2)
                
                # Interpolate
                filled_data = griddata(points=np.column_stack((points[0], points[1])), 
                                      values=values, 
                                      xi=coords,
                                      method='linear', 
                                      fill_value=np.nanmean(values))
                data = filled_data.reshape(data.shape)
        
        elif fill_method == 'fill_holes':
            # Fill internal holes using binary morphological operations
            data_copy = data.copy()
            data_copy[invalid_mask] = 0
            filled = binary_fill_holes(valid_mask).astype(float)
            # Use nearest neighbor for filled holes
            y_coords, x_coords = np.where(valid_mask)
            if len(y_coords) > 0:
                filled_data = distance_transform_edt(invalid_mask, return_distances=False, 
                                                     return_indices=True)
                data[invalid_mask] = data[filled_data[0, invalid_mask], 
                                         filled_data[1, invalid_mask]]
        
        elif fill_method == 'mean':
            # Fill with mean of valid pixels
            mean_value = np.nanmean(data[valid_mask]) if np.any(valid_mask) else 0
            data[invalid_mask] = mean_value
        
        elif fill_method == 'nearest':
            # Nearest neighbor fill
            if np.any(valid_mask):
                from scipy.ndimage import distance_transform_edt as dt
                indices = dt(invalid_mask, return_distances=False, return_indices=True)
                data[invalid_mask] = data[tuple(indices[:, invalid_mask])]
        
        # Update metadata (ensure rectangular extent, no missing values)
        meta.update({
            'dtype': rasterio.float32,
            'nodata': None  # Remove nodata since we filled holes
        })
        
        # Write cleaned data
        with rasterio.open(output_tiff, 'w', **meta) as dst:
            dst.write(data.astype(rasterio.float32), 1)
    
    return {
        'output_path': output_tiff,
        'fill_method': fill_method,
        'holes_filled': int(holes_count),
        'rectangular_extent': True,
        'heightmap_ready': True
    }

# Example usage:
# result = clean_raster_extents('raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected.tif', 'raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected_cleaned.tif', fill_method='interpolate')
# print(result)


In [None]:
def save_as_geotiff(input_tiff, output_tiff, compression='LZW'):
    """
    Save raster as GeoTIFF with specified format requirements.
    
    Parameters:
    -----------
    input_tiff : str
        Path to the input TIFF file
    output_tiff : str
        Path to the output GeoTIFF file
    compression : str
        Compression method: 'LZW', None, or other GDAL-supported methods
        (default: 'LZW')
    
    Returns:
    --------
    dict : Dictionary with file info (output path, pixel type, compression, square pixels)
    """
    
    with rasterio.open(input_tiff) as src:
        # Read data
        data = src.read()
        
        # Get metadata
        meta = src.meta.copy()
        
        # Ensure square pixels by checking resolution
        x_res, y_res = src.res
        is_square = abs(x_res - y_res) < 1e-6
        
        # Update metadata for GeoTIFF requirements
        meta.update({
            'driver': 'GTiff',
            'dtype': rasterio.float32,  # 32-bit float
            'compress': compression if compression else None
        })
        
        # Write to output file
        with rasterio.open(output_tiff, 'w', **meta) as dst:
            dst.write(data.astype(rasterio.float32))
    
    return {
        'output_path': output_tiff,
        'pixel_type': '32-bit float',
        'compression': compression if compression else 'None',
        'square_pixels': is_square,
        'x_resolution': float(x_res),
        'y_resolution': float(y_res)
    }

# Example usage:
# result = save_as_geotiff('raw_data/mnsterrainbatiment_2015_1m_montreal-est/output_dem_5m_normalized_reprojected_cleaned.tif', 'data/MNS(terrain+batiment)_2015_1m_Montréal-Est_preprocessed.tif', compression='LZW')
# print(result)


{'output_path': 'output_final.tif', 'pixel_type': '32-bit float', 'compression': 'LZW', 'square_pixels': True, 'x_resolution': 5.0, 'y_resolution': 5.0}


In [8]:
import os
from pathlib import Path

def preprocess_dem(input_dem_path, output_name=None):
    """
    Master function that runs the complete DEM preprocessing pipeline.
    Executes all preprocessing steps in sequence on the input DEM file.
    
    Parameters:
    -----------
    input_dem_path : str
        Path to the input DEM TIFF file
    output_name : str
        Name for the final output file (without extension). 
        If None, uses input filename + '_preprocessed'
    
    Returns:
    --------
    dict : Dictionary containing the final output path and preprocessing summary
    """
    
    # Get the raw data folder (where input file is located)
    input_path = Path(input_dem_path)
    raw_data_folder = input_path.parent
    
    # Get input filename without extension
    input_basename = input_path.stem
    
    # Set output name
    if output_name is None:
        output_name = f"{input_basename}_preprocessed"
    
    print(f"Starting DEM preprocessing pipeline for: {input_basename}")
    print("=" * 60)
    
    # Step 1: Resample to 5m resolution
    print("\n[Step 1/5] Resampling to 5m resolution...")
    resampled_path = str(raw_data_folder / 'output_dem_5m.tif')
    result1 = resample_tiff_to_5m(input_dem_path, resampled_path)
    print(f"✓ Saved: {result1['output_path']}")
    
    # Step 2: Normalize to positive values
    print("\n[Step 2/5] Normalizing raster to positive values...")
    normalized_path = str(raw_data_folder / 'output_dem_5m_normalized.tif')
    result2 = normalize_raster_to_positive(resampled_path, normalized_path)
    print(f"✓ Saved: {result2['output_path']}")
    print(f"  Original min: {result2['original_min']}, Normalized max: {result2['normalized_max']}")
    
    # Step 3: Reproject to target CRS
    print("\n[Step 3/5] Reprojecting to EPSG:32188...")
    reprojected_path = str(raw_data_folder / 'output_dem_5m_normalized_reprojected.tif')
    result3 = reproject_tiff(normalized_path, reprojected_path)
    print(f"✓ Saved: {result3['output_path']}")
    print(f"  CRS: {result3['original_crs']} → {result3['target_crs']}")
    
    # Step 4: Clean raster extents
    print("\n[Step 4/5] Cleaning raster extents (filling NoData holes)...")
    cleaned_path = str(raw_data_folder / 'output_dem_5m_normalized_reprojected_cleaned.tif')
    result4 = clean_raster_extents(reprojected_path, cleaned_path, fill_method='interpolate')
    print(f"✓ Saved: {result4['output_path']}")
    print(f"  Holes filled: {result4['holes_filled']}")
    
    # Step 5: Save as final GeoTIFF
    print("\n[Step 5/5] Saving final GeoTIFF to data folder...")
    # Ensure data folder exists
    os.makedirs('data', exist_ok=True)
    final_output_path = f'data/{output_name}.tif'
    result5 = save_as_geotiff(cleaned_path, final_output_path, compression='LZW')
    print(f"✓ Saved: {result5['output_path']}")
    print(f"  Pixel type: {result5['pixel_type']}")
    print(f"  Compression: {result5['compression']}")
    print(f"  Square pixels: {result5['square_pixels']}")
    
    print("\n" + "=" * 60)
    print("✓ DEM preprocessing completed successfully!")
    print(f"Final output: {final_output_path}")
    
    return {
        'input_file': input_dem_path,
        'output_file': final_output_path,
        'status': 'completed',
        'steps_executed': 5,
        'final_resolution': 5.0,
        'target_crs': 'EPSG:32188'
    }

# Example usage:
result = preprocess_dem('raw_data/mnsterrainbatiment_2015_1m_montreal-est/MNS(terrain+batiment)_2015_1m_Montréal-Est.tif')
print(result)


Starting DEM preprocessing pipeline for: MNS(terrain+batiment)_2015_1m_Montréal-Est

[Step 1/5] Resampling to 5m resolution...
✓ Saved: raw_data\mnsterrainbatiment_2015_1m_montreal-est\output_dem_5m.tif

[Step 2/5] Normalizing raster to positive values...
✓ Saved: raw_data\mnsterrainbatiment_2015_1m_montreal-est\output_dem_5m_normalized.tif
  Original min: -32767.0, Normalized max: 32839.52734375

[Step 3/5] Reprojecting to EPSG:32188...
✓ Saved: raw_data\mnsterrainbatiment_2015_1m_montreal-est\output_dem_5m_normalized_reprojected.tif
  CRS: EPSG:2950 → EPSG:32188

[Step 4/5] Cleaning raster extents (filling NoData holes)...
✓ Saved: raw_data\mnsterrainbatiment_2015_1m_montreal-est\output_dem_5m_normalized_reprojected_cleaned.tif
  Holes filled: 0

[Step 5/5] Saving final GeoTIFF to data folder...
✓ Saved: data/MNS(terrain+batiment)_2015_1m_Montréal-Est_preprocessed.tif
  Pixel type: 32-bit float
  Compression: LZW
  Square pixels: True

✓ DEM preprocessing completed successfully!
Fina