### Importing Packages

In [6]:
# Import necessary libraries
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import rasterio
import rasterio.mask
from matplotlib import colors, cm
from osgeo import gdal
import os
from rasterio.plot import show
from rasterio.plot import show_hist
from numpy import ma
# import richdem as rd
from pyspatialml import Raster
from tqdm import tqdm
import rioxarray as rxr
import xarray as xr
from shapely.ops import nearest_points

### Importing Files

In [7]:
# Load geospatial datasets

train_data = gpd.read_file(r"./datasets/Train.gpkg",geometry='geometry')
test_data = gpd.read_file(r"./datasets/Test.gpkg",geometry='geometry')
val_data = gpd.read_file(r"./datasets/valtellina.gpkg",geometry='geometry')
road_network = gpd.read_file(r"./datasets/road_network.gpkg",geometry='geometry')
river_network = gpd.read_file(r"./datasets/river_network.gpkg",geometry='geometry')
fault_zones = gpd.read_file(r"./datasets/geological_faults.gpkg",geometry='geometry')
land_use = gpd.read_file(r"./datasets/land_use_land_cover.gpkg",geometry='geometry')



The purpose of this notebook is to process the train data

In [8]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open(r".\datasets\dtm.tif") as dtm_src:
    # Create empty lists to store the sampled values
    dtm_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        dtm_value = next(dtm_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        dtm_values.append(dtm_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['dtm'] = dtm_values


In [9]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open(r".\datasets\average_precipitation_2020.tif") as average_precipitation_src:
    # Create empty lists to store the sampled values
    average_precipitation_values = []
    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        average_precipitation_value = next(average_precipitation_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        average_precipitation_values.append(average_precipitation_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['average_precipitation'] = average_precipitation_values


In [10]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open(r".\datasets\90_perc_precipitation_2020.tif") as perc_precipitation_src:
    # Create empty lists to store the sampled values
    perc_precipitation_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        perc_precipitation_value = next(perc_precipitation_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        perc_precipitation_values.append(perc_precipitation_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['perc_precipitation'] = perc_precipitation_values


In [11]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open("eastness.tif") as eastness_src:
    # Create empty lists to store the sampled values
    eastness_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        eastness_value = next(eastness_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        eastness_values.append(eastness_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['eastness'] = eastness_values


In [12]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open("northerness.tif") as northness_src:
    # Create empty lists to store the sampled values
    northness_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        northness_value = next(northness_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        northness_values.append(northness_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['northness'] = northness_values


In [13]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open("hillshade.tif") as hillshade_src:
    # Create empty lists to store the sampled values
    hillshade_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        hillshade_value = next(hillshade_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        hillshade_values.append(hillshade_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['hillshade'] = hillshade_values


In [14]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open("roughness.tif") as roughness_src:
    # Create empty lists to store the sampled values
    roughness_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        roughness_value = next(roughness_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        roughness_values.append(roughness_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['roughness'] = roughness_values


In [15]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open("slope_rad.tif") as slope_rad_src:
    # Create empty lists to store the sampled values
    slope_rad_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        slope_rad_value = next(slope_rad_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        slope_rad_values.append(slope_rad_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['slope_rad'] = slope_rad_values


In [16]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open("TPI.tif") as TPI_src:
    # Create empty lists to store the sampled values
    TPI_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        TPI_value = next(TPI_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        TPI_values.append(TPI_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['TPI'] = TPI_values


In [17]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open("TRI.tif") as TRI_src:
    # Create empty lists to store the sampled values
    TRI_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        TRI_value = next(TRI_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        TRI_values.append(TRI_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['TRI'] = TRI_values


In [18]:
# Open the DTM, TPI, and TWI raster files
with rasterio.open("aspect_degrees.tif") as aspect_degree_src:
    # Create empty lists to store the sampled values
    aspect_degree_values = []

    # Iterate through the multipolygons in your training GeoDataFrame
    for geom in train_data['geometry']:
        # Calculate the centroid of the multipolygon
        centroid = geom.centroid
        
        # Sample values from each raster at the centroid
        lon, lat = centroid.x, centroid.y
        aspect_degree_value = next(aspect_degree_src.sample([(lon, lat)]))[0]

        # Append the sampled values to the lists
        aspect_degree_values.append(aspect_degree_value)

    # Add the sampled values as new columns in your training GeoDataFrame
    train_data['aspect_degree'] = aspect_degree_values


In [19]:
from shapely.ops import nearest_points

# Create spatial indexes
train_data.sindex
road_network.sindex

# Create a function to calculate the nearest distance for a single geometry
def calculate_nearest_distance(train_geom):
    nearest_points_result = nearest_points(train_geom, road_network.unary_union)
    return nearest_points_result[0].distance(nearest_points_result[1])

# Use the apply method to calculate distances for all train geometries
train_data['distance_to_nearest_road'] = train_data['geometry'].apply(calculate_nearest_distance)





In [20]:
train_data.sindex
river_network.sindex

# Create a function to calculate the nearest distance for a single geometry
def calculate_nearest_distance(train_geom):
    nearest_points_result = nearest_points(train_geom, river_network.unary_union)
    return nearest_points_result[0].distance(nearest_points_result[1])

# Use the apply method to calculate distances for all train geometries
train_data['distance_to_nearest_river'] = train_data['geometry'].apply(calculate_nearest_distance)



In [21]:
# Create spatial indexes
train_data.sindex
fault_zones.sindex

# Create a function to calculate the nearest distance for a single geometry
def calculate_nearest_distance(train_geom):
    nearest_points_result = nearest_points(train_geom, fault_zones.unary_union)
    return nearest_points_result[0].distance(nearest_points_result[1])

# Use the apply method to calculate distances for all train geometries
train_data['distance_to_nearest_fault_zones'] = train_data['geometry'].apply(calculate_nearest_distance)


In [22]:
train_data


Unnamed: 0,Target,ID,geometry,dtm,average_precipitation,perc_precipitation,eastness,northness,hillshade,roughness,slope_rad,TPI,TRI,aspect_degree,distance_to_nearest_road,distance_to_nearest_river,distance_to_nearest_fault_zones
0,1,ID_000001,"MULTIPOLYGON (((607152.916 5124458.395, 607137...",1499.408203,0.154537,0.265434,-0.816495,-0.577352,-74,4.274292,0.296125,-0.007202,1.153625,234.735474,7257.103048,42.258153,3468.288421
1,1,ID_000002,"MULTIPOLYGON (((611957.101 5131543.071, 611970...",2369.408447,0.134484,0.218065,0.992943,0.118594,-106,2.764648,0.234800,0.055908,0.910767,83.189041,2553.051622,416.844630,662.149764
2,1,ID_000003,"MULTIPOLYGON (((612895.966 5130875.565, 612880...",2103.769043,0.134484,0.218065,0.713229,0.700931,-108,9.699951,0.604573,-0.063721,2.593719,45.498238,1258.714588,0.000000,0.000000
3,1,ID_000004,"MULTIPOLYGON (((609366.882 5131249.149, 609361...",2484.286621,0.143612,0.243227,0.675394,-0.737457,-127,3.734863,0.258438,0.034912,0.963959,137.515228,3393.820938,205.885774,535.782979
4,1,ID_000005,"MULTIPOLYGON (((611814.848 5132646.039, 611808...",2624.262207,0.138958,0.243465,0.850295,-0.526306,58,8.691650,0.568662,-0.041748,2.420074,121.756203,2369.554701,82.533645,220.274205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12135,0,ID_012136,"MULTIPOLYGON (((540821.869 5097076.248, 540821...",2070.000000,0.284925,0.427346,-0.707107,-0.707107,-76,1.171875,0.041408,0.146484,0.146484,225.000000,3319.667932,0.000000,0.000000
12136,0,ID_012137,"MULTIPOLYGON (((542231.869 5096911.248, 542231...",1858.089966,0.285867,0.454944,0.987688,0.156435,-75,0.000000,0.000000,0.000000,0.000000,-9999.000000,2374.290762,1169.573999,0.000000
12137,0,ID_012138,"MULTIPOLYGON (((542636.869 5096726.248, 542641...",2043.433960,0.285867,0.454944,-0.907157,0.420793,-29,4.729248,0.346257,-0.083374,1.401123,294.884674,2381.980504,1592.957765,239.927811
12138,0,ID_012139,"MULTIPOLYGON (((541576.869 5096291.248, 541581...",2254.370117,0.287747,0.456125,0.987688,0.156435,-75,0.000000,0.000000,0.000000,0.000000,-9999.000000,3200.397566,526.553029,0.000000


In [23]:
(
    train_data.crs.to_epsg() ==
    road_network.crs.to_epsg() ==
    river_network.crs.to_epsg() ==
    land_use.crs.to_epsg() ==
    fault_zones.crs.to_epsg()
)

True

In [24]:

# Calculate the centroid of each geometry
train_data['centroid'] = train_data['geometry'].centroid

# Extract latitude and longitude from the centroids
train_data['latitude'] = train_data['centroid'].y
train_data['longitude'] = train_data['centroid'].x
train_data['area'] = train_data['geometry'].area
train_data['perimeter'] = train_data['geometry'].length
train_data['bounding_box'] = train_data['geometry'].envelope
train_data['aspect_ratio'] = train_data['bounding_box'].apply(lambda geom: geom.bounds[2] / geom.bounds[3]) # Calculate aspect ratio
train_data['convex_hull'] = train_data['geometry'].convex_hull


In [25]:
train_data.to_csv(r'./datasets/processed_train.csv')