In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import MultiPolygon

# Prepare ML input data

In [2]:
# Read Excel file with all possible features
workshop_features = pd.read_excel("workshop_feature_selection.xlsx")

In [3]:
workshop_features

Unnamed: 0,feature,group,workshop_ml_input,tn_model_v1_abs_mean_shap,tn_model_v2_abs_mean_shap,tn_model_v3_abs_mean_shap,tn_model_v4_abs_mean_shap
0,arable_prop,arable,1,0.183095,0.240194,0.249932,0.280003
1,arable_prop_buff_100,arable,1,,,,
2,arable_prop_buff_1000,arable,1,,,,
3,arable_prop_buff_500,arable,1,,,,
4,area,area,1,0.015212,0.013442,0.017646,0.030337
...,...,...,...,...,...,...,...
77,twi_mean,twi,1,0.112151,0.058157,0.106335,0.084019
78,twi_std,twi,1,,,,
79,urban_prop,urban,1,0.066393,0.038074,0.061920,0.040733
80,water_prop,water,1,0.096020,0.117541,0.063534,0.071921


In [4]:
# Extract selected features
use_features = workshop_features[workshop_features["workshop_ml_input"] == 1]["feature"].to_list()

In [5]:
# Read ML input file with all possible features
ml_input = pd.read_csv("source_data/tn_ml_input.csv", sep=",")

In [6]:
ml_input

Unnamed: 0,site_code,obs_year,parameter,obs_value,arable_prop,arable_prop_buff_100,arable_prop_buff_1000,arable_prop_buff_500,area,awc1_min,...,tri_max,tri_mean,tri_std,twi_min,twi_max,twi_mean,twi_std,urban_prop,water_prop,wetland_prop
0,SJA0088000,2016,TN,1.138,0.122,0.152,0.150,0.178,704297025.0,0.161,...,2.977,0.085,0.108,2.262,15.093,9.851,1.138,0.012,0.004,0.084
1,SJA0088000,2017,TN,1.175,0.122,0.152,0.150,0.178,704297025.0,0.161,...,2.977,0.085,0.108,2.262,15.093,9.851,1.138,0.012,0.004,0.084
2,SJA0106000,2016,TN,5.967,0.428,0.307,0.380,0.368,26553800.0,0.187,...,1.517,0.070,0.072,3.415,13.996,10.252,1.046,0.018,0.000,0.021
3,SJA0106000,2017,TN,4.175,0.428,0.307,0.380,0.368,26553800.0,0.187,...,1.517,0.070,0.072,3.415,13.996,10.252,1.046,0.018,0.000,0.021
4,SJA0106000,2018,TN,4.225,0.428,0.307,0.380,0.368,26553800.0,0.187,...,1.517,0.070,0.072,3.415,13.996,10.252,1.046,0.018,0.000,0.021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,SJB3502000,2019,TN,5.347,0.579,0.460,0.498,0.471,18557575.0,0.174,...,1.723,0.076,0.095,3.264,14.062,10.334,1.200,0.023,0.003,0.025
465,SJB3502000,2020,TN,5.596,0.579,0.460,0.498,0.471,18557575.0,0.174,...,1.723,0.076,0.095,3.264,14.062,10.334,1.200,0.023,0.003,0.025
466,SJB3503000,2020,TN,8.225,0.537,0.316,0.523,0.453,112294125.0,0.174,...,2.764,0.099,0.111,2.279,15.563,10.033,1.392,0.028,0.002,0.014
467,SJB3510000,2019,TN,8.090,0.475,0.271,0.573,0.497,29300575.0,0.174,...,2.757,0.134,0.201,2.321,14.517,9.690,1.718,0.025,0.004,0.096


In [7]:
# Create list of columns to drop from ML input
drop_cols = []
for col in ml_input.columns:
    if col not in use_features and col not in ["site_code", "obs_value"]:
        drop_cols.append(col)

In [8]:
# Drop unnecessary columns
ml_input = ml_input.drop(drop_cols, axis=1)

In [9]:
# Generate DataFrame with median values
ml_input_median = ml_input.groupby("site_code").median().reset_index()

In [10]:
ml_input_median

Unnamed: 0,site_code,obs_value,arable_prop,arable_prop_buff_100,arable_prop_buff_1000,arable_prop_buff_500,area,awc1_min,awc1_max,awc1_mean,...,tri_max,tri_mean,tri_std,twi_min,twi_max,twi_mean,twi_std,urban_prop,water_prop,wetland_prop
0,SJA0088000,1.1565,0.122,0.152,0.150,0.178,7.042970e+08,0.161,0.207,0.189,...,2.977,0.085,0.108,2.262,15.093,9.851,1.138,0.012,0.004,0.084
1,SJA0106000,4.6250,0.428,0.307,0.380,0.368,2.655380e+07,0.187,0.210,0.198,...,1.517,0.070,0.072,3.415,13.996,10.252,1.046,0.018,0.000,0.021
2,SJA0112000,1.2650,0.450,0.222,0.536,0.506,4.667168e+07,0.167,0.205,0.188,...,3.528,0.161,0.169,2.453,14.935,9.187,1.603,0.024,0.009,0.024
3,SJA0135000,4.5375,0.304,0.180,0.298,0.258,4.409568e+08,0.179,0.213,0.198,...,3.635,0.098,0.134,1.998,15.329,9.927,1.507,0.016,0.005,0.135
4,SJA0148000,1.9020,0.259,0.216,0.270,0.277,6.058148e+09,0.160,0.216,0.192,...,922.779,0.096,0.153,0.157,15.942,9.825,1.325,0.021,0.007,0.086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,SJB3136000,0.8780,0.001,0.000,0.001,0.000,3.332602e+07,0.177,0.202,0.191,...,1.089,0.055,0.054,4.593,14.016,9.826,0.897,0.000,0.003,0.473
238,SJB3348000,1.7625,0.000,0.000,0.000,0.000,9.406750e+05,,,,...,2.256,0.182,0.237,2.346,13.138,8.781,1.446,0.209,0.010,0.000
239,SJB3502000,5.4715,0.579,0.460,0.498,0.471,1.855758e+07,0.174,0.207,0.190,...,1.723,0.076,0.095,3.264,14.062,10.334,1.200,0.023,0.003,0.025
240,SJB3503000,8.2250,0.537,0.316,0.523,0.453,1.122941e+08,0.174,0.205,0.191,...,2.764,0.099,0.111,2.279,15.563,10.033,1.392,0.028,0.002,0.014


In [11]:
# Remove rows with missing values
ml_input_median = ml_input_median[~(ml_input_median.isnull()).any(axis=1)].reset_index(drop=True)

In [12]:
# Add observation ID based on index
ml_input_median.insert(0, "obs_id", ml_input_median.index + 1)

In [13]:
ml_input_median

Unnamed: 0,obs_id,site_code,obs_value,arable_prop,arable_prop_buff_100,arable_prop_buff_1000,arable_prop_buff_500,area,awc1_min,awc1_max,...,tri_max,tri_mean,tri_std,twi_min,twi_max,twi_mean,twi_std,urban_prop,water_prop,wetland_prop
0,1,SJA0088000,1.1565,0.122,0.152,0.150,0.178,7.042970e+08,0.161,0.207,...,2.977,0.085,0.108,2.262,15.093,9.851,1.138,0.012,0.004,0.084
1,2,SJA0106000,4.6250,0.428,0.307,0.380,0.368,2.655380e+07,0.187,0.210,...,1.517,0.070,0.072,3.415,13.996,10.252,1.046,0.018,0.000,0.021
2,3,SJA0112000,1.2650,0.450,0.222,0.536,0.506,4.667168e+07,0.167,0.205,...,3.528,0.161,0.169,2.453,14.935,9.187,1.603,0.024,0.009,0.024
3,4,SJA0135000,4.5375,0.304,0.180,0.298,0.258,4.409568e+08,0.179,0.213,...,3.635,0.098,0.134,1.998,15.329,9.927,1.507,0.016,0.005,0.135
4,5,SJA0148000,1.9020,0.259,0.216,0.270,0.277,6.058148e+09,0.160,0.216,...,922.779,0.096,0.153,0.157,15.942,9.825,1.325,0.021,0.007,0.086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,235,SJB3132000,0.4780,0.003,0.000,0.009,0.000,1.318370e+07,0.177,0.202,...,0.999,0.103,0.076,4.638,12.114,8.892,0.933,0.001,0.000,0.108
235,236,SJB3136000,0.8780,0.001,0.000,0.001,0.000,3.332602e+07,0.177,0.202,...,1.089,0.055,0.054,4.593,14.016,9.826,0.897,0.000,0.003,0.473
236,237,SJB3502000,5.4715,0.579,0.460,0.498,0.471,1.855758e+07,0.174,0.207,...,1.723,0.076,0.095,3.264,14.062,10.334,1.200,0.023,0.003,0.025
237,238,SJB3503000,8.2250,0.537,0.316,0.523,0.453,1.122941e+08,0.174,0.205,...,2.764,0.099,0.111,2.279,15.563,10.033,1.392,0.028,0.002,0.014


In [14]:
ml_input_median.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 85 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   obs_id                         239 non-null    int64  
 1   site_code                      239 non-null    object 
 2   obs_value                      239 non-null    float64
 3   arable_prop                    239 non-null    float64
 4   arable_prop_buff_100           239 non-null    float64
 5   arable_prop_buff_1000          239 non-null    float64
 6   arable_prop_buff_500           239 non-null    float64
 7   area                           239 non-null    float64
 8   awc1_min                       239 non-null    float64
 9   awc1_max                       239 non-null    float64
 10  awc1_mean                      239 non-null    float64
 11  awc1_std                       239 non-null    float64
 12  bd1_min                        239 non-null    flo

In [15]:
# Write to CSV
ml_input_median.to_csv("agile2024_tn_ml_input_median.csv", sep=",", index=False)

# Prepare spatial data

In [16]:
# Extract site codes
site_codes = list(ml_input_median["site_code"].unique())

In [17]:
# Read observation site points
sites = gpd.read_file("source_data/sites.gpkg")

# Drop unnecessary columns
sites = sites.drop(["x", "y", "wb_code"], axis=1)

# Extract sites used in the workshop
sites = sites[sites["site_code"].isin(site_codes)].reset_index(drop=True)

# Merge sites with ML input
sites = sites.merge(ml_input_median, on="site_code", how="left")
display(sites)

# Write to GPKG
sites.to_file("agile2024_tn_obs_sites.gpkg", index=False)

Unnamed: 0,site_code,geometry,obs_id,obs_value,arable_prop,arable_prop_buff_100,arable_prop_buff_1000,arable_prop_buff_500,area,awc1_min,...,tri_max,tri_mean,tri_std,twi_min,twi_max,twi_mean,twi_std,urban_prop,water_prop,wetland_prop
0,SJA8127000,POINT (696315.000 6546937.000),161,0.992,0.086,0.175,0.092,0.124,1.512569e+08,0.178,...,3.073,0.119,0.176,2.206,14.725,9.732,1.706,0.006,0.005,0.129
1,SJA9900000,POINT (669868.000 6591973.000),200,1.302,0.182,0.170,0.178,0.190,8.071414e+08,0.173,...,6.114,0.100,0.191,1.725,15.356,9.827,1.375,0.026,0.008,0.087
2,SJA3956000,POINT (636008.000 6603086.000),90,6.833,0.536,0.340,0.520,0.464,4.229881e+08,0.169,...,7.820,0.112,0.150,1.869,15.563,9.910,1.513,0.049,0.003,0.006
3,SJA1934000,POINT (700294.000 6592517.000),40,1.642,0.243,0.134,0.172,0.156,2.132077e+08,0.173,...,4.394,0.109,0.183,1.842,14.976,9.745,1.348,0.056,0.004,0.014
4,SJA7837000,POINT (520653.000 6588232.000),157,2.783,0.293,0.321,0.330,0.352,3.100426e+08,0.173,...,5.873,0.111,0.151,1.859,15.576,9.724,1.397,0.111,0.007,0.058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,SJB3503000,POINT (633230.000 6585334.000),238,8.225,0.537,0.316,0.523,0.453,1.122941e+08,0.174,...,2.764,0.099,0.111,2.279,15.563,10.033,1.392,0.028,0.002,0.014
235,SJA3731000,POINT (698754.000 6586118.000),84,1.725,0.200,0.065,0.122,0.085,6.085978e+07,0.175,...,4.166,0.124,0.223,1.842,14.565,9.663,1.395,0.072,0.003,0.009
236,SJA0813000,POINT (619933.000 6581023.000),17,3.250,0.452,0.223,0.499,0.475,4.023375e+07,0.174,...,4.090,0.207,0.272,2.373,16.631,9.019,2.097,0.033,0.007,0.014
237,SJA8884000,POINT (551221.000 6591443.000),175,3.300,0.242,0.176,0.232,0.225,1.930663e+09,0.164,...,7.045,0.105,0.145,1.464,15.329,9.728,1.468,0.025,0.010,0.106


In [18]:
# Read observation site catchments
catchments = gpd.read_file("source_data/site_catchments.gpkg")
catchments = catchments.drop("wb_code", axis=1)

# Extract catchments used in the workshop
catchments = catchments[catchments["site_code"].isin(site_codes)].reset_index(drop=True)

# Convert geometry to MultiPolygon
new_geoms = []
for geom in catchments["geometry"]:
    if not isinstance(geom, MultiPolygon):
        new_geoms.append(MultiPolygon([geom]))
    else:
        new_geoms.append(geom)
catchments["geometry"] = new_geoms
display(catchments)

# Write to GPKG
catchments.to_file("agile2024_catchments.gpkg", index=False)

Unnamed: 0,site_code,geometry
0,SJA9316000,"MULTIPOLYGON (((585865.000 6587725.000, 585875..."
1,SJA6880000,"MULTIPOLYGON (((597930.000 6606720.000, 597930..."
2,SJA8358000,"MULTIPOLYGON (((599250.000 6568615.000, 599250..."
3,SJA9895000,"MULTIPOLYGON (((629535.000 6566175.000, 629535..."
4,SJA6180000,"MULTIPOLYGON (((578040.000 6568635.000, 578040..."
...,...,...
234,SJA0680000,"MULTIPOLYGON (((427385.000 6469925.000, 427385..."
235,SJA9031000,"MULTIPOLYGON (((642545.000 6592930.000, 642545..."
236,SJA5143000,"MULTIPOLYGON (((602205.000 6565055.000, 602205..."
237,SJA0135000,"MULTIPOLYGON (((582165.000 6565725.000, 582165..."
