In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import MultiPolygon

# Prepare ML input data

In [2]:
# Read Excel file with all possible features
workshop_features = pd.read_excel("workshop_feature_selection.xlsx")

In [3]:
workshop_features

Unnamed: 0,feature,group,workshop_ml_input,tn_model_v1_abs_mean_shap,tn_model_v2_abs_mean_shap,tn_model_v3_abs_mean_shap,tn_model_v4_abs_mean_shap
0,arable_prop,arable,0,0.183095,0.240194,0.249932,0.280003
1,arable_prop_buff_100,arable,0,,,,
2,arable_prop_buff_1000,arable,0,,,,
3,arable_prop_buff_500,arable,0,,,,
4,area,area,0,0.015212,0.013442,0.017646,0.030337
...,...,...,...,...,...,...,...
77,twi_mean,twi,0,0.112151,0.058157,0.106335,0.084019
78,twi_std,twi,0,,,,
79,urban_prop,urban,0,0.066393,0.038074,0.061920,0.040733
80,water_prop,water,0,0.096020,0.117541,0.063534,0.071921


In [4]:
# Extract selected features
use_features = workshop_features[workshop_features["workshop_ml_input"] == 1]["feature"].to_list()

In [5]:
# Read ML input file with all possible features
ml_input = pd.read_csv("../source_data/tn_ml_input.csv", sep=",")

In [6]:
ml_input

Unnamed: 0,site_code,obs_year,parameter,obs_value,arable_prop,arable_prop_buff_100,arable_prop_buff_1000,arable_prop_buff_500,area,awc1_min,...,tri_max,tri_mean,tri_std,twi_min,twi_max,twi_mean,twi_std,urban_prop,water_prop,wetland_prop
0,SJA0088000,2016,TN,1.138,0.122,0.152,0.150,0.178,704297025.0,0.161,...,2.977,0.085,0.108,2.262,15.093,9.851,1.138,0.012,0.004,0.084
1,SJA0088000,2017,TN,1.175,0.122,0.152,0.150,0.178,704297025.0,0.161,...,2.977,0.085,0.108,2.262,15.093,9.851,1.138,0.012,0.004,0.084
2,SJA0106000,2016,TN,5.967,0.428,0.307,0.380,0.368,26553800.0,0.187,...,1.517,0.070,0.072,3.415,13.996,10.252,1.046,0.018,0.000,0.021
3,SJA0106000,2017,TN,4.175,0.428,0.307,0.380,0.368,26553800.0,0.187,...,1.517,0.070,0.072,3.415,13.996,10.252,1.046,0.018,0.000,0.021
4,SJA0106000,2018,TN,4.225,0.428,0.307,0.380,0.368,26553800.0,0.187,...,1.517,0.070,0.072,3.415,13.996,10.252,1.046,0.018,0.000,0.021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,SJB3502000,2019,TN,5.347,0.579,0.460,0.498,0.471,18557575.0,0.174,...,1.723,0.076,0.095,3.264,14.062,10.334,1.200,0.023,0.003,0.025
465,SJB3502000,2020,TN,5.596,0.579,0.460,0.498,0.471,18557575.0,0.174,...,1.723,0.076,0.095,3.264,14.062,10.334,1.200,0.023,0.003,0.025
466,SJB3503000,2020,TN,8.225,0.537,0.316,0.523,0.453,112294125.0,0.174,...,2.764,0.099,0.111,2.279,15.563,10.033,1.392,0.028,0.002,0.014
467,SJB3510000,2019,TN,8.090,0.475,0.271,0.573,0.497,29300575.0,0.174,...,2.757,0.134,0.201,2.321,14.517,9.690,1.718,0.025,0.004,0.096


In [7]:
# Create list of columns to drop from ML input
drop_cols = []
for col in ml_input.columns:
    if col not in use_features and col not in ["site_code", "obs_value"]:
        drop_cols.append(col)

In [8]:
# Drop unnecessary columns
ml_input = ml_input.drop(drop_cols, axis=1)

In [9]:
# Aggregate data
agg_func = "mean"
ml_input_agg = ml_input.groupby("site_code").aggregate(agg_func).reset_index()

In [10]:
# Remove rows with missing values
ml_input_agg = ml_input_agg[~(ml_input_agg.isnull()).any(axis=1)].reset_index(drop=True)

In [11]:
# Add observation ID based on index
ml_input_agg.insert(0, "obs_id", ml_input_agg.index + 1)

In [12]:
ml_input_agg

Unnamed: 0,obs_id,site_code,obs_value
0,1,SJA0088000,1.1565
1,2,SJA0106000,4.9884
2,3,SJA0112000,1.2650
3,4,SJA0135000,4.5375
4,5,SJA0148000,1.9020
...,...,...,...
237,238,SJB3136000,0.8780
238,239,SJB3348000,1.7625
239,240,SJB3502000,5.4715
240,241,SJB3503000,8.2250


In [13]:
ml_input_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   obs_id     242 non-null    int64  
 1   site_code  242 non-null    object 
 2   obs_value  242 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 5.8+ KB


In [14]:
# Write to CSV
ml_input_agg.to_csv(f"agile2024_tn_ml_input_{agg_func}.csv", sep=",", index=False)

# Prepare spatial data

In [15]:
# Extract site codes
site_codes = list(ml_input_agg["site_code"].unique())

In [16]:
# Read observation site points
sites = gpd.read_file("../source_data/sites.gpkg")

# Drop unnecessary columns
sites = sites.drop(["x", "y", "wb_code"], axis=1)

# Extract sites used in the workshop
sites = sites[sites["site_code"].isin(site_codes)].reset_index(drop=True)

# Merge sites with ML input
sites = sites.merge(ml_input_agg, on="site_code", how="left")
display(sites)

# Write to GPKG
sites.to_file(f"agile2024_tn_{agg_func}_obs_sites.gpkg", index=False)

Unnamed: 0,site_code,geometry,obs_id,obs_value
0,SJA8127000,POINT (696315.000 6546937.000),161,1.0288
1,SJA9900000,POINT (669868.000 6591973.000),200,1.3402
2,SJA3956000,POINT (636008.000 6603086.000),90,6.6156
3,SJA1934000,POINT (700294.000 6592517.000),40,1.6676
4,SJA7837000,POINT (520653.000 6588232.000),157,2.8696
...,...,...,...,...
237,SJB3503000,POINT (633230.000 6585334.000),241,8.2250
238,SJA3731000,POINT (698754.000 6586118.000),84,1.7250
239,SJA0813000,POINT (619933.000 6581023.000),17,3.2500
240,SJA8884000,POINT (551221.000 6591443.000),175,3.3000


In [17]:
# Read observation site catchments
catchments = gpd.read_file("../source_data/site_catchments.gpkg")
catchments = catchments.drop("wb_code", axis=1)

# Extract catchments used in the workshop
catchments = catchments[catchments["site_code"].isin(site_codes)].reset_index(drop=True)

# Convert geometry to MultiPolygon
new_geoms = []
for geom in catchments["geometry"]:
    if not isinstance(geom, MultiPolygon):
        new_geoms.append(MultiPolygon([geom]))
    else:
        new_geoms.append(geom)
catchments["geometry"] = new_geoms
display(catchments)

# Write to GPKG
catchments.to_file("agile2024_catchments.gpkg", index=False)

Unnamed: 0,site_code,geometry
0,SJA9316000,"MULTIPOLYGON (((585865.000 6587725.000, 585875..."
1,SJA6880000,"MULTIPOLYGON (((597930.000 6606720.000, 597930..."
2,SJA8358000,"MULTIPOLYGON (((599250.000 6568615.000, 599250..."
3,SJA9895000,"MULTIPOLYGON (((629535.000 6566175.000, 629535..."
4,SJA6180000,"MULTIPOLYGON (((578040.000 6568635.000, 578040..."
...,...,...
237,SJA0680000,"MULTIPOLYGON (((427385.000 6469925.000, 427385..."
238,SJA9031000,"MULTIPOLYGON (((642545.000 6592930.000, 642545..."
239,SJA5143000,"MULTIPOLYGON (((602205.000 6565055.000, 602205..."
240,SJA0135000,"MULTIPOLYGON (((582165.000 6565725.000, 582165..."
