# Analysis of top four models
Inside this notebook we will delve furter into the top four models selected from all the model results, we will do an in-depth analysis on the feature importance, generalizability and the learning curve. All three sections can be ran individually after the imports below are ran.

In [None]:
# imports
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

## Feature importance

In [None]:
# Load the dataset
gdf_zaanstad = gpd.read_file("../Data/dataset_zaanstad.gpkg", layer="polluted_points")
gdf_oosterhout = gpd.read_file("../Data/dataset_oosterhout.gpkg", layer="polluted_points")

# Exclude BOORPUNT_ID and geometry
gdf_zaanstad = gdf_zaanstad.drop(columns=['BOORPUNT_ID', 'geometry'])
gdf_oosterhout = gdf_oosterhout.drop(columns=['BOORPUNT_ID', 'geometry'])

# Rename columns to English for the plot
gdf_zaanstad = gdf_zaanstad.rename(columns={"gewaspercelen": "Agricultural land", "oppervlaktewater": "Surface water", "wegen":"Roads", "spoorwegen":"Railroad", "industry":"Industry", "days_since_ref":"Date"})
gdf_oosterhout = gdf_oosterhout.rename(columns={"gewaspercelen": "Agricultural land", "oppervlaktewater": "Surface water", "wegen":"Roads", "spoorwegen":"Railroad", "industry":"Industry", "days_since_ref":"Date"})

In [None]:
# Prepare the data
def preprocess_data(data, use_bkk):
    if not use_bkk:
        data = data.drop(columns=['BKK'])
    else:
        bkk_mapping = {'AW_2000': 1, 'Wonen': 2, 'Industrie': 3}
        data.replace({"BKK": bkk_mapping}, inplace=True)
        data = data[data['BKK'] != 'Onbekend'].dropna()

    label_encoder = LabelEncoder()
    data['TOETS_WBB'] = label_encoder.fit_transform(data['TOETS_WBB'])

    X = data.drop(columns=['TOETS_WBB'])
    y = data['TOETS_WBB']

    # Columns to normalize
    columns_to_normalize = ['Date', 'X', 'Y']
    scaler = StandardScaler()
    X[columns_to_normalize] = scaler.fit_transform(X[columns_to_normalize])

    return train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess datasets
X_train_oost, X_val_oost, y_train_oost, y_val_oost = preprocess_data(gdf_oosterhout, use_bkk=False)
X_train_oost_bkk, X_val_oost_bkk, y_train_oost_bkk, y_val_oost_bkk = preprocess_data(gdf_oosterhout, use_bkk=True)
X_train_zaan, X_val_zaan, y_train_zaan, y_val_zaan = preprocess_data(gdf_zaanstad, use_bkk=False)
X_train_zaan_bkk, X_val_zaan_bkk, y_train_zaan_bkk, y_val_zaan_bkk = preprocess_data(gdf_zaanstad, use_bkk=True)

In [None]:
# Train models and get feature importances
def get_feature_importances(X_train, y_train, max_depth):
    model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    return model.feature_importances_

# Train and extract feature importances for each scenario
feature_importances_oost = get_feature_importances(X_train_oost, y_train_oost, 20)
feature_importances_oost_bkk = get_feature_importances(X_train_oost_bkk, y_train_oost_bkk, 20)
feature_importances_zaan = get_feature_importances(X_train_zaan, y_train_zaan, 30)
feature_importances_zaan_bkk = get_feature_importances(X_train_zaan_bkk, y_train_zaan_bkk, 30)

In [None]:
# Function to plot feature importance
def plot_feature_importance(importances, features, title, filename, figsize=(8, 4)):
    plt.figure(figsize=figsize)
    sns.barplot(x=importances, y=features, palette="Blues_d")
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.tight_layout()
    plt.show()
    plt.close()

# Set the font size
plt.rcParams.update({'font.size': 16})

# Plot and save feature importance for Oosterhout without BKK
plot_feature_importance(
    feature_importances_oost, 
    X_train_oost.columns, 
    'Oosterhout (without BKK)', 
    'feature_importances_oosterhout_without_bkk.png',
)

# Plot and save feature importance for Oosterhout with BKK
plot_feature_importance(
    feature_importances_oost_bkk, 
    X_train_oost_bkk.columns, 
    'Oosterhout (with BKK)', 
    'feature_importances_oosterhout_with_bkk.png',
)

# Plot and save feature importance for Zaanstad without BKK
plot_feature_importance(
    feature_importances_zaan, 
    X_train_zaan.columns, 
    'Zaanstad (without BKK)', 
    'feature_importances_zaanstad_without_bkk.png',
)

# Plot and save feature importance for Zaanstad with BKK
plot_feature_importance(
    feature_importances_zaan_bkk, 
    X_train_zaan_bkk.columns, 
    'Zaanstad (with BKK)', 
    'feature_importances_zaanstad_with_bkk.png',
)

## Generalizability
The second codeblock below can be used to choose which dataset should be the train dataset and which should be the test dataset. Additionally the baseline parameter can be set to True and False. If set to True the model will exclude the BKK as variable, if set to False it will be included.

In [None]:
# Load the dataset
gdf_zaanstad = gpd.read_file("../Data/dataset_zaanstad.gpkg", layer="polluted_points")
gdf_oosterhout = gpd.read_file("../Data/dataset_oosterhout.gpkg", layer="polluted_points")

# Exclude BOORPUNT_ID and geometry
gdf_zaanstad = gdf_zaanstad.drop(columns=['BOORPUNT_ID', 'geometry'])
gdf_oosterhout = gdf_oosterhout.drop(columns=['BOORPUNT_ID', 'geometry'])

In [None]:
# Choose dataset
dataset = gdf_oosterhout

# Parameter to exclude the BKK
baseline = False

In [None]:
# Different param settings for different model runs
max_depths = {
    'Oosterhout_unbalance': 20,
    'Zaanstad_unbalance': 30,
}

if len(dataset) < 10000:
    max_depth = max_depths['Oosterhout_unbalance']
else:
    max_depth = max_depths['Zaanstad_unbalance']

# Generalizability cross datasets
dataset_name = 'Oosterhout' if len(dataset) < 10000 else 'Zaanstad'
print(f'Dataset: {dataset_name}')

data_oosterhout = gdf_oosterhout
data_zaanstad = gdf_zaanstad

if baseline:
    data_oosterhout = data_oosterhout.drop(columns=['BKK'])
    data_zaanstad = data_zaanstad.drop(columns=['BKK'])
else:
    # Define the mapping for ordinal encoding
    bkk_mapping = {'AW_2000': 1, 'Wonen': 2, 'Industrie': 3}

    # Apply the mapping to the BKK column
    data_oosterhout.replace({"BKK": bkk_mapping}, inplace=True)
    data_oosterhout = data_oosterhout[data_oosterhout['BKK'] != 'Onbekend'].dropna()
    
    data_zaanstad.replace({"BKK": bkk_mapping}, inplace=True)
    data_zaanstad = data_zaanstad[data_zaanstad['BKK'] != 'Onbekend'].dropna()


# Encode the target variable
label_encoder = LabelEncoder()
data_oosterhout['TOETS_WBB'] = label_encoder.fit_transform(data_oosterhout['TOETS_WBB'])
data_zaanstad['TOETS_WBB'] = label_encoder.fit_transform(data_zaanstad['TOETS_WBB'])

# Define features and target variable
X_oosterhout = data_oosterhout.drop(columns=['TOETS_WBB'])
y_oosterhout = data_oosterhout['TOETS_WBB']
X_zaanstad = data_zaanstad.drop(columns=['TOETS_WBB'])
y_zaanstad = data_zaanstad['TOETS_WBB']

# Columns to normalize
columns_to_normalize = ['days_since_ref', 'X', 'Y']

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_oosterhout[columns_to_normalize] = scaler.fit_transform(X_oosterhout[columns_to_normalize])
X_zaanstad[columns_to_normalize] = scaler.transform(X_zaanstad[columns_to_normalize])

# Initialize RandomForest model
rf_model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1)

# Train the model on one dataset and evaluate on the other
if dataset_name == 'Zaanstad':
    rf_model.fit(X_zaanstad, y_zaanstad)
    y_pred = rf_model.predict(X_oosterhout)
    y_true = y_oosterhout
else:
    rf_model.fit(X_oosterhout, y_oosterhout)
    y_pred = rf_model.predict(X_zaanstad)
    y_true = y_zaanstad

# Evaluate the model
test_accuracy = accuracy_score(y_true, y_pred)
test_precision = precision_score(y_true, y_pred)
test_recall = recall_score(y_true, y_pred)
test_f1 = f1_score(y_true, y_pred)

# Print the test set results
print("\nTest set results:")
print("Accuracy: ", round(test_accuracy, 5))
print("Recall: ", round(test_recall, 5))
print("Precision: ", round(test_precision, 5))
print("F1-score: ", round(test_f1, 5))

## Learning curve

In [None]:
# Load the dataset
gdf_zaanstad = gpd.read_file("../Data/dataset_zaanstad.gpkg", layer="polluted_points")
gdf_oosterhout = gpd.read_file("../Data/dataset_oosterhout.gpkg", layer="polluted_points")

# Exclude BOORPUNT_ID and geometry
gdf_zaanstad = gdf_zaanstad.drop(columns=['BOORPUNT_ID', 'geometry'])
gdf_oosterhout = gdf_oosterhout.drop(columns=['BOORPUNT_ID', 'geometry'])

# Rename columns to English for the plot
gdf_zaanstad = gdf_zaanstad.rename(columns={"gewaspercelen": "Agricultural land", "oppervlaktewater": "Surface water", "wegen":"Roads", "spoorwegen":"Railroad", "industry":"Industry", "days_since_ref":"Date"})
gdf_oosterhout = gdf_oosterhout.rename(columns={"gewaspercelen": "Agricultural land", "oppervlaktewater": "Surface water", "wegen":"Roads", "spoorwegen":"Railroad", "industry":"Industry", "days_since_ref":"Date"})

In [None]:
# Function to compute learning curves for recall
def compute_recall_learning_curves(X_train, y_train, X_val, y_val, max_depth, step_size=100):
    train_sizes = np.arange(step_size, len(X_train), step_size)
    val_recall = []

    for size in train_sizes:
        X_train_subset = X_train[:size]
        y_train_subset = y_train[:size]

        model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1)
        model.fit(X_train_subset, y_train_subset)

        y_val_pred = model.predict(X_val)
        val_recall.append(recall_score(y_val, y_val_pred, average='binary'))

    return train_sizes, val_recall

In [None]:
# Prepare the data
def preprocess_data(data, use_bkk):
    if not use_bkk:
        data = data.drop(columns=['BKK'])
    else:
        bkk_mapping = {'AW_2000': 1, 'Wonen': 2, 'Industrie': 3}
        data.replace({"BKK": bkk_mapping}, inplace=True)
        data = data[data['BKK'] != 'Onbekend'].dropna()

    label_encoder = LabelEncoder()
    data['TOETS_WBB'] = label_encoder.fit_transform(data['TOETS_WBB'])

    X = data.drop(columns=['TOETS_WBB'])
    y = data['TOETS_WBB']

    # Columns to normalize
    columns_to_normalize = ['Date', 'X', 'Y']
    scaler = StandardScaler()
    X[columns_to_normalize] = scaler.fit_transform(X[columns_to_normalize])

    return train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess datasets
X_train_oost, X_val_oost, y_train_oost, y_val_oost = preprocess_data(gdf_oosterhout, use_bkk=False)
X_train_oost_bkk, X_val_oost_bkk, y_train_oost_bkk, y_val_oost_bkk = preprocess_data(gdf_oosterhout, use_bkk=True)
X_train_zaan, X_val_zaan, y_train_zaan, y_val_zaan = preprocess_data(gdf_zaanstad, use_bkk=False)
X_train_zaan_bkk, X_val_zaan_bkk, y_train_zaan_bkk, y_val_zaan_bkk = preprocess_data(gdf_zaanstad, use_bkk=True)

In [None]:
# Compute learning curves
train_sizes_oost, val_recall_oost = compute_recall_learning_curves(X_train_oost, y_train_oost, X_val_oost, y_val_oost, 20)
train_sizes_oost_bkk, val_recall_oost_bkk = compute_recall_learning_curves(X_train_oost_bkk, y_train_oost_bkk, X_val_oost_bkk, y_val_oost_bkk, 20)
train_sizes_zaan, val_recall_zaan = compute_recall_learning_curves(X_train_zaan, y_train_zaan, X_val_zaan, y_val_zaan, 30)
train_sizes_zaan_bkk, val_recall_zaan_bkk = compute_recall_learning_curves(X_train_zaan_bkk, y_train_zaan_bkk, X_val_zaan_bkk, y_val_zaan_bkk, 30)

In [None]:
# Set the font size for the entire plot
plt.rcParams.update({'font.size': 16})

# Define a function to plot and save each subplot with custom tick font size
def plot_and_save(train_sizes1, val_recall1, train_sizes2, val_recall2, title, xlabel, ylabel, labels, filename, tick_fontsize=12):
    plt.figure(figsize=(7.5, 4))  # Adjust the size as needed
    plt.plot(train_sizes1, val_recall1, label=labels[0])
    plt.plot(train_sizes2, val_recall2, label=labels[1])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
    plt.tight_layout()
    plt.show()
    plt.close()

# Plot and save Oosterhout Recall
plot_and_save(
    train_sizes_oost, 
    val_recall_oost, 
    train_sizes_oost_bkk, 
    val_recall_oost_bkk, 
    'Oosterhout Recall', 
    'Training Size', 
    'Recall', 
    ['Oosterhout without BKK', 'Oosterhout with BKK'], 
    'oosterhout_recall.png',
    tick_fontsize=12  # Adjust the tick font size as needed
)

# Plot and save Zaanstad Recall
plot_and_save(
    train_sizes_zaan, 
    val_recall_zaan, 
    train_sizes_zaan_bkk, 
    val_recall_zaan_bkk, 
    'Zaanstad Recall', 
    'Training Size', 
    'Recall', 
    ['Zaanstad without BKK', 'Zaanstad with BKK'], 
    'zaanstad_recall.png',
    tick_fontsize=12  # Adjust the tick font size as needed
)
