# GirdSearch Analysis
In the third cell a selection can be made for which dataset the analysis should be run and if the dataset should be balanced or unbalanced, afterwards the additional code can be run to show the graph corresponding to the chosen dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
gdf_zaanstad = gpd.read_file("../Data/dataset_zaanstad.gpkg", layer="polluted_points")
gdf_oosterhout = gpd.read_file("../Data/dataset_oosterhout.gpkg", layer="polluted_points")

# Exclude BOORPUNT_ID
gdf_zaanstad = gdf_zaanstad.drop(columns=['BOORPUNT_ID'])
gdf_oosterhout = gdf_oosterhout.drop(columns=['BOORPUNT_ID'])

# Exclude geopandas geometry as variable
gdf_zaanstad = gdf_zaanstad.drop(columns=['geometry'])
gdf_oosterhout = gdf_oosterhout.drop(columns=['geometry'])

In [None]:
# Dataset
dataset = gdf_zaanstad
# dataset = gdf_oosterhout

# Balanced vs not balanced
balanced = True

In [None]:
# Exclude BKK
gdf_gridsearch = dataset.drop(columns=['BKK'])

# Encode the target variable
label_encoder = LabelEncoder()
gdf_gridsearch['TOETS_WBB'] = label_encoder.fit_transform(gdf_gridsearch['TOETS_WBB'])

# Define features and target variable
X = gdf_gridsearch.drop(columns=['TOETS_WBB'])
y = gdf_gridsearch['TOETS_WBB']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Columns to normalize
columns_to_normalize = ['days_since_ref', 'X', 'Y']

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

# Set params for GridSearch
params = {
    'max_depth': [10, 20, 30, 50, None],
}

# Initialize lists to hold the metric values
accuracies = []
precisions = []
recalls = []
f1s = []
test_accuracies = []
test_precisions = []
test_recalls = []
test_f1s = []

# Iterate through each set of parameters
for max_depth in params['max_depth']:
    # Initialize the model with the current parameters
    if balanced:
        model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1, class_weight='balanced')
    else:
        model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1)
        
    # Perform cross-validation and calculate metrics
    scores = cross_validate(model, X_train, y_train, cv=10, scoring=['accuracy', 'precision', 'recall', 'f1'], return_train_score=True)
    
    # Append metrics to the lists
    accuracies.append(scores['test_accuracy'].mean())
    precisions.append(scores['test_precision'].mean())
    recalls.append(scores['test_recall'].mean())
    f1s.append(scores['test_recall'].mean())
    
    # Train the model on the entire training set
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model on the test set
    test_accuracy = accuracy_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    test_recall = recall_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)
    
    # Append test set metrics to the lists
    test_accuracies.append(test_accuracy)
    test_precisions.append(test_precision)
    test_recalls.append(test_recall)
    test_f1s.append(test_f1)

# Convert the lists to arrays for easier manipulation
accuracies = np.array(accuracies)
precisions = np.array(precisions)
recalls = np.array(recalls)
f1s = np.array(f1s)
test_accuracies = np.array(test_accuracies)
test_precisions = np.array(test_precisions)
test_recalls = np.array(test_recalls)
test_f1s = np.array(test_f1s)

# Create a dataframe for easier plotting
results_df = pd.DataFrame({
    'max_depth': [str(p) for p in params['max_depth']],
    'train_accuracy': accuracies,
    'train_precision': precisions,
    'train_recall': recalls,
    'train_f1' : f1s,
    'test_accuracy': test_accuracies,
    'test_precision': test_precisions,
    'test_recall': test_recalls,
    'test_f1': test_f1s
})

In [None]:
# Set the font size
plt.rcParams.update({'font.size': 16})

# Plot the results
plt.figure(figsize=(15, 5))

# Plot accuracy
plt.subplot(1, 4, 1)
ax = plt.gca()
ax.set_ylim([0, 1])
plt.plot(results_df['max_depth'], results_df['train_accuracy'], marker='o', label='Cross Validation')
plt.plot(results_df['max_depth'], results_df['test_accuracy'], marker='o', color='red', label='Test Validation')
plt.title('Accuracy')
plt.xlabel('Parameter Max Depth')
plt.ylabel('Accuracy')
plt.xticks(rotation=90)

# Plot precision
plt.subplot(1, 4, 2)
ax = plt.gca()
ax.set_ylim([0, 1])
plt.plot(results_df['max_depth'], results_df['train_precision'], marker='o', label='Cross Validation')
plt.plot(results_df['max_depth'], results_df['test_precision'], marker='o', color='red', label='Test Validation')
plt.title('Precision')
plt.xlabel('Parameter Max Depth')
plt.ylabel('Precision')
plt.xticks(rotation=90)

# Plot recall
plt.subplot(1, 4, 3)
ax = plt.gca()
ax.set_ylim([0, 1])
plt.plot(results_df['max_depth'], results_df['train_recall'], marker='o', label='Cross Validation')
plt.plot(results_df['max_depth'], results_df['test_recall'], marker='o', color='red', label='Test Validation')
plt.title('Recall')
plt.xlabel('Parameter Max Depth')
plt.ylabel('Recall')
plt.xticks(rotation=90)

# Plot recall
plt.subplot(1, 4, 4)
ax = plt.gca()
ax.set_ylim([0, 1])
plt.plot(results_df['max_depth'], results_df['train_f1'], marker='o', label='Cross Validation')
plt.plot(results_df['max_depth'], results_df['test_f1'], marker='o', color='red', label='Test Validation')
plt.title('F1-score')
plt.xlabel('Parameter Max Depth')
plt.ylabel('F1-score')
plt.xticks(rotation=90)

# Add legend
plt.legend(fontsize=14)

# Add a general title for all subplots
if len(dataset) > 10000:
    gemeenten = 'Dataset Zaanstad'
else:
    gemeenten = 'Dataset Oosterhout'
    
if balanced:
    balance = ' balanced'
else:
    balance = ' not balanced'

plt.tight_layout()

plt.savefig(f'{gemeenten}{balance}.png', facecolor='white')
plt.show()