# Model run
In cell three a selection can be made for the dataset, balanced or unbalanced and implementation of expert knowledge. Afterwards the code below will print the crossvalidation and test results.

In [1]:
# Import necessary libraries
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load the dataset
gdf_zaanstad = gpd.read_file("../Data/dataset_zaanstad.gpkg", layer="polluted_points")
gdf_oosterhout = gpd.read_file("../Data/dataset_oosterhout.gpkg", layer="polluted_points")

# Exclude BOORPUNT_ID
gdf_zaanstad = gdf_zaanstad.drop(columns=['BOORPUNT_ID'])
gdf_oosterhout = gdf_oosterhout.drop(columns=['BOORPUNT_ID'])

# Exclude geopandas geometry as variable
gdf_zaanstad = gdf_zaanstad.drop(columns=['geometry'])
gdf_oosterhout = gdf_oosterhout.drop(columns=['geometry'])

In [3]:
# Dataset
dataset = gdf_zaanstad
# dataset = gdf_oosterhout

# Normalized vs not normalized
balanced = False

# Implementation
baseline = True
BKK_var = False
BKK_split = False

# Different param settings for different model runs
max_depths = {
    'Oosterhout_unbalance' : 20,
    'Oosterhout_balance' : 10,
    'Zaanstad_unbalance' : 30,
    'Zaanstad_balance' : 20,
}

if len(dataset) < 10000:
    if not balanced:
        max_depth = max_depths['Oosterhout_unbalance']
    else:
        max_depth = max_depths['Oosterhout_balance']
else:
    if not balanced:
        max_depth = max_depths['Zaanstad_unbalance']
    else:
        max_depth = max_depths['Zaanstad_balance']

In [4]:
# Model print statement
if len(dataset) > 10000:
    datasetName = 'Zaanstad'
    print(f'Dataset: {datasetName}')
else:
    datasetName = 'Oosterhout'
    print(f'Dataset: {datasetName}')
    
if balanced:
    print('Dataset is balanced')
else:
    print('Dataset is not balanced')
    
if baseline:
    print('BKK is excluded from the model')
if BKK_var:
    print('BKK is used as variable in the model')
if BKK_split:
    print('The model is split into three using the BKK')

# BASELINE MODEL (Excluding BKK)
# --------------------------------------------------------------------------------
if baseline:
    # Exclude BKK
    gdf_baseline = dataset.drop(columns=['BKK'])

    # Encode the target variable
    label_encoder = LabelEncoder()
    gdf_baseline['TOETS_WBB'] = label_encoder.fit_transform(gdf_baseline['TOETS_WBB'])
    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

    # Define features and target variable
    X = gdf_baseline.drop(columns=['TOETS_WBB'])
    y = gdf_baseline['TOETS_WBB']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Columns to normalize
    columns_to_normalize = ['days_since_ref', 'X', 'Y']

    # Initialize the scaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform both training and testing data
    X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    # Initialize the Random Forest model
    if balanced:
        rf_model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1, class_weight='balanced')
    else:
        rf_model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1)

    # Perform 10-fold cross-validation
    scores = cross_validate(rf_model, X_train, y_train, cv=10, scoring=['accuracy', 'precision', 'recall', 'f1'], return_train_score=True)

    # Print the results
    print("\nCross-validation results:")
    print("Accuracy: ", round(scores['test_accuracy'].mean(), 5))
    print("Recall: ", round(scores['test_recall'].mean(), 5))
    print("Precision: ", round(scores['test_precision'].mean(), 5))
    print("F1-score: ", round(scores['test_f1'].mean(), 5))
    
    # Train the model on the entire training set
    rf_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = rf_model.predict(X_test)

    # Evaluate the model on the test set
    test_accuracy = accuracy_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    test_recall = recall_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)

    # Print the test set results
    print("\nTest set results:")
    print("Accuracy: ", round(test_accuracy, 5))
    print("Recall: ", round(test_recall, 5))
    print("Precision: ", round(test_precision, 5))
    print("F1-score: ", round(test_recall, 5))
    
    # Safe results to excel
    # Define the path to your existing CSV file
    csv_file_path = 'results.csv'
    new_row = {
        'name': f'{datasetName}__Balanced:{balanced}__Baseline:{baseline}__VAR:{BKK_var}__SPLIT:{BKK_split}',
        'Accuracy_cross-validation': round(scores['test_accuracy'].mean(), 5),
        'AccuracyTestvalidation': round(test_accuracy, 5),
        'RecallCross-validation': round(scores['test_recall'].mean(), 5),
        'RecallTest validation': round(test_recall, 5),
        'PrecisionCross-validation': round(scores['test_precision'].mean(), 5),
        'PrecisionTest validation': round(test_precision, 5),
        'F1 scoreCross-validation': round(scores['test_f1'].mean(), 5),
        'F1 scoreTest validation': round(test_recall, 5)
    }

    df_new_row = pd.DataFrame([new_row])
    df_new_row.to_csv(csv_file_path, mode='a', index=False, header=False)
    
# Model Including BKK as independent variable
# --------------------------------------------------------------------------------
if BKK_var:
    # Define the mapping for ordinal encoding
    bkk_mapping = {'AW_2000': 1, 'Wonen': 2, 'Industrie': 3}

    # Apply the mapping to the BKK column
    gdf_bkk_variable = dataset
    gdf_bkk_variable = gdf_bkk_variable.replace({"BKK": bkk_mapping})
    gdf_bkk_variable = gdf_bkk_variable[gdf_bkk_variable['BKK'] != 'Onbekend']
    gdf_bkk_variable = gdf_bkk_variable.dropna()
        
    # Encode the target variable
    label_encoder = LabelEncoder()
    gdf_bkk_variable['TOETS_WBB'] = label_encoder.fit_transform(gdf_bkk_variable['TOETS_WBB'])

    # Define features and target variable
    X = gdf_bkk_variable.drop(columns=['TOETS_WBB'])
    y = gdf_bkk_variable['TOETS_WBB']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Columns to normalize
    columns_to_normalize = ['days_since_ref', 'X', 'Y']

    # Initialize the scaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform both training and testing data
    X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    # Initialize the Random Forest model
    if balanced:
        rf_model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1, class_weight='balanced')
    else:
        rf_model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1)

    # Perform 10-fold cross-validation
    scores = cross_validate(rf_model, X_train, y_train, cv=10, scoring=['accuracy', 'precision', 'recall', 'f1'], return_train_score=True)

    # Print the results
    print("\nCross-validation results:")
    print("Accuracy: ", round(scores['test_accuracy'].mean(), 5))
    print("Recall: ", round(scores['test_recall'].mean(), 5))
    print("Precision: ", round(scores['test_precision'].mean(), 5))
    print("F1-score: ", round(scores['test_f1'].mean(), 5))
    
    # Train the model on the entire training set
    rf_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = rf_model.predict(X_test)

    # Evaluate the model on the test set
    test_accuracy = accuracy_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    test_recall = recall_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)

    # Print the test set results
    print("\nTest set results:")
    print("Accuracy: ", round(test_accuracy, 5))
    print("Recall: ", round(test_recall, 5))
    print("Precision: ", round(test_precision, 5))
    print("F1-score: ", round(test_recall, 5))
    
    # Safe results to excel
    # Define the path to your existing CSV file
    csv_file_path = 'results.csv'
    new_row = {
        'name': f'{datasetName}__Balanced:{balanced}__Baseline:{baseline}__VAR:{BKK_var}__SPLIT:{BKK_split}',
        'Accuracy_cross-validation': round(scores['test_accuracy'].mean(), 5),
        'AccuracyTestvalidation': round(test_accuracy, 5),
        'RecallCross-validation': round(scores['test_recall'].mean(), 5),
        'RecallTest validation': round(test_recall, 5),
        'PrecisionCross-validation': round(scores['test_precision'].mean(), 5),
        'PrecisionTest validation': round(test_precision, 5),
        'F1 scoreCross-validation': round(scores['test_f1'].mean(), 5),
        'F1 scoreTest validation': round(test_recall, 5)
    }

    df_new_row = pd.DataFrame([new_row])
    df_new_row.to_csv(csv_file_path, mode='a', index=False, header=False)
    
# Models based on BKK split
# --------------------------------------------------------------------------------
if BKK_split:
    gdf_bkk_aw2000 = dataset[dataset['BKK'] == 'AW_2000']
    gdf_bkk_wonen = dataset[dataset['BKK'] == 'Wonen']
    gdf_bkk_industrie = dataset[dataset['BKK'] == 'Industrie']

    datasets = {
        'AW_2000' : gdf_bkk_aw2000,
        'Wonen' : gdf_bkk_wonen,
        'Industrie' : gdf_bkk_industrie
    }
    
    for datasetName, data in datasets.items():
        print(f"\nResults for {datasetName}:")
        print("-----------------------------")
        
        # Exclude BKK
        data = data.drop(columns=['BKK'])

        # Encode the target variable
        label_encoder = LabelEncoder()
        data['TOETS_WBB'] = label_encoder.fit_transform(data['TOETS_WBB'])

        # Define features and target variable
        X = data.drop(columns=['TOETS_WBB'])
        y = data['TOETS_WBB']

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

        # Columns to normalize
        columns_to_normalize = ['days_since_ref', 'X', 'Y']

        # Initialize the scaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both training and testing data
        X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
        X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

        # Initialize the Random Forest model
        if balanced:
            rf_model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1, class_weight='balanced')
        else:
            rf_model = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=42, n_jobs=-1)

        # Perform 10-fold cross-validation
        scores = cross_validate(rf_model, X_train, y_train, cv=10, scoring=['accuracy', 'precision', 'recall', 'f1'], return_train_score=True)

        # Print the results
        print("Cross-validation results:")
        print("Accuracy: ", round(scores['test_accuracy'].mean(), 5))
        print("Recall: ", round(scores['test_recall'].mean(), 5))
        print("Precision: ", round(scores['test_precision'].mean(), 5))
        print("F1-score: ", round(scores['test_f1'].mean(), 5))

        # Train the model on the entire training set
        rf_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = rf_model.predict(X_test)

        # Evaluate the model on the test set
        test_accuracy = accuracy_score(y_test, y_pred)
        test_precision = precision_score(y_test, y_pred)
        test_recall = recall_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)

        # Print the test set results
        print("\nTest set results:")
        print("Accuracy: ", round(test_accuracy, 5))
        print("Recall: ", round(test_recall, 5))
        print("Precision: ", round(test_precision, 5))
        print("F1-score: ", round(test_recall, 5))
        
        # Safe results to excel
        # Define the path to your existing CSV file
        csv_file_path = 'results.csv'
        new_row = {
            'name': f'{datasetName}__Balanced:{balanced}__Baseline:{baseline}__VAR:{BKK_var}__SPLIT:{BKK_split}__Type:{datasetName}',
            'Accuracy_cross-validation': round(scores['test_accuracy'].mean(), 5),
            'AccuracyTestvalidation': round(test_accuracy, 5),
            'RecallCross-validation': round(scores['test_recall'].mean(), 5),
            'RecallTest validation': round(test_recall, 5),
            'PrecisionCross-validation': round(scores['test_precision'].mean(), 5),
            'PrecisionTest validation': round(test_precision, 5),
            'F1 scoreCross-validation': round(scores['test_f1'].mean(), 5),
            'F1 scoreTest validation': round(test_recall, 5)
        }

        df_new_row = pd.DataFrame([new_row])
        df_new_row.to_csv(csv_file_path, mode='a', index=False, header=False)

Dataset: Zaanstad
Dataset is not balanced
BKK is excluded from the model

Cross-validation results:
Accuracy:  0.81953
Recall:  0.63851
Precision:  0.7228
F1-score:  0.67793

Test set results:
Accuracy:  0.82403
Recall:  0.65817
Precision:  0.72503
F1-score:  0.65817
