# Imports

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set random seeds

In [None]:
tf.random.set_seed(42)
np.random.seed(42)

# Load data

In [None]:
raw_df = gpd.read_file("datasets/5_split/df_fe.gpkg")

In [None]:
df = raw_df.copy()

# Separate features

In [None]:
# Dependent variables
labels = df.pop('very_good_health')

# Outer CV folds
outer_fold_ids = df["outer_loop_fold_id_python"]
outer_splits = outer_fold_ids.unique().astype(int)

# Inner CV folds
inner_fold_ids = df[[col for col in df.columns if "fold_id_python" in col]]
inner_splits = inner_fold_ids.stack().unique().astype(int)

# Independent variables
features = df.drop(columns = [col for col in df.columns if "fold_id" in col])
features = features.drop(columns = ["geometry"])

# Functions

## Get random hyperparameters

In [None]:
def get_random_hyperparameters():
    no_of_layers = np.random.randint(1, 5)
    no_of_nodes = []
    for i in range(0, no_of_layers):
        no_of_nodes.append(np.random.randint(16, 64))
    learning_rate = np.random.uniform(0.0001, 0.01)
    epochs = np.random.randint(20, 500)
    patience = np.random.randint(5, 15)
    return no_of_layers, no_of_nodes, learning_rate, epochs, patience
    

## Build model

In [None]:
def build_model(train_features, no_of_layers, no_of_nodes, learning_rate):

    layers = []

    normaliser = keras.layers.Normalization(axis = -1)
    normaliser.adapt(np.array(train_features))
    layers.append(normaliser)

    for layer_no in range(no_of_layers):
        layers.append(keras.layers.Dense(no_of_nodes[layer_no], activation = "relu"))

    layers.append(keras.layers.Dense(1))     # Single output for regression value

    model = keras.Sequential(
        layers
    )

    model.compile(
        optimizer = keras.optimizers.Adam(learning_rate = learning_rate),
        loss='mse'
    )

    return model

## Build early stopper

In [None]:
def build_early_stopper(patience):
    early_stopper = keras.callbacks.EarlyStopping(
        monitor = "val_loss",
        patience = patience,
        restore_best_weights = True
    )
    return early_stopper

## Get evaluation metrics

In [None]:
def get_evaluation_metrics(val_labels, predictions):
    mae = mean_absolute_error(val_labels, predictions)
    mse = mean_squared_error(val_labels, predictions)
    r2 = r2_score(val_labels, predictions)
    return mae, mse, r2

## Get average score across cross-validation folds

In [None]:
def get_avg_scores(cv_results):
    mae_scores = []
    mse_scores = []
    r2_scores = []

    for result in cv_results:
        mae_scores.append(result["mae"])
        mse_scores.append(result["mse"])
        r2_scores.append(result["r2"])

    avg_mae = np.mean(mae_scores)
    avg_mse = np.mean(mse_scores)
    avg_r2 = np.mean(r2_scores)

    return avg_mae, avg_mse, avg_r2

## Get optimal hyperparameters

In [None]:
def get_optimal_hyperparameters(hp_combinations, cv_results):
    hp_combination_scores = []
    for i in range(len(hp_combinations)):
        current_hp_combination_results = [result for result in cv_results if result["hp_combination"] == i]
        mae, mse, r2 = get_avg_scores(current_hp_combination_results)
        hp_combination_scores.append(mse)
    optimal_combination = np.argmin(hp_combination_scores)
    optimal_hps = hp_combinations[optimal_combination]
    return optimal_hps

# Cross-validation

## Initialise HP and results arrays

In [None]:
outer_cv_results = []

##  Nested cross-validation loop

In [None]:
for current_outer_split in outer_splits:

    hp_combinations = []
    cv_results = []

    # Get training and validation sets for current outer split
    is_in_validation_set = outer_fold_ids == current_outer_split
    is_in_training_set = ~is_in_validation_set
    outer_train_features = features.loc[is_in_training_set]
    outer_train_labels = labels.loc[is_in_training_set]
    outer_val_features = features.loc[is_in_validation_set]
    outer_val_labels = labels.loc[is_in_validation_set]

    # Loop to test 10 hyperparameter combinations
    for i in range(10):
        
        # Get hyperparameters
        no_of_layers, no_of_nodes, learning_rate, epochs, patience = get_random_hyperparameters()
        current_hps = {
            "outer_loop_split": current_outer_split,
            "no_of_layers": no_of_layers,
            "no_of_nodes": no_of_nodes,
            "learning_rate": learning_rate,
            "epochs": epochs,
            "patience": patience
        }
        hp_combinations.append(current_hps)

        # Inner cross-validation for model selection
        for current_inner_split in inner_splits:
            print(f"\n --- Training model {i} on outer split {current_outer_split}, inner split {current_inner_split} ---")

            # Get training and validation sets for current inner split
            is_in_validation_set = inner_fold_ids[f"inner_loop_{current_outer_split + 1}_fold_id_python"] == current_inner_split
            is_in_training_set = ~is_in_validation_set
            inner_train_features = outer_train_features.loc[is_in_training_set]
            inner_train_labels = outer_train_labels.loc[is_in_training_set]
            inner_val_features = outer_train_features.loc[is_in_validation_set]
            inner_val_labels = outer_train_labels.loc[is_in_validation_set]

            # Build model
            model = build_model(inner_train_features, no_of_layers, no_of_nodes, learning_rate)
            early_stopper = build_early_stopper(patience)

            # Fit model
            model.fit(
                inner_train_features,
                inner_train_labels,
                epochs = epochs,
                validation_data = (inner_val_features, inner_val_labels),     # Research whether this is ok
                callbacks = [early_stopper],
                verbose = 0
            )

            # Get predictions using fitted model
            predictions = model.predict(inner_val_features).flatten()

            # Get accuracy scores
            mae, mse, r2 = get_evaluation_metrics(inner_val_labels, predictions)

            # Add scores for current fold to results
            cv_results.append({
                "hp_combination": i,
                "outer_split": current_outer_split,
                "inner_split": current_inner_split,
                "hps": current_hps,
                "mae": mae,
                "mse": mse,
                "r2": r2
            })

    # Get optimal hyperparameters for current outer split training set
    opt_hps = get_optimal_hyperparameters(hp_combinations, cv_results)
    opt_no_of_layers = opt_hps["no_of_layers"]
    opt_no_of_nodes = opt_hps["no_of_nodes"]
    opt_learning_rate = opt_hps["learning_rate"]
    opt_epochs = opt_hps["epochs"] 
    opt_patience = opt_hps["patience"]

    # Build model
    model = build_model(outer_train_features, opt_no_of_layers, opt_no_of_nodes, opt_learning_rate)
    early_stopper = build_early_stopper(opt_patience)

    # Fit model
    model.fit(
        outer_train_features,
        outer_train_labels,
        epochs = opt_epochs,
        validation_data = (outer_val_features, outer_val_labels),     # Research whether this is ok
        callbacks = [early_stopper],
        verbose = 1
    )

    # Get predictions using fitted model
    predictions = model.predict(outer_val_features).flatten()

    # Get accuracy scores
    mae, mse, r2 = get_evaluation_metrics(outer_val_labels, predictions)

    # Add scores for current fold to results
    outer_cv_results.append({
        "outer_split": current_outer_split,
        "hps": opt_hps,
        "mae": mae,
        "mse": mse,
        "r2": r2,
        "inner_cv_results": cv_results
    })