# Imports

In [14]:
import pickle
import numpy as np
import geopandas as gpd
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import libpysal.weights as weights
import pysal.explore as esda

In [15]:
np.random.seed(42)

# Load data

In [16]:
raw_df = gpd.read_file("datasets/5_split/df_fe.gpkg")

In [17]:
df = raw_df.copy()

# Separate features

In [18]:
# Drop R fold ids
df = df.drop(columns = [col for col in df.columns if "fold_id_r" in col])

# Dependent variables
labels = df.pop('very_good_health')

# Outer CV folds
outer_fold_ids = df["outer_loop_fold_id_python"]
outer_splits = np.sort(outer_fold_ids.unique().astype(int))

# Inner CV folds
inner_fold_ids = df[[col for col in df.columns if "inner_loop" in col]]
inner_splits = np.sort(inner_fold_ids.stack().unique().astype(int))

# Independent variables
features = df.drop(columns = [col for col in df.columns if "fold_id" in col])

# Functions

In [19]:
def get_random_hyperparameters():
    kernel = np.random.choice(["bisquare", "Gaussian", "exponential"])
    criterion = np.random.choice(["AICc", "AIC", "BIC", "CV"])
    return kernel, criterion

## Get GWR inputs

In [20]:
def get_gwr_inputs(features, labels, bandwidth = False, kernel = None, criterion = None):
    x = features.geometry.x
    y = features.geometry.y
    coords = np.array(list(zip(x, y)))
    target = labels.values.reshape((-1, 1))
    predictor_cols = ["greenspace_proportion", "imd", "f_m_ratio", "mean_age"]
    predictors = np.hstack(
        [features[col].values.reshape((-1, 1)) for col in predictor_cols]
    )
    opt_bandwidth = None
    if bandwidth:
        opt_bandwidth = Sel_BW(coords, target, predictors, kernel = kernel).search(criterion = criterion)
    return coords, predictors, target, opt_bandwidth


In [21]:
def get_evaluation_metrics(val_features, val_labels, predictions):
    mae = mean_absolute_error(val_labels, predictions)
    mse = mean_squared_error(val_labels, predictions)
    r2 = r2_score(val_labels, predictions)
    w = weights.KNN.from_dataframe(val_features, k = 30)
    moran = esda.esda.Moran(val_labels.values - predictions.flatten(), w)
    return mae, mse, r2, moran.I

In [22]:
def get_avg_scores(cv_results):
    mae_scores = []
    mse_scores = []
    r2_scores = []
    moran_scores = []

    for result in cv_results:
        mae_scores.append(result["mae"])
        mse_scores.append(result["mse"])
        r2_scores.append(result["r2"])
        moran_scores.append(result["moran"])

    avg_mae = np.mean(mae_scores)
    avg_mse = np.mean(mse_scores)
    avg_r2 = np.mean(r2_scores)
    avg_moran = np.mean(moran_scores)

    return avg_mae, avg_mse, avg_r2, avg_moran

In [23]:
def get_optimal_hyperparameters(hp_combinations, cv_results):
    hp_combination_scores = []
    for i in range(len(hp_combinations)):
        current_hp_results = [result for result in cv_results if result["hp_combination"] == i]
        mae, mse, r2, moran = get_avg_scores(current_hp_results)
        hp_combination_scores.append(mse)
    optimal_combination = np.argmin(hp_combination_scores)
    optimal_hps = hp_combinations[optimal_combination]
    return optimal_hps

# Build and evaluate model

In [24]:
outer_cv_results = []

In [None]:
for current_outer_split in outer_splits:

    hp_combinations = []
    cv_results = []

    # Get training and validation sets for current outer split
    is_in_validation_set = outer_fold_ids == current_outer_split
    is_in_training_set = ~is_in_validation_set
    outer_train_features = features.loc[is_in_training_set]
    outer_train_labels = labels.loc[is_in_training_set]
    outer_val_features = features.loc[is_in_validation_set]
    outer_val_labels = labels.loc[is_in_validation_set]
    current_inner_fold_ids = inner_fold_ids.loc[is_in_training_set]

    # Loop to test 10 hyperparameter combinations
    for i in range(8):
        
        # Set random hps
        kernel, criterion = get_random_hyperparameters()
        current_hps = {
            "kernel": kernel,
            "criterion": criterion
        }
        hp_combinations.append(current_hps)

        # Inner cross-validation for model selection
        for current_inner_split in inner_splits:
            print(f"\n --- Outer split {current_outer_split} - Training model {i} on inner split {current_inner_split} ---")

            # Get training and validation sets for current inner split
            is_in_validation_set = current_inner_fold_ids[f"inner_loop_{current_outer_split + 1}_fold_id_python"] == current_inner_split
            is_in_training_set = ~is_in_validation_set
            inner_train_features = outer_train_features.loc[is_in_training_set]
            inner_train_labels = outer_train_labels.loc[is_in_training_set]
            inner_val_features = outer_train_features.loc[is_in_validation_set]
            inner_val_labels = outer_train_labels.loc[is_in_validation_set]

            # Get model inputs
            print("Getting inputs...")
            inner_train_coords, inner_train_predictors, inner_train_target, inner_bandwidth = get_gwr_inputs(inner_train_features, inner_train_labels, bandwidth = True, kernel = kernel, criterion = criterion)
            inner_val_coords, inner_val_predictors, inner_val_target, _ = get_gwr_inputs(inner_val_features, inner_val_labels)

            # Build model
            print("Building model...")
            model = GWR(
                inner_train_coords,
                inner_train_target,
                inner_train_predictors,
                bw = inner_bandwidth,
                kernel = kernel
            )

            model.fit()

            # Get predictions
            print("Getting predictions...")
            results = model.predict(
                inner_val_coords, inner_val_predictors
            )
            predictions = results.predy

            # Get accuracy scores
            print("Evaluating predictions...")
            mae, mse, r2, moran = get_evaluation_metrics(inner_val_features, inner_val_labels, predictions)

            # Add scores for current fold to results
            cv_results.append({
                "hp_combination": i,
                "inner_split": current_inner_split,
                "hps": current_hps,
                "mae": mae,
                "mse": mse,
                "r2": r2,
                "moran": moran
            })

    print(f"\n --- Outer split {current_outer_split} - Training optimised model ---")

    # Get optimal hyperparameters for current outer split training set
    opt_hps = get_optimal_hyperparameters(hp_combinations, cv_results)

    # Get model inputs
    print("Getting inputs...")
    outer_train_coords, outer_train_predictors, outer_train_target, outer_bandwidth = get_gwr_inputs(outer_train_features, outer_train_labels, bandwidth = True, kernel = opt_hps["kernel"], criterion = opt_hps["criterion"])
    outer_val_coords, outer_val_predictors, outer_val_target, _ = get_gwr_inputs(outer_val_features, outer_val_labels)
    
    # Build model
    print("Building model...")
    model = GWR(
        outer_train_coords,
        outer_train_target,
        outer_train_predictors,
        bw = outer_bandwidth,
        kernel = opt_hps["kernel"],
    )

    model.fit()

    # Get predictions
    print("Getting predictions...")
    results = model.predict(
        outer_val_coords, outer_val_predictors
    )
    predictions = results.predy

    # Get accuracy scores
    print("Evaluating predictions...")
    mae, mse, r2, moran = get_evaluation_metrics(outer_val_features, outer_val_labels, predictions)

    outer_cv_results.append({
        "outer_split": current_outer_split,
        "hps": opt_hps,
        "mae": mae,
        "mse": mse,
        "r2": r2,
        "moran": moran,
        "inner_cv_results": cv_results
    })


 --- Outer split 0 - Training model 0 on inner split 0 ---
is_in_training_set: 4227
outer_train_features: 4227
Getting inputs...


ValueError: x attribute access only provided for Point geometries

In [None]:
predictor_cols = ["greenspace_proportion", "imd", "f_m_ratio", "mean_age"]
model_results = {
    "predictors": predictor_cols,
    "results": outer_cv_results
}

In [None]:
with open("outputs/model_results/gwr.pkl", "wb") as f:
    pickle.dump(model_results, f)