# Imports

In [16]:
import numpy as np
import pandas as pd
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L1
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set random seeds

In [17]:
tf.random.set_seed(42)
np.random.seed(42)

# Load data

In [18]:
raw_df = pd.read_csv("datasets/5_split/df_full.csv")

In [19]:
df = raw_df.copy()

# Separate features

In [20]:
# Dependent variables
labels = df.pop('very_good_health')

# CV folds
fold_ids = df.pop("fold_id_python")
folds = np.unique(fold_ids)

# Independent variables
features = df.drop(columns = ["fold_id_r"])

# Define build model function

In [21]:
def build_model(normaliser):

    model = keras.Sequential([
        normaliser,
        layers.Dense(64, activation='relu', kernel_regularizer = L1(0.0005)),
        layers.Dense(32, activation='relu', kernel_regularizer = L1(0.0005)),
        layers.Dense(1)
    ])

    model.compile(
        optimizer = Adam(learning_rate=0.001),
        loss='mse'
    )

    return model

# Build and evaluate model

In [24]:
# Initialise evaluation results array
evaluation_results = []

# Cross-validation loop
for fold in folds:
    print(f"\n --- Training on fold {fold} ---")

    # Separate data into training and validation sets
    is_in_validation_set = fold_ids == fold
    is_in_training_set = ~is_in_validation_set

    train_features = features.loc[is_in_training_set]
    train_labels = labels.loc[is_in_training_set]

    val_features = features.loc[is_in_validation_set]
    val_labels = labels.loc[is_in_validation_set]

    # Define normalisation layer inside loop to avoid data leakage
    normaliser = layers.Normalization(axis = -1)
    normaliser.adapt(np.array(train_features))
    
    # Build model
    model = build_model(normaliser)

    # Define early stopper inside loop to avoid storing state between folds
    early_stopper = keras.callbacks.EarlyStopping(
        monitor = "val_loss",
        patience = 10,
        restore_best_weights = True,
        verbose = 1
    )

    # Fit model
    model.fit(
        train_features,
        train_labels,
        epochs = 100,
        validation_split = 0.2,
        callbacks = [early_stopper]
    )

    # Get predictions using fitted model
    predictions = model.predict(val_features).flatten()

    # Get accuracy scores
    mae = mean_absolute_error(val_labels, predictions)
    mse = mean_squared_error(val_labels, predictions)
    r2 = r2_score(val_labels, predictions)

    # Add scores for current fold to results
    evaluation_results.append({
        "fold": fold,
        "MAE": mae,
        "MSE": mse,
        "R2": r2
    })


 --- Training on fold 0 ---
Epoch 1/100
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.7704 - val_loss: 0.6781
Epoch 2/100
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6014 - val_loss: 0.6099
Epoch 3/100
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.5384 - val_loss: 0.5416
Epoch 4/100
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4795 - val_loss: 0.4767
Epoch 5/100
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4216 - val_loss: 0.4109
Epoch 6/100
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3635 - val_loss: 0.3470
Epoch 7/100
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3060 - val_loss: 0.2856
Epoch 8/100
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2509 - val_loss: 0.2290
Epo

# Print results

In [23]:
evaluation_results

[{'fold': np.int64(0),
  'MAE': 0.025596956214684472,
  'MSE': 0.001224033285741126,
  'R2': 0.5789625364846958},
 {'fold': np.int64(1),
  'MAE': 0.02392719069905429,
  'MSE': 0.000901687498589569,
  'R2': -0.015950744461724398},
 {'fold': np.int64(2),
  'MAE': 0.025326071309361122,
  'MSE': 0.0009690947817001568,
  'R2': 0.31181559296709416},
 {'fold': np.int64(3),
  'MAE': 0.026504328057579837,
  'MSE': 0.0011580454979150212,
  'R2': 0.5914122591023905},
 {'fold': np.int64(4),
  'MAE': 0.02209017274281532,
  'MSE': 0.0007651118972433621,
  'R2': 0.23899596185356908},
 {'fold': np.int64(5),
  'MAE': 0.02984912381445257,
  'MSE': 0.0014033048038687259,
  'R2': -0.04084758241516173},
 {'fold': np.int64(6),
  'MAE': 0.022541609201301726,
  'MSE': 0.0007728833372459742,
  'R2': 0.5484893147440223},
 {'fold': np.int64(7),
  'MAE': 0.023900702531414984,
  'MSE': 0.0009039778676005498,
  'R2': 0.3029635485071489},
 {'fold': np.int64(8),
  'MAE': 0.03963370636388367,
  'MSE': 0.00239012831281