From the lesson learnt before, we understood how to train a DL for age prediction

We got decent results given our limited model.

Now it's your turn:

- We assume you don't have a big computational unit
- You have full access to the scans
- You have full access to the dataset

Can you think of a way of improving the pipeline in a manageable way given your constraints?

I will try MLP on corthical thicknesses

In [1]:
# !pip install monai==1.5.0 --no-dependencies

In [2]:
# Imports
# General purpose
import os
import random
import tqdm
from typing import List
# # DL
import torch as th
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
# Data
from sklearn.preprocessing import StandardScaler
import pandas as pd
import nibabel as nib
from torch.utils.data import Dataset, DataLoader
import monai.transforms as mtr
from sklearn.model_selection import StratifiedKFold, train_test_split
# Visualize
import matplotlib.pyplot as plt
import tqdm

In [3]:
# We download instead a preprocessed version (will be explained...)
# !mkdir ../data
# !curl -L -o ../data/preprocessed-ixi-dataset-with-fs8.zip -z ../data/preprocessed-ixi-dataset-with-fs8.zip https://www.kaggle.com/api/v1/datasets/download/kingpowa/preprocessed-ixi-dataset-with-fs8
# !unzip -n ../data/preprocessed-ixi-dataset-with-fs8.zip -d ../data
# !mkdir ../data/IXI
# !mv -v ../data/T1w_Processed_IXI_with_csv/IXI/* ../data/IXI
# !rm -rf ../data/preprocessed-ixi-dataset-with-fs8.zip
# !rm -rf ../data/T1w_Processed_IXI_with_csv

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
th.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [6]:
device = th.device('cuda' if th.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [7]:
masterfile_path = "../data/IXI/subjects.csv"
masterfile = pd.read_csv(masterfile_path)
masterfile = masterfile[masterfile["age"] != -1.0]
masterfile.head(5)

Unnamed: 0,subject_id,age,sex,scanner,site,diagnosis,dataset_name,subject_key,session,run,registered_mni
0,IXI002,35.8,Female,Philips-1.5T,Guy’s-Hospital,Healthy,IXI,IXI002_IXI,1,1,sub-IXI002/ses-1/run-1/anat/sub-IXI002_acq-Phi...
1,IXI012,38.78,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,IXI012_IXI,1,1,sub-IXI012/ses-1/run-1/anat/sub-IXI012_acq-Phi...
2,IXI013,46.71,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,IXI013_IXI,1,1,sub-IXI013/ses-1/run-1/anat/sub-IXI013_acq-Phi...
3,IXI014,34.24,Female,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,IXI014_IXI,1,1,sub-IXI014/ses-1/run-1/anat/sub-IXI014_acq-Phi...
4,IXI015,24.28,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,IXI015_IXI,1,1,sub-IXI015/ses-1/run-1/anat/sub-IXI015_acq-Phi...


In [None]:
def grad_norm(model):
    """Compute total L2 norm of gradients over all parameters."""
    total_norm = 0.0
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2).item()
            total_norm += param_norm ** 2
    return total_norm ** 0.5

def train_epoch(model, loader, optimizer, input_label="t1n", loss_fn=None, update=True, log_norm=False, scheduler=None, per_step=False):
    # Add per_step for per_step update of scheduler
    model.train()
    running = 0.0

    if log_norm:
      grad_norm_sum = 0.0
      grad_norm_count = 0
      max_grad_norm = 0.0

    for batch in tqdm.tqdm(loader):
        img = batch[input_label].to(device)
        age = batch["age"].to(device).view(-1, 1)
        pred = model(img).view(-1, 1)
        loss = loss_fn(pred, age)
        optimizer.zero_grad()
        loss.backward()

        if update:
          optimizer.step()
          if per_step:
             if scheduler is not None:
                if isinstance(scheduler, th.optim.lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(loss)
                else:
                    scheduler.step()

        else:
          loss = loss.detach()
        running += loss.item() * img.size(0)

        if log_norm:
            gn = grad_norm(model)
            grad_norm_sum += gn
            grad_norm_count += 1
            if gn > max_grad_norm:
                max_grad_norm = gn
        
    for param_group in optimizer.param_groups:
        lr = param_group['lr']
    if log_norm and grad_norm_count > 0:
        avg_grad_norm = grad_norm_sum / grad_norm_count
    if update:
        if not per_step:
            if scheduler is not None:
                if isinstance(scheduler, th.optim.lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(loss)
                else:
                    scheduler.step()

    return running / len(loader.dataset), lr, avg_grad_norm

def eval_model(model, loader, input_label="cort", loss_fns={}):
    model.eval()
    with th.no_grad():
        losses = {k:[] for k in loss_fns.keys()}
        for batch in loader:
            img = batch[input_label].to(device)
            age = batch["age"].to(device).view(-1, 1)
            pred = model(img).view(-1, 1)
            for k in loss_fns.keys():
                losses[k].append(loss_fns[k](pred, age).cpu().numpy())
        losses = {k: np.concatenate(val) for k, val in losses.items()}
    return {k: {"mean": np.mean(val), "std": np.std(val)} for k, val in losses.items()}

# First model: MLP

In [9]:
# Dataset (from regression lesson)

# Let's extract the per thickness matrix
def make_thickness_matrix(df,
                          sample_id_cols=['subject_id', 'session', 'run'],
                          metadata_cols=['age', 'sex', 'scanner', 'site', 'diagnosis', 'dataset_name', 'registered_mni'],
                          value_col='mean_thickness_weighted',
                          hemi_col='hemi',
                          region_col='region',
                          aggfunc='mean'):
    df = df.copy()
    df['sample_id'] = df[sample_id_cols].astype(str).agg('_'.join, axis=1)

    # pivoting
    index_cols = ['sample_id'] + list(sample_id_cols) + list(metadata_cols)
    pivot = df.pivot_table(
        index=index_cols,
        columns=[hemi_col, region_col],
        values=value_col,
        aggfunc=aggfunc  # if duplicates exist (e.g., multiple entries), aggregate
    )

    pivot.columns = [f"{hemi}_{region}" for hemi, region in sorted(pivot.columns)]
    wide_df = pivot.reset_index()

    return wide_df

thickness_df = pd.read_csv("../data/IXI/thickness.csv")
# Merge the matrix
merged = thickness_df.merge(masterfile, on='subject_id', how='inner')
merged['age'] = pd.to_numeric(merged['age'], errors='coerce')
merged['mean_thickness_weighted'] = pd.to_numeric(merged['mean_thickness_weighted'], errors='coerce')
merged['mean_thickness_simple'] = pd.to_numeric(merged['mean_thickness_simple'], errors='coerce')
merged = merged.dropna(subset=['age', 'sex', 'mean_thickness_weighted', 'mean_thickness_simple'])
# Filter the matrxi
merged_filtered = merged[~((merged["region"] == 'temporalpole') & (merged["hemi"] == "rh") | (merged["region"] == "unknown"))]
# obtain the df matrix
proper_thickness_matrix_df = make_thickness_matrix(
    df=merged_filtered,
    value_col='mean_thickness_weighted'
)

test_set = proper_thickness_matrix_df[proper_thickness_matrix_df["scanner"] == "GE-1.5T"]
train_set = proper_thickness_matrix_df[proper_thickness_matrix_df["scanner"] != "GE-1.5T"]

internal_df = train_set.copy()
internal_df["age_bin"] = pd.cut(train_set["age"], bins=10, labels=False, include_lowest=True)
internal_df["stratify_key"] = internal_df["age_bin"].astype(str) + "_" + internal_df["sex"].astype(str)
internal_df.head(5)

Unnamed: 0,sample_id,subject_id,session,run,age,sex,scanner,site,diagnosis,dataset_name,...,rh_precuneus,rh_rostralanteriorcingulate,rh_rostralmiddlefrontal,rh_superiorfrontal,rh_superiorparietal,rh_superiortemporal,rh_supramarginal,rh_transversetemporal,age_bin,stratify_key
0,IXI002_1_1,IXI002,1,1,35.8,Female,Philips-1.5T,Guy’s-Hospital,Healthy,IXI,...,2.474847,2.899147,2.635482,3.117611,2.317504,3.213092,2.725745,2.460538,2,2_Female
1,IXI012_1_1,IXI012,1,1,38.78,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,...,2.131758,2.652905,2.352631,2.635217,2.109884,2.871431,2.485718,2.019534,2,2_Male
2,IXI013_1_1,IXI013,1,1,46.71,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,...,1.949073,2.655105,2.347103,2.558824,2.037381,2.88712,2.578263,1.7917,4,4_Male
3,IXI014_1_1,IXI014,1,1,34.24,Female,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,...,2.035774,2.726204,2.491766,2.670686,2.029425,2.860698,2.461032,1.714474,2,2_Female
4,IXI015_1_1,IXI015,1,1,24.28,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,...,2.267934,2.859262,2.545136,2.780078,2.166804,2.979659,2.727619,2.119733,0,0_Male


In [10]:
# This time I do SKF
skf = StratifiedKFold(n_splits = 3, shuffle = False)
folds = [(train_fold, val_fold) for train_fold, val_fold in skf.split(np.arange(len(internal_df)), internal_df["stratify_key"].values)]
len(folds)



3

In [11]:
# Dataset
class CorthicalThicknessDataset(Dataset):
    def __init__(self,
                 metadata_df: pd.DataFrame,
                 features: List[str],
                 transform = None,
                 age_transforms = None):
        self.df = metadata_df
        self.transform = transform
        self.age_transforms = age_transforms
        self.features = features

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row: pd.Series = self.df.iloc[idx]
        age = float(row["age"])
        features = row[self.features].values.astype(np.float32)
        if self.transform:
            features = self.transform(features)
        if self.age_transforms:
            age = self.age_transforms(age)
        return {
            "cort": features if th.is_tensor(features) else th.as_tensor(features),
            "age": (age if th.is_tensor(age) else th.as_tensor(age)).type(dtype=th.float32),
        }

labels_features = [c for c in train_set.columns if "lh" in c or "rh" in c] 
cds = CorthicalThicknessDataset(metadata_df=train_set, 
                                features=labels_features)
cds[0]

{'cort': tensor([2.7676, 2.8286, 2.0199, 2.7128, 2.8419, 2.4874, 3.1024, 3.1808, 2.4890,
         2.1395, 2.6867, 1.9539, 2.5086, 2.9185, 2.6467, 2.8679, 2.6350, 2.8145,
         2.8272, 1.5510, 2.2510, 2.4814, 2.7461, 2.5553, 2.9170, 2.8129, 3.0472,
         2.3132, 3.1159, 2.7137, 2.6597, 2.6050, 2.9449, 1.9366, 2.8172, 2.8733,
         2.6456, 3.2003, 3.2687, 2.4619, 2.3114, 2.7239, 1.8627, 2.5346, 2.9950,
         2.6629, 2.7215, 2.7494, 2.9277, 2.7340, 1.5249, 2.0854, 2.5268, 2.6698,
         2.4748, 2.8991, 2.6355, 3.1176, 2.3175, 3.2131, 2.7257, 2.4605]),
 'age': tensor(35.8000)}

In [12]:
def apply_scaler(df, scaler, num_cols):
    df_scaled = df.copy()
    df_scaled[num_cols] = scaler.transform(df[num_cols])
    return df_scaled

def get_loaders_thk(train_set,
                    val_set,
                    features,
                    train_transforms=None,
                    val_transforms=None,
                    age_transforms=None,
                    batch_size=16):

    # Scale the corthical thicknesses 
    scaler = StandardScaler()
    scaler.fit(train_set[features])

    train_scaled = apply_scaler(train_set, scaler, features)
    val_scaled   = apply_scaler(val_set, scaler, features)

    ds_tr = CorthicalThicknessDataset(
        train_scaled,
        features=features,
        transform=train_transforms,
        age_transforms=age_transforms
    )

    train_dl = DataLoader(ds_tr, batch_size=batch_size,
                            shuffle=True,
                            num_workers=6,
                            pin_memory=True,
                            persistent_workers=True,
                            prefetch_factor=2)
    
    ds_val = CorthicalThicknessDataset(
        val_scaled,
        features=features,
        transform=val_transforms,
        age_transforms=age_transforms
    )
    val_dl = DataLoader(ds_val, batch_size=batch_size,
                            num_workers=2,
                            pin_memory=False,
                            persistent_workers=True,
                            prefetch_factor=None)
                            # persistent_workers=True,
                            # prefetch_factor=2)

    return train_dl, val_dl, scaler

In [13]:
train_dl, val_dl, scaler = get_loaders_thk(
    train_set.iloc[folds[0][0]],
    train_set.iloc[folds[0][1]],
    labels_features,
    age_transforms=lambda x: x/255.,
    batch_size=16
)
next(iter(train_dl))

{'cort': tensor([[-1.4895e+00, -8.6988e-01, -1.3153e+00,  2.4317e-01, -2.5327e-01,
          -1.5475e+00, -1.8220e+00, -2.6167e+00,  1.3202e-01, -2.5823e-01,
          -1.6089e+00, -4.5825e-01, -1.6113e+00, -2.2403e+00, -5.0683e-01,
          -1.2371e+00, -1.6097e+00, -2.0359e+00, -2.5105e+00, -8.9627e-01,
          -2.7559e+00, -4.5902e-01, -2.6403e+00, -9.9167e-01, -1.1777e+00,
          -9.9487e-01, -1.5530e+00, -1.8158e-01, -2.0718e+00, -1.3942e+00,
           5.2000e-01, -6.9565e-01, -1.5666e+00, -9.2259e-01,  6.8357e-01,
          -3.8094e-01, -8.4504e-01, -5.6212e-01, -2.7066e+00, -4.8013e-01,
          -1.3899e+00, -1.4115e+00, -9.8437e-01, -5.0582e-01, -8.6327e-01,
          -9.2207e-01, -4.8917e-01, -1.4247e+00, -1.7547e+00, -2.1948e+00,
          -3.3192e-01, -2.0167e+00, -1.1864e-01, -2.8528e+00, -1.0594e+00,
          -1.3328e+00, -1.2463e+00, -1.6035e+00, -1.0547e+00, -1.4242e+00,
          -1.8224e+00,  6.6554e-01],
         [-6.3447e-01,  6.9745e-01,  6.0751e-01,  9.360

In [14]:
class MLPCort(nn.Module):
    def __init__(self, in_dim: int, hidden: List[int], dropout: float = 0.0):
        super().__init__()
        layers: List[nn.Module] = []
        prev = in_dim
        for h in hidden:
            layers += [
                nn.Linear(prev, h),
                nn.ReLU(inplace=True),
                nn.Dropout(dropout),
            ]
            prev = h
        layers += [nn.Linear(prev, 1)]  # regression output
        self.net = nn.Sequential(*layers)

    def forward(self, x: th.Tensor) -> th.Tensor:
        return self.net(x)

In [None]:
num_epochs = 50
warmup_epochs = 10
exp_name = "CorthicalThickness"
base_ckpt_dir = f"../checkpoints/{exp_name}"
base_results_dir = f"../results/{exp_name}"
os.makedirs(base_ckpt_dir, exist_ok=True)
os.makedirs(base_results_dir, exist_ok=True)

device = th.device("cuda" if th.cuda.is_available() else "cpu")
delta = 0.10
criterion_train = nn.HuberLoss(delta=delta)


def build_scheduler(optimizer, train_dl, max_lr=1e-2, base_lr=1e-4, warmup_epochs=60):
    base_lrs = [g['lr'] for g in optimizer.param_groups]
    factors = [base_lr / b for b in base_lrs]  # -> usually 1.0 if base_lr == optimizer init lr
    lambdas = [(lambda f: (lambda _: f))(f) for f in factors]
    const = th.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambdas)

    total_warm_steps = len(train_dl) * warmup_epochs
    oc = th.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        max_lr=max_lr,
        total_steps=total_warm_steps,
        pct_start=0.4,
        anneal_strategy="cos",
        div_factor=100.0,      # initial_lr = max_lr / div_factor
        final_div_factor=max_lr/base_lr,  # final_lr ~ max_lr / 50
        cycle_momentum=False
    )
    # After OneCycle finishes, switch to "const" (restores base_lr)
    sched = th.optim.lr_scheduler.SequentialLR(
        optimizer=optimizer,
        schedulers=[oc, const],
        milestones=[total_warm_steps]
    )
    return sched, total_warm_steps

def save_fold_summary(all_best_rows, path):
    if len(all_best_rows) == 0:
        return
    df = pd.DataFrame(all_best_rows)
    df.to_csv(path, index=False)

cv_best_rows = []

for i, fold in enumerate(folds):
    train_idx, val_idx = fold
    ckpt_path = os.path.join(base_ckpt_dir, f"{exp_name}_fold{i}_best.pth")
    results_path = os.path.join(base_results_dir, f"{exp_name}_fold{i}_results.csv")
    summary_path = os.path.join(base_results_dir, f"{exp_name}_cv_summary.csv")

    train_dl, val_dl, _ = get_loaders_thk(
        train_set=train_set.iloc[train_idx], val_set=train_set.iloc[val_idx], features=labels_features,
        train_transforms=None, val_transforms=None, age_transforms=lambda x: x/255.,
        batch_size=32
    )

    network = MLPCort(
        in_dim=len(labels_features),
        hidden=[32,64,16],
        dropout=0.3
    ).to(device=device)
    optimizer = th.optim.AdamW(network.parameters(), lr=1e-3)

    scheduler, warm_steps = build_scheduler(
        optimizer=optimizer,
        train_dl=train_dl,
        max_lr=1e-2,
        base_lr=1e-4,
        warmup_epochs=warmup_epochs
    )

    best_val_mae = np.inf
    curr_epoch = -1

    if os.path.exists(ckpt_path):
        state_dict = th.load(ckpt_path, weights_only=False, map_location=device)
        curr_epoch = state_dict["epoch"]
        best_val_mae = state_dict["best_val_mae"]
        network.load_state_dict(state_dict["model_state_dict"])
        optimizer.load_state_dict(state_dict["optimizer_state_dict"])

        # Restore scheduler position (step-per-batch scheduler)
        scheduler.last_epoch = (curr_epoch + 1) * len(train_dl) - 1

        print(f"[Fold {i}] Reloaded checkpoint from epoch {curr_epoch} (best_val_mae={best_val_mae:.3f})")

    # Resume results table for this fold
    if os.path.exists(results_path):
        losses = pd.read_csv(results_path)
        if curr_epoch >= 0 and "epoch" in losses.columns:
            losses = losses[losses["epoch"] <= curr_epoch].copy()
    else:
        losses = pd.DataFrame()

    # ---- training loop ----
    for epoch in tqdm.tqdm(range(curr_epoch + 1, num_epochs), desc="Epochs"):
        tr_loss, lr, norm = train_epoch(
            network,
            train_dl,
            optimizer,
            input_label="cort",
            loss_fn=criterion_train,
            update=True,
            log_norm=True,
            scheduler=scheduler,
            per_step=True
        )

        val_metrics = eval_model(
            network,
            val_dl,
            loss_fns={
                "huber": nn.HuberLoss(delta=delta, reduction="none"),
                "mae": lambda pred, age: th.abs(pred - age) * 100
            }
        )

        # extract scalar means/stds
        def _to_float(x):
            try:
                return float(x.item())
            except Exception:
                return float(x)

        val_row = {
            "fold": i,
            "epoch": epoch,
            "tr_huber": float(tr_loss),
        }
        for k, v in val_metrics.items():
            val_row[f"val_{k}_mean"] = _to_float(v["mean"])
            val_row[f"val_{k}_std"]  = _to_float(v["std"])

        tqdm.tqdm.write(
            f"ep. {epoch+1}/{num_epochs} | "
            f"tr_loss {tr_loss:.4f} | "
            f"val_mae {val_metrics['mae']['mean']:.4f} | "
            f"best_val_MAE {best_val_mae} | "
            f"lr {lr}"
        )

        # Save best checkpoint per fold (by val MAE)
        if val_row["val_mae_mean"] < best_val_mae:
            best_val_mae = val_row["val_mae_mean"]
            # print(f"[Fold {i}] New lowest MAE: {best_val_mae:.3f}. Saving checkpoint.")
            checkpoint = {
                "epoch": epoch,
                "model_state_dict": network.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "best_val_mae": best_val_mae,
                "fold": i,
                "exp_name": exp_name
            }
            th.save(checkpoint, ckpt_path)

        # Append and persist per-fold results
        losses = pd.concat([losses, pd.DataFrame([val_row])], ignore_index=True)
        losses.to_csv(results_path, index=False)

    try:
        fold_df = pd.read_csv(results_path)
        best_idx = fold_df["val_mae_mean"].idxmin()
        best_row = fold_df.loc[int(best_idx)].to_dict()
        cv_best_rows.append(best_row)
        save_fold_summary(cv_best_rows, summary_path)
        print(f"[Fold {i}] Best MAE: {best_row['val_mae_mean']:.3f} at epoch {int(best_row['epoch'])+1}")
    except Exception as e:
        print(f"[Fold {i}] Could not compute CV summary: {e}")

TypeError: get_loaders_thk() got an unexpected keyword argument 'train_idx'

In [22]:
train_dl, test_dl, scaler = get_loaders_thk(
    train_set,
    test_set,
    labels_features,
    age_transforms=lambda x: x/255.,
    batch_size=32
)

In [35]:
network = MLPCort(
        in_dim=len(labels_features),
        hidden=[32,64,16],
        dropout=0.3
).to(device=device)
optimizer = th.optim.AdamW(network.parameters(), lr=1e-3)

scheduler, warm_steps = build_scheduler(
    optimizer=optimizer,
    train_dl=train_dl,
    max_lr=1e-2,
    base_lr=1e-4,
    warmup_epochs=warmup_epochs
)

for epoch in tqdm.tqdm(range(20), desc="Epochs"):
    tr_loss, lr, norm = train_epoch(
        network,
        train_dl,
        optimizer,
        input_label="cort",
        loss_fn=criterion_train,
        update=True,
        log_norm=True,
        scheduler=scheduler,
        per_step=True
    )

100%|██████████| 16/16 [00:00<00:00, 186.74it/s]
100%|██████████| 16/16 [00:00<00:00, 194.83it/s]
100%|██████████| 16/16 [00:00<00:00, 177.12it/s]4it/s]
100%|██████████| 16/16 [00:00<00:00, 179.01it/s]
100%|██████████| 16/16 [00:00<00:00, 183.27it/s]0it/s]
100%|██████████| 16/16 [00:00<00:00, 195.92it/s]
100%|██████████| 16/16 [00:00<00:00, 185.09it/s]8it/s]
100%|██████████| 16/16 [00:00<00:00, 214.88it/s]
100%|██████████| 16/16 [00:00<00:00, 233.54it/s]6it/s]
100%|██████████| 16/16 [00:00<00:00, 202.65it/s]
100%|██████████| 16/16 [00:00<00:00, 201.00it/s]03it/s]
100%|██████████| 16/16 [00:00<00:00, 201.94it/s]
100%|██████████| 16/16 [00:00<00:00, 224.13it/s]07it/s]
100%|██████████| 16/16 [00:00<00:00, 197.48it/s]
100%|██████████| 16/16 [00:00<00:00, 225.88it/s]27it/s]
100%|██████████| 16/16 [00:00<00:00, 209.31it/s]
100%|██████████| 16/16 [00:00<00:00, 203.84it/s]51it/s]
100%|██████████| 16/16 [00:00<00:00, 195.11it/s]
100%|██████████| 16/16 [00:00<00:00, 231.91it/s]37it/s]
100%|█████

In [None]:
test_losses = eval_model(network, test_dl, loss_fns={
        "mse": nn.HuberLoss(reduction='none', delta=delta), "mae": lambda pred, age: th.abs(pred-age)*100
    })
print(f'Test:')
for k, v in test_losses.items():
    print(f'{k}: {v["mean"]:.3f} ± {v["std"]:.3f}')

tensor(0.0482, device='cuda:0')
tensor(0.0484, device='cuda:0')
tensor(0.0141, device='cuda:0')
Test:
mse: 0.002 ± 0.002
mae: 4.724 ± 2.825


In [37]:
checkpoint = {
    "model_state_dict": network.state_dict()
}
th.save(checkpoint, "../checkpoints/MLPCort_best_model.pth")

Do you see something funny? :)
I should do also hyperparameter tuning but I will skip it for time reason...