From the lesson learnt before, we understood how to train a DL for age prediction

We got decent results given our limited model.

Now it's your turn:

- We assume you don't have a big computational unit
- You have full access to the scans
- You have full access to the dataset

Can you think of a way of improving the pipeline in a manageable way given your constraints?

In [None]:
# !pip install monai==1.5.0 --no-dependencies

Collecting monai==1.5.0
  Downloading monai-1.5.0-py3-none-any.whl.metadata (13 kB)
Downloading monai-1.5.0-py3-none-any.whl (2.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/2.7 MB[0m [31m37.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: monai
Successfully installed monai-1.5.0


In [None]:
# Imports
# General purpose
import os
import random
import tqdm
from typing import List
# # DL
import torch as th
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
# Data
from sklearn.preprocessing import StandardScaler
import pandas as pd
import nibabel as nib
from torch.utils.data import Dataset, DataLoader
import monai.transforms as mtr
from sklearn.model_selection import StratifiedKFold, train_test_split
# Visualize
import matplotlib.pyplot as plt
import tqdm

In [None]:
import kagglehub
import os
# Download latest version
BASE_PATH = kagglehub.dataset_download("kingpowa/preprocessed-ixi-dataset-with-fs8")
BASE_PATH_IXI = os.path.join(BASE_PATH, "T1w_Processed_IXI_with_csv", "IXI")
BASE_PATH_EXE = os.path.join(BASE_PATH, "T1w_Processed_IXI_with_csv", "example_data")

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
th.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [4]:
device = th.device('cuda' if th.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
masterfile_path = f"{BASE_PATH_IXI}/subjects.csv"
masterfile = pd.read_csv(masterfile_path)
masterfile = masterfile[masterfile["age"] != -1.0]
masterfile.head(5)

Unnamed: 0,subject_id,age,sex,scanner,site,diagnosis,dataset_name,subject_key,session,run,registered_mni
0,IXI002,35.8,Female,Philips-1.5T,Guy’s-Hospital,Healthy,IXI,IXI002_IXI,1,1,sub-IXI002/ses-1/run-1/anat/sub-IXI002_acq-Phi...
1,IXI012,38.78,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,IXI012_IXI,1,1,sub-IXI012/ses-1/run-1/anat/sub-IXI012_acq-Phi...
2,IXI013,46.71,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,IXI013_IXI,1,1,sub-IXI013/ses-1/run-1/anat/sub-IXI013_acq-Phi...
3,IXI014,34.24,Female,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,IXI014_IXI,1,1,sub-IXI014/ses-1/run-1/anat/sub-IXI014_acq-Phi...
4,IXI015,24.28,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,IXI015_IXI,1,1,sub-IXI015/ses-1/run-1/anat/sub-IXI015_acq-Phi...


In [None]:
# Dataset (from regression lesson)
# Let's extract the per thickness matrix
def make_thickness_matrix(df,
                          sample_id_cols=['subject_id', 'session', 'run'],
                          metadata_cols=['age', 'sex', 'scanner', 'site', 'diagnosis', 'dataset_name', 'registered_mni'],
                          value_col='mean_thickness_weighted',
                          hemi_col='hemi',
                          region_col='region',
                          aggfunc='mean'):
    df = df.copy()
    df['sample_id'] = df[sample_id_cols].astype(str).agg('_'.join, axis=1)

    # pivoting
    index_cols = ['sample_id'] + list(sample_id_cols) + list(metadata_cols)
    pivot = df.pivot_table(
        index=index_cols,
        columns=[hemi_col, region_col],
        values=value_col,
        aggfunc=aggfunc  # if duplicates exist (e.g., multiple entries), aggregate
    )

    pivot.columns = [f"{hemi}_{region}" for hemi, region in sorted(pivot.columns)]
    wide_df = pivot.reset_index()

    return wide_df

thickness_df = pd.read_csv("../data/IXI/thickness.csv")
# Merge the matrix
merged = thickness_df.merge(masterfile, on='subject_id', how='inner')
merged['age'] = pd.to_numeric(merged['age'], errors='coerce')
merged['mean_thickness_weighted'] = pd.to_numeric(merged['mean_thickness_weighted'], errors='coerce')
merged['mean_thickness_simple'] = pd.to_numeric(merged['mean_thickness_simple'], errors='coerce')
merged = merged.dropna(subset=['age', 'sex', 'mean_thickness_weighted', 'mean_thickness_simple'])
# Filter the matrxi
merged_filtered = merged[~((merged["region"] == 'temporalpole') & (merged["hemi"] == "rh") | (merged["region"] == "unknown"))]
# obtain the df matrix
proper_thickness_matrix_df = make_thickness_matrix(
    df=merged_filtered,
    value_col='mean_thickness_weighted'
)

test_set = proper_thickness_matrix_df[proper_thickness_matrix_df["scanner"] == "GE-1.5T"]
train_set = proper_thickness_matrix_df[proper_thickness_matrix_df["scanner"] != "GE-1.5T"]

internal_df = train_set.copy()
internal_df["age_bin"] = pd.cut(train_set["age"], bins=10, labels=False, include_lowest=True)
internal_df["stratify_key"] = internal_df["age_bin"].astype(str) + "_" + internal_df["sex"].astype(str)
internal_df.head(5)

Unnamed: 0,sample_id,subject_id,session,run,age,sex,scanner,site,diagnosis,dataset_name,...,rh_precuneus,rh_rostralanteriorcingulate,rh_rostralmiddlefrontal,rh_superiorfrontal,rh_superiorparietal,rh_superiortemporal,rh_supramarginal,rh_transversetemporal,age_bin,stratify_key
0,IXI002_1_1,IXI002,1,1,35.8,Female,Philips-1.5T,Guy’s-Hospital,Healthy,IXI,...,2.474847,2.899147,2.635482,3.117611,2.317504,3.213092,2.725745,2.460538,2,2_Female
1,IXI012_1_1,IXI012,1,1,38.78,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,...,2.131758,2.652905,2.352631,2.635217,2.109884,2.871431,2.485718,2.019534,2,2_Male
2,IXI013_1_1,IXI013,1,1,46.71,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,...,1.949073,2.655105,2.347103,2.558824,2.037381,2.88712,2.578263,1.7917,4,4_Male
3,IXI014_1_1,IXI014,1,1,34.24,Female,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,...,2.035774,2.726204,2.491766,2.670686,2.029425,2.860698,2.461032,1.714474,2,2_Female
4,IXI015_1_1,IXI015,1,1,24.28,Male,Philips-3.0T,Hammersmith-Hospital,Healthy,IXI,...,2.267934,2.859262,2.545136,2.780078,2.166804,2.979659,2.727619,2.119733,0,0_Male


In [32]:
# This time I do SKF
skf = StratifiedKFold(n_splits = 3, shuffle = False)
folds = [(train_fold, val_fold) for train_fold, val_fold in skf.split(np.arange(len(internal_df)), internal_df["stratify_key"].values)]
len(folds)



3

Do your best!