# Initialize Notebook

**Scope of this notebook:** I will only proceed with importing the neccesary libraries and dataset to perform a quick EDA

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv

# -----------------------------
# 1. Load datasets
# -----------------------------
load_dotenv()
adni_path = os.getenv("DATASET_PATH_L")
oasis_path = os.getenv("DATASET_PATH_TWO_PL")

In [2]:
adni = pd.read_csv(adni_path)
adni.head()

Unnamed: 0,RID,Gender,Ageatscreening,Diagnosis,MMSE0m,HipsASMbaseline,HipsContrastbaseline,HipsCorelationbaseline,HipsVariancebaseline,HipsSumAveragebaseline,...,ERCsContrastbaseline,ERCsCorelationbaseline,ERCsVariancebaseline,ERCsSumAveragebaseline,ERCsSumVariancebaseline,ERCsEntropybaseline,ERCsClusterShadebaseline,ERCs_thicknessbaseline,ERCsVolumebaseline,HipposcampusVolumebaseline
0,1140,0,84.5123,1,25.0,0.15,169.3,0.61,223.34,36.0,...,227.925,0.495,223.25,27.445,665.065,2.725,-516.185,2.63834,2397.0,3164.85
1,1051,1,75.3699,1,26.0,0.097733,175.103337,0.562332,197.321714,42.533788,...,222.355754,0.500577,218.584302,30.381415,651.981453,3.154282,2595.481588,3.1465,1662.5,2648.3
2,15,0,80.9068,0,29.0,0.1,161.28,0.54,174.53,35.94,...,221.76,0.445,,30.565,,3.12,4287.78,2.89773,2188.0,3602.5
3,680,0,77.8932,0,28.0,0.11,235.89,0.51,231.56,41.66,...,217.45,0.54,236.75,30.465,729.545,3.01,-741.895,2.73485,2292.5,3267.45
4,324,1,75.3534,1,24.0,0.14,192.29,0.55,218.5,35.28,...,269.565,0.39,219.405,26.56,608.05,2.565,456.55,2.444245,1082.0,2550.5


In [3]:
oasis = pd.read_csv(oasis_path)
oasis.head()

Unnamed: 0,Label,Gender,Age,DIAGNOSIS,MMSE,Hip ASM,Hip Contrast,Hip Correlation,Hip Variance,Hip Sum Average,...,ERC CONTRAST,ERC CORRELATION,ERC VARIANCE,ERC SUM AVERAGE,ERC SUM VARIANCE,ERC ENTROPY,ERC CLUSTER SHADE,ERCs_thicknessbaseline,ERCsVolumebaseline,TOTAL_HIPPOCAMPUS_VOLUME
0,30879,0,64.5859,0,27.0,0.377501,128.715226,0.422342,115.630163,15.181458,...,286.270009,0.289728,206.373786,44.577974,539.225137,3.320292,22924.7391,3.3445,2251,7338.2
1,31129,0,68.07666,2,20.0,0.657212,51.830551,0.557254,84.225204,8.530596,...,282.873623,0.310507,206.276556,39.697933,542.232602,2.864138,18182.68767,2.9605,2924,5438.6
2,30605,1,76.22176,1,28.0,0.520032,86.558742,0.594543,123.662642,13.400504,...,244.274039,0.370189,201.216707,46.081559,560.59279,3.415186,28045.35099,3.3145,4042,8434.4
3,30039,0,73.221085,0,30.0,0.396133,60.731686,0.596733,122.546512,15.571183,...,226.713342,0.51743,232.98371,30.461877,705.221498,2.937422,1196.160472,3.1505,3485,7476.2
4,30079,0,45.87269,0,30.0,0.4048,108.80854,0.520363,98.065558,13.483247,...,202.074647,0.354851,169.528802,50.680275,476.040561,3.017047,47743.25412,3.542,3964,8021.8


In [4]:
# Define a mapping from oasis column names to adni column names
rename_map = {
    'Label': 'RID',
    'Age': 'Ageatscreening',
    'DIAGNOSIS': 'Diagnosis',
    'MMSE': 'MMSE0m',
    'Hip ASM': 'HipsASMbaseline',
    'Hip Contrast': 'HipsContrastbaseline',
    'Hip Correlation': 'HipsCorelationbaseline',
    'Hip Variance ': 'HipsVariancebaseline',
    'Hip Sum Average': 'HipsSumAveragebaseline',
    'Hip Sum Variance': 'HipsSumVariancebaseline',
    'Hip Entropy': 'HipsEntropybaseline',
    'Hip Clusterhade': 'HipsClusterShadebaseline',
    'ERC ASM': 'ERCsASMbaseline',
    'ERC CONTRAST': 'ERCsContrastbaseline',
    'ERC CORRELATION': 'ERCsCorelationbaseline',
    'ERC VARIANCE ': 'ERCsVariancebaseline',
    'ERC SUM AVERAGE': 'ERCsSumAveragebaseline',
    'ERC SUM VARIANCE': 'ERCsSumVariancebaseline',
    'ERC ENTROPY': 'ERCsEntropybaseline',
    'ERC CLUSTER SHADE': 'ERCsClusterShadebaseline',
    'TOTAL_HIPPOCAMPUS_VOLUME': 'HipposcampusVolumebaseline'
    # Columns already matching like 'ERCs_thicknessbaseline', 'ERCsVolumebaseline', 'dataset', 'Gender' don't need mapping
}

# Rename columns in oasis
oasis = oasis.rename(columns=rename_map)

# Reorder oasis columns to match adni
oasis = oasis[adni.columns]

In [5]:
# -----------------------------
# 2. Prepare datasets
# -----------------------------
# Add dataset labels
adni['dataset'] = 'ADNI'
oasis['dataset'] = 'OASIS'

# Ensure diagnosis is string and drop missing labels
adni['Diagnosis'] = adni['Diagnosis'].astype(str)
oasis['Diagnosis'] = oasis['Diagnosis'].astype(str)
oasis = oasis.dropna(subset=['Diagnosis'])

# -----------------------------
# 3. Split ADNI by subject
# -----------------------------
adni_subjects = adni['RID'].unique()
adni_train_subs, adni_test_subs = train_test_split(
    adni_subjects,
    test_size=0.3,
    stratify=adni.groupby('RID')['Diagnosis'].first()  # preserves class distribution
)
adni_train = adni[adni['RID'].isin(adni_train_subs)]
adni_test = adni[adni['RID'].isin(adni_test_subs)]

# Check subject-level overlap
adni_overlap = set(adni_train['RID']).intersection(set(adni_test['RID']))
print("ADNI train-test subject overlap:", adni_overlap)  # Should be empty set

# -----------------------------
# 4. Split OASIS by subject
# -----------------------------
oasis_subjects = oasis['RID'].unique()
oasis_train_subs, oasis_test_subs = train_test_split(
    oasis_subjects,
    test_size=0.3,
    stratify=oasis.groupby('RID')['Diagnosis'].first()
)
oasis_train = oasis[oasis['RID'].isin(oasis_train_subs)]
oasis_test = oasis[oasis['RID'].isin(oasis_test_subs)]

# Check subject-level overlap
oasis_overlap = set(oasis_train['RID']).intersection(set(oasis_test['RID']))
print("OASIS train-test subject overlap:", oasis_overlap)  # Should be empty set

# -----------------------------
# 5. Merge train and test sets
# -----------------------------
train_df = pd.concat([adni_train, oasis_train])
test_df = pd.concat([adni_test, oasis_test])

# -----------------------------
# 6. Check dataset distribution
# -----------------------------
print("Training set distribution by dataset:\n", train_df['dataset'].value_counts())
print("Test set distribution by dataset:\n", test_df['dataset'].value_counts())


ADNI train-test subject overlap: set()
OASIS train-test subject overlap: set()
Training set distribution by dataset:
 dataset
OASIS    508
ADNI     363
Name: count, dtype: int64
Test set distribution by dataset:
 dataset
OASIS    219
ADNI     161
Name: count, dtype: int64


In [6]:
train_df

Unnamed: 0,RID,Gender,Ageatscreening,Diagnosis,MMSE0m,HipsASMbaseline,HipsContrastbaseline,HipsCorelationbaseline,HipsVariancebaseline,HipsSumAveragebaseline,...,ERCsCorelationbaseline,ERCsVariancebaseline,ERCsSumAveragebaseline,ERCsSumVariancebaseline,ERCsEntropybaseline,ERCsClusterShadebaseline,ERCs_thicknessbaseline,ERCsVolumebaseline,HipposcampusVolumebaseline,dataset
1,1051,1,75.36990,1,26.0,0.097733,175.103337,0.562332,197.321714,42.533788,...,0.500577,218.584302,30.381415,651.981453,3.154282,2595.481588,3.146500,1662.5,2648.30,ADNI
3,680,0,77.89320,0,28.0,0.110000,235.890000,0.510000,231.560000,41.660000,...,0.540000,236.750000,30.465000,729.545000,3.010000,-741.895000,2.734850,2292.5,3267.45,ADNI
4,324,1,75.35340,1,24.0,0.140000,192.290000,0.550000,218.500000,35.280000,...,0.390000,219.405000,26.560000,608.050000,2.565000,456.550000,2.444245,1082.0,2550.50,ADNI
5,1306,1,74.62740,0,29.0,0.171489,,0.492663,237.593731,35.325773,...,0.492663,237.593731,35.325773,711.631342,3.113417,5788.690804,,1800.0,2763.75,ADNI
6,1257,0,85.08490,2,20.0,0.146456,147.659761,0.571869,180.780959,31.815222,...,0.388627,233.073690,29.953674,644.682797,2.953001,-467.359666,2.449500,1819.0,3630.95,ADNI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,31140,0,87.18412,2,25.0,0.576119,41.516993,0.611460,79.814266,8.965251,...,0.175542,216.250128,33.930873,496.754841,2.455708,8151.153118,2.497000,2210.0,5354.20,OASIS
722,30091,0,84.12594,2,16.0,0.441684,88.697197,0.409785,81.549373,9.657353,...,0.349637,201.670069,42.895778,558.109674,3.127915,24927.967080,3.003500,3273.0,5553.70,OASIS
724,30351,0,73.50582,0,28.0,0.293610,201.722318,0.243967,109.728432,15.638299,...,0.515571,223.491273,31.081235,679.123362,2.960271,2819.127225,3.389500,3201.0,6714.40,OASIS
725,30778,0,74.33813,0,29.0,0.290675,143.327666,0.222813,93.517107,11.592136,...,0.486951,239.812272,29.055669,711.556099,2.763789,-1667.941598,3.090500,2817.0,6020.90,OASIS
