# Initialize Notebook

**Scope of this notebook:** I will only proceed with importing the neccesary libraries and dataset to perform a quick EDA

In [1]:
# Import basic libraries for EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from dotenv import load_dotenv
import os
import pingouin as pg


In [2]:
# Import dataset
load_dotenv()
dataset_path=os.getenv("DATASET_PATH_L")
dataset = pd.read_csv(dataset_path)
dataset.head()

Unnamed: 0,RID,Gender,Ageatscreening,Diagnosis,MMSE0m,HipsASMbaseline,HipsContrastbaseline,HipsCorelationbaseline,HipsVariancebaseline,HipsSumAveragebaseline,...,ERCsContrastbaseline,ERCsCorelationbaseline,ERCsVariancebaseline,ERCsSumAveragebaseline,ERCsSumVariancebaseline,ERCsEntropybaseline,ERCsClusterShadebaseline,ERCs_thicknessbaseline,ERCsVolumebaseline,HipposcampusVolumebaseline
0,1140,0,84.5123,1,25.0,0.15,169.3,0.61,223.34,36.0,...,227.925,0.495,223.25,27.445,665.065,2.725,-516.185,2.63834,2397.0,3164.85
1,1051,1,75.3699,1,26.0,0.097733,175.103337,0.562332,197.321714,42.533788,...,222.355754,0.500577,218.584302,30.381415,651.981453,3.154282,2595.481588,3.1465,1662.5,2648.3
2,15,0,80.9068,0,29.0,0.1,161.28,0.54,174.53,35.94,...,221.76,0.445,,30.565,,3.12,4287.78,2.89773,2188.0,3602.5
3,680,0,77.8932,0,28.0,0.11,235.89,0.51,231.56,41.66,...,217.45,0.54,236.75,30.465,729.545,3.01,-741.895,2.73485,2292.5,3267.45
4,324,1,75.3534,1,24.0,0.14,192.29,0.55,218.5,35.28,...,269.565,0.39,219.405,26.56,608.05,2.565,456.55,2.444245,1082.0,2550.5


## Data Inspection

In [3]:
# Check column names in case 
# we need to do some spelling correction
dataset.columns

Index(['RID', 'Gender', 'Ageatscreening', 'Diagnosis', 'MMSE0m',
       'HipsASMbaseline', 'HipsContrastbaseline', 'HipsCorelationbaseline',
       'HipsVariancebaseline', 'HipsSumAveragebaseline',
       'HipsSumVariancebaseline', 'HipsEntropybaseline',
       'HipsClusterShadebaseline', 'ERCsASMbaseline', 'ERCsContrastbaseline',
       'ERCsCorelationbaseline', 'ERCsVariancebaseline',
       'ERCsSumAveragebaseline', 'ERCsSumVariancebaseline',
       'ERCsEntropybaseline', 'ERCsClusterShadebaseline',
       'ERCs_thicknessbaseline', 'ERCsVolumebaseline',
       'HipposcampusVolumebaseline'],
      dtype='object')

In [4]:
dataset.groupby("Diagnosis")["Gender"].value_counts()

Diagnosis  Gender
0          1          98
           0          96
1          0         127
           1          73
2          1          70
           0          60
Name: count, dtype: int64

In [5]:
dataset.groupby("Diagnosis")["Gender"].value_counts().groupby(level=0).sum()

Diagnosis
0    194
1    200
2    130
Name: count, dtype: int64

In [6]:
dataset.isna().sum()

RID                            0
Gender                         0
Ageatscreening                 0
Diagnosis                      0
MMSE0m                         3
HipsASMbaseline               25
HipsContrastbaseline          13
HipsCorelationbaseline        19
HipsVariancebaseline           5
HipsSumAveragebaseline         1
HipsSumVariancebaseline        6
HipsEntropybaseline           21
HipsClusterShadebaseline      17
ERCsASMbaseline                3
ERCsContrastbaseline          17
ERCsCorelationbaseline        25
ERCsVariancebaseline          12
ERCsSumAveragebaseline        11
ERCsSumVariancebaseline       10
ERCsEntropybaseline            9
ERCsClusterShadebaseline      21
ERCs_thicknessbaseline        34
ERCsVolumebaseline            11
HipposcampusVolumebaseline    16
dtype: int64

In [7]:
diag_labels = {0: "Normal (NC)", 1: "MCI", 2: "AD"}

# --- Loop through each diagnosis ---
for diag_code, diag_name in diag_labels.items():
    subset = dataset[dataset["Diagnosis"] == diag_code]
    
    # Gender counts
    gender_counts = subset["Gender"].value_counts()
    
    # Age and MMSE mean ± std
    age_mean, age_std = subset["Ageatscreening"].mean(), subset["Ageatscreening"].std()
    mmse_mean, mmse_std = subset["MMSE0m"].mean(), subset["MMSE0m"].std()
    
    print(f"--- {diag_name} ---")
    print("Gender counts:")
    print(gender_counts)
    print(f"Ageatscreening: {age_mean:.3f} ± {age_std:.3f}")
    print(f"MMSE0m: {mmse_mean:.3f} ± {mmse_std:.3f}\n")

--- Normal (NC) ---
Gender counts:
Gender
1    98
0    96
Name: count, dtype: int64
Ageatscreening: 76.175 ± 5.208
MMSE0m: 29.113 ± 1.001

--- MCI ---
Gender counts:
Gender
0    127
1     73
Name: count, dtype: int64
Ageatscreening: 74.745 ± 7.181
MMSE0m: 27.202 ± 1.825

--- AD ---
Gender counts:
Gender
1    70
0    60
Name: count, dtype: int64
Ageatscreening: 76.017 ± 7.348
MMSE0m: 23.140 ± 2.161

