Cleaned ADNI data with all the MRI features normalized with ICV. other features includes clinical scores and genetic variance APOE4.

In [23]:
"""
ADNI Baseline Data Preprocessing Script
- Filters baseline visits
- Selects and engineers features
- Handles missing values
- Saves clean dataset to CSV
"""

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv('ADNIMERGE_25Apr2025.csv')

# Filter for baseline visits
df_bl = df[df['VISCODE'] == 'bl'] # Creating df_bl by filtering baseline visits

# Select relevant features
features = [
    'PTID', 'AGE', 'APOE4', 'DX',
    'Hippocampus_bl', 'Entorhinal_bl', 'MidTemp_bl', 'Ventricles_bl',
    'Fusiform_bl', 'WholeBrain_bl', 'ICV_bl',
    'MMSE', 'CDRSB'
]
df_bl = df_bl[features]

  df = pd.read_csv('ADNIMERGE_25Apr2025.csv')


In [24]:


# Handle Missing data
print("\n=== Initial Missing Data ===")
missing_initial = df_bl.isna().sum()
missing_percent_initial = (missing_initial / len(df_bl)) * 100
print("Missing values per column (count):\n", missing_initial)
print("\nMissing values per column (%):\n", missing_percent_initial.round(2))





=== Initial Missing Data ===
Missing values per column (count):
 PTID                0
AGE                 4
APOE4             217
DX                 21
Hippocampus_bl    349
Entorhinal_bl     378
MidTemp_bl        378
Ventricles_bl     178
Fusiform_bl       378
WholeBrain_bl     138
ICV_bl             92
MMSE                1
CDRSB               0
dtype: int64

Missing values per column (%):
 PTID               0.00
AGE                0.16
APOE4              8.93
DX                 0.86
Hippocampus_bl    14.36
Entorhinal_bl     15.56
MidTemp_bl        15.56
Ventricles_bl      7.33
Fusiform_bl       15.56
WholeBrain_bl      5.68
ICV_bl             3.79
MMSE               0.04
CDRSB              0.00
dtype: float64


In [25]:
# Drop rows with missing MRI data
mri_columns = ['Hippocampus_bl', 'Entorhinal_bl', 'MidTemp_bl',
               'Ventricles_bl', 'Fusiform_bl', 'WholeBrain_bl', 'ICV_bl']
df_bl = df_bl.dropna(subset=mri_columns, how='any')

print(f"Dropped {len(df_bl) - len(df_bl)} rows with missing MRI data.")

Dropped 0 rows with missing MRI data.


In [26]:

# Handle Missing data
print("\n=== Initial Missing Data ===")
missing_initial = df_bl.isna().sum()
missing_percent_initial = (missing_initial / len(df_bl)) * 100
print("Missing values per column (count):\n", missing_initial)
print("\nMissing values per column (%):\n", missing_percent_initial.round(2))


=== Initial Missing Data ===
Missing values per column (count):
 PTID                0
AGE                 3
APOE4             141
DX                 12
Hippocampus_bl      0
Entorhinal_bl       0
MidTemp_bl          0
Ventricles_bl       0
Fusiform_bl         0
WholeBrain_bl       0
ICV_bl              0
MMSE                0
CDRSB               0
dtype: int64

Missing values per column (%):
 PTID              0.00
AGE               0.15
APOE4             7.25
DX                0.62
Hippocampus_bl    0.00
Entorhinal_bl     0.00
MidTemp_bl        0.00
Ventricles_bl     0.00
Fusiform_bl       0.00
WholeBrain_bl     0.00
ICV_bl            0.00
MMSE              0.00
CDRSB             0.00
dtype: float64


In [27]:
# Normalize MRI volumes by ICV_bl
roi_features = ['Hippocampus_bl', 'Entorhinal_bl', 'MidTemp_bl',
                'Ventricles_bl', 'Fusiform_bl', 'WholeBrain_bl']
for roi in roi_features:
    df_bl[f'{roi}_ICV'] = df_bl[roi] / df_bl['ICV_bl']

df_bl = df_bl.drop(columns=roi_features + ['ICV_bl'])

In [28]:

print(f"Rows before dropping missing DX: {len(df_bl)}")
df_bl = df_bl.dropna(subset=['DX'])  # Remove rows with missing diagnosis
print(f"Rows after dropping missing DX: {len(df_bl)}")

Rows before dropping missing DX: 1945
Rows after dropping missing DX: 1933


In [29]:
# Check missingness post-MRI cleanup
print("\n=== Missing Data After MRI Cleanup ===")
print("Missing values:\n", df_bl.isna().sum())

# Impute AGE with median (if missing)
df_bl['AGE'] = df_bl['AGE'].fillna(df_bl['AGE'].median())

# Impute MMSE/CDRSB with median within DX groups
# Impute MMSE/CDRSB using DX groups (now no missing DX)
for var in ['MMSE', 'CDRSB']:
    df_bl[var] = df_bl.groupby('DX')[var].transform(lambda x: x.fillna(x.median()))

# Impute APOE4 with mode (most frequent value in the dataset)
df_bl['APOE4'] = df_bl['APOE4'].fillna(df_bl['APOE4'].mode()[0])


=== Missing Data After MRI Cleanup ===
Missing values:
 PTID                    0
AGE                     3
APOE4                 133
DX                      0
MMSE                    0
CDRSB                   0
Hippocampus_bl_ICV      0
Entorhinal_bl_ICV       0
MidTemp_bl_ICV          0
Ventricles_bl_ICV       0
Fusiform_bl_ICV         0
WholeBrain_bl_ICV       0
dtype: int64


In [30]:
# Handle Missing data
print("\n=== Initial Missing Data ===")
missing_initial = df_bl.isna().sum()
missing_percent_initial = (missing_initial / len(df_bl)) * 100
print("Missing values per column (count):\n", missing_initial)
print("\nMissing values per column (%):\n", missing_percent_initial.round(2))


=== Initial Missing Data ===
Missing values per column (count):
 PTID                  0
AGE                   0
APOE4                 0
DX                    0
MMSE                  0
CDRSB                 0
Hippocampus_bl_ICV    0
Entorhinal_bl_ICV     0
MidTemp_bl_ICV        0
Ventricles_bl_ICV     0
Fusiform_bl_ICV       0
WholeBrain_bl_ICV     0
dtype: int64

Missing values per column (%):
 PTID                  0.0
AGE                   0.0
APOE4                 0.0
DX                    0.0
MMSE                  0.0
CDRSB                 0.0
Hippocampus_bl_ICV    0.0
Entorhinal_bl_ICV     0.0
MidTemp_bl_ICV        0.0
Ventricles_bl_ICV     0.0
Fusiform_bl_ICV       0.0
WholeBrain_bl_ICV     0.0
dtype: float64


In [32]:
print("\n=== Final Missing Data ===")
missing_final = df_bl.isna().sum().sum()
if missing_final == 0:
    print("No missing values remaining!")
else:
    print(f"{missing_final} missing values left. Investigate:\n")
    print(df_bl.isna().sum())
    # Optional: Drop rows with residual missingness



=== Final Missing Data ===
No missing values remaining!


In [33]:
#Explore data for completeness

df_bl.head()

Unnamed: 0,PTID,AGE,APOE4,DX,MMSE,CDRSB,Hippocampus_bl_ICV,Entorhinal_bl_ICV,MidTemp_bl_ICV,Ventricles_bl_ICV,Fusiform_bl_ICV,WholeBrain_bl_ICV
0,011_S_0002,74.3,0.0,CN,28.0,0.0,0.0042,0.002105,0.014076,0.059573,0.008343,0.619623
1,011_S_0003,81.3,1.0,Dementia,20.0,4.5,0.002769,0.000932,0.009591,0.044046,0.008073,0.588242
5,022_S_0004,67.5,0.0,MCI,27.0,1.0,0.00409,0.002372,0.011679,0.023582,0.011335,0.687717
10,011_S_0005,73.7,0.0,CN,29.0,0.0,0.004312,0.002702,0.013173,0.02076,0.015108,0.680552
15,100_S_0006,80.4,0.0,MCI,25.0,0.5,0.003599,0.001532,0.011981,0.026804,0.01209,0.624237


In [34]:
df_bl.count()

Unnamed: 0,0
PTID,1933
AGE,1933
APOE4,1933
DX,1933
MMSE,1933
CDRSB,1933
Hippocampus_bl_ICV,1933
Entorhinal_bl_ICV,1933
MidTemp_bl_ICV,1933
Ventricles_bl_ICV,1933


In [35]:
df_bl.head(100)

Unnamed: 0,PTID,AGE,APOE4,DX,MMSE,CDRSB,Hippocampus_bl_ICV,Entorhinal_bl_ICV,MidTemp_bl_ICV,Ventricles_bl_ICV,Fusiform_bl_ICV,WholeBrain_bl_ICV
0,011_S_0002,74.3,0.0,CN,28.0,0.0,0.004200,0.002105,0.014076,0.059573,0.008343,0.619623
1,011_S_0003,81.3,1.0,Dementia,20.0,4.5,0.002769,0.000932,0.009591,0.044046,0.008073,0.588242
5,022_S_0004,67.5,0.0,MCI,27.0,1.0,0.004090,0.002372,0.011679,0.023582,0.011335,0.687717
10,011_S_0005,73.7,0.0,CN,29.0,0.0,0.004312,0.002702,0.013173,0.020760,0.015108,0.680552
15,100_S_0006,80.4,0.0,MCI,25.0,0.5,0.003599,0.001532,0.011981,0.026804,0.012090,0.624237
...,...,...,...,...,...,...,...,...,...,...,...,...
453,136_S_0186,80.4,0.0,CN,27.0,0.5,0.004866,0.002498,0.012103,0.030544,0.011986,0.640510
460,153_S_5267,65.8,1.0,CN,28.0,0.0,0.005948,0.002621,0.015014,0.006078,0.014648,0.724784
461,032_S_0187,77.1,2.0,MCI,28.0,1.0,0.003717,0.002048,0.011209,0.019812,0.010005,0.611036
466,128_S_0188,86.1,1.0,MCI,27.0,2.0,0.003492,0.002096,0.010725,0.027965,0.010557,0.585669


In [36]:
# Save the cleaned DataFrame to a CSV file
output_path = "cleaned_ADNI_baseline2_data.csv"
df_bl.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to: {output_path}")

Cleaned dataset saved to: cleaned_ADNI_baseline2_data.csv


In [37]:
from google.colab import files

# Save the cleaned DataFrame to CSV (if not already saved)
df_bl.to_csv("cleaned_ADNI_baseline_data.csv", index=False)

# Download the file+
files.download("cleaned_ADNI_baseline_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>