**Preprocessing the PPMI Dataset:** We initiate the preprocessing of the PPMI dataset and select relevant Freesurfer processed MRI Image data.

In [1]:
import pandas as pd


In [7]:
vol_df = pd.read_csv("FS7_ASEG_VOL_13May2025.csv")

In [8]:
vol_df.head()

Unnamed: 0,PATNO,EVENT_ID,Left_WM_hypointensities,Brain_Stem,Left_non_WM_hypointensities,Optic_Chiasm,Right_WM_hypointensities,BrainSegVol,Right_Lateral_Ventricle,CC_Central,...,Left_VentralDC,SupraTentorialVolNotVent,CC_Mid_Anterior,SupraTentorialVol,SubCortGrayVol,Right_Thalamus,Left_Lateral_Ventricle,CSF,SurfaceHoles,CerebralWhiteMatterVol
0,3000,BL,0,20725.6,0,96.7,0,1144050,9579.3,936.4,...,4188.8,984766,599.2,1010680,52729.0,7122.1,9970.7,1039.0,23,497730
1,3001,BL,0,23802.3,0,152.0,0,1297010,17897.3,887.0,...,4407.5,1094140,542.6,1142600,67357.0,7495.3,22811.9,1481.8,26,565148
2,3002,BL,0,19271.1,0,102.1,0,1040700,8822.4,568.9,...,3191.4,888974,725.7,910643,54813.0,6470.1,8445.8,835.3,23,425313
3,3003,BL,0,22191.0,0,107.3,0,1177320,8620.4,1020.8,...,4427.0,1010570,1279.4,1033260,60577.0,7458.9,9266.9,1235.7,16,510845
4,3004,BL,0,24473.0,0,135.3,0,1211960,9447.4,857.1,...,3837.1,1058590,831.4,1080080,56642.0,6451.1,6889.7,851.9,21,540211


In [6]:
vol_df.shape

(1713, 66)

In [9]:

vol_df.columns

Index(['PATNO', 'EVENT_ID', 'Left_WM_hypointensities', 'Brain_Stem',
       'Left_non_WM_hypointensities', 'Optic_Chiasm',
       'Right_WM_hypointensities', 'BrainSegVol', 'Right_Lateral_Ventricle',
       'CC_Central', '5th_Ventricle', 'Right_choroid_plexus',
       'Right_Cerebellum_White_Matter', 'Left_vessel',
       'Left_Cerebellum_Cortex', 'MaskVol_to_eTIV', 'MaskVol', 'TotalGrayVol',
       'Left_choroid_plexus', 'Right_Inf_Lat_Vent', 'Left_Pallidum',
       'Left_Thalamus', 'Right_VentralDC', 'rhCortexVol',
       'Right_non_WM_hypointensities', 'BrainSegVol_to_eTIV', 'Right_Amygdala',
       'Left_Amygdala', 'EstimatedTotalIntraCranialVol', '4th_Ventricle',
       'Left_Inf_Lat_Vent', 'CortexVol', 'Right_Pallidum', 'lhCortexVol',
       'CC_Anterior', 'CC_Posterior', 'Left_Accumbens_area', 'Right_vessel',
       'Right_Cerebellum_Cortex', 'Left_Putamen', '3rd_Ventricle',
       'non_WM_hypointensities', 'Right_Caudate', 'CC_Mid_Posterior',
       'lhSurfaceHoles', 'Left_Hipp

In [131]:

vol_keep_cols = [
    'PATNO', 'EVENT_ID',
    'Left_Hippocampus', 'Right_Hippocampus',
    'Left_Amygdala', 'Right_Amygdala',
    'Left_Thalamus', 'Right_Thalamus',
    'Left_Caudate', 'Right_Caudate',
    'Left_Putamen', 'Right_Putamen',
    'Left_Accumbens_area', 'Right_Accumbens_area',
    'Left_Lateral_Ventricle', 'Right_Lateral_Ventricle',
    '3rd_Ventricle', '4th_Ventricle',
    'BrainSegVol', 'EstimatedTotalIntraCranialVol'
]

vol_df = vol_df[vol_keep_cols].copy()

# Normalize by ICV (Estimated Total IntraCranial Volume)
icv = vol_df['EstimatedTotalIntraCranialVol']

# Averaging normalized left/right values
vol_df['hippocampus_norm'] = (vol_df['Left_Hippocampus'] + vol_df['Right_Hippocampus']) / (2 * icv)
vol_df['amygdala_norm'] = (vol_df['Left_Amygdala'] + vol_df['Right_Amygdala']) / (2 * icv)
vol_df['thalamus_norm'] = (vol_df['Left_Thalamus'] + vol_df['Right_Thalamus']) / (2 * icv)
vol_df['caudate_norm'] = (vol_df['Left_Caudate'] + vol_df['Right_Caudate']) / (2 * icv)
vol_df['putamen_norm'] = (vol_df['Left_Putamen'] + vol_df['Right_Putamen']) / (2 * icv)
vol_df['accumbens_norm'] = (vol_df['Left_Accumbens_area'] + vol_df['Right_Accumbens_area']) / (2 * icv)
vol_df['lateral_ventricle_norm'] = (vol_df['Left_Lateral_Ventricle'] + vol_df['Right_Lateral_Ventricle']) / (2 * icv)

# Other structures (single-sided)
vol_df['3rd_ventricle_norm'] = vol_df['3rd_Ventricle'] / icv
vol_df['4th_ventricle_norm'] = vol_df['4th_Ventricle'] / icv
vol_df['brainsegvol_norm'] = vol_df['BrainSegVol'] / icv

# Final volumetric dataframe
vol_norm_df = vol_df[['PATNO', 'EVENT_ID'] + [
    'hippocampus_norm', 'amygdala_norm', 'thalamus_norm', 'caudate_norm',
    'putamen_norm', 'accumbens_norm', 'lateral_ventricle_norm',
    '3rd_ventricle_norm', '4th_ventricle_norm', 'brainsegvol_norm'
]]


In [126]:
vol_norm_df.head()

Unnamed: 0,PATNO,EVENT_ID,hippocampus_norm,amygdala_norm,thalamus_norm,caudate_norm,putamen_norm,accumbens_norm,lateral_ventricle_norm,3rd_ventricle_norm,4th_ventricle_norm,brainsegvol_norm
0,3000,BL,0.002523,0.00084,0.004465,0.0019,0.002644,0.000181,0.0062,0.001143,0.001441,0.725653
1,3001,BL,0.002573,0.001072,0.004421,0.003017,0.003898,0.000199,0.011808,0.001074,0.001144,0.752405
2,3002,BL,0.002592,0.001023,0.00486,0.002694,0.00388,0.00027,0.006321,0.000628,0.001336,0.761921
3,3003,BL,0.002997,0.001032,0.004703,0.002122,0.00344,0.000256,0.005642,0.00055,0.000886,0.742723
4,3004,BL,0.00257,0.001068,0.004476,0.001961,0.003293,0.000272,0.005012,0.000554,0.001474,0.743598


In [14]:
vol_norm_df.shape

(1713, 12)

In [15]:
vol_norm_df.columns

Index(['PATNO', 'EVENT_ID', 'hippocampus_norm', 'amygdala_norm',
       'thalamus_norm', 'caudate_norm', 'putamen_norm', 'accumbens_norm',
       'lateral_ventricle_norm', '3rd_ventricle_norm', '4th_ventricle_norm',
       'brainsegvol_norm'],
      dtype='object')

In [127]:
cortical_df = pd.read_csv("FS7_APARC_SA_13May2025.csv")

In [132]:
cortical_df.columns

Index(['PATNO', 'EVENT_ID', 'lh_superiorfrontal', 'rh_superiorfrontal',
       'lh_caudalmiddlefrontal', 'rh_caudalmiddlefrontal',
       'lh_rostralmiddlefrontal', 'rh_rostralmiddlefrontal',
       'lh_superiorparietal', 'rh_superiorparietal',
       'lh_caudalanteriorcingulate', 'rh_caudalanteriorcingulate',
       'lh_posteriorcingulate', 'rh_posteriorcingulate', 'lh_entorhinal',
       'rh_entorhinal', 'lh_parahippocampal', 'rh_parahippocampal',
       'lh_insula', 'rh_insula', 'lh_supramarginal', 'rh_supramarginal'],
      dtype='object')

In [133]:
# PD-relevant cortical regions (adjust as needed)
cortical_keep = [
    # Frontal (motor planning, executive function)
    "lh_superiorfrontal", "rh_superiorfrontal",          # Motor planning
    "lh_caudalmiddlefrontal", "rh_caudalmiddlefrontal",  # Executive function
    "lh_rostralmiddlefrontal", "rh_rostralmiddlefrontal",# Decision-making

    # Parietal (visuospatial integration)
    "lh_superiorparietal", "rh_superiorparietal",        # Visuospatial function

    # Cingulate (attention, mood)
    "lh_caudalanteriorcingulate", "rh_caudalanteriorcingulate",  # Attention
    "lh_posteriorcingulate", "rh_posteriorcingulate",    # Memory

    # Temporal (non-motor symptoms)
    "lh_entorhinal", "rh_entorhinal",                    # Early cognitive decline
    "lh_parahippocampal", "rh_parahippocampal",          # Memory

    # Insula (autonomic dysfunction)
    "lh_insula", "rh_insula",                            # Non-motor symptoms

    # Optional (customize based on hypotheses)
    "lh_supramarginal", "rh_supramarginal"               # Sensorimotor integration
]

# Filter cortical columns + identifiers
cortical_df = cortical_df[['PATNO', 'EVENT_ID'] + [c for c in cortical_keep if c in cortical_df.columns]]

In [134]:
cortical_df.head()

Unnamed: 0,PATNO,EVENT_ID,lh_superiorfrontal,rh_superiorfrontal,lh_caudalmiddlefrontal,rh_caudalmiddlefrontal,lh_rostralmiddlefrontal,rh_rostralmiddlefrontal,lh_superiorparietal,rh_superiorparietal,...,lh_posteriorcingulate,rh_posteriorcingulate,lh_entorhinal,rh_entorhinal,lh_parahippocampal,rh_parahippocampal,lh_insula,rh_insula,lh_supramarginal,rh_supramarginal
0,3000,BL,6599,6879,2050,2102,5335,6067,5408,6145,...,864,940,501,525,682,744,2486,2558,3729,3882
1,3001,BL,7960,7826,2462,2147,6978,6963,5204,5799,...,981,734,532,488,816,719,2375,2726,4718,3992
2,3002,BL,5926,5378,1961,1982,5019,5228,4728,5490,...,1050,847,347,487,526,636,2240,1927,3267,2763
3,3003,BL,6600,6937,2174,1883,5888,6359,5158,4770,...,873,735,407,425,721,598,2463,2242,3526,3429
4,3004,BL,7558,7102,1775,2053,5050,5132,5483,6632,...,1008,1163,511,507,743,651,2401,2408,4765,4863


In [135]:
cortical_df.shape

(1716, 22)

In [139]:
# Average left-right cortical thickness for each region
for region in ['superiorfrontal', 'caudalmiddlefrontal', 'rostralmiddlefrontal',
               'superiorparietal', 'caudalanteriorcingulate', 'posteriorcingulate',
               'entorhinal', 'parahippocampal', 'insula', 'supramarginal']:
    lh_col = f'lh_{region}'
    rh_col = f'rh_{region}'

    if lh_col in cortical_df.columns and rh_col in cortical_df.columns:
        cortical_df[f'{region}_avg'] = (cortical_df[lh_col] + cortical_df[rh_col]) / 2

# Drop original left/right columns
cortical_df = cortical_df.drop(columns=[c for c in cortical_df.columns if c.startswith(('lh_', 'rh_'))])

In [140]:
cortical_df.head()

Unnamed: 0,PATNO,EVENT_ID,superiorfrontal_avg,caudalmiddlefrontal_avg,rostralmiddlefrontal_avg,superiorparietal_avg,caudalanteriorcingulate_avg,posteriorcingulate_avg,entorhinal_avg,parahippocampal_avg,insula_avg,supramarginal_avg
0,3000,BL,6739.0,2076.0,5701.0,5776.5,656.5,902.0,513.0,713.0,2522.0,3805.5
1,3001,BL,7893.0,2304.5,6970.5,5501.5,756.5,857.5,510.0,767.5,2550.5,4355.0
2,3002,BL,5652.0,1971.5,5123.5,5109.0,503.0,948.5,417.0,581.0,2083.5,3015.0
3,3003,BL,6768.5,2028.5,6123.5,4964.0,394.0,804.0,416.0,659.5,2352.5,3477.5
4,3004,BL,7330.0,1914.0,5091.0,6057.5,576.5,1085.5,509.0,697.0,2404.5,4814.0


In [141]:
cortical_df.columns

Index(['PATNO', 'EVENT_ID', 'superiorfrontal_avg', 'caudalmiddlefrontal_avg',
       'rostralmiddlefrontal_avg', 'superiorparietal_avg',
       'caudalanteriorcingulate_avg', 'posteriorcingulate_avg',
       'entorhinal_avg', 'parahippocampal_avg', 'insula_avg',
       'supramarginal_avg'],
      dtype='object')

In [142]:
cortical_df.shape[1]

12

Merge The volumetric and Cortical MRI image data

In [143]:
# Merge cortical and volumetric features
merged_df = pd.merge(
    vol_norm_df,
    cortical_df,
    on=['PATNO', 'EVENT_ID'],
    how='left',  # Keep only participants with both datasets
    validate='one_to_one'  # Ensure no duplicates
)

# Final feature set
print("Merged columns:", merged_df.columns.tolist())
print("Shape:", merged_df.shape)

Merged columns: ['PATNO', 'EVENT_ID', 'hippocampus_norm', 'amygdala_norm', 'thalamus_norm', 'caudate_norm', 'putamen_norm', 'accumbens_norm', 'lateral_ventricle_norm', '3rd_ventricle_norm', '4th_ventricle_norm', 'brainsegvol_norm', 'superiorfrontal_avg', 'caudalmiddlefrontal_avg', 'rostralmiddlefrontal_avg', 'superiorparietal_avg', 'caudalanteriorcingulate_avg', 'posteriorcingulate_avg', 'entorhinal_avg', 'parahippocampal_avg', 'insula_avg', 'supramarginal_avg']
Shape: (1713, 22)


In [144]:
merged_df.head()

Unnamed: 0,PATNO,EVENT_ID,hippocampus_norm,amygdala_norm,thalamus_norm,caudate_norm,putamen_norm,accumbens_norm,lateral_ventricle_norm,3rd_ventricle_norm,...,superiorfrontal_avg,caudalmiddlefrontal_avg,rostralmiddlefrontal_avg,superiorparietal_avg,caudalanteriorcingulate_avg,posteriorcingulate_avg,entorhinal_avg,parahippocampal_avg,insula_avg,supramarginal_avg
0,3000,BL,0.002523,0.00084,0.004465,0.0019,0.002644,0.000181,0.0062,0.001143,...,6739.0,2076.0,5701.0,5776.5,656.5,902.0,513.0,713.0,2522.0,3805.5
1,3001,BL,0.002573,0.001072,0.004421,0.003017,0.003898,0.000199,0.011808,0.001074,...,7893.0,2304.5,6970.5,5501.5,756.5,857.5,510.0,767.5,2550.5,4355.0
2,3002,BL,0.002592,0.001023,0.00486,0.002694,0.00388,0.00027,0.006321,0.000628,...,5652.0,1971.5,5123.5,5109.0,503.0,948.5,417.0,581.0,2083.5,3015.0
3,3003,BL,0.002997,0.001032,0.004703,0.002122,0.00344,0.000256,0.005642,0.00055,...,6768.5,2028.5,6123.5,4964.0,394.0,804.0,416.0,659.5,2352.5,3477.5
4,3004,BL,0.00257,0.001068,0.004476,0.001961,0.003293,0.000272,0.005012,0.000554,...,7330.0,1914.0,5091.0,6057.5,576.5,1085.5,509.0,697.0,2404.5,4814.0


**Clinical_Scores** Since we now have the Merged MRI volumetric and cortical thickness features, we will proceed to add clinical scores to the dataset.

In [145]:
MDS1_df = pd.read_csv("MDS-UPDRS_Part_I_13May2025.csv")

In [146]:
MDS2_df= pd.read_csv("MDS_UPDRS_Part_II__Patient_Questionnaire_16May2025.csv")

In [147]:
MDS3_df= pd.read_csv("MDS-UPDRS_Part_III_16May2025.csv")

  MDS3_df= pd.read_csv("MDS-UPDRS_Part_III_16May2025.csv")


In [148]:
print(MDS1_df.shape)
print(MDS2_df.shape)
print(MDS3_df.shape)

(28123, 15)
(29100, 22)
(33067, 63)


In [149]:
MDS1_df.columns

Index(['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT', 'NUPSOURC',
       'NP1COG', 'NP1HALL', 'NP1DPRS', 'NP1ANXS', 'NP1APAT', 'NP1DDS',
       'NP1RTOT', 'ORIG_ENTRY', 'LAST_UPDATE'],
      dtype='object')

In [150]:
MDS2_df.columns

Index(['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT', 'NUPSOURC',
       'NP2SPCH', 'NP2SALV', 'NP2SWAL', 'NP2EAT', 'NP2DRES', 'NP2HYGN',
       'NP2HWRT', 'NP2HOBB', 'NP2TURN', 'NP2TRMR', 'NP2RISE', 'NP2WALK',
       'NP2FREZ', 'NP2PTOT', 'ORIG_ENTRY', 'LAST_UPDATE'],
      dtype='object')

In [151]:
MDS3_df.columns

Index(['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT', 'PDTRTMNT',
       'PDSTATE', 'HRPOSTMED', 'HRDBSON', 'HRDBSOFF', 'PDMEDYN', 'DBSYN',
       'ONOFFORDER', 'OFFEXAM', 'OFFNORSN', 'DBSOFFTM', 'ONEXAM', 'ONNORSN',
       'HIFUYN', 'DBSONTM', 'PDMEDDT', 'PDMEDTM', 'EXAMDT', 'EXAMTM',
       'NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU', 'NP3RIGRL',
       'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR',
       'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL', 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG',
       'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR',
       'NP3PTRML', 'NP3KTRMR', 'NP3KTRML', 'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
       'NP3RTALL', 'NP3RTALJ', 'NP3RTCON', 'NP3TOT', 'DYSKPRES', 'DYSKIRAT',
       'NHY', 'ORIG_ENTRY', 'LAST_UPDATE'],
      dtype='object')

In [152]:
MDS3_df.head()

Unnamed: 0,REC_ID,PATNO,EVENT_ID,PAG_NAME,INFODT,PDTRTMNT,PDSTATE,HRPOSTMED,HRDBSON,HRDBSOFF,...,NP3RTARL,NP3RTALL,NP3RTALJ,NP3RTCON,NP3TOT,DYSKPRES,DYSKIRAT,NHY,ORIG_ENTRY,LAST_UPDATE
0,272451901,3000,BL,NUPDRS3,02/2011,,,,,,...,0.0,0.0,0.0,0.0,4.0,0.0,,0.0,02/2011,2020-06-25 16:02:19.0
1,338703101,3000,V04,NUPDRS3,03/2012,,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,03/2012,2020-06-25 16:02:22.0
2,385009801,3000,V06,NUPDRS3,02/2013,,,,,,...,0.0,0.0,0.0,0.0,4.0,0.0,,0.0,02/2013,2020-06-25 16:02:22.0
3,437131401,3000,V08,NUPDRS3,03/2014,,,,,,...,0.0,0.0,0.0,0.0,2.0,0.0,,0.0,05/2014,2020-06-25 16:02:22.0
4,512469901,3000,V10,NUPDRS3,03/2015,,,,,,...,0.0,0.0,0.0,0.0,19.0,0.0,,0.0,03/2015,2020-06-25 16:02:23.0


In [153]:
selected_items = [
    # Part I
    "NP1DDS",   # Sleep disturbances (predicts RBD → PD)
    "NP1COG",    # Cognitive impairment (early non-motor sign)

    # Part III
    "NP3TREM",   # Tremor severity (subtype-specific)
    "NP3RIG",    # Rigidity (core PD motor sign)
    "NP3POST"    # Postural instability (predicts falls)
]

In [168]:
updrs_merged = pd.merge(MDS1_df, MDS2_df, on=["PATNO", "EVENT_ID"], how="left")
updrs_merged = pd.merge(updrs_merged, MDS3_df, on=["PATNO", "EVENT_ID"], how="left")

In [169]:
updrs_merged.head()

Unnamed: 0,REC_ID_x,PATNO,EVENT_ID,PAG_NAME_x,INFODT_x,NUPSOURC_x,NP1COG,NP1HALL,NP1DPRS,NP1ANXS,...,NP3RTARL,NP3RTALL,NP3RTALJ,NP3RTCON,NP3TOT,DYSKPRES,DYSKIRAT,NHY,ORIG_ENTRY,LAST_UPDATE
0,272451201,3000,BL,NUPDRS1,02/2011,1.0,1,0,1.0,1.0,...,0.0,0.0,0.0,0.0,4.0,0.0,,0.0,02/2011,2020-06-25 16:02:19.0
1,338701901,3000,V04,NUPDRS1,03/2012,1.0,0,0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,03/2012,2020-06-25 16:02:22.0
2,385008801,3000,V06,NUPDRS1,02/2013,1.0,1,0,1.0,1.0,...,0.0,0.0,0.0,0.0,4.0,0.0,,0.0,02/2013,2020-06-25 16:02:22.0
3,437130601,3000,V08,NUPDRS1,03/2014,1.0,1,0,0.0,2.0,...,0.0,0.0,0.0,0.0,2.0,0.0,,0.0,05/2014,2020-06-25 16:02:22.0
4,512466501,3000,V10,NUPDRS1,03/2015,1.0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,19.0,0.0,,0.0,03/2015,2020-06-25 16:02:23.0


In [170]:
updrs_merged.shape

(33721, 96)

In [171]:
updrs_merged.columns

Index(['REC_ID_x', 'PATNO', 'EVENT_ID', 'PAG_NAME_x', 'INFODT_x', 'NUPSOURC_x',
       'NP1COG', 'NP1HALL', 'NP1DPRS', 'NP1ANXS', 'NP1APAT', 'NP1DDS',
       'NP1RTOT', 'ORIG_ENTRY_x', 'LAST_UPDATE_x', 'REC_ID_y', 'PAG_NAME_y',
       'INFODT_y', 'NUPSOURC_y', 'NP2SPCH', 'NP2SALV', 'NP2SWAL', 'NP2EAT',
       'NP2DRES', 'NP2HYGN', 'NP2HWRT', 'NP2HOBB', 'NP2TURN', 'NP2TRMR',
       'NP2RISE', 'NP2WALK', 'NP2FREZ', 'NP2PTOT', 'ORIG_ENTRY_y',
       'LAST_UPDATE_y', 'REC_ID', 'PAG_NAME', 'INFODT', 'PDTRTMNT', 'PDSTATE',
       'HRPOSTMED', 'HRDBSON', 'HRDBSOFF', 'PDMEDYN', 'DBSYN', 'ONOFFORDER',
       'OFFEXAM', 'OFFNORSN', 'DBSOFFTM', 'ONEXAM', 'ONNORSN', 'HIFUYN',
       'DBSONTM', 'PDMEDDT', 'PDMEDTM', 'EXAMDT', 'EXAMTM', 'NP3SPCH',
       'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU', 'NP3RIGRL', 'NP3RIGLL',
       'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL',
       'NP3TTAPR', 'NP3TTAPL', 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT',
       'NP3FRZGT', 'N

In [172]:
cols_to_drop = [
    # Administrative columns
    'REC_ID_x', 'REC_ID_y', 'REC_ID', 'PAG_NAME_x', 'PAG_NAME_y', 'PAG_NAME',
    'INFODT_x', 'INFODT_y', 'INFODT', 'NUPSOURC_x', 'NUPSOURC_y', 'ORIG_ENTRY_x',
    'ORIG_ENTRY_y', 'ORIG_ENTRY', 'LAST_UPDATE_x', 'LAST_UPDATE_y', 'LAST_UPDATE',
    # Treatment-related columns (confounding variables)
    'PDTRTMNT', 'PDSTATE', 'HRPOSTMED', 'HRDBSON', 'HRDBSOFF', 'PDMEDYN', 'DBSYN',
    'ONOFFORDER', 'OFFEXAM', 'OFFNORSN', 'DBSOFFTM', 'ONEXAM', 'ONNORSN', 'HIFUYN',
    'DBSONTM', 'PDMEDDT', 'PDMEDTM', 'EXAMDT', 'EXAMTM', 'DYSKPRES', 'DYSKIRAT'
]
# Drop columns from the merged DataFrame
updrs_merged = updrs_merged.drop(columns=cols_to_drop)

In [173]:
updrs_merged.columns

Index(['PATNO', 'EVENT_ID', 'NP1COG', 'NP1HALL', 'NP1DPRS', 'NP1ANXS',
       'NP1APAT', 'NP1DDS', 'NP1RTOT', 'NP2SPCH', 'NP2SALV', 'NP2SWAL',
       'NP2EAT', 'NP2DRES', 'NP2HYGN', 'NP2HWRT', 'NP2HOBB', 'NP2TURN',
       'NP2TRMR', 'NP2RISE', 'NP2WALK', 'NP2FREZ', 'NP2PTOT', 'NP3SPCH',
       'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU', 'NP3RIGRL', 'NP3RIGLL',
       'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL',
       'NP3TTAPR', 'NP3TTAPL', 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT',
       'NP3FRZGT', 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
       'NP3KTRMR', 'NP3KTRML', 'NP3RTARU', 'NP3RTALU', 'NP3RTARL', 'NP3RTALL',
       'NP3RTALJ', 'NP3RTCON', 'NP3TOT', 'NHY'],
      dtype='object')

In [174]:
composite_scores = [
    'NP1RTOT',  # Part I: Non-motor total
    'NP2PTOT',  # Part II: Motor daily living total
    'NP3TOT'    # Part III: Motor exam total
]

In [175]:
part1_items = [
    'NP1DDS',   # Sleep disturbances (predicts REM sleep behavior disorder → PD)
    'NP1COG',   # Cognitive impairment (early non-motor sign)
    'NP1APAT'   # Apathy (linked to PD dementia)
]

In [176]:
part2_items = [
    'NP2FREZ',  # Freezing of gait (specific to PD)
    'NP2WALK'   # Walking difficulty (core PD symptom)
]

In [177]:
part3_items = [
    'NP3TREM',    # Resting tremor (PD-specific)
    'NP3BRADY',   # Bradykinesia (cardinal PD sign)
    'NP3PSTBL',   # Postural instability (predicts falls)
    'NP3RIGN',    # Neck rigidity (common in PD)
    'NP3GAIT'     # Gait impairment (early marker)
]

In [178]:
# Corrected feature selection (only columns that exist)

selected_features = [
    'PATNO', 'EVENT_ID',
    'NP1RTOT', 'NP2PTOT', 'NP3TOT',  # Composite scores
    'NP1DDS', 'NP1APAT',              # Part I items
    'NP2WALK', 'NP2FREZ',             # Part II items
    'NP3BRADY', 'NP3PSTBL', 'NP3RIGN', 'NP3GAIT'  # Part III items
]


valid_features = [col for col in selected_features if col in updrs_merged.columns]


final_df = updrs_merged[valid_features]

In [179]:
final_df.head()

Unnamed: 0,PATNO,EVENT_ID,NP1RTOT,NP2PTOT,NP3TOT,NP1DDS,NP1APAT,NP2WALK,NP2FREZ,NP3BRADY,NP3PSTBL,NP3RIGN,NP3GAIT
0,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3000,V04,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3000,V06,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3000,V08,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3000,V10,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
final_df.tail()

Unnamed: 0,PATNO,EVENT_ID,NP1RTOT,NP2PTOT,NP3TOT,NP1DDS,NP1APAT,NP2WALK,NP2FREZ,NP3BRADY,NP3PSTBL,NP3RIGN,NP3GAIT
32810,432055,BL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32811,432060,BL,0.0,2.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32812,433261,BL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32813,433274,BL,1.0,5.0,6.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
32814,439763,BL,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
final_df.shape

(32815, 13)

In [181]:
MoCA_df = pd.read_csv("Montreal_Cognitive_Assessment__MoCA__13May2025 (1).csv")

In [182]:
MoCA_df.columns

Index(['REC_ID', 'PATNO', 'EVENT_ID', 'PAG_NAME', 'INFODT', 'MCAALTTM',
       'MCACUBE', 'MCACLCKC', 'MCACLCKN', 'MCACLCKH', 'MCALION', 'MCARHINO',
       'MCACAMEL', 'MCAFDS', 'MCABDS', 'MCAVIGIL', 'MCASER7', 'MCASNTNC',
       'MCAVFNUM', 'MCAVF', 'MCAABSTR', 'MCAREC1', 'MCAREC2', 'MCAREC3',
       'MCAREC4', 'MCAREC5', 'MCADATE', 'MCAMONTH', 'MCAYR', 'MCADAY',
       'MCAPLACE', 'MCACITY', 'MCATOT', 'ORIG_ENTRY', 'LAST_UPDATE'],
      dtype='object')

In [187]:
MoCA_df = MoCA_df[['EVENT_ID',"PATNO","MCATOT"]]

In [188]:
MoCA_df.head()

Unnamed: 0,EVENT_ID,PATNO,MCATOT
0,SC,3000,27.0
1,V04,3000,29.0
2,V06,3000,28.0
3,V08,3000,30.0
4,V10,3000,29.0


In [190]:
# Replace all 'SC' with 'BL' in EVENT_ID
MoCA_df["EVENT_ID"] = MoCA_df["EVENT_ID"].replace("SC", "BL")


print(MoCA_df["EVENT_ID"].value_counts())
# Output: BL, V04, V06, V08, V10

EVENT_ID
BL     4662
V04    3009
V06    1858
V08    1247
V10     996
V12     932
V13     691
V14     673
V15     465
V17     320
V16     304
V18     252
V19     212
V20     120
ST       51
V21      23
PW        6
RS1       4
U01       3
R01       3
V05       2
V03       2
V01       2
R06       1
Name: count, dtype: int64


In [191]:
MoCA_df.shape

(15838, 3)

In [192]:
# Merge MoCA with UPDRS
merged_updrs_moca = pd.merge(
    final_df,
    MoCA_df,
    on=["PATNO", "EVENT_ID"],
    how="left"
)

In [193]:
merged_updrs_moca.head()

Unnamed: 0,PATNO,EVENT_ID,NP1RTOT,NP2PTOT,NP3TOT,NP1DDS,NP1APAT,NP2WALK,NP2FREZ,NP3BRADY,NP3PSTBL,NP3RIGN,NP3GAIT,MCATOT
0,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0
1,3000,V04,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0
2,3000,V06,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0
3,3000,V08,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
4,3000,V10,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0


In [194]:
merged_updrs_moca.shape

(33769, 14)

In [195]:
# Merge UPDRS+MoCA with MRI
final_df = pd.merge(
    merged_updrs_moca,
    merged_df,
    on=["PATNO", "EVENT_ID"],
    how="left"  # Keep only participants with MRI + clinical data
)

In [197]:
final_df.head()

Unnamed: 0,PATNO,EVENT_ID,NP1RTOT,NP2PTOT,NP3TOT,NP1DDS,NP1APAT,NP2WALK,NP2FREZ,NP3BRADY,...,superiorfrontal_avg,caudalmiddlefrontal_avg,rostralmiddlefrontal_avg,superiorparietal_avg,caudalanteriorcingulate_avg,posteriorcingulate_avg,entorhinal_avg,parahippocampal_avg,insula_avg,supramarginal_avg
0,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,6739.0,2076.0,5701.0,5776.5,656.5,902.0,513.0,713.0,2522.0,3805.5
1,3000,V04,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,3000,V06,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,3000,V08,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,3000,V10,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [198]:
final_df.shape

(33769, 34)

In [199]:
final_df.columns

Index(['PATNO', 'EVENT_ID', 'NP1RTOT', 'NP2PTOT', 'NP3TOT', 'NP1DDS',
       'NP1APAT', 'NP2WALK', 'NP2FREZ', 'NP3BRADY', 'NP3PSTBL', 'NP3RIGN',
       'NP3GAIT', 'MCATOT', 'hippocampus_norm', 'amygdala_norm',
       'thalamus_norm', 'caudate_norm', 'putamen_norm', 'accumbens_norm',
       'lateral_ventricle_norm', '3rd_ventricle_norm', '4th_ventricle_norm',
       'brainsegvol_norm', 'superiorfrontal_avg', 'caudalmiddlefrontal_avg',
       'rostralmiddlefrontal_avg', 'superiorparietal_avg',
       'caudalanteriorcingulate_avg', 'posteriorcingulate_avg',
       'entorhinal_avg', 'parahippocampal_avg', 'insula_avg',
       'supramarginal_avg'],
      dtype='object')

In [200]:
Gen_df =  pd.read_csv("iu_genetic_consensus_20250225_13May2025.csv")

In [201]:
Gen_df.head()

Unnamed: 0,PATNO,CLIA,GWAS,WES,WGS,SVs,SANGER,IU_Fingerprint,RNASEQ,RNASEQ_VIS,...,PATHVAR_COUNT,VAR_GENE,LRRK2,GBA,VPS35,SNCA,PRKN,PARK7,PINK1,NOTES
0,3000,-,X,X,X,X,X,X,X,5,...,0.0,0,0,0,0,0,0,0,0,
1,3001,-,X,X,X,X,X,X,X,5,...,0.0,0,0,0,0,0,0,0,0,
2,3002,-,X,X,X,X,X,X,X,4,...,0.0,0,0,0,0,0,0,0,0,
3,3003,-,-,X,X,X,X,X,X,5,...,0.0,0,0,0,0,0,0,0,0,
4,3004,-,X,X,X,X,X,X,X,5,...,0.0,0,0,0,0,0,0,0,0,


In [204]:
Gen_df.columns

Index(['PATNO', 'APOE'], dtype='object')

In [205]:
Gen_df = Gen_df[["PATNO","APOE"]]

In [206]:
Gen_df.head()

Unnamed: 0,PATNO,APOE
0,3000,E3/E3
1,3001,E3/E3
2,3002,E3/E3
3,3003,E3/E4
4,3004,E2/E3


In [207]:
Gen_df.shape

(6265, 2)

In [208]:
Age_df = pd.read_csv("Age_at_visit_16May2025.csv")

In [209]:
Age_df.columns

Index(['PATNO', 'EVENT_ID', 'AGE_AT_VISIT'], dtype='object')

In [210]:
Age_df.head()

Unnamed: 0,PATNO,EVENT_ID,AGE_AT_VISIT
0,3000,BL,69.1
1,3000,R17,80.5
2,3000,R18,81.4
3,3000,SC,69.1
4,3000,V01,69.4


In [211]:
Age_df.shape

(35906, 3)

In [84]:
status_df = pd.read_csv("Participant_Status_16May2025.csv")

In [212]:
status_df.columns

Index(['PATNO', 'COHORT_DEFINITION'], dtype='object')

In [86]:
status_df.head()

Unnamed: 0,PATNO,COHORT,COHORT_DEFINITION,ENROLL_DATE,ENROLL_STATUS,STATUS_DATE,SCREENEDAM4,ENROLL_AGE,INEXPAGE,AV133STDY,...,PPMI_ONLINE_ENROLL,ENRLPINK1,ENRLPRKN,ENRLSRDC,ENRLNORM,ENRLHPSM,ENRLRBD,ENRLLRRK2,ENRLSNCA,ENRLGBA
0,3000,2,Healthy Control,02/2011,Withdrew,10/2024,0.0,69.1,,0.0,...,NO,0.0,0.0,0.0,,0,0,0,0,0
1,3001,1,Parkinson's Disease,03/2011,Enrolled,09/2021,0.0,65.1,,0.0,...,NO,0.0,0.0,1.0,,0,0,0,0,0
2,3002,1,Parkinson's Disease,03/2011,Withdrew,10/2024,0.0,67.6,,0.0,...,NO,0.0,0.0,1.0,,0,0,0,0,0
3,3003,1,Parkinson's Disease,04/2011,Enrolled,01/2022,0.0,56.7,,0.0,...,YES,0.0,0.0,1.0,,0,0,0,0,0
4,3004,2,Healthy Control,04/2011,Enrolled,01/2022,0.0,59.4,,0.0,...,YES,0.0,0.0,0.0,,0,0,0,0,0


In [213]:
status_df.shape

(5223, 2)

In [214]:
status_df = status_df[["PATNO","COHORT_DEFINITION",]]

In [215]:
status_df.head()

Unnamed: 0,PATNO,COHORT_DEFINITION
0,3000,Healthy Control
1,3001,Parkinson's Disease
2,3002,Parkinson's Disease
3,3003,Parkinson's Disease
4,3004,Healthy Control


In [216]:

# Merge UPDRS+MoCA+MRI+APOE4
final_APOE_df = pd.merge(
    final_df,
    Gen_df,
    on=["PATNO"],
    how="left"
)

In [217]:
final_APOE_df.head()

Unnamed: 0,PATNO,EVENT_ID,NP1RTOT,NP2PTOT,NP3TOT,NP1DDS,NP1APAT,NP2WALK,NP2FREZ,NP3BRADY,...,caudalmiddlefrontal_avg,rostralmiddlefrontal_avg,superiorparietal_avg,caudalanteriorcingulate_avg,posteriorcingulate_avg,entorhinal_avg,parahippocampal_avg,insula_avg,supramarginal_avg,APOE
0,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,2076.0,5701.0,5776.5,656.5,902.0,513.0,713.0,2522.0,3805.5,E3/E3
1,3000,V04,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,E3/E3
2,3000,V06,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,E3/E3
3,3000,V08,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,E3/E3
4,3000,V10,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,E3/E3


In [218]:
final_APOE_df.shape

(33769, 35)

In [219]:
final_APOE_df.columns

Index(['PATNO', 'EVENT_ID', 'NP1RTOT', 'NP2PTOT', 'NP3TOT', 'NP1DDS',
       'NP1APAT', 'NP2WALK', 'NP2FREZ', 'NP3BRADY', 'NP3PSTBL', 'NP3RIGN',
       'NP3GAIT', 'MCATOT', 'hippocampus_norm', 'amygdala_norm',
       'thalamus_norm', 'caudate_norm', 'putamen_norm', 'accumbens_norm',
       'lateral_ventricle_norm', '3rd_ventricle_norm', '4th_ventricle_norm',
       'brainsegvol_norm', 'superiorfrontal_avg', 'caudalmiddlefrontal_avg',
       'rostralmiddlefrontal_avg', 'superiorparietal_avg',
       'caudalanteriorcingulate_avg', 'posteriorcingulate_avg',
       'entorhinal_avg', 'parahippocampal_avg', 'insula_avg',
       'supramarginal_avg', 'APOE'],
      dtype='object')

In [230]:
# Merge UPDRS+MoCA+MRI+APOE4+Age
final_df_APOE_Age = pd.merge(
    final_APOE_df,
    status_df,
    on=["PATNO",],
    how="left"
)

In [235]:
final_df_APOE_Age.head()

Unnamed: 0,PATNO,EVENT_ID,NP1RTOT,NP2PTOT,NP3TOT,NP1DDS,NP1APAT,NP2WALK,NP2FREZ,NP3BRADY,...,rostralmiddlefrontal_avg,superiorparietal_avg,caudalanteriorcingulate_avg,posteriorcingulate_avg,entorhinal_avg,parahippocampal_avg,insula_avg,supramarginal_avg,APOE,COHORT_DEFINITION
0,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,5701.0,5776.5,656.5,902.0,513.0,713.0,2522.0,3805.5,E3/E3,Healthy Control
1,3000,V04,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,E3/E3,Healthy Control
2,3000,V06,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,E3/E3,Healthy Control
3,3000,V08,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,E3/E3,Healthy Control
4,3000,V10,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,E3/E3,Healthy Control


In [236]:
final_df_APOE_Age.columns

Index(['PATNO', 'EVENT_ID', 'NP1RTOT', 'NP2PTOT', 'NP3TOT', 'NP1DDS',
       'NP1APAT', 'NP2WALK', 'NP2FREZ', 'NP3BRADY', 'NP3PSTBL', 'NP3RIGN',
       'NP3GAIT', 'MCATOT', 'hippocampus_norm', 'amygdala_norm',
       'thalamus_norm', 'caudate_norm', 'putamen_norm', 'accumbens_norm',
       'lateral_ventricle_norm', '3rd_ventricle_norm', '4th_ventricle_norm',
       'brainsegvol_norm', 'superiorfrontal_avg', 'caudalmiddlefrontal_avg',
       'rostralmiddlefrontal_avg', 'superiorparietal_avg',
       'caudalanteriorcingulate_avg', 'posteriorcingulate_avg',
       'entorhinal_avg', 'parahippocampal_avg', 'insula_avg',
       'supramarginal_avg', 'APOE', 'COHORT_DEFINITION'],
      dtype='object')

In [237]:
# Merge UPDRS+MoCA+MRI+APOE4+Age+Dx
final_df_all = pd.merge(
    final_df_APOE_Age,
    Age_df,
    on=["PATNO"],
    how="left"
)

In [238]:
final_df_all.head()

Unnamed: 0,PATNO,EVENT_ID_x,NP1RTOT,NP2PTOT,NP3TOT,NP1DDS,NP1APAT,NP2WALK,NP2FREZ,NP3BRADY,...,caudalanteriorcingulate_avg,posteriorcingulate_avg,entorhinal_avg,parahippocampal_avg,insula_avg,supramarginal_avg,APOE,COHORT_DEFINITION,EVENT_ID_y,AGE_AT_VISIT
0,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,656.5,902.0,513.0,713.0,2522.0,3805.5,E3/E3,Healthy Control,BL,69.1
1,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,656.5,902.0,513.0,713.0,2522.0,3805.5,E3/E3,Healthy Control,R17,80.5
2,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,656.5,902.0,513.0,713.0,2522.0,3805.5,E3/E3,Healthy Control,R18,81.4
3,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,656.5,902.0,513.0,713.0,2522.0,3805.5,E3/E3,Healthy Control,SC,69.1
4,3000,BL,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,656.5,902.0,513.0,713.0,2522.0,3805.5,E3/E3,Healthy Control,V01,69.4


In [239]:
final_df_all.shape

(489408, 38)

In [245]:
final_df_all.columns

Index(['PATNO', 'EVENT_ID_x', 'NP1RTOT', 'NP2PTOT', 'NP3TOT', 'NP1DDS',
       'NP1APAT', 'NP2WALK', 'NP2FREZ', 'NP3BRADY', 'NP3PSTBL', 'NP3RIGN',
       'NP3GAIT', 'MCATOT', 'hippocampus_norm', 'amygdala_norm',
       'thalamus_norm', 'caudate_norm', 'putamen_norm', 'accumbens_norm',
       'lateral_ventricle_norm', '3rd_ventricle_norm', '4th_ventricle_norm',
       'brainsegvol_norm', 'superiorfrontal_avg', 'caudalmiddlefrontal_avg',
       'rostralmiddlefrontal_avg', 'superiorparietal_avg',
       'caudalanteriorcingulate_avg', 'posteriorcingulate_avg',
       'entorhinal_avg', 'parahippocampal_avg', 'insula_avg',
       'supramarginal_avg', 'APOE', 'COHORT_DEFINITION', 'EVENT_ID_y',
       'AGE_AT_VISIT'],
      dtype='object')

In [246]:
# Drop duplicate columns (keep '_x' as an example)
cols_to_drop = [col for col in final_df_all.columns if col.endswith('_y')]
final_df_all = final_df_all.drop(columns=cols_to_drop)

# Rename remaining columns (remove '_x' suffix)
final_df_all.columns = [col.replace('_x', '') if '_x' in col else col for col in final_df_all.columns]

In [249]:
final_df_all.columns

Index(['PATNO', 'EVENT_ID', 'NP1RTOT', 'NP2PTOT', 'NP3TOT', 'NP1DDS',
       'NP1APAT', 'NP2WALK', 'NP2FREZ', 'NP3BRADY', 'NP3PSTBL', 'NP3RIGN',
       'NP3GAIT', 'MCATOT', 'hippocampus_norm', 'amygdala_norm',
       'thalamus_norm', 'caudate_norm', 'putamen_norm', 'accumbens_norm',
       'lateral_ventricle_norm', '3rd_ventricle_norm', '4th_ventricle_norm',
       'brainsegvol_norm', 'superiorfrontal_avg', 'caudalmiddlefrontal_avg',
       'rostralmiddlefrontal_avg', 'superiorparietal_avg',
       'caudalanteriorcingulate_avg', 'posteriorcingulate_avg',
       'entorhinal_avg', 'parahippocampal_avg', 'insula_avg',
       'supramarginal_avg', 'APOE', 'COHORT_DEFINITION', 'AGE_AT_VISIT'],
      dtype='object')

In [250]:
final_df_all.shape

(489408, 37)

In [253]:
# Save the full dataset (all visits)
final_df_all.to_csv("full_pd_dataset_with_duplicates_resolved.csv", index=False)

**Ouptut**

The end of this preprocessing script output is a large PPMI dataset cosisting of selected merged PPMI features.

 MRI,
 Age,
 Clinical Scores,
 Genotype.

 **Next Steps**
 Futher preprocessing,
 Clean the data,
 Select by baseline,
 Handle missing data
 Prepare data for Machine Learning Pipeline


