In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os

In [None]:
## Folder containing tabulated MRI data from the PNC study
os.chdir('regularFSstats')

In [None]:
## Read area, thickness, volume and subcortical volume data into data frames
arearhPNC = pd.read_csv('rh.area.txt',sep='\s+')
arealhPNC = pd.read_csv('lh.area.txt',sep='\s+')
thickrhPNC = pd.read_csv('rh.thicknessstd.txt',sep='\s+')
thicklhPNC = pd.read_csv('lh.thicknessstd.txt',sep='\s+')
volumerhPNC = pd.read_csv('rh.volume.txt',sep='\s+')
volumelhPNC = pd.read_csv('lh.volume.txt',sep='\s+')
subcorticalPNC = pd.read_csv('subcorticalstats.txt',sep='\s+')

In [None]:
## Align naming of ID column to enable merging of data frames
FSlist = [arearhPNC, arealhPNC, thickrhPNC, thicklhPNC, volumerhPNC, volumelhPNC, subcorticalPNC]

for fs in FSlist:
    fs.columns = fs.columns.str.replace(".*aparc.*", "ID", regex=True)

subcorticalPNC.rename(columns = {'Measure:volume':'ID'}, inplace=True)

In [None]:
areaPNC = arealhPNC.merge(arearhPNC, how='outer', on='ID')

thickPNC = thicklhPNC.merge(thickrhPNC, how='outer', on='ID')

volPNC = volumelhPNC.merge(volumerhPNC, how='outer', on='ID')

In [None]:
'''MAKE PATHS RELATIVE'''
## To match the test data, uncomment this cell
MRITest = pd.read_csv('..\\processedData\\harmonizedTestdata_plusscannerdfONLYSMRI.csv')

## To match the training data, uncomment this cell
MRITrain = pd.read_csv('..\\processedData\\menarcheTrain_harm_red_matchedAgeScannerONLYSMRI.csv')

testSubs = MRITest['subjectkey'].to_list()
trainSubs = MRITrain['subjectkey'].to_list()

MRI = pd.concat([MRITrain, MRITest])

In [None]:
subABCD = pd.read_csv('..\\ABCDTabular\\abcd_smrip10201.txt', sep='\s+') 
subABCD = subABCD[subABCD['subjectkey'].isin(MRI['subjectkey'])]
subABCD = subABCD[subABCD['eventname'] == '2_year_follow_up_y_arm_1']

In [None]:
## Tease apart the individual MRI measures 
thickABCD = MRI.filter(regex=".*smri_thick_cdk.*").copy()
thickABCD['subjectkey'] = MRI['subjectkey']
areaABCD = MRI.filter(regex=".*smri_area_cdk.*").copy()
areaABCD['subjectkey'] = MRI['subjectkey']
volABCD = MRI.filter(regex=".*smri_vol_cdk.*").copy()
volABCD['subjectkey'] = MRI['subjectkey']
subcorticalABCD = subABCD.filter(regex=".*vol_scs.*").copy()
subcorticalABCD['subjectkey'] = subABCD['subjectkey']

In [None]:
## Drop duplicates caught by merging
thickPNC.drop(columns = ['BrainSegVolNotVent_y', 'BrainSegVolNotVent_x', 'eTIV_y', 'eTIV_x'], inplace = True)
volPNC.drop(columns = ['BrainSegVolNotVent_y', 'BrainSegVolNotVent_x', 'eTIV_y', 'eTIV_x'], inplace = True)
areaPNC.drop(columns = ['BrainSegVolNotVent_y', 'BrainSegVolNotVent_x', 'eTIV_y', 'eTIV_x', 
                        'rh_WhiteSurfArea_area', 'lh_WhiteSurfArea_area'], inplace = True)

In [None]:
subcorticalPNC.columns

In [None]:
subcorticalABCD.columns

In [None]:
## Drop unnecessary columns (such as total intracranial volume)
subcorticalABCD.drop(columns=['smri_vol_scs_wholeb','smri_vol_scs_latventricles','smri_vol_scs_allventricles',
                              'smri_vol_scs_intracranialv','smri_vol_scs_lesionlh','smri_vol_scs_lesionrh'], inplace=True)

In [None]:
## Drop unnecessary columns (such as total intracranial volume)
subcorticalPNC = subcorticalPNC.drop(columns=['Left-vessel', 'Left-choroid-plexus', 'Right-vessel', 'Right-choroid-plexus', '5th-Ventricle',
                                           'non-WM-hypointensities', 'Left-non-WM-hypointensities', 'Right-non-WM-hypointensities',
                                           'Optic-Chiasm', 'BrainSegVol', 'BrainSegVolNotVent', 'lhCortexVol', 'rhCortexVol', 'CortexVol',
                                           'CerebralWhiteMatterVol', 'TotalGrayVol', 'SupraTentorialVolNotVent', 'MaskVol',
                                           'BrainSegVol-to-eTIV', 'MaskVol-to-eTIV', 'lhSurfaceHoles', 'rhSurfaceHoles', 'SurfaceHoles',
                                           'EstimatedTotalIntraCranialVol'])

In [None]:
## Load demographic PNC data to exclude subjects with med_rating >= 3
demographicsPNC = pd.read_csv('..\\processedData\\age_sex_med.csv', sep=';')

In [None]:
## Load Euler data to exclude subjects with Euler number > 3 std below mean 
eulernumberPNC = pd.read_csv('..\\processedData\\allEulerPNC.csv')

In [None]:
## Delete bad Euler subjects
eulernumberPNC['euler_mean'] = (eulernumberPNC['euler_lh']+eulernumberPNC['euler_rh'])/2
delSubjects = eulernumberPNC[eulernumberPNC['euler_mean'] < (-(abs(eulernumberPNC['euler_mean'].mean()) + 3* eulernumberPNC['euler_mean'].std()))]
eulernumberPNC = eulernumberPNC.drop(index = delSubjects.index)

In [None]:
## Delete subjects with severe medical conditions
todelete = demographicsPNC[demographicsPNC['med_rating'] >= 3.0]
demographicsPNC = demographicsPNC.drop(index = todelete.index)

In [None]:
## Combine both
demog_red = demographicsPNC.loc[demographicsPNC['SUBJID'].isin(eulernumberPNC['subject'])] 
euler_red = eulernumberPNC.loc[eulernumberPNC['subject'].isin(demog_red['SUBJID'])]
demog_red.rename(columns = {'SUBJID':'subject'}, inplace = True)
demogandeuler = demog_red.merge(euler_red, how = 'outer', on = 'subject')

In [None]:
## Load file specifying which subcortical features to include (in accordance to previous work)
subfeatselect = pd.read_csv('subcorticalFeatureSelection.txt', sep=';')

In [None]:
## Reduce from 64 to 30 subcortical features
subfeat = subfeatselect['feature'][subfeatselect['include1_exclude0'] == 1]
subfeat = subfeat.drop(index = 63)
## Turn into list and append ID
subfeat = subfeat.to_list()
subfeat.append('ID')
## Reduce PNC subcortical data to the selected columns
subcorticalPNC = subcorticalPNC[subfeat]

In [None]:
## Specify columns to drop from subcortical ABCD data and drop
drop_sub = ['smri_vol_scs_cbwmatterlh', 'smri_vol_scs_csf', 'smri_vol_scs_vedclh', 'smri_vol_scs_wmhint', 'smri_vol_scs_cbwmatterrh', 
            'smri_vol_scs_vedcrh', 'smri_vol_scs_wmhintlh', 'smri_vol_scs_wmhintrh', 'smri_vol_scs_suprateialv', 'smri_vol_scs_subcorticalgv']

subcorticalABCD.drop(columns=drop_sub, inplace=True)

In [None]:
## Merge all PNC MRI Dfs
mergePNC = [areaPNC, volPNC, thickPNC, subcorticalPNC]

from functools import reduce
allPNC = reduce(lambda left,right: pd.merge(left,right,on=['ID'], how='outer'), mergePNC)
allPNC

In [None]:
## Merge all ABCD MRI Dfs to create Df with only MRI features and subjectkey
mergeABCD = [areaABCD, volABCD, thickABCD, subcorticalABCD]

from functools import reduce
allABCD = reduce(lambda left,right: pd.merge(left,right,on=['subjectkey'], how='outer'), mergeABCD)
allABCD

In [None]:
## Remove subjects with bad Euler or Med rating
demogandeuler.rename(columns={'subject':'ID'},inplace=True)
allPNCred = allPNC[allPNC['ID'].isin(demogandeuler['ID'])]

In [None]:
## Bring PNC MRI and demographic Df in same order and add sex to MRI data
allPNCred.sort_values(by = 'ID', inplace = True, ignore_index = True)
demogandeuler.sort_values(by = 'ID', inplace = True, ignore_index = True)

allPNCred['sex'] = demogandeuler['sex_from_pedigree_data']

In [None]:
## Rename sex values from 1.0, 2.0 to F and M
allPNCred.loc[allPNCred['sex'] == 2.0, 'sex'] = 'F'
allPNCred.loc[allPNCred['sex'] == 1.0, 'sex'] = 'M'

In [None]:
## Combine PNC Dfs
PNC_MRIandDemog = pd.merge(allPNCred, demogandeuler, on = 'ID', how = 'outer')

In [None]:
## Make training feature Df with only MRI data and subject ID
PNCtraining = PNC_MRIandDemog.drop(columns = demogandeuler.columns.to_list())

In [None]:
## Make training labels array
PNClabels = PNC_MRIandDemog['ageAtScan'].copy()

In [None]:
MRITestABCD = allABCD[allABCD['subjectkey'].isin(MRITest['subjectkey'])]
MRITrainABCD = allABCD[allABCD['subjectkey'].isin(MRITrain['subjectkey'])]

In [None]:
MRITrainABCD.to_csv('..\\processedData\\TrainAgePredFeaturesHarmonised.csv', index = False)
MRITestABCD.to_csv('..\\processedData\\TestAgePredFeaturesHarmonised.csv', index = False)

In [None]:
PNCtraining.to_csv('..\\processedData\\PNCtraining.csv', index = False)
PNClabels.to_csv('..\\processedData\\PNClabels.csv', index = False)

In [None]:
## Make table of all features used to save as csv file
ABCD_subc = allABCD.filter(regex=".*_vol_scs_.*").copy()
thickABCDlh = thickABCD.filter(regex=".*lh.*").copy()
areaABCDlh = areaABCD.filter(regex=".*lh.*").copy()
volABCDlh = volABCD.filter(regex=".*lh.*").copy()
allFeatures2 = pd.DataFrame({'Thickness': thickABCDlh.columns, 'Volume': volABCDlh.columns, 'Area': areaABCDlh.columns}) 

allFeatures = pd.DataFrame({'Thickness': thickrhPNC.columns, 'Volume': volumerhPNC.columns}) 
allFeatures['Area'] = (arearhPNC.drop(columns = 'rh_WhiteSurfArea_area')).columns
subvols = pd.Series(ABCD_subc.columns, name='SubcorticalVolumes')
allFeatures.drop(0, inplace = True)
allFeatures.reset_index(inplace = True, drop = True)
allFeatures['SubcorticalVolumes'] = subvols
allFeatures2['SubcorticalVolumes'] = subvols
allFeatures.drop([34,35], inplace = True)

In [None]:
allFeatures

In [None]:
allFeatures2

In [None]:
#allFeatures.to_csv('..\\processedData\\FeatureList.csv')

In [None]:
#allFeatures2.to_csv('..\\processedData\\FeatureListABCD.csv')