In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt 
import os
from neuroCombat import neuroCombat

In [None]:
os.chdir('ABCDTabular\\')

In [None]:
## read in mri meta information file 
scansite = pd.read_csv('abcd_mri01.txt',sep='\s+')

In [None]:
## read training and test data for menarche classification
traindata = pd.read_csv('..\\processedData\\menarcheTrain_redONLYSMRI.csv')
testdata = pd.read_csv('..\\processedData\\menarcheTest_redONLYSMRI.csv')
traindata.sort_values(by = 'subjectkey', inplace = True, ignore_index = True)
testdata.sort_values(by = 'subjectkey', inplace = True, ignore_index = True)

In [None]:
## extract and then remove first row of dataframe, which is cleartext description of column content
dictscan = scansite.iloc[0]
scansite = scansite.drop(index = 0)

In [None]:
dictscan

In [None]:
## get scansite info for train and test subjects respectively
scansiteTrain = scansite.loc[scansite['subjectkey'].isin(traindata['subjectkey'])]
scansiteTest = scansite.loc[scansite['subjectkey'].isin(testdata['subjectkey'])]

In [None]:
## make df of every mri device and the number of scans performed with it
scansitecounts = scansite['mri_info_deviceserialnumber'].value_counts().reset_index(name='count')
scansitecountsTrain = scansiteTrain['mri_info_deviceserialnumber'].value_counts().reset_index(name='count')
scansitecountsTest = scansiteTest['mri_info_deviceserialnumber'].value_counts().reset_index(name='count')

In [None]:
## rename index column back to original name so that future matching can work
scansitecounts = scansitecounts.rename(columns={'index':'mri_info_deviceserialnumber'})
scansitecountsTrain = scansitecountsTrain.rename(columns={'index':'mri_info_deviceserialnumber'})
scansitecountsTest = scansitecountsTest.rename(columns={'index':'mri_info_deviceserialnumber'})

In [None]:
## turn age into float to be able to calculate means
import math 

age = np.array(scansite['interview_age'].astype(float))
ageTrain = np.array(scansiteTrain['interview_age'].astype(float))
ageTest = np.array(scansiteTest['interview_age'].astype(float))

scansite['interview_age'] = age/12
scansiteTrain['interview_age'] = ageTrain/12
scansiteTest['interview_age'] = ageTest/12

In [None]:
## make dataframe of mean age per mri device and sort it by age from highest to lowest mean 
df_mean = scansite.groupby('mri_info_deviceserialnumber')['interview_age'].mean().reset_index()
df_mean = df_mean.sort_values('interview_age', ascending=False)

df_meanTrain = scansiteTrain.groupby('mri_info_deviceserialnumber')['interview_age'].mean().reset_index()
df_meanTrain = df_meanTrain.sort_values('interview_age', ascending=False)

df_meanTest = scansiteTest.groupby('mri_info_deviceserialnumber')['interview_age'].mean().reset_index()
df_meanTest = df_meanTest.sort_values('interview_age', ascending=False)

In [None]:
## merge dataframe of means with dataframe of counts
df_mean.merge(scansitecounts,how='outer', on='mri_info_deviceserialnumber')

In [None]:
df_meanTrain.merge(scansitecountsTrain, how = 'outer', on = 'mri_info_deviceserialnumber')

In [None]:
df_meanTest.merge(scansitecountsTest, how = 'outer', on = 'mri_info_deviceserialnumber')

### Harmonize Train and Test Data separately

In [None]:
## get scanner data from only 2-year follow-up
scansiteTest = scansiteTest.loc[scansiteTest['eventname'] == '2_year_follow_up_y_arm_1']
scansiteTrain = scansiteTrain.loc[scansiteTrain['eventname'] == '2_year_follow_up_y_arm_1']

scansiteTest.reset_index(inplace=True,drop=True)
scansiteTrain.reset_index(inplace=True,drop=True)

## sort train and test data by subjectkeys
testdata.sort_values(by = 'subjectkey', inplace=True)
traindata.sort_values(by = 'subjectkey', inplace=True)

## sort scanner data by subjectkeys to bring in same order as MRI data
scansiteTest.sort_values(by='subjectkey', inplace=True)
scansiteTrain.sort_values(by='subjectkey', inplace=True)

In [None]:
## get menarche variable from train & test dfs
menarcheTrain = traindata[['subjectkey','pds_f5_y_P']]
menarcheTest = testdata[['subjectkey', 'pds_f5_y_P']]

In [None]:
## merge scanner with menarche data
scan_menTrain = scansiteTrain.merge(menarcheTrain, how = 'outer', on = 'subjectkey')
scan_menTest = scansiteTest.merge(menarcheTest, how = 'outer', on = 'subjectkey')

In [None]:
## extract only the mri data of each specific modality from train 
## data to use harmonisation individually (needs to be done that way)
thicknessTrain = traindata.filter(regex=".*smri_thick_cdk.*")
areaTrain = traindata.filter(regex=".*smri_area_cdk.*")
volumeTrain = traindata.filter(regex=".*smri_vol_cdk.*")
subcorticalTrain = traindata.filter(regex=".*smri_vol_scs.*")

In [None]:
## extract only the mri data of each specific modality from test
## data to use harmonisation individually (needs to be done that way)
thicknessTest = testdata.filter(regex=".*smri_thick_cdk.*")
areaTest = testdata.filter(regex=".*smri_area_cdk.*")
volumeTest = testdata.filter(regex=".*smri_vol_cdk.*")
subcorticalTest = testdata.filter(regex=".*smri_vol_scs.*")

In [None]:
## turn training dataframes into numpy arrays
thickness_arrayTr = thicknessTrain.values
area_arrayTr = areaTrain.values
volume_arrayTr = volumeTrain.values
subcortical_arrayTr = subcorticalTrain.values

## save column names of each modality into list
thickness_columnsTr = thicknessTrain.columns
area_columnsTr = areaTrain.columns
volume_columnsTr = volumeTrain.columns
subcortical_columnsTr = subcorticalTrain.columns

## put data arrays and column names into lists so I can iterate through 
## them and do the harmonization for every modality within a single function
arraylistTrain = [thickness_arrayTr,area_arrayTr,volume_arrayTr,subcortical_arrayTr]
columnslistTrain = [thickness_columnsTr,area_columnsTr,volume_columnsTr,subcortical_columnsTr]

In [None]:
## turn test dataframes into numpy arrays
thickness_arrayTs = thicknessTest.values
area_arrayTs = areaTest.values
volume_arrayTs = volumeTest.values
subcortical_arrayTs = subcorticalTest.values

## save column names of each modality into list
thickness_columnsTs = thicknessTest.columns
area_columnsTs = areaTest.columns
volume_columnsTs = volumeTest.columns
subcortical_columnsTs = subcorticalTest.columns

## put data arrays and column names into lists so I can iterate through
## them and do the harmonization for every modality within a single function
arraylistTest = [thickness_arrayTs,area_arrayTs,volume_arrayTs,subcortical_arrayTs]
columnslistTest = [thickness_columnsTs,area_columnsTs,volume_columnsTs,subcortical_columnsTs]

In [None]:
## specify name of scansite variable
batchcol = 'mri_info_deviceserialnumber'

In [None]:
## create df that contains only scansite and menarche info without subjectkey
demogandscannerTrain = scan_menTrain[['mri_info_deviceserialnumber','pds_f5_y_P']]

demogandscannerTest = scan_menTest[['mri_info_deviceserialnumber','pds_f5_y_P']]

In [None]:
from neuroCombat import neuroCombat

## method to perform harmonization for every modality (volume, thickness etc.)
def site_harmonization(array_list, column_list, site_df, site_var):
    '''
    This function performs Combat harmonization per MRI modality
    Input:
    - array_list: list of arrays containing area, volume etc. data, Note: Combat expects shape 
    (features x subject) --> transpose
    - site_df: dataframe containing demographic and site information
    - site_var: variable that encodes MRI site information (here: scanner number)
    ------
    Output:
    - array_list_combat: combat harmonized MRI data with shape of input array_list
    - dataframe: combat harmonized MRI data as a pandas dataframe
    '''
    # initialise list and dataframe
    array_list_combat = []
    dataframe = pd.DataFrame()
    
    i = 0
    
    # loop through all modalities
    for array in array_list:
        # perform harmonization on single modality (call neuroCombat method (https://github.com/Jfortin1/neuroCombat))
        array_combat = neuroCombat(
            dat=array.T,  covars=site_df, batch_col=site_var)["data"]
        # append result of harmonization of one modality to larger list of arrays of harmonized modalities
        array_list_combat.append(array_combat.T)
        # turn results into dataframe 
        tempdf = pd.DataFrame(array_combat.T, columns = column_list[i])
        # concatinate dataframe to dataframe of already harmonized modality
        dataframe = pd.concat([dataframe,tempdf], axis=1)
        
        i = i+1 

    return array_list_combat, dataframe

In [None]:
## call site_harmonization method to obtain harmonized training data as list and dataframe for training data
arraylistcombatTrain, harmonizedDFTrain = site_harmonization(arraylistTrain,columnslistTrain,demogandscannerTrain,batchcol)

In [None]:
## call site_harmonization method to obtain harmonized test data as list and dataframe for test data
arraylistcombatTest, harmonizedDFTest = site_harmonization(arraylistTest,columnslistTest,demogandscannerTest,batchcol)

In [None]:
# put subjectkeys back into harmonized mri dataframe
harmonizedDFTrain['subjectkey'] = traindata['subjectkey']

harmonizedDFTest['subjectkey'] = testdata['subjectkey']

In [None]:
## add menarche variable and scanner data back to harmonized MRI data
harmonizedDFTest_m = harmonizedDFTest.merge(scan_menTest, on = 'subjectkey', how = 'outer')
harmonizedDFTrain_m = harmonizedDFTrain.merge(scan_menTrain, how = 'outer', on = 'subjectkey')

In [None]:
## add only menarche variable back to harmonized MRI data
harmonizedDFTest = harmonizedDFTest.merge(menarcheTest, how = 'outer', on = 'subjectkey')
harmonizedDFTrain = harmonizedDFTrain.merge(menarcheTrain, how = 'outer', on = 'subjectkey')

In [None]:
harmonizedDFTest_m.to_csv('..\\processedData\\harmonizedTestdata_plusscannerdfONLYSMRI.csv', index=False)

In [None]:
harmonizedDFTrain_m.to_csv('..\\processedData\\harmonizedTraindata_plusscannerdfONLYSMRI.csv', index=False)