This script will perform scanner harmonization using [NeuroCombat](https://github.com/Jfortin1/neuroCombat) (version 0.2.12).

In [42]:
import os

from neuroCombat import neuroCombat

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

data_dir='data'

# Data preparation

Load datasheet that contains thalamus nuclei volumes, covars, and tiv information after subject and nuclei exclusion.

In [43]:
# load data
df = pd.read_csv(os.path.join(data_dir, 'thalamus_stats_exclsubs.csv'), sep=',', index_col=0)
df

Unnamed: 0_level_0,Left-AV,Left-CL,Left-CM,Left-CeM,Left-LD,Left-LP,Left-MDl,Left-MDm,Left-MV(Re),Left-Pc,...,atm2,atm3,Vent_dur_excl_zero,hemo,Age_at_scan,GA,BW,Scanner_ID,EstimatedTotalIntraCranialVol,TotalGrayVol
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BEST-BN-001,118.768534,36.140232,226.106870,66.512654,41.552272,147.987079,247.331292,761.177337,14.889298,3.480652,...,,,,,25.487671,38.0,3360.0,1,1.455519e+06,613094.623435
BEST-BN-002,133.201816,38.289918,240.742002,59.981622,39.535740,144.277092,270.039012,809.110424,12.622232,4.030428,...,0.0,0.0,,0.0,25.706849,30.0,1650.0,1,1.591702e+06,673738.828692
BEST-BN-004,152.779061,45.771427,280.919993,68.159510,37.400778,138.539143,287.620203,850.149134,15.164456,3.643198,...,,,,,25.643836,41.0,3470.0,1,1.546094e+06,667586.391249
BEST-BN-006,133.487379,33.118877,262.077472,72.482332,28.547389,114.361723,315.702712,870.938413,14.152003,3.968036,...,0.0,11.0,11.0,0.0,25.926027,31.0,1060.0,1,1.402174e+06,607113.061988
BEST-BN-008,169.298289,53.541844,299.876873,82.171782,57.381844,171.357372,325.752350,960.995924,18.526644,4.393606,...,,,,,26.131507,39.0,2120.0,1,1.660666e+06,693850.563884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BEST-MUC-150,169.017279,46.765600,302.716720,77.579887,34.241289,154.153372,346.151490,983.571077,13.779343,4.549750,...,,,,,28.041096,40.0,4670.0,4,1.838159e+06,729737.949935
BEST-MUC-151,116.334741,27.549775,238.082712,77.528147,23.451204,84.918997,408.963638,1125.098453,16.192208,4.116030,...,,,,,28.309589,40.0,3750.0,4,1.521013e+06,614574.979229
BEST-MUC-152,145.538973,37.207616,221.002605,64.722686,30.558595,131.099176,331.858370,1024.618297,11.621355,3.556432,...,,,,,27.734247,39.0,2610.0,4,1.574888e+06,624068.285618
BEST-MUC-153,145.362903,30.319255,223.045079,65.882500,29.602832,111.056765,309.540387,972.438172,10.782618,4.028710,...,,,,,28.468493,39.0,3400.0,4,1.839361e+06,575601.400530


In [44]:
# separate df for combat
nuclei = df.filter(like='Left').columns.union(df.filter(like='Right').columns)
data = df[nuclei]
print("Data shape:", data.shape)

# covars
covars = df[df.columns.difference(nuclei)]

Data shape: (175, 46)


In [45]:
# save information about subs and rois to use later
rois = data.columns.values.tolist() # save list of rois
with open(os.path.join(data_dir, 'roi_names.txt'), 'w') as file:
    for roi in rois:
        file.write(str(roi) + '\n')

subjects = data.index.values.tolist() # save list of subjects

In [46]:
# reshape data (required for Combat)
data_val = data.values #skip headers
data_val = data_val.transpose() #transpose data
data_val = pd.DataFrame(data_val) #convert to pandas dataframe
print("Data shape after transposing:", data_val.shape)

Data shape after transposing: (46, 175)


# Scanner harmonization of eTIV

To also correct TIV for the scanner effect, it needs to be treated differently compared to the other ROIs (where TIV is preserved in the combat model).

In [47]:
# reshape TIV
GM_TIV = covars[['EstimatedTotalIntraCranialVol', 'TotalGrayVol']]
GM_TIV = GM_TIV.values
GM_TIV = GM_TIV.transpose()
GM_TIV = pd.DataFrame(GM_TIV)

print(GM_TIV.shape)
GM_TIV.head()

(2, 175)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,1455519.0,1591702.0,1546094.0,1402174.0,1660666.0,1665921.0,1608302.0,1509680.0,1568594.0,1581962.0,...,1447725.0,1516066.0,1426657.0,1368818.0,1673638.0,1838159.0,1521013.0,1574888.0,1839361.0,1696146.0
1,613094.6,673738.8,667586.4,607113.1,693850.6,690349.2,676198.4,623975.6,673670.3,685504.7,...,568956.4,618100.3,575637.7,537686.6,649448.0,729737.9,614575.0,624068.3,575601.4,693890.4


In [48]:
tiv_combat = neuroCombat(dat=GM_TIV,
                          covars=covars,
                          batch_col=['Scanner_ID'],
                          categorical_cols=['sex', 'blsgroup'],
                          continuous_cols=['Age_at_scan']) ['data']

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding parametric adjustments
[neuroCombat] Final adjustment of data


In [49]:
tiv_combat = pd.DataFrame(tiv_combat)
tiv_combat = tiv_combat.transpose()
tiv_combat.columns = ['TIV_corrected','GMV_corrected']
tiv_combat['Subject'] = subjects #add SubjectID column

In [50]:
covars = covars.drop(['EstimatedTotalIntraCranialVol','TotalGrayVol'],axis=1) #drop uncorrected TIV and GMV
covars = pd.merge(covars,tiv_combat,on='Subject', how="left")
print(covars.shape)

(175, 32)


# Scanner harmonization of thalamus nuclei volumes

Next, thalamic nuclei volumes will be scanner-corrected, while preserving TIV_corrected (in addition to sex, age, and diagnosis) as a biological variable.

In [51]:
combat = neuroCombat(dat=data_val,
                          covars=covars,
                          batch_col=['Scanner_ID'],
                          categorical_cols=['sex','blsgroup'],
                          continuous_cols=['Age_at_scan','TIV_corrected']) ['data']

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding parametric adjustments
[neuroCombat] Final adjustment of data


In [52]:
combat_output = pd.DataFrame(combat)

In [53]:
# convert the array into a dataframe and add headers
combat_output = pd.DataFrame(combat)
combat_output = combat_output.transpose() #transpose data
combat_output.columns = rois #add header 
combat_output['Subject'] = subjects #add SubjectID column
combat_output.head(3)

Unnamed: 0,Left-AV,Left-CL,Left-CM,Left-CeM,Left-LD,Left-LP,Left-MDl,Left-MDm,Left-MV(Re),Left-Pc,...,Right-PuL,Right-PuM,Right-VA,Right-VAmc,Right-VLa,Right-VLp,Right-VM,Right-VPL,Right-Whole_thalamus,Subject
0,108.987197,32.940172,214.734026,62.727183,38.253734,140.918801,235.673108,728.019139,14.133652,3.27801,...,143.759018,1048.90825,454.14801,33.009275,581.939144,723.119299,18.337792,677.55913,6225.124147,BEST-BN-001
1,124.311589,35.156902,228.722334,56.147848,36.369998,137.758627,258.356131,776.737547,11.811479,3.858323,...,179.755594,896.38779,457.408144,33.405715,696.380328,865.592828,25.076973,807.354964,6580.098469,BEST-BN-002
2,144.96762,43.005783,268.11035,64.367554,33.860415,129.496519,275.909071,818.483024,14.442932,3.451785,...,143.151764,908.774362,395.516976,29.153522,617.965233,802.53969,19.199389,816.432781,6174.006278,BEST-BN-004


In [54]:
# add covars info to combat_output
combat_output_all = pd.merge(combat_output, covars, on='Subject', how="left")
combat_output_all.set_index('Subject',inplace=True)
print(combat_output_all.shape)

(175, 77)


In [55]:
combat_output_all.to_csv(os.path.join(data_dir,'NeuroCombatCorrected_thalamusseg.csv'), index=True)