# Extract structural data 
### script by Alina

In [2]:
import pandas as pd
import numpy as np
import os
import glob
import sys

In [3]:
#path to folder with all subjects data

path = '/media/hcs-sci-psy-narun/OpenNEURO_adhd/ds002424-download/'
path_out = '/media/hcs-sci-psy-narun/OpenNEURO_adhd/ds002424-download/3T_Structural/'
if not os.path.exists(path_out):
    os.makedirs(path_out)

In [4]:
#Freesurfer header for indexing

#indexes for cortex
fs_cort_ind = np.loadtxt('/media/hcs-sci-psy-narun/BackupDataD800/Alina/atlases/destrieux2009_new_header_for_table_WITHOUT_med_wall.txt', 
                         dtype=str)
#indexes for subcortex
fs_subc_ind = [
    'Left-Accumbens-area',
    'Right-Accumbens-area',
    'Left-Amygdala',
    'Right-Amygdala',
    'Brain-Stem',
    'Left-Caudate',
    'Right-Caudate',
    'Left-Cerebellum-Cortex',
    'Right-Cerebellum-Cortex',
    'Left-VentralDC',
    'Right-VentralDC',
    'Left-Hippocampus',
    'Right-Hippocampus',
    'Left-Pallidum',
    'Right-Pallidum',
    'Left-Putamen',
    'Right-Putamen',
    'Left-Thalamus-Proper',
    'Right-Thalamus-Proper'
]
#indexes for subcortex like in wb_commands
fs_subc_ind_wb = np.loadtxt('/media/hcs-sci-psy-narun/BackupDataD800/Alina/atlases/fs_index_subc.txt', dtype=str)

#total brain volume index
tot_names = [
    'Estimated Total Intracranial Volume',
    'Total cortical gray matter volume',
    'Subcortical gray matter volume',
    'Total cerebral white matter volume',
    'Ratio of BrainSegVol to eTIV'
]

tot_names_ml = [
    'FS_IntraCranial_Vol',
    'FS_TotCort_GM_Vol',
    'FS_SubCort_GM_Vol',
    'FS_Tot_WM_Vol',
    'FS_BrainSegVol_eTIV_Ratio'    
]

In [4]:

subject_ID_df  = pd.read_csv(data_dir + "valid_subj_list.csv", dtype=str, index_col=0)
subject_ID = list(subject_ID_df.iloc[:,0])
subjects = [str(x[4:]) for x in subject_ID]

In [13]:
#name of files to convert
files = ['lh.aparc.a2009s.stats', 'rh.aparc.a2009s.stats', 'aseg.stats']

In [14]:
#convert FreeSurfer table into csv-files for each subject for convenience

dct1={}
dct2={}
dct3={}

for subject in subjects:
    for filename in files:
        file = str(path)+ str(subject) +'/T1w/' + str(subject) +'/stats/'+ str(filename)
        
        if os.path.isfile(file) == True:
            #read table
            table = pd.DataFrame(np.loadtxt(file, dtype=str))
            #read headers
            header_names = [l.replace('\n', '').replace('  ', ' ').replace('# ', '').split(' ')[1:(len(table.columns)+1)] for l in open(file).readlines() if 'ColHeaders' in l]
            #rename table
            table.columns = header_names[0]
            table.index = table['StructName']
            table = table.drop('StructName', axis=1)
            #save table to dct
            if filename == 'lh.aparc.a2009s.stats':
                dct1[subject]=table
            elif filename == 'rh.aparc.a2009s.stats':
                dct2[subject]=table
            else:
                dct3[subject]=table
            
        else:
            print('not exist')
            print(file)

In [16]:
#assemble total brain volumes into separate dictionary
dct_tvol = {}
for subject in subjects:
    file_aseg = open(str(path)+ str(subject) +'/T1w/' + str(subject) +'/stats/'+ str(files[2])).readlines() #read aseg file
    tvolvalues = []
    for tot in tot_names:
        for line in file_aseg:
            if tot in line:
                tvolvalues+=[line.split(',')[-2]]
    dct_tvol[subject] = pd.Series(tvolvalues, index=tot_names_ml)

In [17]:
#assemble table by modalities

dct_thick = {}
dct_area = {}
dct_subc = {}

for subject in subjects:
    #load individual files
    df1 = dct1[subject]
    df2 = dct2[subject]
    df3 = dct3[subject]
    #combine lh and rh into one vector
    thck_full = np.concatenate([df1['ThickAvg'], df2['ThickAvg']], axis=0)
    area_full = np.concatenate([df1['SurfArea'], df2['SurfArea']], axis=0)
    #df3.index = df3['StructName']
    df3 = df3.reindex(index=fs_subc_ind) #filter to chosen structures
    #write to dictionary
    dct_thick[subject] = thck_full
    dct_area[subject] = area_full
    dct_subc[subject] = df3['Volume_mm3']

In [None]:
#check if something to short
todel = []

print('thickness')
for key in dct_thick.keys():
    if len(dct_thick[key]) < 148:
        print('short  ', key, "  it's length is", len(dct_thick[key]))
        todel += [key]

print('areas')        
for key in dct_area.keys():
    if len(dct_area[key]) < 148:
        print('short  ', key, "  it's length is", len(dct_area[key]))
        todel += [key]
        
print('subcortex')
for key in dct_subc.keys():
    if len(dct_subc[key]) < 19:
        print('short  ', key, "  it's length is", len(dct_subc[key]))
        todel += [key]       

todel = sorted(set(todel))
print(' ')
print('need to be removed', todel)


#removing this from dictionaries
for d in todel:
    del dct_thick[d]
    del dct_area[d]
    del dct_subc[d]
    del dct_tvol[d]

In [19]:
#transform dct to table
df_thick = pd.DataFrame(dct_thick)
df_area = pd.DataFrame(dct_area)
df_subc = pd.DataFrame(dct_subc)
df_tvol = pd.DataFrame(dct_tvol)

#change indexes
df_thick.index = fs_cort_ind
df_area.index = fs_cort_ind
df_subc.index = fs_subc_ind_wb

#save tables
df_thick.T.to_csv(path_out + 'cortical_thickness.csv')
df_area.T.to_csv(path_out + 'cortical_area.csv')
df_subc.T.to_csv(path_out + 'subcortical_volume.csv')
df_tvol.T.to_csv(path_out + 'total_brain_volume.csv')