In [1544]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [1]:
import os
os.chdir('/Users/ltran/Documents/TrueData29/CPC_ML_tutorial/')

import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
import seaborn as sns
import random
import joypy
from matplotlib import cm
from datetime import datetime
from scipy import stats

from scipy.stats import fisher_exact
from pcntoolkit.normative import estimate, predict, evaluate
from pcntoolkit.util.utils import compute_MSLL, create_design_matrix
from nm_utils import calibration_descriptives, remove_bad_subjects, load_2d
from sklearn.model_selection import train_test_split

# Set working directory
root_dir = '/Users/ltran/Documents/TrueData0104/CPC_ML_tutorial/'
out_dir = os.path.join(root_dir,'models','test')

# create the output directory if it does not already exist
os.makedirs(out_dir, exist_ok=True)

# Load TCA

In [353]:
data_dir = '/Users/ltran/Documents/Data/'


## Compute BMI

In [505]:
pheno = pd.read_csv(os.path.join(data_dir, 'TCA_vol/MRI_RDB_TCA_20221109_anonym.tsv'), sep = '\t')

In [506]:
date_MRI = []
for i in pheno['MRIDateOfMRI']:
    if (type(i) == float):
        date_MRI.append(i)
    elif (type(i) == str):
        date_MRI.append(datetime.strptime(i, '%d/%m/%Y'))
        
pheno['date_MRI'] = date_MRI


pheno.loc[pheno.date_first_hospi == '02/22/2021', 'date_first_hospi'] = '22/02/2021'
date_hospi = []
for i in pheno['date_first_hospi']:
    if (type(i) == float):
        date_hospi.append(i)
    elif (type(i) == str):
        date_hospi.append(datetime.strptime(i, '%d/%m/%Y'))

pheno['date_hospi'] = date_hospi



In [507]:
time_delta = (pheno['date_MRI'] - pheno['date_hospi'])

In [508]:
tdl = []

for i in time_delta:
    tdl.append(float(i.days))
    
pheno['delta'] = tdl

pheno.loc[(pheno['date_MRI'] < pheno['date_hospi']), 'delta'] = np.nan

In [509]:
pheno['ASD'] = 'TCA'
pheno.loc[pheno.delta < 14, 'ASD'] = 'TCA_ac'
pheno.loc[pheno.delta >= 14, 'ASD'] = 'TCA_pr'
pheno.loc[pheno[pheno.TSA == 1].index, 'ASD'] = 'TCA_Autism'

In [510]:
pheno = pheno.dropna(subset = ['age_at_scan', 'Sex', 'machine'])

In [511]:
pheno = pheno.drop_duplicates(subset = ['subject_id'])

In [512]:
# Problème pour la taille (m -> cm)
pheno['size_at_scan'] = pheno['size_at_scan'].replace(0.92, 92)

In [513]:
# Rajouter suffixe à participant_id
pheno['participant_id'] = pheno['subject_id'].astype(str)+'_tca'

In [514]:
S = []
for i in pheno.subject_id:
    S.append('sub-'+str(i).zfill(4))
    
pheno['Subject'] = S

In [515]:
pheno.Subject = pheno.Subject+('_ses-0')+pheno.session_id.astype(str)

In [516]:
pheno['Protocole'] = pheno['sequence_3DT1'].str.upper()

In [525]:
pheno.date_first_hospi

0      20/02/2006
1      24/02/2006
2      08/03/2006
3      12/11/2008
4      15/07/2009
          ...    
125    01/06/2015
126    23/02/2016
128    09/03/2016
129    22/10/2021
130    16/05/2022
Name: MRIDateOfMRI, Length: 118, dtype: object

In [527]:
pheno[pheno.subject_id == 144]['date_first_hospi']

99    09/03/2020
Name: date_first_hospi, dtype: object

In [526]:
pheno[pheno.subject_id == 144]['MRIDateOfMRI']

99    22/01/2020
Name: MRIDateOfMRI, dtype: object

## Remove failed

In [517]:
list_fail = [8, 12, 23, 18, 152, 26, 88, 73, 112, 194, 136]

In [518]:
pheno = pheno[~(pheno.subject_id.isin(list_fail))]

# Volume cortex

In [519]:
cv_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/volumes-vlob-iso.txt'), sep = '\t')

In [520]:
cv_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/volumes-vlob-tfe.txt'), sep = '\t')

In [521]:
cv_iso.Subject = cv_iso.Subject.str.split('.', expand = True)[0]
cv_tfe.Subject = cv_tfe.Subject.str.split('.', expand = True)[0]

In [453]:
cv_iso['Protocole'] = 'ISO'
cv_tfe['Protocole'] = 'TFE'

In [454]:
cv = pd.concat([cv_iso, cv_tfe])

In [455]:
cv['participant_id'] = cv.Subject.str.split('_', expand = True)[0].str.split('-', expand = True)[1].str.lstrip('0')
cv['participant_id'] = cv['participant_id'].astype(str)+'_tca'

# Merge Volume, Area, CT

In [456]:
# Load ISO volume
vol_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/volumes-fs-iso.txt'), sep = '\t')

# Load TFE volume
vol_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/volumes-fs-tfe.txt'), sep = '\t')

In [457]:
vol_iso['Protocole'] = 'ISO'
vol_tfe['Protocole'] = 'TFE'

In [458]:
# Merge volume ISO + TFE 

vol_tca = pd.concat([vol_tfe, vol_iso])

In [459]:
lh_area_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.lh.area.csv'), sep = '\t') 
rh_area_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.rh.area.csv'), sep = '\t') 


lh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.lh.area.csv'), sep = '\t') 
rh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.rh.area.csv'), sep = '\t') 

area_iso = lh_area_iso.merge(rh_area_iso.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

area_tfe = lh_area_tfe.merge(rh_area_tfe.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [460]:
area_iso['Protocole'] = 'ISO'
area_tfe['Protocole'] = 'TFE'

In [461]:
area = pd.concat([area_iso, area_tfe])
area = area.rename(columns= {'lh.aparc.area' : 'Subject'})

In [462]:
lh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.lh.thickness.csv'), sep = '\t') 
rh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.rh.thickness.csv'), sep = '\t') 

ct_iso = lh_ct_iso.merge(rh_ct_iso.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), 
               on = 'lh.aparc.thickness')


lh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.lh.thickness.csv'), sep = '\t') 
rh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.rh.thickness.csv'), sep = '\t') 

ct_tfe = lh_ct_tfe.merge(rh_ct_tfe.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), 
               on = 'lh.aparc.thickness')

ct_iso['Protocole'] = 'ISO'
ct_tfe['Protocole'] = 'TFE'
ct = pd.concat([ct_iso, ct_tfe])
ct = ct.rename(columns= {'lh.aparc.thickness' : 'Subject'})

In [463]:
df_TCA = area.merge(vol_tca, on = ['Subject', 'Protocole']).merge(ct, on=['Subject','Protocole'])

In [464]:
l_pi = []
for i in df_TCA['Subject']:
    l_pi.append(int(i.split('-')[1].split('_')[0]))

df_TCA['participant_id'] = l_pi

In [465]:
df_TCA['participant_id'] = df_TCA['participant_id'].astype(str)+'_tca'

In [466]:
df_TCA = df_TCA.merge(cv, on = ['participant_id', 'Protocole'])

In [467]:
df_TCA = df_TCA.merge(pheno, on = ['participant_id', 'Protocole'])

In [468]:
df_TCA['sex'] = df_TCA['Sex'].replace({'Female' : 2, 'Male' : 1})

In [469]:
# df_TCA = df_TCA.drop_duplicates(subset='participant_id')
df_TCA = df_TCA.dropna(subset='Sex')

In [470]:
df_TCA.columns = df_TCA.columns.str.replace('-', '_')

In [471]:
# df_TCA = df_TCA[~(df_TCA['magnetic_field_strength'] == '3')]

In [472]:
df_TCA = df_TCA.drop_duplicates(subset='participant_id', keep = 'first')

In [473]:
df_TCA['size_at_scan'] = df_TCA['size_at_scan'].replace(0.92, 92)

In [474]:
df_TCA['Month'] = df_TCA['age_at_scan']*12

In [475]:
df_TCA['scanner'] = df_TCA['manufacturer_model_name']

In [476]:
df_TCA = df_TCA.rename(columns= {'3rd_Ventricle' : 'third_Ventricle',
                       '4th_Ventricle' : 'fourth_Ventricle',
                        '5th_Ventricle' : 'fifth_Ventricle'})

### Export dataframe

In [486]:
df_TCA.to_csv(os.path.join(data_dir, 'Outputs/df_TCA.csv'), index = False)

In [478]:
df_TCA = df_TCA[~df_TCA.Subject_x.str.contains('069')]

In [479]:
df_TCA = df_TCA.rename(columns= {'weight_at_scan': 'patient_weight'})

In [480]:
df_TCA = df_TCA[(df_TCA.age_at_scan < 16) & (df_TCA.age_at_scan > 6)]

In [481]:
df_TCA = df_TCA[df_TCA.machine != 'Ingenia_3T']

In [482]:
df_TCA.to_csv(os.path.join(data_dir, 'Outputs/df_TCA.csv'))

In [483]:
df_TCA.groupby('ASD').size()

ASD
TCA            5
TCA_Autism     2
TCA_ac        37
TCA_pr        47
dtype: int64

In [484]:
df_TCA['ID'] = df_TCA.participant_id.str.split('_', expand = True)[0]

________________

# Load TSA

Load infos 

In [41]:
def is_duplicated(df, col):
    return df[df.duplicated(subset = col, keep = False)]
    

In [42]:
infos_TSA = pd.read_csv(os.path.join(data_dir, 'TSA_vol/MRI_RDB_20220928_anonym.tsv'), sep = '\t')

In [43]:
infos_TSA['Subject'] = 'sub-'+infos_TSA['subject_id'].astype(str).str.zfill(4)+'_ses-0'+infos_TSA['session_id'].astype(str
                                                                    )

In [44]:
# Enleve les NA dans sex, ASD, age
infos_TSA = infos_TSA.dropna(subset = ['Sex', 'age_at_scan', 'ASD'])
infos_TSA = infos_TSA[~(infos_TSA.ASD == '?')]

In [45]:
infos_TSA['sex'] = infos_TSA.Sex.replace({'Male' : 1, 'Female' : 2})

In [46]:
infos_TSA.loc[(infos_TSA.Group == 'Relative') & (infos_TSA.ASD == 'No'), 'ASD'] = 'Relative'

In [47]:
# maj_tsa = pd.read_csv(os.path.join(data_dir, 'TSA_vol/TSA_cohort_dcm_info_20221110_anonymized.csv'), sep = ',')

In [48]:
# maj_tsa = maj_tsa.rename(columns= {'SubCode' : 'subject_id'})

In [49]:
# infos_TSA = maj_tsa[['subject_id', 'patient_weight']].merge(infos_TSA,on = 'subject_id', suffixes = ('', '2'))

In [50]:
infos_TSA = infos_TSA[(infos_TSA.age_at_scan > 6) & (infos_TSA.age_at_scan < 16)]

Load QC

In [51]:
qc_alex = pd.read_csv(os.path.join(data_dir, 'TSA_vol/Imagerie_QC.csv'), sep = ';')

In [52]:
# qc_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/qc-fs-6.0.0-tfe_20171002.txt'), sep = '\t')
# qc_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/qc-fs-6.0.0-isotrope_20171009.txt'), sep = '\t')
# iso_fail = qc_iso[qc_iso.QC <= 1][['Subject', 'QC']]
# tfe_fail = qc_tfe[qc_tfe.QC <= 1][['Subject', 'QC']]

In [53]:
QC = pd.read_csv(os.path.join(data_dir, 'QC_trio.csv'), sep = ';', on_bad_lines= 'skip')

In [54]:
QC.BIDS = QC.BIDS.str.lower()

In [55]:
QC = QC.dropna(subset = 'QC')
qc_alex = qc_alex.dropna(subset = 'QC')

In [56]:
QC = QC[~(QC.BIDS.isin(qc_alex.BIDS))]

In [57]:
QC = pd.concat([QC, qc_alex])

In [58]:
QC.session = QC.session.replace(np.nan, 1)

In [59]:
QC.loc[495, 'session'] = 2

In [60]:
# bids_f = QC[(QC.QC == 'F') | (QC.QC == 'g') | (QC.QC == 'M')][['BIDS', 'session', 'Protocole']]

In [61]:
bids_f = QC[(QC.QC == 'P')| (QC.QC == 'M')][['BIDS', 'session', 'Protocole', 'QC']]

In [62]:
ses = []
for i in bids_f['session']:
    if (i is np.nan):
        print(i)
    else:   
        ses.append(str(i).split('.')[0])
    
bids_f['ses'] = ses
bids_f = bids_f.replace({np.nan : '1'})

In [63]:
bids_f['Subject'] = bids_f['BIDS'].astype(str)+'_ses-0'+bids_f['ses'].astype(str) 

In [64]:
bids_f_iso = bids_f[(bids_f.Protocole == 'ISO') | (bids_f.Protocole == 'isotrope')
                    | (bids_f.Protocole == 'isot')]

In [65]:
bids_f_tfe  = bids_f[(bids_f.Protocole == 'TFE') | (bids_f.Protocole == 'tfe')]

In [66]:
bids_f_iso = bids_f_iso[~ (bids_f_iso.BIDS == 'sub-0361')]

Volumes Cortex

In [274]:
cv_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/volumes-vlob-iso.txt'), sep = '\t')

cv_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/volumes-vlob-tfe.txt'), sep = '\t')

cv_iso.Subject = cv_iso.Subject.str.split('.', expand = True)[0]
cv_tfe.Subject = cv_tfe.Subject.str.split('.', expand = True)[0]

cv_iso['Protocole'] = 'ISO'
cv_tfe['Protocole'] = 'TFE'

cv_iso = cv_iso[~(cv_iso.Subject.isin(cv_tfe.Subject))]
cv_iso = cv_iso[(cv_iso.Subject.isin(bids_f_iso.Subject))]
cv_tfe = cv_tfe[(cv_tfe.Subject.isin(bids_f_tfe.Subject))]
cv = pd.concat([cv_iso, cv_tfe])

In [275]:
cv['participant_id'] = cv.Subject.str.split('_', expand = True)[0].str.split('-', expand = True)[1].str.lstrip('0')
cv['participant_id'] = cv['participant_id'].astype(int)

Volumes

In [67]:
# Load ISO volume
vol_tsa_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.aseg.csv'), sep = '\t')

# Load TFE volume
vol_tsa_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.aseg.csv'), sep = '\t')

In [68]:
vol_tsa_iso = vol_tsa_iso.rename(columns= {'Measure:volume':'Subject'})
vol_tsa_tfe = vol_tsa_tfe.rename(columns= {'Measure:volume':'Subject'})

In [69]:
vol_tsa_iso = vol_tsa_iso[(vol_tsa_iso.Subject.isin(bids_f_iso.Subject))]

In [70]:
# vol_tsa_iso = vol_tsa_iso.merge(bids_f_iso, on = 'Subject')
# vol_tsa_tfe = vol_tsa_tfe.merge(bids_f_tfe, on = 'Subject')

In [71]:
vol_tsa_tfe = vol_tsa_tfe[(vol_tsa_tfe.Subject.isin(bids_f_tfe.Subject))]

In [72]:
vol_tsa_iso= vol_tsa_iso[~(vol_tsa_iso.Subject.isin(vol_tsa_tfe.Subject))]

In [73]:
vol_tsa_iso['Protocole'] = 'ISO'
vol_tsa_tfe['Protocole'] = 'TFE'

In [74]:
vol_tsa = pd.concat([vol_tsa_iso, vol_tsa_tfe])

In [75]:
participants = []
for i in vol_tsa['Subject']:
    participants.append(int(i.split('-')[1].split('_')[0]))
    
vol_tsa['participant_id'] = participants

In [76]:
vol_tsa = vol_tsa.drop_duplicates(subset= 'Subject', keep = 'first')

Surface area

In [77]:
rh_area_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.rh.area.csv'), sep = '\t')
lh_area_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.lh.area.csv'), sep = '\t')
rh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.rh.area.csv'), sep = '\t')
lh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.lh.area.csv'), sep = '\t')

In [78]:
area_iso = lh_area_iso.merge(rh_area_iso.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [79]:
area_tfe = lh_area_tfe.merge(rh_area_tfe.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [80]:
area_iso = area_iso.rename(columns= {'lh.aparc.area' : 'Subject'})
area_tfe = area_tfe.rename(columns= {'lh.aparc.area' : 'Subject'})

In [81]:
area_iso = area_iso[(area_iso.Subject.isin(bids_f_iso.Subject))]

In [82]:
area_tfe = area_tfe[(area_tfe.Subject.isin(bids_f_tfe.Subject))]

In [83]:
area_iso= area_iso[~(area_iso.Subject.isin(area_tfe.Subject))]

In [84]:
area_iso['Protocole'] = 'ISO'
area_tfe['Protocole'] = 'TFE'

In [85]:
area_tsa = pd.concat([area_iso, area_tfe])

In [86]:
participants = []
for i in area_tsa['Subject']:
    participants.append(int(i.split('-')[1].split('_')[0]))
    
area_tsa['participant_id'] = participants

In [87]:
area_tsa = area_tsa.drop_duplicates(subset= 'Subject', keep = 'first')

Cortical thickness

In [88]:
rh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.rh.thickness.csv'), sep = '\t')
lh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.lh.thickness.csv'), sep = '\t')
rh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.rh.thickness.csv'), sep = '\t')
lh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.lh.thickness.csv'), sep = '\t')

In [89]:
ct_iso = lh_ct_iso.merge(rh_ct_iso.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), on ='lh.aparc.thickness')

In [90]:
ct_tfe = lh_ct_tfe.merge(rh_ct_tfe.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), on ='lh.aparc.thickness')

In [91]:
ct_iso = ct_iso.rename(columns={'lh.aparc.thickness':'Subject'})
ct_tfe = ct_tfe.rename(columns={'lh.aparc.thickness':'Subject'})

In [92]:
ct_iso = ct_iso[(ct_iso.Subject.isin(bids_f_iso.Subject))]

In [93]:
ct_tfe = ct_tfe[(ct_tfe.Subject.isin(bids_f_tfe.Subject))]

In [94]:
ct_iso= ct_iso[~(ct_iso.Subject.isin(ct_tfe.Subject))]

In [95]:
ct_iso['Protocole'] = 'ISO'
ct_tfe['Protocole'] = 'TFE'

In [96]:
ct_tsa = pd.concat([ct_iso, ct_tfe])

In [97]:
participants = []
for i in ct_tsa['Subject']:
    participants.append(int(i.split('-')[1].split('_')[0]))
    
ct_tsa['participant_id'] = participants

In [98]:
ct_tsa = ct_tsa.drop_duplicates(subset= 'Subject', keep = 'first')

### Merge volume / area / ct

In [99]:
df_tsa = area_tsa.merge(ct_tsa, on = ['Subject', 'Protocole'], suffixes=('', '_y')).merge(vol_tsa, on = ['Subject','Protocole'], suffixes=('', '_z'))

In [100]:
# df_tsa = df_tsa.merge(cv, on = ['Subject', 'Protocole'])

In [101]:
# df_tsa = area_tsa.merge(ct_tsa, on = 'participant_id', suffixes=('', '_y')).merge(vol_tsa, on = 'participant_id', suffixes=('', '_z'))

### Merge with infos_TSA

In [102]:
zf = []
for i in infos_TSA.subject_id:
    zf.append('sub-'+str(i).zfill(4)+'_ses-0')
    
infos_TSA['Subject'] = zf+infos_TSA.session_id.astype(str)

In [103]:
# df_TSA = infos_TSA.merge(cv, on = 'Subject')

In [104]:
df_TSA = infos_TSA.merge(df_tsa,on = 'Subject')

In [105]:
df_TSA['participant_id'] = df_TSA.Subject.str.split('-', expand = True)[1].str.split('_', expand = True)[0].str.lstrip('0')

In [106]:
df_TSA.columns = df_TSA.columns.str.replace('-', '_')

In [107]:
df_TSA['scanner'] = df_TSA['machine'].str.upper()

In [108]:
df_TSA = df_TSA.rename(columns= {'3rd_Ventricle' : 'third_Ventricle',
                       '4th_Ventricle' : 'fourth_Ventricle',
                        '5th_Ventricle' : 'fifth_Ventricle'})

## ASD Categories

In [109]:
df_TSA.loc[(df_TSA.ASD == 'Yes') & (df_TSA.Group !=  'Patient'), 'ASD'] = 'Pas_ASD'

In [110]:
df_TSA.loc[(df_TSA.ASD == 'No') & (df_TSA.Group ==  'Patient'), 'ASD'] = 'TD_Patient'

## Export TSA

In [111]:
df_TSA.to_csv(os.path.join(data_dir, 'Outputs/df_TSA.csv'), index = False)

# Merge TCA + TSA

In [112]:
df_TSA = df_TSA.drop(columns= 'participant_id_y')

In [113]:
df_tsa_tca = pd.concat([df_TSA, df_TCA])

### Clean NA

In [114]:
df_tsa_tca = df_tsa_tca.dropna(subset=['Sex', 'age_at_scan', 'ASD', 'scanner'])

### Filter 1.5 T / 3 T


In [115]:
df_tsa_tca = df_tsa_tca[df_tsa_tca.scanner != 'INGENIA_3T']

In [116]:
df_tsa_tca = df_tsa_tca[~((df_tsa_tca['magnetic_field_strength'] == '3')|(df_tsa_tca['machine'] == 'Ingenia_3T'))]

### Filter on age

In [117]:
df_tsa_tca = df_tsa_tca[(df_tsa_tca['age_at_scan'] > 6) & (df_tsa_tca['age_at_scan'] < 16)]

In [118]:
df_tsa_tca = df_tsa_tca.drop_duplicates('participant_id')

In [119]:
df_tsa_tca.groupby(['Sex', 'ASD']).size().to_frame('Count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Sex,ASD,Unnamed: 2_level_1
Female,No,37
Female,Pas_ASD,5
Female,Relative,16
Female,TCA,4
Female,TCA_ac,33
Female,TCA_pr,43
Female,TD_Patient,1
Female,Yes,49
Male,No,49
Male,Pas_ASD,3


In [120]:
df_tsa_tca = df_tsa_tca.rename(columns={'3rd_Ventricle':'third_Ventricle',
                           '4th_Ventricle' : 'fourth_Ventricle',
                           '5th_Ventricle' : 'fifth_Ventricle'
    
})

In [1116]:
import random

In [1117]:
df_tsa_tca = df_tsa_tca.reset_index(drop = True)

In [1118]:
index_td = df_tsa_tca[(df_tsa_tca.ASD == 'Relative') | (df_tsa_tca.ASD == 'No')].index

In [1119]:
s1 =  random.sample(list(index_td), k = 59)

In [1120]:
s2 = list(index_td[~index_td.isin(s1)])

In [1121]:
df_tsa_tca['group_random'] = 0
df_tsa_tca.loc[s1, 'group_random'] = 1
df_tsa_tca.loc[s2, 'group_random'] = 2

## Export TSA + TCA

In [121]:
thick = [col for col in df_tsa_tca.columns if '_thickness' in col]
thick.remove('lh_MeanThickness_thickness')
thick.remove('rh_MeanThickness_thickness')
len(thick)

df_tsa_tca['meanCT'] = df_tsa_tca[['lh_MeanThickness_thickness', 'rh_MeanThickness_thickness']].mean(axis=1)

In [122]:
arean  = [col for col in df_tsa_tca.columns if '_area' in col]
arean.remove('rh_WhiteSurfArea_area')
arean.remove('lh_WhiteSurfArea_area')
arean.remove('Left_Accumbens_area'),
arean.remove('Right_Accumbens_area')

df_tsa_tca['totalSA'] = df_tsa_tca[arean].sum(axis = 1)

In [123]:
df_tsa_tca.to_csv(os.path.join(data_dir, 'Outputs/df_tsa_tca.csv'))

In [124]:
df_tsa_tca[['Protocole', 'sex', 'ASD', 'scanner']].isna().sum()

Protocole    0
sex          0
ASD          0
scanner      0
dtype: int64

In [125]:
for i in area:
    print(i+',')

Subject,
lh_bankssts_area,
lh_caudalanteriorcingulate_area,
lh_caudalmiddlefrontal_area,
lh_cuneus_area,
lh_entorhinal_area,
lh_fusiform_area,
lh_inferiorparietal_area,
lh_inferiortemporal_area,
lh_isthmuscingulate_area,
lh_lateraloccipital_area,
lh_lateralorbitofrontal_area,
lh_lingual_area,
lh_medialorbitofrontal_area,
lh_middletemporal_area,
lh_parahippocampal_area,
lh_paracentral_area,
lh_parsopercularis_area,
lh_parsorbitalis_area,
lh_parstriangularis_area,
lh_pericalcarine_area,
lh_postcentral_area,
lh_posteriorcingulate_area,
lh_precentral_area,
lh_precuneus_area,
lh_rostralanteriorcingulate_area,
lh_rostralmiddlefrontal_area,
lh_superiorfrontal_area,
lh_superiorparietal_area,
lh_superiortemporal_area,
lh_supramarginal_area,
lh_frontalpole_area,
lh_temporalpole_area,
lh_transversetemporal_area,
lh_insula_area,
lh_WhiteSurfArea_area,
BrainSegVolNotVent_x,
eTIV_x,
rh_bankssts_area,
rh_caudalanteriorcingulate_area,
rh_caudalmiddlefrontal_area,
rh_cuneus_area,
rh_entorhinal_area,
rh

In [96]:
for i in thick:
    print(i+',')

lh_bankssts_thickness,
lh_caudalanteriorcingulate_thickness,
lh_caudalmiddlefrontal_thickness,
lh_cuneus_thickness,
lh_entorhinal_thickness,
lh_fusiform_thickness,
lh_inferiorparietal_thickness,
lh_inferiortemporal_thickness,
lh_isthmuscingulate_thickness,
lh_lateraloccipital_thickness,
lh_lateralorbitofrontal_thickness,
lh_lingual_thickness,
lh_medialorbitofrontal_thickness,
lh_middletemporal_thickness,
lh_parahippocampal_thickness,
lh_paracentral_thickness,
lh_parsopercularis_thickness,
lh_parsorbitalis_thickness,
lh_parstriangularis_thickness,
lh_pericalcarine_thickness,
lh_postcentral_thickness,
lh_posteriorcingulate_thickness,
lh_precentral_thickness,
lh_precuneus_thickness,
lh_rostralanteriorcingulate_thickness,
lh_rostralmiddlefrontal_thickness,
lh_superiorfrontal_thickness,
lh_superiorparietal_thickness,
lh_superiortemporal_thickness,
lh_supramarginal_thickness,
lh_frontalpole_thickness,
lh_temporalpole_thickness,
lh_transversetemporal_thickness,
lh_insula_thickness,
rh_banksst

In [97]:
thick

['lh_bankssts_thickness',
 'lh_caudalanteriorcingulate_thickness',
 'lh_caudalmiddlefrontal_thickness',
 'lh_cuneus_thickness',
 'lh_entorhinal_thickness',
 'lh_fusiform_thickness',
 'lh_inferiorparietal_thickness',
 'lh_inferiortemporal_thickness',
 'lh_isthmuscingulate_thickness',
 'lh_lateraloccipital_thickness',
 'lh_lateralorbitofrontal_thickness',
 'lh_lingual_thickness',
 'lh_medialorbitofrontal_thickness',
 'lh_middletemporal_thickness',
 'lh_parahippocampal_thickness',
 'lh_paracentral_thickness',
 'lh_parsopercularis_thickness',
 'lh_parsorbitalis_thickness',
 'lh_parstriangularis_thickness',
 'lh_pericalcarine_thickness',
 'lh_postcentral_thickness',
 'lh_posteriorcingulate_thickness',
 'lh_precentral_thickness',
 'lh_precuneus_thickness',
 'lh_rostralanteriorcingulate_thickness',
 'lh_rostralmiddlefrontal_thickness',
 'lh_superiorfrontal_thickness',
 'lh_superiorparietal_thickness',
 'lh_superiortemporal_thickness',
 'lh_supramarginal_thickness',
 'lh_frontalpole_thickness'

In [95]:
vol = ['Left_Lateral_Ventricle', 'Left_Inf_Lat_Vent',
       'Left_Cerebellum_White_Matter', 'Left_Cerebellum_Cortex',
       'Left_Thalamus', 'Left_Caudate', 'Left_Putamen', 'Left_Pallidum',
       'third_Ventricle', 'fourth_Ventricle', 'Brain_Stem', 'Left_Hippocampus',
       'Left_Amygdala', 'CSF', 'Left_Accumbens_area', 'Left_VentralDC',
       'Left_vessel', 'Left_choroid_plexus', 'Right_Lateral_Ventricle',
       'Right_Inf_Lat_Vent', 'Right_Cerebellum_White_Matter',
       'Right_Cerebellum_Cortex', 'Right_Thalamus', 'Right_Caudate',
       'Right_Putamen', 'Right_Pallidum', 'Right_Hippocampus',
       'Right_Amygdala', 'Right_Accumbens_area', 'Right_VentralDC',
       'Right_vessel', 'Right_choroid_plexus', 'fifth_Ventricle']

In [164]:
df_tsa_tca[['lh_MeanThickness_thickness', 'rh_MeanThickness_thickness']].mean(axis = 1)

0      2.707100
1      2.662850
2      2.716115
3      2.729830
6      2.583240
         ...   
111    2.435210
112    2.480660
113    2.800270
114    2.758875
115    2.431040
Length: 506, dtype: float64

In [162]:
df_tsa_tca.meanCT

0      2.707100
1      2.662850
2      2.716115
3      2.729830
6      2.583240
         ...   
111    2.435210
112    2.480660
113    2.800270
114    2.758875
115    2.431040
Name: meanCT, Length: 506, dtype: float64

In [100]:
df_tsa_tca[vol]

Unnamed: 0,Left_Lateral_Ventricle,Left_Inf_Lat_Vent,Left_Cerebellum_White_Matter,Left_Cerebellum_Cortex,Left_Thalamus,Left_Caudate,Left_Putamen,Left_Pallidum,third_Ventricle,fourth_Ventricle,...,Right_Caudate,Right_Putamen,Right_Pallidum,Right_Hippocampus,Right_Amygdala,Right_Accumbens_area,Right_VentralDC,Right_vessel,Right_choroid_plexus,fifth_Ventricle
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,4147.4,383.5,12894.4,47780.2,6557.0,3589.7,4533.0,1556.8,846.3,1635.3,...,3730.6,4635.2,1486.8,3377.8,1416.7,675.5,3678.1,8.8,385.0,0.0
120,12727.6,739.4,14169.9,56411.3,6884.8,3235.6,4595.3,1883.0,1844.2,2455.2,...,3376.3,4518.4,1786.7,3902.4,1019.9,617.8,3464.5,40.5,573.7,0.0
121,8126.8,218.9,12007.8,58053.7,7268.1,3978.9,5122.6,1740.1,972.9,1439.5,...,4274.8,5149.8,1770.3,3363.6,1282.0,697.5,3427.6,10.7,498.5,0.0
122,2972.1,305.6,13193.5,53362.2,8082.3,3626.2,4908.1,1906.4,649.6,1272.4,...,4125.6,5128.1,1787.7,3803.0,1341.7,727.8,3531.7,16.1,588.7,0.0


In [96]:
for i in vol:
    print(i+',')

Left_Lateral_Ventricle,
Left_Inf_Lat_Vent,
Left_Cerebellum_White_Matter,
Left_Cerebellum_Cortex,
Left_Thalamus,
Left_Caudate,
Left_Putamen,
Left_Pallidum,
third_Ventricle,
fourth_Ventricle,
Brain_Stem,
Left_Hippocampus,
Left_Amygdala,
CSF,
Left_Accumbens_area,
Left_VentralDC,
Left_vessel,
Left_choroid_plexus,
Right_Lateral_Ventricle,
Right_Inf_Lat_Vent,
Right_Cerebellum_White_Matter,
Right_Cerebellum_Cortex,
Right_Thalamus,
Right_Caudate,
Right_Putamen,
Right_Pallidum,
Right_Hippocampus,
Right_Amygdala,
Right_Accumbens_area,
Right_VentralDC,
Right_vessel,
Right_choroid_plexus,
fifth_Ventricle,


In [101]:
vol_tsa.columns.str.replace('-','_')

Index(['Subject', 'BrainSeg', 'BrainSegNotVent', 'VentricleChoroidVol',
       'lhCortex', 'rhCortex', 'Cortex', 'lhCerebralWhiteMatter',
       'rhCerebralWhiteMatter', 'CerebralWhiteMatter', 'SubCortGray',
       'TotalGray', 'SupraTentorial', 'SupraTentorialNotVent', 'Mask',
       'BrainSegVol_to_eTIV', 'MaskVol_to_eTIV', 'lhSurfaceHoles',
       'rhSurfaceHoles', 'SurfaceHoles', 'EstimatedTotalIntraCranialVol',
       'Left_Lateral_Ventricle', 'Left_Inf_Lat_Vent',
       'Left_Cerebellum_White_Matter', 'Left_Cerebellum_Cortex',
       'Left_Thalamus', 'Left_Caudate', 'Left_Putamen', 'Left_Pallidum',
       '3rd_Ventricle', '4th_Ventricle', 'Brain_Stem', 'Left_Hippocampus',
       'Left_Amygdala', 'CSF', 'Left_Accumbens_area', 'Left_VentralDC',
       'Left_vessel', 'Left_choroid_plexus', 'Right_Lateral_Ventricle',
       'Right_Inf_Lat_Vent', 'Right_Cerebellum_White_Matter',
       'Right_Cerebellum_Cortex', 'Right_Thalamus', 'Right_Caudate',
       'Right_Putamen', 'Right_Pallidu