In [3]:
import os
os.chdir('/Users/ltran/Documents/TrueData29/CPC_ML_tutorial/')

import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
import seaborn as sns
import random
import joypy
from matplotlib import cm

from scipy.stats import fisher_exact
from pcntoolkit.normative import estimate, predict, evaluate
from pcntoolkit.util.utils import compute_MSLL, create_design_matrix
from nm_utils import calibration_descriptives, remove_bad_subjects, load_2d
from sklearn.model_selection import train_test_split

# Set working directory
root_dir = '/Users/ltran/Documents/TrueData0104/CPC_ML_tutorial/'
out_dir = os.path.join(root_dir,'models','test')

# create the output directory if it does not already exist
os.makedirs(out_dir, exist_ok=True)

# Load TCA

In [4]:
data_dir = '/Users/ltran/Documents/Data/'


## Compute BMI

In [5]:
pheno = pd.read_csv(os.path.join(data_dir, 'TCA_vol/MRI_RDB_TCA_20220629_anonym.tsv'), sep = '\t')

In [6]:
#Format height in meters

l_height = []
for i in pheno['size_at_scan']:
    if (i>100):
        l_height.append(i/100)
    else:
        l_height.append(i)

pheno['height_m'] = l_height
pheno = pheno.rename(columns={'subject_id' : 'participant_id'})

In [7]:
# Compute BMI 

pheno['BMI'] = pheno['weight_at_scan'] / (pheno['height_m'] ** 2)

# Merge Volume, Area, CT

In [8]:
# Load ISO volume
vol_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/volumes-fs-iso.txt'), sep = '\t')

# Load TFE volume
vol_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/volumes-fs-tfe.txt'), sep = '\t')

In [9]:
vol_tca = pd.concat([vol_tfe, vol_iso])

In [10]:
lh_area_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.lh.area.csv'), sep = '\t') 
rh_area_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.rh.area.csv'), sep = '\t') 


lh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.lh.area.csv'), sep = '\t') 
rh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.rh.area.csv'), sep = '\t') 

area_iso = lh_area_iso.merge(rh_area_iso.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

area_tfe = lh_area_tfe.merge(rh_area_tfe.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [11]:
area = pd.concat([area_iso, area_tfe])
area = area.rename(columns= {'lh.aparc.area' : 'Subject'})

In [12]:
lh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.lh.thickness.csv'), sep = '\t') 
rh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.rh.thickness.csv'), sep = '\t') 

ct_iso = lh_ct_iso.merge(rh_ct_iso.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), 
               on = 'lh.aparc.thickness')


lh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.lh.thickness.csv'), sep = '\t') 
rh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.rh.thickness.csv'), sep = '\t') 

ct_tfe = lh_ct_tfe.merge(rh_ct_tfe.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), 
               on = 'lh.aparc.thickness')


ct = pd.concat([ct_iso, ct_tfe])
ct = ct.rename(columns= {'lh.aparc.thickness' : 'Subject'})

In [13]:
df_TCA = area.merge(vol_tca, on = 'Subject').merge(ct, on='Subject')

In [14]:
l_pi = []
for i in df_TCA['Subject']:
    l_pi.append(int(i.split('-')[1].split('_')[0]))

df_TCA['participant_id'] = l_pi

In [15]:
df_TCA = df_TCA.merge(pheno.rename(columns={'subject_id':'participant_id'}), on = 'participant_id')

In [16]:
df_TCA = df_TCA.drop(index = [83, 85])

In [17]:
df_TCA['sex'] = df_TCA['Sex'].replace({'Female' : -1, 'Male' : 1})

In [18]:
# df_TCA = df_TCA.drop_duplicates(subset='participant_id')
df_TCA = df_TCA.dropna(subset='Sex')

In [19]:
df_TCA.columns = df_TCA.columns.str.replace('-', '_')

In [20]:
df_TCA['ASD'] = 'TCA'
df_TCA.loc[df_TCA[df_TCA.TSA == 1].index, 'ASD'] = 'TCA_Autism'

### Export dataframe

In [21]:
df_TCA.to_csv(os.path.join(data_dir, 'Outputs/df_TCA.csv'), index = False)

________________

# Load TSA

Load QC

In [736]:
QCtrio = pd.read_csv(os.path.join(data_dir, 'QC_trio.csv'), sep = ',')

In [737]:
# Remove missing scans
QC_trio = QCtrio.dropna(subset = ['cropped FOV',
       'cropped cerebellum', 'cropped orbitofrontal', 'defacing failed',
       'motion', 'low Quality brain', 'QC'], how = 'all')


In [738]:
QC_trio['Protocole'] = QC_trio['Protocole'].replace({'isotrope' : 'ISO', 'tfe' : 'TFE'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QC_trio['Protocole'] = QC_trio['Protocole'].replace({'isotrope' : 'ISO', 'tfe' : 'TFE'})


In [739]:
l_bids_dup = QC_trio[QC_trio.duplicated(subset = 'BIDS')].BIDS.to_list()

In [740]:
removed_dup = []
for i in l_bids_dup:
    temp = QC_trio[QC_trio['BIDS'] == i]

    if temp['best Vbrain'].isnull().all():
        temp = temp[temp['Protocole'] == 'TFE']
 
    else:
        temp = temp[temp['best Vbrain'] == 1]
    
    removed_dup.append(temp)
    
QCt = pd.concat(removed_dup)


In [741]:
QCt = QCt.drop_duplicates(subset=['Unnamed: 0', 'BIDS'])

In [742]:
QC = pd.concat([QC_trio[~(QC_trio.BIDS.isin(l_bids_dup))], QCt])

In [743]:
QC = QC.drop_duplicates(subset='BIDS')

In [744]:
participants = []
for i in QC['BIDS']:
    if (type(i) == str):
        participants.append(int(i.split('-')[1]))
    else: 
        participants.append(i)
QC['participant_id'] = participants

Load infos 

In [125]:
data_dir = '/Users/ltran/Documents/Data/'
isDir = os.path.isdir('/Users/ltran/Documents/Data/TSA_vol/Outputs/')
if isDir:
    ! rm -r '/Users/ltran/Documents/Data/TSA_vol/Outputs/'
! mkdir '/Users/ltran/Documents/Data/TSA_vol/Outputs/'

In [126]:
infos_TSA = pd.read_csv(os.path.join(data_dir, 'TSA_vol/TSA_cohort_dcm_info_anonymized_1.csv'), sep = ';')
infos_TSA = infos_TSA.rename(columns={'SubCode BIDS' : 'participant_id'})

In [127]:

# Load participants infos (age, sex, diagnosis, scores...)
new = pd.read_csv(os.path.join(data_dir, 'participants.tsv'), sep = "\t")

In [128]:
# Merge instruments with demography
new = new.merge(infos_TSA, on = 'participant_id')

In [129]:
# Drop NA in sex, age, diag
new = new[~(new.ASD == '?')].dropna(subset = ['sex', 'age_at_first_scan', 'ASD'])

In [130]:
# Drop duplicates

new = new.drop_duplicates(subset = 'participant_id', keep = 'first')

In [131]:
new.sex = new.sex.replace({'M' : 1, 'F' : -1})

In [132]:
new = new.rename(columns={'age_at_first_scan' : 'age_at_scan'}) 

In [133]:
new.loc[(new.group == 'Relative') & (new.ASD == 'No'), 'ASD'] = 'Relative'

Volumes

In [134]:
# Load ISO volume
vol_tsa_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/volumes-fs-iso.txt'), sep = '\t')

# Load TFE volume
vol_tsa_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/volumes-fs-tfe.txt'), sep = '\t')

In [135]:
vol_tsa = pd.concat([vol_tsa_iso, vol_tsa_tfe])

---> CHANGE WITH QC

In [136]:
vol_tsa = vol_tsa.drop_duplicates(subset='Subject')

In [137]:
participants = []
for i in vol_tsa['Subject']:
    participants.append(int(i.split('-')[1].split('_')[0]))
    
vol_tsa['participant_id'] = participants

In [138]:
TSA_volume = vol_tsa.merge(new, on = 'participant_id')

In [139]:
df_both = pd.concat([TSA_volume, df_TCA])

In [140]:
df_both.columns = df_both.columns.str.replace('-', '_')

In [141]:
df_both.to_csv(os.path.join(data_dir, 'Outputs/df_both.csv'), index = False)

Surface area

In [142]:
rh_area_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.rh.area.csv'), sep = '\t')
lh_area_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.lh.area.csv'), sep = '\t')
rh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.rh.area.csv'), sep = '\t')
lh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.lh.area.csv'), sep = '\t')

In [143]:
area_iso = lh_area_iso.merge(rh_area_iso.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [144]:
area_tfe = lh_area_tfe.merge(rh_area_tfe.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [145]:
area = pd.concat([area_iso, area_tfe])
area = area.rename(columns={'lh.aparc.area' : 'Subject'})

In [146]:
area = area.drop_duplicates(subset='Subject')

Cortical thickness

In [147]:
rh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.rh.thickness.csv'), sep = '\t')
lh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.lh.thickness.csv'), sep = '\t')
rh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.rh.thickness.csv'), sep = '\t')
lh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.lh.thickness.csv'), sep = '\t')

In [148]:
ct_iso = lh_ct_iso.merge(rh_ct_iso.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), on ='lh.aparc.thickness')

In [149]:
ct_tfe = lh_ct_tfe.merge(rh_ct_tfe.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), on ='lh.aparc.thickness')

In [150]:
ct = pd.concat([ct_iso, ct_tfe])

In [151]:
ct = ct.rename(columns={'lh.aparc.thickness' : 'Subject'})

In [152]:
ct = ct.drop_duplicates(subset='Subject')

In [160]:
df_RDB = pd.read_csv(os.path.join(data_dir, 'Outputs/df_RDB.csv'))
df_RDB

Unnamed: 0,subject_id,session_id,Sex,Group,FamilyStatus_x,Control,ASD,acq_time,age_at_scan,machine,...,IQ_Instrument,cognitive_level,Max_IQ,Performance_IQ,Verbal_IQ,Total_IQ,developmental_quotient,abide2_code,DNA_Fusion_code,MRI_code_old
0,3,1,Female,Patient,Proband,No,Yes,2006-02-16T19:18:00,9.4,Intera,...,WIS4,1.0,130.0,130.0,103.0,110.0,,29603.0,C0733-011-301-001,MRIPasteur-413
1,25,1,Male,Patient,Proband,No,Yes,2007-01-22T10:49:05,6.4,Intera,...,RAVEN,3.0,66.0,66.0,,,22,,20496,MRIPasteur-068
2,70,3,Male,Patient,Proband,No,Yes,2014-12-06T10:35:48,11.3,Ingenia,...,WIS4,1.0,103.0,96.0,83.0,87.0,,,C0733-011-314-001,MRIPasteur-081
3,95,1,Female,Patient,Proband,No,Yes,2008-05-07T10:50:19,14.5,Intera,...,WAI4,1.0,116.0,90.0,116.0,,,29613.0,C0733-011-305-001,MRIPasteur-128
4,96,1,Male,Patient,Proband,No,Yes,2008-05-16T13:45:57,4.9,Intera,...,WIP,1.0,94.0,94.0,61.0,,,,AU-RD-GAR-293-004,MRIPasteur-046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,1472,1,Female,Patient,Proband,No,Yes,2020-01-23T13:59:53,4.1,Ingenia_3T,...,,,,,,,,,,
786,1474,1,Male,Patient,Proband,No,Yes,2020-01-27T08:41:04,7.1,Ingenia_3T,...,,,,,,,,,,
787,1477,1,Male,Patient,Proband,No,Yes,2020-01-28T15:28:10,5.0,Ingenia_3T,...,,,,,,,,,,
788,1478,1,Female,Patient,Proband,No,Yes,2020-02-03T08:42:28,8.0,Ingenia_3T,...,,,,,,,,,,


### Merge volume / area / ct

In [161]:
df_tsa = area.merge(ct, on = 'Subject')

In [162]:
df_tsa = df_tsa.merge(vol_tsa, on = 'Subject')

In [163]:
participants = []
for i in df_tsa['Subject']:
    participants.append(int(i.split('-')[1].split('_')[0]))
    
df_tsa['participant_id'] = participants

In [164]:
df_tsa = df_tsa.merge(df_RDB, on = 'participant_id')

In [165]:
df_tsa.columns = df_tsa.columns.str.replace('-', '_')

In [170]:
df_tsa = df_tsa.drop_duplicates(subset='participant_id', keep = 'first')

## Export TSA

In [171]:
df_tsa.to_csv(os.path.join(data_dir, 'TSA_vol/Outputs/df_TSA.csv'), index = False)

## Export TSA + TCA

In [113]:
data_dir

'/Users/ltran/Documents/Data/'

In [115]:
df_tsa_tca = pd.concat([df_tsa, df_TCA])


In [771]:
df_tsa_tca.groupby('ASD').size()

ASD
No            135
Relative      203
TCA           110
TCA_Autism      5
Yes           437
dtype: int64

In [121]:
df_tsa_tca = df_tsa_tca.drop_duplicates(subset='participant_id', keep = 'first')

In [124]:
df_tsa_tca.to_csv(os.path.join(data_dir, 'Outputs/df_TCA_TSA.csv'), index = False)