In [1083]:
import os
os.chdir('/Users/ltran/Documents/TrueData29/CPC_ML_tutorial/')

import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
import seaborn as sns
import random
import joypy
from matplotlib import cm

from scipy.stats import fisher_exact
from pcntoolkit.normative import estimate, predict, evaluate
from pcntoolkit.util.utils import compute_MSLL, create_design_matrix
from nm_utils import calibration_descriptives, remove_bad_subjects, load_2d
from sklearn.model_selection import train_test_split

# Set working directory
root_dir = '/Users/ltran/Documents/TrueData0104/CPC_ML_tutorial/'
out_dir = os.path.join(root_dir,'models','test')

# create the output directory if it does not already exist
os.makedirs(out_dir, exist_ok=True)

# Load TCA

In [1084]:
data_dir = '/Users/ltran/Documents/Data/'


## Compute BMI

In [1377]:
pheno = pd.read_csv(os.path.join(data_dir, 'TCA_vol/MRI_RDB_TCA_20220629_anonym.tsv'), sep = '\t')

In [1378]:
#Format height in meters

l_height = []
for i in pheno['size_at_scan']:
    if (i>100):
        l_height.append(i/100)
    else:
        l_height.append(i)

pheno['height_m'] = l_height
pheno = pheno.rename(columns={'subject_id' : 'participant_id'})

In [1379]:
# Compute BMI 

pheno['BMI'] = pheno['weight_at_scan'] / (pheno['height_m'] ** 2)

# Merge Volume, Area, CT

In [1561]:
# Load ISO volume
vol_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/volumes-fs-iso.txt'), sep = '\t')

# Load TFE volume
vol_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/volumes-fs-tfe.txt'), sep = '\t')

In [1562]:
vol_tca = pd.concat([vol_tfe, vol_iso])

In [1563]:
lh_area_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.lh.area.csv'), sep = '\t') 
rh_area_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.rh.area.csv'), sep = '\t') 


lh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.lh.area.csv'), sep = '\t') 
rh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.rh.area.csv'), sep = '\t') 

area_iso = lh_area_iso.merge(rh_area_iso.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

area_tfe = lh_area_tfe.merge(rh_area_tfe.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [1564]:
area = pd.concat([area_iso, area_tfe])
area = area.rename(columns= {'lh.aparc.area' : 'Subject'})

In [1565]:
lh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.lh.thickness.csv'), sep = '\t') 
rh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-iso.rh.thickness.csv'), sep = '\t') 

ct_iso = lh_ct_iso.merge(rh_ct_iso.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), 
               on = 'lh.aparc.thickness')


lh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.lh.thickness.csv'), sep = '\t') 
rh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TCA_vol/fs-7.2.0-tfe.rh.thickness.csv'), sep = '\t') 

ct_tfe = lh_ct_tfe.merge(rh_ct_tfe.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), 
               on = 'lh.aparc.thickness')


ct = pd.concat([ct_iso, ct_tfe])
ct = ct.rename(columns= {'lh.aparc.thickness' : 'Subject'})

In [1566]:
df_TCA = area.merge(vol_tca, on = 'Subject').merge(ct, on='Subject')

In [1567]:
l_pi = []
for i in df_TCA['Subject']:
    l_pi.append(int(i.split('-')[1].split('_')[0]))

df_TCA['participant_id'] = l_pi

In [1568]:
df_TCA = df_TCA.merge(pheno.rename(columns={'subject_id':'participant_id'}), on = 'participant_id')

In [1569]:
df_TCA['sex'] = df_TCA['Sex'].replace({'Female' : -1, 'Male' : 1})

In [1570]:
# df_TCA = df_TCA.drop_duplicates(subset='participant_id')
df_TCA = df_TCA.dropna(subset='Sex')

In [1571]:
df_TCA.columns = df_TCA.columns.str.replace('-', '_')

In [1572]:
df_TCA['ASD'] = 'TCA'
df_TCA.loc[df_TCA[df_TCA.TSA == 1].index, 'ASD'] = 'TCA_Autism'

In [1573]:
df_TCA = df_TCA.drop_duplicates(subset='Subject', keep = 'first')

In [1574]:
df_TCA['participant_id'] = df_TCA['participant_id'].astype(str)+'_TCA'

In [1575]:
df_TCA['size_at_scan'] = df_TCA['size_at_scan'].replace(0.92, 92)

In [1576]:
df_TCA = df_TCA.dropna(subset='BMI')

In [1577]:
df_TCA['Month'] = df_TCA['age_at_scan']*12

In [1578]:
df_TCA.sex = df_TCA.sex.replace(-1, 2)

### Export dataframe

In [1579]:
data_dir

'/Users/ltran/Documents/Data/'

In [1580]:
df_TCA.to_csv(os.path.join(data_dir, 'TCA_vol/Outputs/df_TCA.csv'), index = False)

________________

# Load TSA

In [1581]:
data_dir

'/Users/ltran/Documents/Data/'

## Load infos

In [1582]:
infos = pd.read_csv(os.path.join(data_dir, 'TSA_vol/participants.tsv'), sep = '\t')

In [1583]:
infos_TSA = pd.read_csv(os.path.join(data_dir, 'TSA_vol/TSA_cohort_dcm_info_anonymized_1.csv'), sep = ';')
infos_TSA = infos_TSA.rename(columns={'SubCode BIDS' : 'participant_id'})

In [1584]:
infos_TSA =infos_TSA.drop_duplicates(subset='participant_id', keep = 'first')

In [1585]:
infos_TSA['participant_id'] = infos_TSA['participant_id'].astype(str)+'_ses01'



In [1586]:
l_dup = infos_TSA[infos_TSA.duplicated(subset='participant_id')].participant_id


In [1587]:

all_ses02 = []
for i in l_dup: 
    temp = infos_TSA[infos_TSA.participant_id == i].reset_index(drop = True)
    if temp.loc[0,'acquisition_date'] > temp.loc[1,'acquisition_date']:
        temp.loc[0, 'participant_id'] = temp.loc[0, 'participant_id'].replace('_ses01', '_ses02')
    elif temp.loc[0,'acquisition_date'] < temp.loc[1,'acquisition_date']:
        temp.loc[1, 'participant_id'] = temp.loc[1, 'participant_id'].replace('_ses01', '_ses02')

    all_ses02.append(temp)
    
df_ses02 = pd.concat(all_ses02)

ValueError: No objects to concatenate

In [1588]:
infos_TSA = df_ses02.merge(infos_TSA, on = ['patient_birth_date', 'patient_sex', 'patient_weight',
       'acquisition_date', 'manufacturer', 'manufacturer_model_name',
       'magnetic_field_strength'], how = 'left', suffixes=('', 'y')).drop(columns='participant_idy')

## Load QC

In [1589]:
QCtrio = pd.read_csv(os.path.join(data_dir, 'QC_trio.csv'), sep = ',')

In [1590]:
# Remove missing scans
QC_trio = QCtrio.dropna(subset = ['QC'], how = 'all')

In [1591]:
QC_trio['Protocole'] = QC_trio['Protocole'].replace({'isotrope' : 'ISO', 'tfe' : 'TFE'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QC_trio['Protocole'] = QC_trio['Protocole'].replace({'isotrope' : 'ISO', 'tfe' : 'TFE'})


In [1592]:
QC_trio = QC_trio[~((QC_trio.QC == 'g') | (QC_trio.QC == 'F'))]
# QC_trio = QC_trio.dropna(subset = 'BIDS')

In [1593]:
l_bids_dup = QC_trio[QC_trio.duplicated(subset = 'BIDS')].BIDS.to_list()

In [1429]:
bestVbrain = []
for i in l_bids_dup:
    if QC_trio[QC_trio.BIDS == i]['best Vbrain'].isna().all():
        temp = QC_trio[(QC_trio.BIDS == i) & (QC_trio['Protocole'] == 'TFE')]
    else:
        temp = QC_trio[(QC_trio.BIDS == i) & (QC_trio['best Vbrain'] == 1)]
        
    bestVbrain.append(temp)
    
bestVbrain = pd.concat(bestVbrain)


In [1434]:
bestVbrain = bestVbrain.drop_duplicates('BIDS', keep = 'first')

In [1435]:
QC_trio = QC_trio[~(QC_trio.BIDS.isin(l_bids_dup))]

In [1550]:
QC = pd.concat([QC_trio, bestVbrain])


In [1129]:
QC.BIDS = QC.BIDS.str.replace('Sub', 'sub')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QC.BIDS = QC.BIDS.str.replace('Sub', 'sub')


In [1440]:
QC_TFE = QC[QC['Protocole'] == 'TFE']

In [1441]:
QC_ISO = QC[QC['Protocole'] == 'ISO']

Load infos 

In [1594]:
data_dir = '/Users/ltran/Documents/Data/'
isDir = os.path.isdir('/Users/ltran/Documents/Data/TSA_vol/Outputs/')
if isDir:
    ! rm -r '/Users/ltran/Documents/Data/TSA_vol/Outputs/'
! mkdir '/Users/ltran/Documents/Data/TSA_vol/Outputs/'

In [1595]:
infos_TSA = pd.read_csv(os.path.join(data_dir, 'TSA_vol/TSA_cohort_dcm_info_anonymized_1.csv'), sep = ';')
infos_TSA = infos_TSA.rename(columns={'SubCode BIDS' : 'participant_id'})

In [1596]:
infos_TSA = infos_TSA.drop_duplicates(subset='participant_id', keep = 'first')

In [1598]:

# Load participants infos (age, sex, diagnosis, scores...)
new = pd.read_csv(os.path.join(data_dir, 'participants.tsv'), sep = "\t")

In [1599]:
# Merge instruments with demography
new = new.merge(infos_TSA, on = 'participant_id')

In [1600]:
# Drop NA in sex, age, diag
new = new[~(new.ASD == '?')].dropna(subset = ['sex', 'age_at_first_scan', 'ASD'])

In [1601]:
# Drop duplicates

new = new.drop_duplicates(subset = 'participant_id', keep = 'first')

In [1602]:
new.sex = new.sex.replace({'M' : 1, 'F' : -1})

In [1603]:
new = new.rename(columns={'age_at_first_scan' : 'age_at_scan'}) 

In [1604]:
new.loc[(new.group == 'Relative') & (new.ASD == 'No'), 'ASD'] = 'Relative'

Volumes

In [1605]:
# Load ISO volume
vol_tsa_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/volumes-fs-iso.txt'), sep = '\t')

# Load TFE volume
vol_tsa_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/volumes-fs-tfe.txt'), sep = '\t')

In [1606]:
# Convert Subject to BIDS format
def fitler_qc_iso_tfe(vol_tsa_iso, vol_tsa_tfe, colname):
    bids = []
    for i in vol_tsa_iso[colname]:
        bids.append(i.split('_')[0])
    vol_tsa_iso['BIDS'] = bids


    bids = []
    for i in vol_tsa_tfe[colname]:
        bids.append(i.split('_')[0])
    vol_tsa_tfe['BIDS'] = bids


    vol_tsa = pd.concat([vol_tsa_iso.merge(QC_ISO, on = 'BIDS'), vol_tsa_tfe.merge(QC_TFE, on = 'BIDS')])
    
    return vol_tsa

In [1607]:
vol_tsa = fitler_qc_iso_tfe(vol_tsa_iso, vol_tsa_tfe, 'Subject')

Surface area

In [1608]:
rh_area_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.rh.area.csv'), sep = '\t')
lh_area_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.lh.area.csv'), sep = '\t')
rh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.rh.area.csv'), sep = '\t')
lh_area_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.lh.area.csv'), sep = '\t')

In [1609]:
area_iso = lh_area_iso.merge(rh_area_iso.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [1610]:
area_tfe = lh_area_tfe.merge(rh_area_tfe.rename(columns={'rh.aparc.area': 'lh.aparc.area'}), on ='lh.aparc.area')

In [1611]:
area = fitler_qc_iso_tfe(area_iso, area_tfe, 'lh.aparc.area').rename(columns = {'lh.aparc.area':'Subject'})

Cortical thickness

In [1612]:
rh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.rh.thickness.csv'), sep = '\t')
lh_ct_iso = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-iso.lh.thickness.csv'), sep = '\t')
rh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.rh.thickness.csv'), sep = '\t')
lh_ct_tfe = pd.read_csv(os.path.join(data_dir, 'TSA_vol/fs-7.2.0-tfe.lh.thickness.csv'), sep = '\t')

In [1613]:
ct_iso = lh_ct_iso.merge(rh_ct_iso.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), on ='lh.aparc.thickness')

In [1614]:
ct_tfe = lh_ct_tfe.merge(rh_ct_tfe.rename(columns={'rh.aparc.thickness': 'lh.aparc.thickness'}), on ='lh.aparc.thickness')

In [1615]:
ct = fitler_qc_iso_tfe(ct_iso, ct_tfe, 'lh.aparc.thickness').rename(columns = {'lh.aparc.thickness':'Subject'})

### Merge volume / area / ct

In [1616]:
df_tsa = area.merge(ct, on = 'Subject').merge(vol_tsa, on = 'Subject')

In [1617]:
participants = []
for i in df_tsa['Subject']:
    participants.append(int(i.split('-')[1].split('_')[0]))
    
df_tsa['participant_id'] = participants

In [1618]:
df_tsa = df_tsa.drop_duplicates(subset = 'participant_id', keep = 'first')

In [1619]:
df_tsa = df_tsa.merge(new, on  = 'participant_id')

In [1620]:
df_tsa.columns = df_tsa.columns.str.replace('-', '_')

## Export TSA

In [1621]:
df_tsa.to_csv(os.path.join(data_dir, 'TSA_vol/Outputs/df_TSA.csv'), index = False)

## Export TSA + TCA

In [409]:
data_dir

'/Users/ltran/Documents/Data/'

In [413]:
df_tsa_tca = pd.concat([df_tsa, df_TCA])


In [416]:
df_tsa_tca.participant_id.to_csv(os.path.join(data_dir, 'Outputs/partcipant_id.csv'))

In [417]:
df_tsa_tca

Unnamed: 0,Subject,lh_bankssts_area,lh_caudalanteriorcingulate_area,lh_caudalmiddlefrontal_area,lh_cuneus_area,lh_entorhinal_area,lh_fusiform_area,lh_inferiorparietal_area,lh_inferiortemporal_area,lh_isthmuscingulate_area,...,magnetic_field_strength,principal_diag,TSA,delay_before_first_mri,weight_at_scan,size_at_scan,dwi_b1000,dwi_b1500,height_m,BMI
0,sub-0003_ses-01,1226.0,622.0,2909.0,1490.0,442.0,3107.0,4961.0,3823.0,938.0,...,,,,,,,,,,
2,sub-0025_ses-01,1067.0,526.0,2537.0,1772.0,394.0,3310.0,5077.0,3188.0,1062.0,...,,,,,,,,,,
4,sub-0070_ses-01,857.0,803.0,2087.0,1713.0,547.0,3153.0,5148.0,3845.0,1115.0,...,,,,,,,,,,
6,sub-0095_ses-01,1015.0,1071.0,2185.0,1622.0,358.0,3018.0,4830.0,3400.0,1202.0,...,,,,,,,,,,
8,sub-0096_ses-01,1495.0,550.0,2502.0,1852.0,606.0,4075.0,5700.0,3895.0,1016.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,sub-0182_ses-01,896.0,598.0,2109.0,1482.0,289.0,2859.0,4906.0,3639.0,810.0,...,3,2.0,0.0,,34.6,159.9,0.0,0.0,1.599,13.532535
113,sub-0183_ses-01,929.0,533.0,2045.0,1174.0,385.0,2513.0,4454.0,3271.0,951.0,...,3,1.0,0.0,,38.6,157.0,0.0,0.0,1.570,15.659864
114,sub-0184_ses-01,1306.0,555.0,2289.0,1495.0,323.0,3256.0,4454.0,3821.0,1050.0,...,3,1.0,0.0,,24.0,142.0,0.0,0.0,1.420,11.902400
115,sub-0185_ses-01,1036.0,621.0,2645.0,1450.0,541.0,3349.0,5397.0,4132.0,1139.0,...,3,1.0,0.0,,28.2,140.8,0.0,0.0,1.408,14.224722


In [412]:
adf_tsa_tca.groupby('ASD').size()

ASD
No            337
TCA           110
TCA_Autism      5
Yes           446
dtype: int64