In [None]:
import csv
import os
import random
import pickle
import gc
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import sklearn
import warnings
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
from scipy.stats import spearmanr

# Stack dwMRI: Parcellations (structural matrices) + IDPs (TBSS/Probabilistic tractography)

In [None]:
dti_all_modalities = [
'fa_prob', 'fa_tbss',
'icvf_prob', 'icvf_tbss',
'isovf_prob','isovf_tbss',
'l1_prob', 'l1_tbss',
'l2_prob', 'l2_tbss',
'l3_prob', 'l3_tbss',
'md_prob', 'md_tbss',
'mo_prob', 'mo_tbss',
'od_prob', 'od_tbss',

'31020_connectome_fa',
'31020_connectome_mean_length',
'31020_connectome_sift2',
'31020_connectome_streamline_count',

'31021_connectome_fa',
'31021_connectome_mean_length',
'31021_connectome_sift2',
'31021_connectome_streamline_count',

'31022_connectome_fa',
'31022_connectome_mean_length',
'31022_connectome_sift2',
'31022_connectome_streamline_count',

'31023_connectome_fa',
'31023_connectome_mean_length',
'31023_connectome_sift2',
'31023_connectome_streamline_count',

'31024_connectome_fa',
'31024_connectome_mean_length',
'31024_connectome_sift2',
'31024_connectome_streamline_count',

'31025_connectome_fa',
'31025_connectome_mean_length',
'31025_connectome_sift2',
'31025_connectome_streamline_count_10M']

In [None]:
len(dti_all_modalities)

In [None]:
# Merge predicted g-factors
folds = ["0", "1", "2", "3", "4"]
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
     
     print(f'Started {fold}', flush=True)
     all_modalities_train = []
     all_modalities_test = []
     
     for modality in dti_all_modalities:
          
          g_train_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted train': f'{modality}'})
          print('g TRAIN shape BEFORE stacking:', g_train_pred_level1.shape)
          g_test_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted test': f'{modality}'})
          print('g TEST shape BEFORE stacking:', g_test_pred_level1.shape)
          
          all_modalities_train.append(g_train_pred_level1)
          all_modalities_test.append(g_test_pred_level1)
          
     features_train_level1 = all_modalities_train[0]
     features_test_level1 = all_modalities_test[0]
     
     for i in range(1, len(all_modalities_train)):
          features_train_level1 = pd.merge(features_train_level1, all_modalities_train[i], on='eid', how='inner')
     print('g TRAIN shape AFTER stacking:', features_train_level1.shape)

     features_train_level1.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/dti_all/features_train_level1_fold_{fold}.csv', index=False)
     features_train_level1_g_matched = features_train_level1.merge(pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_train_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/dti_all/features_train_level1_g_matched_fold_{fold}.csv', index=False)
     
     for i in range(1, len(all_modalities_test)):
          features_test_level1 = pd.merge(features_test_level1, all_modalities_test[i], on='eid', how='inner')
     print('g TEST shape AFTER stacking:', features_test_level1.shape)
     features_test_level1.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/dti_all/features_test_level1_fold_{fold}.csv', index=False)
     features_test_level1_g_matched = features_test_level1.merge(pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_test_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/dti_all/features_test_level1_g_matched_fold_{fold}.csv', index=False)

In [None]:
# Display results for all algorithms and folds
algorithms = ['svr', 'eNet', 'xgb', 'rf']
folds = ["0", "1", "2", "3", "4"]

stack_folds_all_dti = []
for algorithm in algorithms:
    for fold in folds:
        model = pd.read_csv(f'/PLS/brain/stacking/result/DTI_All_{algorithm}_stacked_result_fold_{fold}.csv', header=None)
        model.columns = ['Algorithm', 'Fold', 'Best parameters', 'MSE', 'MAE', 'R2', 'Pearson r']
        stack_folds_all_dti.append(model)
        stack_all_dti = pd.concat(stack_folds_all_dti, ignore_index=False)
stack_all_dti['Pearson r'] = stack_all_dti['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
stack_all_dti[['Pearson r', 'p-value']] = stack_all_dti['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
stack_all_dti['Best parameters'] = stack_all_dti['Best parameters'].str.replace("'", "")
stack_all_dti['Best parameters'] = stack_all_dti['Best parameters'].str.replace("{", "").str.replace("}", "")
stack_all_dti.round(3).to_csv(f'/PLS/brain/stacking/DTI_All_stacked_five_folds.csv', index=False)
stack_all_dti.round(3)

In [None]:
stack_all_dti_mean_dti = stack_all_dti[['Algorithm', 'MSE', 'MAE', 'R2', 'Pearson r']].groupby(['Algorithm']).mean().round(3).reset_index()
stack_all_dti_mean_dti.sort_values(by='Pearson r', ascending=False)

# Stack rsMRI: Parcellations (full correlation) + IDPs (tangent + amplitudes)

## RS IDP best + FullCorr atlases matched to DTI

- aparc_Tian_s1
- aparc_2009_Tian_s1
- Glasser_Tian_s1
- Glasser_Tian_s4
- Schaeffer_7n200_Tian_s1
- Schaeffer_7n500_Tian_s4

In [None]:
rs_idp_rs_best_metrics = [
# RS TS
# Full corr
'aparc_s1_full_correlation', 
'aparc_2009_s1_full_correlation',
'glasser_s1_full_correlation',
'glasser_s4_full_correlation',
'Schaefer7n200p_s1_full_correlation',
'Schaefer7n500p_s4_full_correlation',

# RS IDP
'amplitudes_21', 
'amplitudes_55',
'tangent_matrices_21', 
'tangent_matrices_55']

In [None]:
len(rs_idp_rs_best_metrics)

In [None]:
# Merge predicted g-factors
folds = ["0", "1", "2", "3", "4"]
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
     
     print(f'Started {fold}', flush=True)
     all_modalities_train = []
     all_modalities_test = []
     
     for modality in rs_idp_rs_best_metrics:
          
          g_train_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted train': f'{modality}'})
          print('g TRAIN shape BEFORE stacking:', g_train_pred_level1.shape)
          print(g_train_pred_level1.isna().sum().sort_values(ascending=False))
          g_test_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted test': f'{modality}'})
          print('g TEST shape BEFORE stacking:', g_test_pred_level1.shape)
          print(g_test_pred_level1.isna().sum().sort_values(ascending=False))
          
          all_modalities_train.append(g_train_pred_level1)
          all_modalities_test.append(g_test_pred_level1)
          
     features_train_level1 = all_modalities_train[0]
     features_test_level1 = all_modalities_test[0]
     
     for i in range(1, len(all_modalities_train)):
          features_train_level1 = pd.merge(features_train_level1, all_modalities_train[i], on='eid', how='inner')
     print('g TRAIN shape AFTER stacking:', features_train_level1.shape)

     features_train_level1.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/rs_idp_ts_best_metrics/features_train_level1_fold_{fold}.csv', index=False)
     features_train_level1_g_matched = features_train_level1.merge(pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_train_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/rs_idp_ts_best_metrics/features_train_level1_g_matched_fold_{fold}.csv', index=False)
     print(features_train_level1_g_matched.isna().sum().sort_values(ascending=False))
     
     for i in range(1, len(all_modalities_test)):
          features_test_level1 = pd.merge(features_test_level1, all_modalities_test[i], on='eid', how='inner')
     print('g TEST shape AFTER stacking:', features_test_level1.shape)
     features_test_level1.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/rs_idp_ts_best_metrics/features_test_level1_fold_{fold}.csv', index=False)
     features_test_level1_g_matched = features_test_level1.merge(pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_test_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/rs_idp_ts_best_metrics/features_test_level1_g_matched_fold_{fold}.csv', index=False)
     print(features_test_level1_g_matched.isna().sum().sort_values(ascending=False))

In [None]:
# Display results for all algorithms and folds
algorithms = ['xgb', 'eNet', 'rf', 'svr']
folds = [ "0", "1", "2", "3", "4"]
stack_folds_all = []
for algorithm in algorithms:
    for fold in folds:
        model = pd.read_csv(f'/PLS/brain/stacking/result/RS_IDP_Timeseries_best_metrics_{algorithm}_stacked_result_fold_{fold}.csv', header=None)
        model.columns = ['Algorithm', 'Fold', 'Best parameters', 'MSE', 'MAE', 'R2', 'Pearson r']
        stack_folds_all.append(model)
        stack_all = pd.concat(stack_folds_all, ignore_index=False)
stack_all['Pearson r'] = stack_all['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
stack_all[['Pearson r', 'p-value']] = stack_all['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("'", "")
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("{", "").str.replace("}", "")
stack_all.round(3).to_csv(f'/PLS/brain/stacking/RS_IDP_Timeseries_best_metrics_stacked_five_folds.csv', index=False)
stack_all.round(3)

In [None]:
stack_all_mean_rs = stack_all[['Algorithm', 'MSE', 'MAE', 'R2', 'Pearson r']].groupby(['Algorithm']).mean().round(3).reset_index()
stack_all_mean_rs.sort_values(by='R2', ascending=False)
#stack_all_mean.sort_values(by='R2', ascending=False)

# Stack sMRI: T1w + whole-brain T1w/T2w

In [None]:
modalities_struct = ['struct_fast', 'struct_sub_first', 'struct_aseg_mean_intensity', 'struct_aseg_volume', 'struct_ba_exvivo_area', 
          'struct_ba_exvivo_mean_thickness', 'struct_ba_exvivo_volume', 'struct_a2009s_area', 'struct_a2009s_mean_thickness', 'struct_a2009s_volume',
           'struct_dkt_area', 'struct_dkt_mean_thickness', 'struct_dkt_volume', 'struct_desikan_gw', 'struct_desikan_pial',
           'struct_desikan_white_area', 'struct_desikan_white_mean_thickness', 'struct_desikan_white_volume', 'struct_subsegmentation', 'T1_T2_whole_brain']

In [None]:
len(modalities_struct)

In [None]:
# Merge predicted g-factors
folds = ["0", "1", "2", "3", "4"]
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
     
     print(f'Started {fold}', flush=True)
     all_modalities_train = []
     all_modalities_test = []
     
     for modality in modalities_struct:
          
          g_train_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted train': f'{modality}'})
          print('g TRAIN shape BEFORE stacking:', g_train_pred_level1.shape)
          print(g_train_pred_level1.isna().sum().sort_values(ascending=False))
          g_test_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted test': f'{modality}'})
          print('g TEST shape BEFORE stacking:', g_test_pred_level1.shape)
          print(g_test_pred_level1.isna().sum().sort_values(ascending=False))
          
          all_modalities_train.append(g_train_pred_level1)
          all_modalities_test.append(g_test_pred_level1)
          
     features_train_level1 = all_modalities_train[0]
     features_test_level1 = all_modalities_test[0]
     
     for i in range(1, len(all_modalities_train)):
          features_train_level1 = pd.merge(features_train_level1, all_modalities_train[i], on='eid', how='inner')
     print('g TRAIN shape AFTER stacking:', features_train_level1.shape)

     features_train_level1.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/t1_t2_struct/features_train_level1_fold_{fold}.csv', index=False)
     features_train_level1_g_matched = features_train_level1.merge(pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_train_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/t1_t2_struct/features_train_level1_g_matched_fold_{fold}.csv', index=False)
     print(features_train_level1_g_matched.isna().sum().sort_values(ascending=False))
     
     for i in range(1, len(all_modalities_test)):
          features_test_level1 = pd.merge(features_test_level1, all_modalities_test[i], on='eid', how='inner')
     print('g TEST shape AFTER stacking:', features_test_level1.shape)
     features_test_level1.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/t1_t2_struct/features_test_level1_fold_{fold}.csv', index=False)
     features_test_level1_g_matched = features_test_level1.merge(pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_test_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/t1_t2_struct/features_test_level1_g_matched_fold_{fold}.csv', index=False)
     print(features_test_level1_g_matched.isna().sum().sort_values(ascending=False))

In [None]:
# Display results for all algorithms and folds
algorithms = ['xgb', 'eNet', 'rf', 'svr']
folds = [ "0", "1", "2", "3", "4"]
stack_folds_all = []
for algorithm in algorithms:
    for fold in folds:
        model = pd.read_csv(f'/PLS/brain/stacking/result/T1_T2_whole_brain_{algorithm}_stacked_result_fold_{fold}.csv', header=None)
        model.columns = ['Algorithm', 'Fold', 'Best parameters', 'MSE', 'MAE', 'R2', 'Pearson r']
        stack_folds_all.append(model)
        stack_all = pd.concat(stack_folds_all, ignore_index=False)
stack_all['Pearson r'] = stack_all['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
stack_all[['Pearson r', 'p-value']] = stack_all['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("'", "")
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("{", "").str.replace("}", "")
stack_all.round(3).to_csv(f'/PLS/brain/stacking/T1_T2_whole_brain_stacked_five_folds.csv', index=False)
stack_all.round(3)

In [None]:
stack_all_mean_t1t2 = stack_all[['Algorithm', 'MSE', 'MAE', 'R2', 'Pearson r']].groupby(['Algorithm']).mean().round(3).reset_index()
stack_all_mean_t1t2.sort_values(by='R2', ascending=False)

# Stack all modalities

In [None]:
mod_best = [
# Full corr
'aparc_s1_full_correlation', 
'aparc_2009_s1_full_correlation',
'glasser_s1_full_correlation',
'glasser_s4_full_correlation',
'Schaefer7n200p_s1_full_correlation',
'Schaefer7n500p_s4_full_correlation',

# RS IDP
'amplitudes_21', 
'amplitudes_55',
'tangent_matrices_21', 
'tangent_matrices_55',

# DTI IDP
'fa_prob', 'fa_tbss',
'icvf_prob', 'icvf_tbss',
'isovf_prob','isovf_tbss',
'l1_prob', 'l1_tbss',
'l2_prob', 'l2_tbss',
'l3_prob', 'l3_tbss',
'md_prob', 'md_tbss',
'mo_prob', 'mo_tbss',
'od_prob', 'od_tbss',

# T1
'struct_fast', 'struct_sub_first',
'struct_aseg_mean_intensity', 'struct_aseg_volume',
'struct_ba_exvivo_area',  'struct_ba_exvivo_mean_thickness', 'struct_ba_exvivo_volume',
'struct_a2009s_area', 'struct_a2009s_mean_thickness', 'struct_a2009s_volume',
'struct_dkt_area', 'struct_dkt_mean_thickness', 'struct_dkt_volume',
'struct_desikan_gw', 'struct_desikan_pial', 'struct_desikan_white_area', 'struct_desikan_white_mean_thickness', 'struct_desikan_white_volume',
'struct_subsegmentation',
'T1_T2_whole_brain',

# DTI Struct
'31020_connectome_fa',
'31020_connectome_mean_length',
'31020_connectome_sift2',
'31020_connectome_streamline_count',

'31021_connectome_fa',
'31021_connectome_mean_length',
'31021_connectome_sift2',
'31021_connectome_streamline_count',

'31022_connectome_fa',
'31022_connectome_mean_length',
'31022_connectome_sift2',
'31022_connectome_streamline_count',

'31023_connectome_fa',
'31023_connectome_mean_length',
'31023_connectome_sift2',
'31023_connectome_streamline_count',

'31024_connectome_fa',
'31024_connectome_mean_length',
'31024_connectome_sift2',
'31024_connectome_streamline_count',

'31025_connectome_fa',
'31025_connectome_mean_length',
'31025_connectome_sift2',
'31025_connectome_streamline_count_10M'
]

In [None]:
# Merge predicted g-factors
folds = ["0", "1", "2", "3", "4"]
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
     
     print(f'Started {fold}', flush=True)
     all_modalities_train = []
     all_modalities_test = []
     
     for modality in mod_best:
          
          g_train_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted train': f'{modality}'})
          print('g TRAIN shape BEFORE stacking:', g_train_pred_level1.shape)
          g_test_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted test': f'{modality}'})
          print('g TEST shape BEFORE stacking:', g_test_pred_level1.shape)
          
          all_modalities_train.append(g_train_pred_level1)
          all_modalities_test.append(g_test_pred_level1)
          
     features_train_level1 = all_modalities_train[0]
     features_test_level1 = all_modalities_test[0]
     
     for i in range(1, len(all_modalities_train)):
          features_train_level1 = pd.merge(features_train_level1, all_modalities_train[i], on='eid', how='inner')

     print('g TRAIN shape AFTER stacking:', features_train_level1.shape)
     
     features_train_level1.astype(float).to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/all_modalities/features_train_level1_fold_{fold}.csv', index=False)
     features_train_level1_g_matched = features_train_level1.merge(pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_train_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/all_modalities/features_train_level1_g_matched_fold_{fold}.csv', index=False)
     
     for i in range(1, len(all_modalities_test)):
          features_test_level1 = pd.merge(features_test_level1, all_modalities_test[i], on='eid', how='inner')
          
     print('g TEST shape AFTER stacking:', features_test_level1.shape)
     features_test_level1.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/all_modalities/features_test_level1_fold_{fold}.csv', index=False)
     features_test_level1_g_matched = features_test_level1.merge(pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_test_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/all_modalities/features_test_level1_g_matched_fold_{fold}.csv', index=False)

In [None]:
# Display results for all algorithms and folds
algorithms = ['eNet', 'xgb', 'rf', 'svr']
folds = [ "0", "1", "2", "3", "4"]
stack_folds_all = []
for algorithm in algorithms:
    for fold in folds:
        model = pd.read_csv(f'/PLS/brain/stacking/result/All_modalities_{algorithm}_stacked_result_fold_{fold}.csv', header=None)
        model.columns = ['Algorithm', 'Fold', 'Best parameters', 'MSE', 'MAE', 'R2', 'Pearson r']
        stack_folds_all.append(model)
        stack_all = pd.concat(stack_folds_all, ignore_index=False)
stack_all['Pearson r'] = stack_all['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
stack_all[['Pearson r', 'p-value']] = stack_all['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("'", "")
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("{", "").str.replace("{", "").str.replace("}", "")
stack_all.round(3).to_csv(f'/PLS/brain/stacking/All_modalities_stacked_five_folds.csv', index=False)
stack_all.round(3)

In [None]:
stack_all_mean = stack_all[['Algorithm', 'MSE', 'MAE', 'R2', 'Pearson r']].groupby(['Algorithm']).mean().round(3).reset_index()
#stack_all_mean.sort_values(by='Pearson r', ascending=False)
stack_all_mean.sort_values(by='R2', ascending=False)