In [1]:
import csv
import os
import random
import pickle
import gc
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import sklearn
import warnings
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
from scipy.stats import spearmanr

# Stack dwMRI: Parcellations (structural matrices) + IDPs (TBSS/Probabilistic tractography)

In [2]:
dti_all_modalities = [
'fa_prob', 'fa_tbss',
'icvf_prob', 'icvf_tbss',
'isovf_prob','isovf_tbss',
'l1_prob', 'l1_tbss',
'l2_prob', 'l2_tbss',
'l3_prob', 'l3_tbss',
'md_prob', 'md_tbss',
'mo_prob', 'mo_tbss',
'od_prob', 'od_tbss',

'31020_connectome_fa',
'31020_connectome_mean_length',
'31020_connectome_sift2',
'31020_connectome_streamline_count',

'31021_connectome_fa',
'31021_connectome_mean_length',
'31021_connectome_sift2',
'31021_connectome_streamline_count',

'31022_connectome_fa',
'31022_connectome_mean_length',
'31022_connectome_sift2',
'31022_connectome_streamline_count',

'31023_connectome_fa',
'31023_connectome_mean_length',
'31023_connectome_sift2',
'31023_connectome_streamline_count',

'31024_connectome_fa',
'31024_connectome_mean_length',
'31024_connectome_sift2',
'31024_connectome_streamline_count',

'31025_connectome_fa',
'31025_connectome_mean_length',
'31025_connectome_sift2',
'31025_connectome_streamline_count_10M']

In [3]:
len(dti_all_modalities)

42

In [None]:
# Merge predicted g-factors
folds = ["0", "1", "2", "3", "4"]
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
     
     print(f'Started {fold}', flush=True)
     all_modalities_train = []
     all_modalities_test = []
     
     for modality in dti_all_modalities:
          
          g_train_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted train': f'{modality}'})
          print('g TRAIN shape BEFORE stacking:', g_train_pred_level1.shape)
          g_test_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted test': f'{modality}'})
          print('g TEST shape BEFORE stacking:', g_test_pred_level1.shape)
          
          all_modalities_train.append(g_train_pred_level1)
          all_modalities_test.append(g_test_pred_level1)
          
     features_train_level1 = all_modalities_train[0]
     features_test_level1 = all_modalities_test[0]
     
     for i in range(1, len(all_modalities_train)):
          features_train_level1 = pd.merge(features_train_level1, all_modalities_train[i], on='eid', how='inner')
     print('g TRAIN shape AFTER stacking:', features_train_level1.shape)

     features_train_level1.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/dti_all/features_train_level1_fold_{fold}.csv', index=False)
     features_train_level1_g_matched = features_train_level1.merge(pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_train_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/dti_all/features_train_level1_g_matched_fold_{fold}.csv', index=False)
     
     for i in range(1, len(all_modalities_test)):
          features_test_level1 = pd.merge(features_test_level1, all_modalities_test[i], on='eid', how='inner')
     print('g TEST shape AFTER stacking:', features_test_level1.shape)
     features_test_level1.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/dti_all/features_test_level1_fold_{fold}.csv', index=False)
     features_test_level1_g_matched = features_test_level1.merge(pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_test_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/dti_all/features_test_level1_g_matched_fold_{fold}.csv', index=False)

In [None]:
# Display results for all algorithms and folds
algorithms = ['svr', 'eNet', 'xgb', 'rf']
folds = ["0", "1", "2", "3", "4"]

stack_folds_all_dti = []
for algorithm in algorithms:
    for fold in folds:
        model = pd.read_csv(f'/PLS/brain/stacking/result/DTI_All_{algorithm}_stacked_result_fold_{fold}.csv', header=None)
        model.columns = ['Algorithm', 'Fold', 'Best parameters', 'MSE', 'MAE', 'R2', 'Pearson r']
        stack_folds_all_dti.append(model)
        stack_all_dti = pd.concat(stack_folds_all_dti, ignore_index=False)
stack_all_dti['Pearson r'] = stack_all_dti['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
stack_all_dti[['Pearson r', 'p-value']] = stack_all_dti['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
stack_all_dti['Best parameters'] = stack_all_dti['Best parameters'].str.replace("'", "")
stack_all_dti['Best parameters'] = stack_all_dti['Best parameters'].str.replace("{", "").str.replace("}", "")
stack_all_dti.round(3).to_csv(f'/PLS/brain/stacking/DTI_All_stacked_five_folds.csv', index=False)
stack_all_dti.round(3)

Unnamed: 0,Algorithm,Fold,Best parameters,MSE,MAE,R2,Pearson r,p-value
0,svr,0,"C: 12, gamma: 0.003, kernel: rbf",0.934,0.764,0.043,0.277,0.0
0,svr,1,"C: 50, gamma: 0.0006, kernel: rbf",0.991,0.795,-0.011,0.165,0.0
0,svr,2,"C: 50, gamma: 0.001, kernel: rbf",1.023,0.811,0.011,0.173,0.0
0,svr,3,"C: 12, gamma: 0.003, kernel: rbf",0.883,0.741,0.082,0.327,0.0
0,svr,4,"C: 15, gamma: 0.003, kernel: rbf",0.971,0.776,0.056,0.294,0.0
0,eNet,0,"alpha: 0.0001106848488549412, l1_ratio: 1.0, m...",0.939,0.768,0.038,0.26,0.0
0,eNet,1,"alpha: 4.010572880855496e-05, l1_ratio: 1.0, m...",0.995,0.797,-0.015,0.153,0.0
0,eNet,2,"alpha: 1e-06, l1_ratio: 1.0, max_iter: 1000",1.031,0.815,0.004,0.157,0.0
0,eNet,3,"alpha: 2.903526884977814e-05, l1_ratio: 1.0, m...",0.897,0.748,0.067,0.302,0.0
0,eNet,4,"alpha: 2.3052893870517107e-05, l1_ratio: 1.0, ...",0.988,0.783,0.04,0.262,0.0


In [68]:
stack_all_dti_mean_dti = stack_all_dti[['Algorithm', 'MSE', 'MAE', 'R2', 'Pearson r']].groupby(['Algorithm']).mean().round(3).reset_index()
stack_all_dti_mean_dti.sort_values(by='Pearson r', ascending=False)

Unnamed: 0,Algorithm,MSE,MAE,R2,Pearson r
1,rf,0.924,0.764,0.073,0.265
3,xgb,0.936,0.768,0.061,0.26
2,svr,0.961,0.777,0.036,0.247
0,eNet,0.97,0.782,0.027,0.227


# Stack rsMRI: Parcellations (full correlation) + IDPs (tangent + amplitudes)

## RS IDP best + FullCorr atlases matched to DTI

- aparc_Tian_s1
- aparc_2009_Tian_s1
- Glasser_Tian_s1
- Glasser_Tian_s4
- Schaeffer_7n200_Tian_s1
- Schaeffer_7n500_Tian_s4

In [4]:
rs_idp_rs_best_metrics = [
# RS TS
# Full corr
'aparc_s1_full_correlation', 
'aparc_2009_s1_full_correlation',
'glasser_s1_full_correlation',
'glasser_s4_full_correlation',
'Schaefer7n200p_s1_full_correlation',
'Schaefer7n500p_s4_full_correlation',

# RS IDP
'amplitudes_21', 
'amplitudes_55',
'tangent_matrices_21', 
'tangent_matrices_55']

In [5]:
len(rs_idp_rs_best_metrics)

10

In [None]:
# Merge predicted g-factors
folds = ["0", "1", "2", "3", "4"]
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
     
     print(f'Started {fold}', flush=True)
     all_modalities_train = []
     all_modalities_test = []
     
     for modality in rs_idp_rs_best_metrics:
          
          g_train_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted train': f'{modality}'})
          print('g TRAIN shape BEFORE stacking:', g_train_pred_level1.shape)
          print(g_train_pred_level1.isna().sum().sort_values(ascending=False))
          g_test_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted test': f'{modality}'})
          print('g TEST shape BEFORE stacking:', g_test_pred_level1.shape)
          print(g_test_pred_level1.isna().sum().sort_values(ascending=False))
          
          all_modalities_train.append(g_train_pred_level1)
          all_modalities_test.append(g_test_pred_level1)
          
     features_train_level1 = all_modalities_train[0]
     features_test_level1 = all_modalities_test[0]
     
     for i in range(1, len(all_modalities_train)):
          features_train_level1 = pd.merge(features_train_level1, all_modalities_train[i], on='eid', how='inner')
     print('g TRAIN shape AFTER stacking:', features_train_level1.shape)

     features_train_level1.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/rs_idp_ts_best_metrics/features_train_level1_fold_{fold}.csv', index=False)
     features_train_level1_g_matched = features_train_level1.merge(pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_train_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/rs_idp_ts_best_metrics/features_train_level1_g_matched_fold_{fold}.csv', index=False)
     print(features_train_level1_g_matched.isna().sum().sort_values(ascending=False))
     
     for i in range(1, len(all_modalities_test)):
          features_test_level1 = pd.merge(features_test_level1, all_modalities_test[i], on='eid', how='inner')
     print('g TEST shape AFTER stacking:', features_test_level1.shape)
     features_test_level1.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/rs_idp_ts_best_metrics/features_test_level1_fold_{fold}.csv', index=False)
     features_test_level1_g_matched = features_test_level1.merge(pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_test_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/rs_idp_ts_best_metrics/features_test_level1_g_matched_fold_{fold}.csv', index=False)
     print(features_test_level1_g_matched.isna().sum().sort_values(ascending=False))

In [None]:
# Display results for all algorithms and folds
algorithms = ['xgb', 'eNet', 'rf', 'svr']
folds = [ "0", "1", "2", "3", "4"]
stack_folds_all = []
for algorithm in algorithms:
    for fold in folds:
        model = pd.read_csv(f'/PLS/brain/stacking/result/RS_IDP_Timeseries_best_metrics_{algorithm}_stacked_result_fold_{fold}.csv', header=None)
        model.columns = ['Algorithm', 'Fold', 'Best parameters', 'MSE', 'MAE', 'R2', 'Pearson r']
        stack_folds_all.append(model)
        stack_all = pd.concat(stack_folds_all, ignore_index=False)
stack_all['Pearson r'] = stack_all['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
stack_all[['Pearson r', 'p-value']] = stack_all['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("'", "")
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("{", "").str.replace("}", "")
stack_all.round(3).to_csv(f'/PLS/brain/stacking/RS_IDP_Timeseries_best_metrics_stacked_five_folds.csv', index=False)
stack_all.round(3)

Unnamed: 0,Algorithm,Fold,Best parameters,MSE,MAE,R2,Pearson r,p-value
0,xgb,0,"alpha: 0.5, booster: gbtree, eta: 0.1, lambda:...",0.839,0.727,0.142,0.385,0.0
0,xgb,1,"alpha: 1, booster: gbtree, eta: 0.1, lambda: 0...",0.921,0.764,0.062,0.266,0.0
0,xgb,2,"alpha: 0.5, booster: gbtree, eta: 0.1, lambda:...",0.982,0.792,0.052,0.248,0.0
0,xgb,3,"alpha: 1, booster: gbtree, eta: 0.1, lambda: 0...",0.836,0.725,0.128,0.367,0.0
0,xgb,4,"alpha: 0.5, booster: gbtree, eta: 0.1, lambda:...",0.897,0.749,0.125,0.362,0.0
0,eNet,0,"alpha: 0.0005565330778427646, l1_ratio: 0.9898...",0.837,0.727,0.144,0.387,0.0
0,eNet,1,"alpha: 0.0010618229410993836, l1_ratio: 0.4141...",0.924,0.766,0.059,0.264,0.0
0,eNet,2,"alpha: 0.0027982968756551157, l1_ratio: 0.0, m...",0.983,0.792,0.051,0.251,0.0
0,eNet,3,"alpha: 0.0011644806183726856, l1_ratio: 0.0, m...",0.843,0.726,0.12,0.361,0.0
0,eNet,4,"alpha: 0.0012194734366967382, l1_ratio: 0.0, m...",0.896,0.748,0.127,0.363,0.0


In [15]:
stack_all_mean_rs = stack_all[['Algorithm', 'MSE', 'MAE', 'R2', 'Pearson r']].groupby(['Algorithm']).mean().round(3).reset_index()
stack_all_mean_rs.sort_values(by='R2', ascending=False)
#stack_all_mean.sort_values(by='R2', ascending=False)

Unnamed: 0,Algorithm,MSE,MAE,R2,Pearson r
1,rf,0.891,0.75,0.105,0.325
3,xgb,0.895,0.751,0.102,0.326
2,svr,0.896,0.751,0.101,0.327
0,eNet,0.897,0.752,0.1,0.325


# Stack sMRI: T1w + whole-brain T1w/T2w

In [6]:
modalities_struct = ['struct_fast', 'struct_sub_first', 'struct_aseg_mean_intensity', 'struct_aseg_volume', 'struct_ba_exvivo_area', 
          'struct_ba_exvivo_mean_thickness', 'struct_ba_exvivo_volume', 'struct_a2009s_area', 'struct_a2009s_mean_thickness', 'struct_a2009s_volume',
           'struct_dkt_area', 'struct_dkt_mean_thickness', 'struct_dkt_volume', 'struct_desikan_gw', 'struct_desikan_pial',
           'struct_desikan_white_area', 'struct_desikan_white_mean_thickness', 'struct_desikan_white_volume', 'struct_subsegmentation', 'T1_T2_whole_brain']

In [7]:
len(modalities_struct)

20

In [None]:
# Merge predicted g-factors
folds = ["0", "1", "2", "3", "4"]
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
     
     print(f'Started {fold}', flush=True)
     all_modalities_train = []
     all_modalities_test = []
     
     for modality in modalities_struct:
          
          g_train_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted train': f'{modality}'})
          print('g TRAIN shape BEFORE stacking:', g_train_pred_level1.shape)
          print(g_train_pred_level1.isna().sum().sort_values(ascending=False))
          g_test_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted test': f'{modality}'})
          print('g TEST shape BEFORE stacking:', g_test_pred_level1.shape)
          print(g_test_pred_level1.isna().sum().sort_values(ascending=False))
          
          all_modalities_train.append(g_train_pred_level1)
          all_modalities_test.append(g_test_pred_level1)
          
     features_train_level1 = all_modalities_train[0]
     features_test_level1 = all_modalities_test[0]
     
     for i in range(1, len(all_modalities_train)):
          features_train_level1 = pd.merge(features_train_level1, all_modalities_train[i], on='eid', how='inner')
     print('g TRAIN shape AFTER stacking:', features_train_level1.shape)

     features_train_level1.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/t1_t2_struct/features_train_level1_fold_{fold}.csv', index=False)
     features_train_level1_g_matched = features_train_level1.merge(pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_train_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/t1_t2_struct/features_train_level1_g_matched_fold_{fold}.csv', index=False)
     print(features_train_level1_g_matched.isna().sum().sort_values(ascending=False))
     
     for i in range(1, len(all_modalities_test)):
          features_test_level1 = pd.merge(features_test_level1, all_modalities_test[i], on='eid', how='inner')
     print('g TEST shape AFTER stacking:', features_test_level1.shape)
     features_test_level1.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/t1_t2_struct/features_test_level1_fold_{fold}.csv', index=False)
     features_test_level1_g_matched = features_test_level1.merge(pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_test_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/t1_t2_struct/features_test_level1_g_matched_fold_{fold}.csv', index=False)
     print(features_test_level1_g_matched.isna().sum().sort_values(ascending=False))

In [None]:
# Display results for all algorithms and folds
algorithms = ['xgb', 'eNet', 'rf', 'svr']
folds = [ "0", "1", "2", "3", "4"]
stack_folds_all = []
for algorithm in algorithms:
    for fold in folds:
        model = pd.read_csv(f'/PLS/brain/stacking/result/T1_T2_whole_brain_{algorithm}_stacked_result_fold_{fold}.csv', header=None)
        model.columns = ['Algorithm', 'Fold', 'Best parameters', 'MSE', 'MAE', 'R2', 'Pearson r']
        stack_folds_all.append(model)
        stack_all = pd.concat(stack_folds_all, ignore_index=False)
stack_all['Pearson r'] = stack_all['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
stack_all[['Pearson r', 'p-value']] = stack_all['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("'", "")
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("{", "").str.replace("}", "")
stack_all.round(3).to_csv(f'/PLS/brain/stacking/T1_T2_whole_brain_stacked_five_folds.csv', index=False)
stack_all.round(3)

Unnamed: 0,Algorithm,Fold,Best parameters,MSE,MAE,R2,Pearson r,p-value
0,xgb,0,"alpha: 1, booster: gbtree, eta: 0.1, lambda: 1...",0.854,0.738,0.13,0.361,0.0
0,xgb,1,"alpha: 1, booster: gbtree, eta: 0.1, lambda: 1...",0.942,0.775,0.039,0.198,0.0
0,xgb,2,"alpha: 0.5, booster: gbtree, eta: 0.1, lambda:...",0.998,0.802,0.03,0.175,0.0
0,xgb,3,"alpha: 0, booster: gbtree, eta: 0.1, lambda: 1...",0.865,0.732,0.124,0.353,0.0
0,xgb,4,"alpha: 0, booster: gbtree, eta: 0.1, lambda: 1...",0.85,0.724,0.155,0.394,0.0
0,eNet,0,"alpha: 0.002436540009125466, l1_ratio: 1.0, ma...",0.86,0.741,0.124,0.353,0.0
0,eNet,1,"alpha: 0.0014005305453932196, l1_ratio: 0.9595...",0.944,0.777,0.037,0.194,0.0
0,eNet,2,"alpha: 0.0027982968756551157, l1_ratio: 0.3232...",0.999,0.801,0.029,0.174,0.0
0,eNet,3,"alpha: 0.012251135766041178, l1_ratio: 0.0, ma...",0.862,0.731,0.127,0.356,0.0
0,eNet,4,"alpha: 0.011171106505048241, l1_ratio: 0.0, ma...",0.849,0.724,0.155,0.395,0.0


In [64]:
stack_all_mean_t1t2 = stack_all[['Algorithm', 'MSE', 'MAE', 'R2', 'Pearson r']].groupby(['Algorithm']).mean().round(3).reset_index()
stack_all_mean_t1t2.sort_values(by='R2', ascending=False)

Unnamed: 0,Algorithm,MSE,MAE,R2,Pearson r
2,svr,0.902,0.753,0.095,0.298
3,xgb,0.902,0.754,0.095,0.296
0,eNet,0.903,0.755,0.094,0.294
1,rf,0.904,0.755,0.093,0.293


# Stack all modalities

In [20]:
mod_best = [
# Full corr
'aparc_s1_full_correlation', 
'aparc_2009_s1_full_correlation',
'glasser_s1_full_correlation',
'glasser_s4_full_correlation',
'Schaefer7n200p_s1_full_correlation',
'Schaefer7n500p_s4_full_correlation',

# RS IDP
'amplitudes_21', 
'amplitudes_55',
'tangent_matrices_21', 
'tangent_matrices_55',

# DTI IDP
'fa_prob', 'fa_tbss',
'icvf_prob', 'icvf_tbss',
'isovf_prob','isovf_tbss',
'l1_prob', 'l1_tbss',
'l2_prob', 'l2_tbss',
'l3_prob', 'l3_tbss',
'md_prob', 'md_tbss',
'mo_prob', 'mo_tbss',
'od_prob', 'od_tbss',

# T1
'struct_fast', 'struct_sub_first',
'struct_aseg_mean_intensity', 'struct_aseg_volume',
'struct_ba_exvivo_area',  'struct_ba_exvivo_mean_thickness', 'struct_ba_exvivo_volume',
'struct_a2009s_area', 'struct_a2009s_mean_thickness', 'struct_a2009s_volume',
'struct_dkt_area', 'struct_dkt_mean_thickness', 'struct_dkt_volume',
'struct_desikan_gw', 'struct_desikan_pial', 'struct_desikan_white_area', 'struct_desikan_white_mean_thickness', 'struct_desikan_white_volume',
'struct_subsegmentation',
'T1_T2_whole_brain',

# DTI Struct
'31020_connectome_fa',
'31020_connectome_mean_length',
'31020_connectome_sift2',
'31020_connectome_streamline_count',

'31021_connectome_fa',
'31021_connectome_mean_length',
'31021_connectome_sift2',
'31021_connectome_streamline_count',

'31022_connectome_fa',
'31022_connectome_mean_length',
'31022_connectome_sift2',
'31022_connectome_streamline_count',

'31023_connectome_fa',
'31023_connectome_mean_length',
'31023_connectome_sift2',
'31023_connectome_streamline_count',

'31024_connectome_fa',
'31024_connectome_mean_length',
'31024_connectome_sift2',
'31024_connectome_streamline_count',

'31025_connectome_fa',
'31025_connectome_mean_length',
'31025_connectome_sift2',
'31025_connectome_streamline_count_10M'
]

In [None]:
# Merge predicted g-factors
folds = ["0", "1", "2", "3", "4"]
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
     
     print(f'Started {fold}', flush=True)
     all_modalities_train = []
     all_modalities_test = []
     
     for modality in mod_best:
          
          g_train_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted train': f'{modality}'})
          print('g TRAIN shape BEFORE stacking:', g_train_pred_level1.shape)
          g_test_pred_level1 = pd.read_csv(f'/PLS/brain/stacking/g_pred_first_level/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0').rename(columns={'g predicted test': f'{modality}'})
          print('g TEST shape BEFORE stacking:', g_test_pred_level1.shape)
          
          all_modalities_train.append(g_train_pred_level1)
          all_modalities_test.append(g_test_pred_level1)
          
     features_train_level1 = all_modalities_train[0]
     features_test_level1 = all_modalities_test[0]
     
     for i in range(1, len(all_modalities_train)):
          features_train_level1 = pd.merge(features_train_level1, all_modalities_train[i], on='eid', how='inner')

     print('g TRAIN shape AFTER stacking:', features_train_level1.shape)
     
     features_train_level1.astype(float).to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/all_modalities/features_train_level1_fold_{fold}.csv', index=False)
     features_train_level1_g_matched = features_train_level1.merge(pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_train_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_train_level1_stacked/all_modalities/features_train_level1_g_matched_fold_{fold}.csv', index=False)
     
     for i in range(1, len(all_modalities_test)):
          features_test_level1 = pd.merge(features_test_level1, all_modalities_test[i], on='eid', how='inner')
          
     print('g TEST shape AFTER stacking:', features_test_level1.shape)
     features_test_level1.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/all_modalities/features_test_level1_fold_{fold}.csv', index=False)
     features_test_level1_g_matched = features_test_level1.merge(pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv'), on='eid').drop(columns='g')
     features_test_level1_g_matched.to_csv(f'/PLS/brain/stacking/features_test_level1_stacked/all_modalities/features_test_level1_g_matched_fold_{fold}.csv', index=False)

In [None]:
# Display results for all algorithms and folds
algorithms = ['eNet', 'xgb', 'rf', 'svr']
folds = [ "0", "1", "2", "3", "4"]
stack_folds_all = []
for algorithm in algorithms:
    for fold in folds:
        model = pd.read_csv(f'/PLS/brain/stacking/result/All_modalities_{algorithm}_stacked_result_fold_{fold}.csv', header=None)
        model.columns = ['Algorithm', 'Fold', 'Best parameters', 'MSE', 'MAE', 'R2', 'Pearson r']
        stack_folds_all.append(model)
        stack_all = pd.concat(stack_folds_all, ignore_index=False)
stack_all['Pearson r'] = stack_all['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
stack_all[['Pearson r', 'p-value']] = stack_all['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("'", "")
stack_all['Best parameters'] = stack_all['Best parameters'].str.replace("{", "").str.replace("{", "").str.replace("}", "")
stack_all.round(3).to_csv(f'/PLS/brain/stacking/All_modalities_stacked_five_folds.csv', index=False)
stack_all.round(3)

Unnamed: 0,Algorithm,Fold,Best parameters,MSE,MAE,R2,Pearson r,p-value
0,eNet,0,"alpha: 6.075291689016077e-05, l1_ratio: 1.0, m...",0.802,0.71,0.182,0.441,0.0
0,eNet,1,"alpha: 0.00011591196185988857, l1_ratio: 1.0, ...",0.939,0.77,0.041,0.266,0.0
0,eNet,2,"alpha: 2.3052893870517107e-05, l1_ratio: 1.0, ...",0.995,0.797,0.042,0.26,0.0
0,eNet,3,"alpha: 4.199972946715306e-05, l1_ratio: 1.0, m...",0.796,0.704,0.184,0.442,0.0
0,eNet,4,"alpha: 1e-06, l1_ratio: 0.0, max_iter: 1000",0.8,0.71,0.206,0.463,0.0
0,xgb,0,"alpha: 1, booster: gbtree, eta: 0.1, lambda: 0...",0.763,0.693,0.222,0.475,0.0
0,xgb,1,"alpha: 0.5, booster: gbtree, eta: 0.2, lambda:...",0.906,0.757,0.075,0.295,0.0
0,xgb,2,"alpha: 0.5, booster: gbtree, eta: 0.2, lambda:...",0.974,0.789,0.062,0.274,0.0
0,xgb,3,"alpha: 1, booster: gbtree, eta: 0.2, lambda: 0...",0.783,0.699,0.197,0.452,0.0
0,xgb,4,"alpha: 0.5, booster: gbtree, eta: 0.1, lambda:...",0.765,0.693,0.24,0.492,0.0


In [66]:
stack_all_mean = stack_all[['Algorithm', 'MSE', 'MAE', 'R2', 'Pearson r']].groupby(['Algorithm']).mean().round(3).reset_index()
#stack_all_mean.sort_values(by='Pearson r', ascending=False)
stack_all_mean.sort_values(by='R2', ascending=False)

Unnamed: 0,Algorithm,MSE,MAE,R2,Pearson r
3,xgb,0.838,0.726,0.159,0.398
1,rf,0.845,0.729,0.152,0.383
2,svr,0.859,0.734,0.139,0.383
0,eNet,0.866,0.738,0.131,0.374
