In [None]:
import csv
import os
import random
import pickle
import gc
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import sklearn
import warnings
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import ElasticNet
from sklearn.exceptions import ConvergenceWarning

# Select the best rsMRI modality based on train performance

In [None]:
rs_modalities = [
# RS TS
# Full corr
'aparc_Tian_s1_full_correlation', 
'aparc_2009_Tian_s1_full_correlation',
'glasser_Tian_s1_full_correlation',
'glasser_Tian_s4_full_correlation',
'Schaefer7n200p_tian_s1_full_correlation',
'Schaefer7n500p_Tian_s4_full_correlation',

# Partial corr
'aparc_Tian_s1_partial_correlation', 
'aparc_2009_Tian_s1_partial_correlation',
'glasser_Tian_s1_partial_correlation',
'glasser_Tian_s4_partial_correlation',
'Schaefer7n200p_tian_s1_partial_correlation',
'Schaefer7n500p_Tian_s4_partial_correlation',

# Tangent
'aparc_Tian_s1',
'aparc_2009_Tian_s1',
'glasser_Tian_s1',
'glasser_Tian_s4',
'Schaefer7n200p_tian_s1',
'Schaefer7n500p_tian_s4',

# RS IDP
'full_correlation_21', 'full_correlation_55',
'partial_correlation_21', 'partial_correlation_55',
'tangent_matrices_21', 'tangent_matrices_55']

In [None]:
performance_list = []
folds = ["1", "2", "3", "4"]
for fold in folds:
    for modality in rs_modalities:
        with open(f'/PLS/brain/rs/parcellations/fold_{fold}/{modality}_model_fold_{fold}.pkl', "rb") as mo:
            model = pickle.load(mo)
            
        perf = {
            'Fold': fold,
            'Modality': modality,
            'n_components': model.best_params_,
            'Mean_test_scores': model.cv_results_['mean_test_score'],
            'Best_score': model.cv_results_['mean_test_score'].max()
            }
        # mean_test_score in cv_results corresponds to performance in the training set!
        
        performance_list.append(perf)

performance = pd.DataFrame(performance_list)

In [None]:
with pd.option_context('display.max_rows', None):
    display(performance)

In [None]:
performance_mean = performance[['Modality', 'Best_score']] #the less negative the better
performance_mean.groupby(['Modality']).mean().round(3).sort_values(by='Best_score', ascending=False)

In [None]:
# Performance plot for one of the models
n_components = model.cv_results_['param_n_components'].data
mean_test_scores = model.cv_results_['mean_test_score']
plt.plot(n_components, mean_test_scores, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Mean Test Score')