In [None]:
import csv
import os
import random
import pickle
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import sklearn
import sys
import warnings
import skbold
from skbold.preproc import ConfoundRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


## Define ConfoundRegressor
def confound_regressor(features_train, features_test, confounds_train, confounds_test):
    # Scale features (train and test sets)
    scaler_features = StandardScaler()
    features_train_scaled = scaler_features.fit_transform(features_train)
    features_test_scaled = scaler_features.transform(features_test)
    
    # Scale confounds (train and test sets)
    scaler_confounds = StandardScaler()
    confounds_train_scaled = scaler_confounds.fit_transform(confounds_train)
    confounds_test_scaled = scaler_confounds.transform(confounds_test)
    
    # Convert full sets into np.array
    features_full_scaled_np = np.array(pd.concat([pd.DataFrame(features_train_scaled, columns = features_train.columns), pd.DataFrame(features_test_scaled, columns = features_test.columns)], axis=0))
    confounds_full_scaled_np = np.array(pd.concat([pd.DataFrame(confounds_train_scaled, columns = confounds_train.columns), pd.DataFrame(confounds_test_scaled, columns = confounds_test.columns)], axis=0))
    
    # Define ConfoundRegressor on a FULL set (train and test)
    cfr = ConfoundRegressor(confound=confounds_full_scaled_np, X=features_full_scaled_np)
    features_train_corrected = cfr.fit_transform(features_train_scaled)
    features_test_corrected = cfr.transform(features_test_scaled)
    return features_train_corrected, features_test_corrected, features_train_scaled, features_test_scaled, scaler_features, scaler_confounds

## Define modalities
modalities = [
'aparc_2009_Tian_s1_arrays_full_correlation',
'aparc_2009_Tian_s2_arrays_full_correlation',
'aparc_2009_Tian_s3_arrays_full_correlation',
'aparc_2009_Tian_s4_arrays_full_correlation']

modalities = [
'glasser_Tian_s1_arrays_full_correlation',
'glasser_Tian_s2_arrays_full_correlation',
'glasser_Tian_s3_arrays_full_correlation',
'glasser_Tian_s4_arrays_full_correlation']

modalities = [
'aparc_Tian_s1_arrays_full_correlation',
'aparc_Tian_s2_arrays_full_correlation',
'aparc_Tian_s3_arrays_full_correlation',
'aparc_Tian_s4_arrays_full_correlation']

modalities = [
'Schaefer7n200p_tian_s1_arrays_full_correlation_with_id',
'Schaefer7n200p_tian_s2_arrays_full_correlation_with_id',
'Schaefer7n200p_tian_s3_arrays_full_correlation_with_id',
'Schaefer7n200p_tian_s4_arrays_full_correlation_with_id']

folds = ["0", "1", "2", "3", "4"]

###################################################### Preparatory steps
#for i in range(0,80):
    #fold = i % 5
    #model = i // 5
    #print(fold, model, folds[fold], modalities[modal])

warnings.simplefilter(action='ignore', category=FutureWarning)
seed = 42
pls_result = {}

if len(sys.argv) > 1:
    fold = int(sys.argv[1]) % 5
    modal = int(sys.argv[1]) // 5

print(f'Started {modalities[modal]} fold {folds[fold]}', flush=True)
#####################################################

print('Reading confounds', flush=True)
confounds = pd.read_csv('/PLS/pls_rs/pls_idp/main/resting_files/rs_confounds.csv')

############## 1
print(f'Reading {modalities[modal]}', flush=True)

file = pd.read_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/resting_files/{modalities[modal]}.csv').rename(columns={'Unnamed: 0':'eid'})

# Match confounds to MRI
print(f'Matching full brain data to confounds in {modalities[modal]}', flush=True)

conf_to_brain_match = pd.merge(confounds, file['eid'], on='eid')
brain_to_conf_match = pd.merge(conf_to_brain_match['eid'], file, on='eid')

print('Reading train/test sets and g-factor', flush=True)
test_id = pd.read_csv(f'/PLS/pls_dti/fold_id/test_id_fold_{folds[fold]}.csv')
train_id = pd.read_csv(f'/PLS/pls_dti/fold_id/train_id_fold_{folds[fold]}.csv') 

# Upload g-factor with ID
g_train_full = pd.read_csv(f'/PLS/pls_dti/g_factor/g_train_with_id_fold_{folds[fold]}.csv')
g_test_full = pd.read_csv(f'/PLS/pls_dti/g_factor/g_test_with_id_fold_{folds[fold]}.csv')


# Match brain data to cognitive data
print(f'Matching brain data to train/test confounds in {modalities[modal]} fold {folds[fold]}', flush=True)
brain_train, brain_test, brain_train_id, brain_test_id = pd.merge(brain_to_conf_match, train_id, on='eid').drop(columns=['eid']), pd.merge(brain_to_conf_match, test_id, on='eid').drop(columns=['eid']), pd.merge(brain_to_conf_match, train_id, on='eid')['eid'], pd.merge(brain_to_conf_match, test_id, on='eid')['eid']

brain_train.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/suppl/{modalities[modal]}_train_fold_{folds[fold]}.csv', index=False)
brain_test.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/suppl/{modalities[modal]}_test_fold_{folds[fold]}.csv', index=False)
brain_train_id.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/suppl/{modalities[modal]}_train_id_fold_{folds[fold]}.csv', index=False)
brain_test_id.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/suppl/{modalities[modal]}_test_id_fold_{folds[fold]}.csv', index=False)

############## 2
# Match confounds to MRI
brain_conf_train, brain_conf_test = pd.merge(conf_to_brain_match, brain_train_id, on='eid').drop(columns=['eid']), pd.merge(conf_to_brain_match, brain_test_id, on='eid').drop(columns=['eid'])
brain_conf_train.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/suppl/{modalities[modal]}_conf_train_fold_{folds[fold]}.csv', index=False)
brain_conf_test.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/suppl/{modalities[modal]}_conf_test_fold_{folds[fold]}.csv', index=False)

############## 3
print(f'Matching g-factor to {modalities[modal]} fold {folds[fold]}', flush=True)

# Match g-factor back to DTI
g_train, g_test, g_train_id, g_test_id = pd.merge(g_train_full, brain_train_id, on='eid').drop(columns=['eid']), pd.merge(g_test_full, brain_test_id, on='eid').drop(columns=['eid']), pd.merge(g_train_full, brain_train_id, on='eid')['eid'], pd.merge(g_test_full, brain_test_id, on='eid')['eid']
g_train.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/suppl/g_train_{modalities[modal]}_matched_fold_{folds[fold]}.csv', index=False)
g_test.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/suppl/g_test_{modalities[modal]}_matched_fold_{folds[fold]}.csv', index=False)

############## 4
print(f'Applying ConfoundRegressor to {modalities[modal]} fold {folds[fold]}', flush=True)

# Apply ConfoundRegressor
features_train_corr, features_test_corr, features_train_scaled, features_test_scaled, scaler_features, scaler_confounds = confound_regressor(brain_train, brain_test, brain_conf_train, brain_conf_test)
pd.DataFrame(features_train_corr, columns = brain_train.columns).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/scaling/{modalities[modal]}_train_corr_{folds[fold]}.csv', index=False)
pd.DataFrame(features_test_corr, columns = brain_test.columns).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/scaling/{modalities[modal]}_test_corr_{folds[fold]}.csv', index=False)

pd.DataFrame(features_train_scaled, columns = brain_train.columns).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/scaling/{modalities[modal]}_train_scaled_{folds[fold]}.csv', index=False)
pd.DataFrame(features_test_scaled, columns = brain_test.columns).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/scaling/{modalities[modal]}_test_scaled_{folds[fold]}.csv', index=False)


with open(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/scaling/scaler_features_{modalities[modal]}_fold_{folds[fold]}.pkl', "wb") as f:
    pickle.dump(scaler_features, f)

with open(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/scaling/scaler_confounds_{modalities[modal]}_fold_{folds[fold]}.pkl', "wb") as f:
    pickle.dump(scaler_confounds, f)

# Initiate and run PLS
parameters = {'n_components': range(1, 36, 1)}
pls = PLSRegression()
model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True, random_state=seed), verbose=4, n_jobs = 25)


print(f'Fitting PLS to {modalities[modal]} fold {folds[fold]}', flush=True)

model.fit(features_train_corr, np.array(g_train))

print(f'Model parameters for fold {folds[fold]}:', model.cv_results_['params'])
print(f'Mean test score for fold {folds[fold]}:', model.cv_results_['mean_test_score'])
print(f'Rank test score for fold {folds[fold]}:', model.cv_results_['rank_test_score'])
print(model)

print(f'Saving PLS model for {modalities[modal]} fold {folds[fold]}')
with open(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/models/pkl/{modalities[modal]}_model_fold_{folds[fold]}.pkl', "wb") as f:
    pickle.dump(model, f)

print(f'Best params in fold {folds[fold]} = ', model.best_params_)
print(f'Best score (neg_mean_absolute_error) in fold {folds[fold]} = ', model.best_score_)

# Predict the values
print(f'Predicting & saving g_test for {modalities[modal]} fold {folds[fold]}', flush=True)
g_pred_test = model.predict(np.array(features_test_corr))
pd.DataFrame(g_pred_test, columns=['g predicted test']).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/g_pred/{modalities[modal]}_test_fold_{folds[fold]}.csv')

g_pred_test_with_id = pd.concat([g_test_id.astype(int), pd.DataFrame(g_pred_test, columns=['g predicted test'])], axis=1).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/g_pred/{modalities[modal]}_g_pred_test_id_fold_{folds[fold]}.csv')


print(f'Predicting & saving g_train for {modalities[modal]} fold {folds[fold]}', flush=True)
g_pred_train = model.predict(np.array(features_train_corr))
pd.DataFrame(g_pred_train, columns=['g predicted train']).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/g_pred/{modalities[modal]}_g_pred_train_fold_{folds[fold]}.csv')


g_pred_train_with_id = pd.concat([g_train_id.astype(int), pd.DataFrame(g_pred_train, columns=['g predicted train'])], axis=1).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/g_pred/{modalities[modal]}_g_pred_train_id_fold_{folds[fold]}.csv')


print(f"Fold = {folds[fold]}")
print("----------")
print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
print("----------")

pls_result['fold'] = folds[fold]
pls_result['modalities'] = modalities[modal]
pls_result['n_components'] = model.best_params_
pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['Pearson r'] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])

print(f'Saving PLS result for {modalities[modal]} fold {folds[fold]}', flush=True)
with open(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/models/csv/{modalities[modal]}_fold_{folds[fold]}_PLS_result.csv', 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=pls_result.keys())
    writer.writerow(pls_result)

pls_result.clear()

corr, pval = stats.pearsonr(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
r2 = r2_score(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
mse = mean_squared_error(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
result = pd.DataFrame([modalities[modal], fold, corr, pval, r2, mse, model.best_params_], index=['modalities[modal]', 'Fold', 'Correlation', 'P-value', 'R2', 'MSE', 'n components'], columns=['Values']).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/aparc_2009/fold_{folds[fold]}/models/csv/{modalities[modal]}_fold_{folds[fold]}_full_result.csv')

# Schaefer 7n500p + MSA-IV

Because files are very big, deconfounding and PLS are done separately

In [None]:
# Deconfound
folds = ["0", "1", "2", "3", "4"]
subcortical = ['s1', 's2', 's3', 's4']

###################################################### Preparatory steps


warnings.simplefilter(action='ignore', category=FutureWarning)
seed = 42
pls_result = {}

if len(sys.argv) > 1:
    fold = int(sys.argv[1]) % 5
    subcort = int(sys.argv[1]) // 5

print(f'Started {subcortical[subcort]} fold {folds[fold]}', flush=True)
#####################################################

print('Reading confounds', flush=True)
confounds = pd.read_csv('/PLS/pls_rs/pls_idp/main/resting_files/rs_confounds.csv')

############## 1
print(f'Reading {subcortical[subcort]}', flush=True)

file = pd.read_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/resting_files/Schaefer7n500p_tian_{subcortical[subcort]}_arrays_full_correlation.csv').rename(columns={'Unnamed: 0':'eid'})

# Match confounds to MRI
print(f'Matching full brain data to confounds in {subcortical[subcort]}', flush=True)

conf_to_brain_match = pd.merge(confounds, file['eid'], on='eid')
brain_to_conf_match = pd.merge(conf_to_brain_match['eid'], file, on='eid')

print('Reading train/test sets and g-factor', flush=True)
# Upload g-factor with ID
g_train_full = pd.read_csv(f'/PLS/pls_dti/g_factor/g_train_with_id_fold_{folds[fold]}.csv')
g_test_full = pd.read_csv(f'/PLS/pls_dti/g_factor/g_test_with_id_fold_{folds[fold]}.csv')


# Match brain data to cognitive data
print(f'Matching brain data to train/test confounds in {subcortical[subcort]} fold {folds[fold]}', flush=True)
brain_train, brain_test, brain_train_id, brain_test_id = pd.merge(brain_to_conf_match, g_train_full['eid'], on='eid').drop(columns=['eid']), pd.merge(brain_to_conf_match, g_test_full['eid'], on='eid').drop(columns=['eid']), pd.merge(brain_to_conf_match, g_train_full['eid'], on='eid')['eid'], pd.merge(brain_to_conf_match, g_test_full['eid'], on='eid')['eid']

brain_train.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_train_fold_{folds[fold]}.csv', index=False)
brain_test.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_test_fold_{folds[fold]}.csv', index=False)
brain_train_id.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_train_id_fold_{folds[fold]}.csv', index=False)
brain_test_id.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_test_id_fold_{folds[fold]}.csv', index=False)

############## 2
# Match confounds to MRI
brain_conf_train, brain_conf_test = pd.merge(conf_to_brain_match, brain_train_id, on='eid').drop(columns=['eid']), pd.merge(conf_to_brain_match, brain_test_id, on='eid').drop(columns=['eid'])
brain_conf_train.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_conf_train_fold_{folds[fold]}.csv', index=False)
brain_conf_test.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_conf_test_fold_{folds[fold]}.csv', index=False)

############## 3
print(f'Matching g-factor to {subcortical[subcort]} fold {folds[fold]}', flush=True)

# Match g-factor back to MRI
g_train, g_test, g_train_id, g_test_id = pd.merge(g_train_full, brain_train_id, on='eid').drop(columns=['eid']), pd.merge(g_test_full, brain_test_id, on='eid').drop(columns=['eid']), pd.merge(g_train_full, brain_train_id, on='eid')['eid'], pd.merge(g_test_full, brain_test_id, on='eid')['eid']
g_train.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/g_train_Schaefer7n500p_{subcortical[subcort]}_full_correlation_matched_fold_{folds[fold]}.csv', index=False)
g_test.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/g_test_Schaefer7n500p_{subcortical[subcort]}_full_correlation_matched_fold_{folds[fold]}.csv', index=False)
g_train_id.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/g_train_id_Schaefer7n500p_{subcortical[subcort]}_full_correlation_matched_fold_{folds[fold]}.csv', index=False)
g_test_id.to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/g_test_id_Schaefer7n500p_{subcortical[subcort]}_full_correlation_matched_fold_{folds[fold]}.csv', index=False)

############## 4
print(f'Applying ConfoundRegressor to {subcortical[subcort]} fold {folds[fold]}', flush=True)

# Apply ConfoundRegressor
features_train_corr, features_test_corr, features_train_scaled, features_test_scaled, scaler_features = confound_regressor(brain_train, brain_test, brain_conf_train, brain_conf_test)
pd.DataFrame(features_train_corr, columns = brain_train.columns).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/scaling/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_train_corr_{folds[fold]}.csv', index=False)
pd.DataFrame(features_test_corr, columns = brain_test.columns).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/scaling/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_test_corr_{folds[fold]}.csv', index=False)

pd.DataFrame(features_train_scaled, columns = brain_train.columns).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/scaling/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_train_scaled_{folds[fold]}.csv', index=False)
pd.DataFrame(features_test_scaled, columns = brain_test.columns).to_csv(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/scaling/fc/Schaefer7n500p_{subcortical[subcort]}_full_correlation_test_scaled_{folds[fold]}.csv', index=False)


with open(f'/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/scaling/fc/scaler_features_Schaefer7n500p_{subcortical[subcort]}_full_correlation_fold_{folds[fold]}.pkl', "wb") as f:
    pickle.dump(scaler_features, f)

In [None]:
## Define modalities
subcortical = ['s1', 's2', 's3', 's4']
folds = ["0", "1", "2", "3", "4"]

warnings.simplefilter(action='ignore', category=FutureWarning)
seed = 42
pls_result = {}

if len(sys.argv) > 1:
    fold = int(sys.argv[1]) % 5
    subcort = int(sys.argv[1]) // 5

###################################################### Preparatory steps
features_train_corr = pd.read_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/scaling/fc/Schaefer7n500p_{subcortical[subcort]}_train_corr_{folds[fold]}.csv',)
features_test_corr = pd.read_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/scaling/fc/Schaefer7n500p_{subcortical[subcort]}_test_corr_{folds[fold]}.csv')

g_test_id = pd.read_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/g_test_id_Schaefer7n500p_{subcortical[subcort]}_matched_fold_{folds[fold]}.csv')
g_train_id = pd.read_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/g_train_id_Schaefer7n500p_{subcortical[subcort]}_matched_fold_{folds[fold]}.csv')

g_test = pd.read_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/g_test_Schaefer7n500p_{subcortical[subcort]}_matched_fold_{folds[fold]}.csv')
g_train = pd.read_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/suppl/fc/g_train_Schaefer7n500p_{subcortical[subcort]}_matched_fold_{folds[fold]}.csv')


# Initiate and run PLS
parameters = {'n_components': range(1, 26, 1)}
pls = PLSRegression()
model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True, random_state=seed), verbose=4)

#############
print(f'Fitting PLS to {subcortical[subcort]} fold {folds[fold]}', flush=True)

model.fit(features_train_corr, np.array(g_train))

print(f'Model parameters for fold {folds[fold]}:', model.cv_results_['params'])
print(f'Mean test score for fold {folds[fold]}:', model.cv_results_['mean_test_score'])
print(f'Rank test score for fold {folds[fold]}:', model.cv_results_['rank_test_score'])
print(model)

print(f'Saving PLS model for {subcortical[subcort]} fold {folds[fold]}')
with open(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/models/pkl/Schaefer7n500p_{subcortical[subcort]}_full_correlation_model_fold_{folds[fold]}.pkl', "wb") as f:
    pickle.dump(model, f)

print(f'Best params in fold {folds[fold]} = ', model.best_params_)
print(f'Best score (neg_mean_absolute_error) in fold {folds[fold]} = ', model.best_score_)

# Predict the values
print(f'Predicting & saving g_test for {subcortical[subcort]} fold {folds[fold]}', flush=True)
g_pred_test = model.predict(np.array(features_test_corr))
pd.DataFrame(g_pred_test, columns=['g predicted test']).to_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/g_pred/Schaefer7n500p_{subcortical[subcort]}_full_correlation_test_fold_{folds[fold]}.csv')

g_pred_test_with_id = pd.concat([g_test_id.astype(int), pd.DataFrame(g_pred_test, columns=['g predicted test'])], axis=1).to_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/g_pred/Schaefer7n500p_{subcortical[subcort]}_full_correlation_g_pred_test_id_fold_{folds[fold]}.csv')


print(f'Predicting & saving g_train for {subcortical[subcort]} fold {folds[fold]}', flush=True)
g_pred_train = model.predict(np.array(features_train_corr))
pd.DataFrame(g_pred_train, columns=['g predicted train']).to_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/g_pred/Schaefer7n500p_{subcortical[subcort]}_full_correlation_g_pred_train_fold_{folds[fold]}.csv')


g_pred_train_with_id = pd.concat([g_train_id.astype(int), pd.DataFrame(g_pred_train, columns=['g predicted train'])], axis=1).to_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/g_pred/Schaefer7n500p_{subcortical[subcort]}_full_correlation_g_pred_train_id_fold_{folds[fold]}.csv')


print(f"Fold = {folds[fold]}")
print("----------")
print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
print("----------")

pls_result['fold'] = folds[fold]
pls_result['modalities'] = subcortical[subcort]
pls_result['n_components'] = model.best_params_
pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['Pearson r'] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])

print(f'Saving PLS result for {subcortical[subcort]} fold {folds[fold]}', flush=True)
with open(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/models/csv/Schaefer7n500p_{subcortical[subcort]}_full_correlation_fold_{folds[fold]}_PLS_result.csv', 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=pls_result.keys())
    writer.writerow(pls_result)

pls_result.clear()

corr, pval = stats.pearsonr(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
r2 = r2_score(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
mse = mean_squared_error(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
result = pd.DataFrame([subcortical[subcort], fold, corr, pval, r2, mse, model.best_params_], index=['Modality', 'Fold', 'Correlation', 'P-value', 'R2', 'MSE', 'n components'], columns=['Values']).to_csv(f'/projects/PLS/pls_rs/pls_rs_timeseries/shaefer_7n500_600/fold_{folds[fold]}/models/csv/Schaefer7n500p_{subcortical[subcort]}_full_correlation_fold_{folds[fold]}_full_result.csv')