In [None]:
import csv
import os
import warnings
import pickle
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, MaxNLocator
import matplotlib.ticker as ticker
from matplotlib import colors
import matplotlib.ticker as plticker
import sklearn
from wordcloud import WordCloud
import matplotlib.colors as mcolors
import seaborn as sns
from textwrap import wrap
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
from matplotlib.ticker import FormatStrFormatter
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# PLS on 5 folds

## Match mental health to g-factor: 5 folds

In [None]:
mh = pd.read_csv('Cog-Ment/CSVs/2024/mental_health/mental_health_full_renamed.csv')

In [None]:
# Match mental health to g-factor: 5 folds
folds = ["0", "1", "2", "3", "4"]
seed = 42
pls_result = {}
result_list = []

warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
    train_id = pd.read_csv(f'Cog-Ment/g_factor_5_folds_python/fold_{fold}/train_id_fold_{fold}.csv')
    test_id = pd.read_csv(f'Cog-Ment/g_factor_5_folds_python/fold_{fold}/test_id_fold_{fold}.csv')

    # Upload g-factor with ID
    g_train_full = pd.concat([pd.read_csv(f'Cog-Ment/R/g_factor_5_folds/fold_{fold}/g_train_{fold}.csv'), train_id.astype(int)], axis=1) 
    g_test_full = pd.concat([pd.read_csv(f'Cog-Ment/R/g_factor_5_folds/fold_{fold}/g_test_{fold}.csv'), test_id.astype(int)], axis=1)

    # Match mental health data to cognitive data
    mh_train_match, mh_test_match = pd.merge(mh, train_id, on='eid'), pd.merge(mh, test_id, on='eid')
    mh_train, mh_test = mh_train_match.drop(columns=['eid']), mh_test_match.drop(columns=['eid'])
    mh_train.to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/mh_train_fold_{fold}.csv', index=False)
    mh_test.to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/mh_test_fold_{fold}.csv', index=False)


    # Get IDs
    mh_train_id, mh_test_id = mh_train_match['eid'], mh_test_match['eid']
    mh_train_id.to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/mh_train_id_fold_{fold}.csv', index=False)
    mh_test_id.to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/mh_test_id_fold_{fold}.csv', index=False)

    # Match g-factor back to mental health
    g_train_match, g_test_match = pd.merge(g_train_full, mh_train_id, on='eid'), pd.merge(g_test_full, mh_test_id, on='eid')
    g_train_match['eid'].to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/g_train_matched_id_fold_{fold}.csv', index=False)
    g_test_match['eid'].to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/g_test_matched_id_fold_{fold}.csv', index=False)
    g_train, g_test = g_train_match.drop(columns=['eid']), g_test_match.drop(columns=['eid'])
    g_train.to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/g_train_matched_fold_{fold}.csv', index=False)
    g_test.to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/g_test_matched_fold_{fold}.csv', index=False)

## Run PLS on 5 folds

In [None]:
# PLS: Scale only continous scores
folds = ["0", "1", "2", "3", "4"]
seed = 42
pls_result = {}

warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
    mh_train = pd.read_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/mh_train_fold_{fold}.csv')
    mh_test = pd.read_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/mh_test_fold_{fold}.csv')

    # Scale
    scaler = StandardScaler()
    mh_train_scaled, mh_test_scaled = pd.DataFrame(scaler.fit_transform(mh_train), columns = mh_train.columns), pd.DataFrame(scaler.transform(mh_test), columns = mh_test.columns)

    with open(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/models/scaler_features_fold_{fold}.pkl', "wb") as f:
        pickle.dump(scaler, f)

    g_train = pd.read_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/g_train_matched_fold_{fold}.csv')
    g_test = pd.read_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/g_test_matched_fold_{fold}.csv')

    # Initiate and run PLS
    parameters = {'n_components': range(1, np.array(mh_train_scaled).shape[1]+1)} #36
    pls = PLSRegression()
    model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True), verbose=4, n_jobs=17) #, random_state=seed

    print("Fitting PLS")
    model.fit(np.array(mh_train_scaled), np.array(g_train))
    print(f'Best params in fold {fold} = ', model.best_params_)
    print(f'Best score (neg_mean_absolute_error) in fold {fold} = ', model.best_score_)

    with open(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/models/mh_pls_model_fold_{fold}.pkl', "wb") as f:
        pickle.dump(model, f)

    # Predict the values
    print(f'Predicting g_test in fold {fold}')
    g_pred_test = model.predict(np.array(mh_test_scaled))
    print(f'Saving g pred fold {fold}')
    pd.DataFrame(g_pred_test, columns = ['g_pred_mh']).to_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/g_pred/g_pred_mh_fold_{fold}.csv', index=False)

    print(f"Fold = {fold}")
    print("----------")
    print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
    print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
    print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
    print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
    print("----------")

    pls_result['fold'] = fold
    pls_result['n_components'] = model.best_params_
    pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
    pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
    pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
    pls_result['r'] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])

    with open(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/models/MH_Result_PLS_fold_{fold}.csv', 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=pls_result.keys())
        writer.writerow(pls_result)

    pls_result.clear()

In [None]:
# Upload individual pls results and merge them into one table
folds = ["0", "1", "2", "3", "4"]
pls_5_folds = []
for fold in folds:
    pls = pd.read_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/models/MH_Result_PLS_fold_{fold}.csv', header=None)
    pls.columns = ['Fold', 'n components', 'MSE', 'MAE', 'R2', 'r']
    pls_5_folds.append(pls)
    mh_result = pd.concat(pls_5_folds, ignore_index=True)

mh_result['r'] = mh_result['r'].str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
mh_result[['r', 'p-value']] = mh_result['r'].str.split(',', expand=True).astype(float).round(decimals=3)
mh_result = mh_result.round(decimals=3)
mh_result.to_csv('Cog-Ment/PLS/2024/mental_health/folds/pls_mh_result_5_folds.csv', index=False)
mh_result

In [None]:
result_5_folds_mean = mh_result[['R2', 'r', 'MSE', 'MAE']].mean().round(2)
result_5_folds_mean

### Visualize real and predicted g-factors

In [None]:
# snsn cmaps
'Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r',
'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r',
'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Grays', 'Greens',
'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges',
'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1',
'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu',
'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r',
'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu',
'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r',
'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral',
'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r',
'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot',
'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r',
'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r',
'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix',
'cubehelix_r', 'flag', 'flag_r', 'flare', 'flare_r', 'gist_earth', 'gist_earth_r',
'gist_gray', 'gist_gray_r', 'gist_grey', 'gist_heat', 'gist_heat_r', 'gist_ncar',
'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg',
'gist_yarg_r', 'gist_yerg', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r',
'grey', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet',
'jet_r', 'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r',
'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r',
'seismic', 'seismic_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r',
'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'turbo', 'turbo_r', 'twilight', 'twilight_r',
'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r'

In [None]:
# Grid
folds = ["0", "1", "2", "3", "4"]

fig, axes = plt.subplots(1, 5, figsize=(18.54,3.54), dpi=600) #17, 4
axes = axes.flatten() #when you create a grid of subplots with plt.subplots, the axes array is a 2D array if you specify more than one row and column. You need to flatten this array to use it in a single loop

for i,fold in enumerate(folds):
    y_pred = pd.read_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/g_pred/g_pred_mh_fold_{fold}.csv')['g_pred_mh']
    y_true = pd.read_csv(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/suppl/g_test_matched_fold_{fold}.csv')['g']
    corr, p = pearsonr(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
        
    dist_i = np.sqrt((y_true - y_pred.mean())**2 + (y_pred - y_pred.mean())**2)
    sns.scatterplot(x=y_true, y=y_pred, c=dist_i, cmap='Greens', s=40, alpha=0.8, ax=axes[i]) #'flare'
    sns.regplot(x=y_true, y=y_pred, line_kws={"color": "red", "linewidth": 1}, scatter=False, ax=axes[i])
    sns.despine (top=True, right=True, ax=axes[i])
    axes[i].set_xlabel('Observed Cognitive Ability (Z)', fontsize=15)
    axes[i].set_ylabel('Predicted Cognitive Ability (Z)', fontsize=15)
    axes[i].tick_params(axis='x', labelsize=16)
    axes[i].tick_params(axis='y', labelsize=16)
    axes[i].set_title(f'Fold {fold}', fontsize=20, y=1.1)
    
    # Annotate the plot with Pearson correlation and R² score
    axes[i].text(0.05, 1.0, f'$r$ = {corr.round(2)}', transform=axes[i].transAxes, fontsize=15)
    axes[i].text(0.05, 0.91, f'$R$² = {r2.round(2)}', transform=axes[i].transAxes, fontsize=15)
    axes[i].xaxis.set_major_locator(MultipleLocator(1))
    
# Remove the empty subplot (if any)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()

plt.savefig("/Plots_and_Tables/MH/greal_gpred_r_MH_5Folds.png",
            bbox_inches="tight", 
            pad_inches=1, 
            transparent=False, 
            facecolor="w", 
            edgecolor='w', 
            orientation='landscape') 

plt.show()

## Extract and visualize feature loadings for each fold

In [None]:
# Extract loadings
folds = ["0", "1", "2", "3", "4"]
mh_names = mh.drop(columns=['eid'])
mh_names.columns
loadings = {}
for fold in folds:
        with open(f'Cog-Ment/PLS/2024/mental_health/folds/fold_{fold}/models/mh_pls_model_fold_{fold}.pkl', "rb") as f:
                model = pickle.load(f)
        loading = pd.DataFrame(model.best_estimator_.x_loadings_, columns = [f'Component {i+1}' for i in range(0, model.best_estimator_.x_loadings_.shape[1])])
        loadings[f'Fold_{fold}'] = loading

loadings_fold_0 = pd.concat([pd.DataFrame(mh_names.columns), loadings['Fold_0']], axis=1)
loadings_fold_0 = loadings_fold_0.rename(columns = {0: 'Features'})

loadings_fold_1 = pd.concat([pd.DataFrame(mh_names.columns), loadings['Fold_1']], axis=1)
loadings_fold_1 = loadings_fold_1.rename(columns = {0: 'Features'})

loadings_fold_2 = pd.concat([pd.DataFrame(mh_names.columns), loadings['Fold_2']], axis=1)
loadings_fold_2 = loadings_fold_2.rename(columns = {0: 'Features'})

loadings_fold_3 = pd.concat([pd.DataFrame(mh_names.columns), loadings['Fold_3']], axis=1)
loadings_fold_3 = loadings_fold_3.rename(columns = {0: 'Features'})

loadings_fold_4 = pd.concat([pd.DataFrame(mh_names.columns), loadings['Fold_4']], axis=1)
loadings_fold_4 = loadings_fold_4.rename(columns = {0: 'Features'})

In [None]:
# Loading plot Fold 0: first 10 components, sorted
for component in loadings_fold_0.columns[1:10]:
    loadings_fold_0_sorted = loadings_fold_0.sort_values(by=component, ascending=False)
    
    plt.figure(figsize=(20, 20))  # Adjust the figure size as needed
    plt.barh(loadings_fold_0_sorted['Features'], loadings_fold_0_sorted[component], color='skyblue')
    plt.xlabel('Loadings', fontsize=12)
    #plt.ylabel('Features', fontsize=20)
    plt.title(f'Loading Plot for {component}', fontsize=30)
    plt.gca().invert_yaxis()  # Invert the y-axis to have the first feature at the top
    plt.tight_layout()  # Adjust layout to fit all labels
    plt.show()

# Check correlations between g-factor and cognitive test scores to verify that g-factor adequately reflects test performance

It is +/- xpected that g-factor correlates negatively with time variables, such as reaction time and TMT performance, which would mean 'the higher [reaction] time, the slower, the worse', as well as with the number of incorrect matches in Pairs matching task, which would also mean 'the more incorrect, the worse'.

In [None]:
y_train_scaled = pd.read_csv('Cog-Ment/PLS/2024/mental_health/single_split/y_train_scaled.csv')
g_cog_corr = {}
for col in pd.DataFrame(y_train_scaled, columns = y_train_scaled.columns).columns:
    g_cog_corr[col] = pd.DataFrame(y_train_scaled, columns = y_train_scaled.columns)[col].corr(g_train['g'])
g_cog_corr

In [None]:
y_train_0 = pd.read_csv("Cog-Ment/g_factor_5_folds_python/fold_0/train_scaled_fold_0.csv")
g_train_0 = pd.read_csv('Cog-Ment/PLS/g_factor/g_train_with_id_fold_0.csv').drop(columns='eid')

y_train_1 = pd.read_csv("Cog-Ment/g_factor_5_folds_python/fold_1/train_scaled_fold_1.csv")
g_train_1 = pd.read_csv('Cog-Ment/PLS/g_factor/g_train_with_id_fold_1.csv').drop(columns='eid')

y_train_2 = pd.read_csv("Cog-Ment/g_factor_5_folds_python/fold_2/train_scaled_fold_2.csv")
g_train_2 = pd.read_csv('Cog-Ment/PLS/g_factor/g_train_with_id_fold_2.csv').drop(columns='eid')

y_train_3 = pd.read_csv("Cog-Ment/g_factor_5_folds_python/fold_3/train_scaled_fold_3.csv")
g_train_3 = pd.read_csv('Cog-Ment/PLS/g_factor/g_train_with_id_fold_3.csv').drop(columns='eid')

y_train_4 = pd.read_csv("Cog-Ment/g_factor_5_folds_python/fold_4/train_scaled_fold_4.csv")
g_train_4 = pd.read_csv('Cog-Ment/PLS/g_factor/g_train_with_id_fold_4.csv').drop(columns='eid')

In [None]:
g_cog_corr_0 = {}
for col in pd.DataFrame(y_train_0, columns = y_train_0.columns).columns:
    g_cog_corr_0[col] = pd.DataFrame(y_train_0, columns = y_train_0.columns)[col].corr(g_train_0['g'])
g_cog_corr_0

In [None]:
g_cog_corr_1 = {}
for col in pd.DataFrame(y_train_1, columns = y_train_1.columns).columns:
    g_cog_corr_1[col] = pd.DataFrame(y_train_1, columns = y_train_1.columns)[col].corr(g_train_1['g'])
g_cog_corr_1

In [None]:
g_cog_corr_2 = {}
for col in pd.DataFrame(y_train_2, columns = y_train_2.columns).columns:
    g_cog_corr_2[col] = pd.DataFrame(y_train_2, columns = y_train_2.columns)[col].corr(g_train_2['g'])
g_cog_corr_2

In [None]:
g_cog_corr_4 = {}
for col in pd.DataFrame(y_train_4, columns = y_train_4.columns).columns:
    g_cog_corr_4[col] = pd.DataFrame(y_train_4, columns = y_train_4.columns)[col].corr(g_train_4['g'])
g_cog_corr_4