In [None]:
import csv
import os
import warnings
import pickle
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, MaxNLocator
import matplotlib.ticker as ticker
from matplotlib import colors
import matplotlib.ticker as plticker
import sklearn
from sklearn.neighbors import KernelDensity
import matplotlib as mpl
import matplotlib.gridspec as grid_spec
import matplotlib.colors as mcolors
import seaborn as sns
from textwrap import wrap
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
from matplotlib.ticker import FormatStrFormatter
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Structure of the Mental Health variables: correlations, distributions

In [None]:
# Match mental health features to targets
mh = pd.read_csv('/mental_health/mental_health_full_renamed.csv')
targets = pd.read_csv('/Cog-Ment/R/g_factor_5_folds/target.csv')
mh_matched_to_targets = pd.merge(mh, targets['eid'], on = 'eid')
mh_matched_to_targets

### Age and sex distributions

In [None]:
basic_confounds_min = pd.read_csv('/ML_DATASETS/basic_confounds_min.csv')[['Sex', 'Age when attended assessment centre', 'eid']]
age_sex = pd.DataFrame(basic_confounds_min)
age_sex.columns = ['Sex', 'Age', 'eid']
mh_age_sex = mh_matched_to_targets.merge(age_sex, on = 'eid')
mh_age_sex.to_csv('/mental_health/mh_age_sex.csv', index=False)

In [None]:
print('Mean age', mh_age_sex['Age'].mean().round(2))
print(f"SD age {mh_age_sex['Age'].std():.3f}")


print('Age when attended assessment centre, mean:', mh_age_sex['Age'].mean().round(2), 'SD:', f"{mh_age_sex['Age'].std():.3f}")
print('Age max range:', mh_age_sex['Age'].max())
print('Age min range:', mh_age_sex['Age'].min())
print('Proportion of males:', (mh_age_sex['Sex'].value_counts()[1] / len(mh_age_sex['Sex']) * 100).round(2))
print('Proportion of females:', (mh_age_sex['Sex'].value_counts()[0] / len(mh_age_sex['Sex']) * 100).round(2))

### Distribution of mehtal health scores

In [None]:
# Distribution of mehtal health scores
warnings.simplefilter(action='ignore', category=FutureWarning)

# Calculate the number of rows needed for 134 columns with 5 per row
columns = mh_matched_to_targets.drop(columns='eid').columns # Ceiling division to get the number of rows
num_rows = -(-len(columns) // 5)
#loc = plticker.MultipleLocator(base=1.0)
fig, axes = plt.subplots(nrows=num_rows, ncols=9, figsize=(50, num_rows * 5))
axes = axes.flatten()
for i, col in enumerate(columns):
    column_data = mh_matched_to_targets.drop(columns='eid')[col]

    max_val = column_data.max()
    min_val = column_data.min()  # Assuming the minimum value is 0
    range_val = max_val - min_val
    if range_val > 0:
        interval = range_val / 10
        locator = ticker.MaxNLocator(nbins=10, steps=[1, 2, 5, 10])
    else:
        locator = ticker.FixedLocator([0])

    sns.histplot(column_data, ax=axes[i], color='wheat', binwidth=0.3, linewidth=0.5)
    axes[i].tick_params(axis='x', labelsize=10)
    
    wrapped_col = '\n'.join(wrap(col, width=25))

    axes[i].set_xlabel(wrapped_col, fontsize=20) # Name of the x axis
    axes[i].set_ylabel(None)  # Name of the y axis
    axes[i].tick_params(axis='y', labelsize=12) # Size of the y ticks

    axes[i].xaxis.set_major_locator(ticker.MultipleLocator(1)) # x axis tick interval
    axes[i].yaxis.set_major_locator(locator) # y axis tick interval

    axes[i].set_xlim(left=mh_matched_to_targets.drop(columns='eid')[col].min())

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.subplots_adjust(hspace=0.5, wspace=0.7)
plt.suptitle('Distribution of Mental Health Scores', fontsize=40, y=1.0)
plt.tight_layout()
plt.show()

### Correlation plot for mental health scores

In [None]:
# Correlation plot for mental health scores: only significant correlations
correlation_features = mh_matched_to_targets.drop(columns='eid').corr(method='pearson')

p_values = pd.DataFrame(index=mh_matched_to_targets.drop(columns='eid').columns, columns=mh_matched_to_targets.drop(columns='eid').columns)

for row in mh_matched_to_targets.drop(columns='eid').columns:
    for col in mh_matched_to_targets.drop(columns='eid').columns:
        if row != col:
            _, p_value = pearsonr(mh_matched_to_targets.drop(columns='eid')[row], mh_matched_to_targets.drop(columns='eid')[col])
            p_values[row][col] = p_value
p_values = p_values.astype(float)
significance_level = 0.05
significant_correlations = p_values[p_values < significance_level]
mask = p_values >= significance_level
significant_corr_matrix = correlation_features.mask(mask)

plt.figure(figsize=(20, 10))
sns.heatmap(significant_corr_matrix, cmap='coolwarm', xticklabels=correlation_features.columns, yticklabels=correlation_features.columns)
plt.tick_params(labelsize=8)
plt.title('Heatmap of Significant Correlations')
plt.show()

In [None]:
# Correlation plot for mental health scores: discard diagonal
correlation_features = mh_matched_to_targets.drop(columns='eid').corr(method='pearson')
plt.figure(figsize=(70, 70))
mask = np.triu(np.ones_like(correlation_features, dtype=bool), k=1)
plot = sns.heatmap(correlation_features, cmap='coolwarm', fmt=".2f",
                   xticklabels=correlation_features.columns,
                   yticklabels=correlation_features.columns,
                   mask=mask)
plot.set_xticklabels(plot.get_yticklabels(), rotation = 70, ha='right', fontsize = 30)
plot.set_yticklabels(plot.get_yticklabels(), fontsize = 30)
cbar = plot.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)
plt.show()

In [None]:
# Verify correlations: Wellbeing vs other features (expected positive correlations only with happiness/wellbeing-related variables)
with pd.option_context('display.max_rows', None):
    display(correlation_features['Wellbeing'])

# Single split to explain the model

In [None]:
targets = pd.read_csv('/Cog-Ment/R/g_factor_5_folds/target.csv')
ds = pd.merge(mh, targets, on = 'eid')
X = ds.drop(columns=['log_RT', 'FIS',
       'Numeric memory:Max digits remembered correctly',
       'log_TMT:Duration to complete numeric path',
       'log_TMT:Duration to complete alphabetic path',
       'SDS:Numb of symbol digit matches made correctly',
       'PAL:Numb of word pairs correctly associated',
       'Tower rearranging:Numb of puzzles correct',
       'Matrix PC: Numb of puzzles correctly solved',
       'log1p_Pairs match:Incorrect matches (round 2)',
       'Picture vocab:Specific cognitive ability',
       'Prosp memory:Initial answer'])
y = ds[['eid', 'log_RT', 'FIS',
       'Numeric memory:Max digits remembered correctly',
       'log_TMT:Duration to complete numeric path',
       'log_TMT:Duration to complete alphabetic path',
       'SDS:Numb of symbol digit matches made correctly',
       'PAL:Numb of word pairs correctly associated',
       'Tower rearranging:Numb of puzzles correct',
       'Matrix PC: Numb of puzzles correctly solved',
       'log1p_Pairs match:Incorrect matches (round 2)',
       'Picture vocab:Specific cognitive ability',
       'Prosp memory:Initial answer']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
pd.DataFrame(X_train['eid'], columns = ['eid']).to_csv('/mental_health/single_split/train_id.csv', index=False)
pd.DataFrame(X_test['eid'], columns = ['eid']).to_csv('/mental_health/single_split/test_id.csv', index=False)

X_train = X_train.drop(columns = 'eid').reset_index(drop=True)
X_test = X_test.drop(columns = 'eid').reset_index(drop=True)
y_train = y_train.drop(columns = 'eid').reset_index(drop=True)
y_test = y_test.drop(columns = 'eid').reset_index(drop=True)

pd.DataFrame(X_train, columns = X_train.columns).to_csv('/mental_health/single_split/X_train_original.csv', index=False)
pd.DataFrame(X_test, columns = X_test.columns).to_csv('/mental_health/single_split/X_test_original.csv', index=False)

pd.DataFrame(y_train, columns = y_train.columns).to_csv('/mental_health/single_split/y_train_original.csv', index=False)
pd.DataFrame(y_test, columns = y_test.columns).to_csv('/mental_health/single_split/y_test_original.csv', index=False)

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled, X_test_scaled = scaler_X.fit_transform(X_train), scaler_X.transform(X_test)
with open(f'/mental_health/Single_split/scaler_X.pkl', "wb") as f:
    pickle.dump(scaler_X, f)

y_train_scaled, y_test_scaled = scaler_y.fit_transform(y_train), scaler_y.transform(y_test)
with open(f'/mental_health/Single_split/scaler_y.pkl', "wb") as f:
    pickle.dump(scaler_y, f)

pd.DataFrame(X_train_scaled, columns = X_train.columns).to_csv('/mental_health/single_split/X_train_scaled.csv', index=False)
pd.DataFrame(X_test_scaled, columns = X_train.columns).to_csv('/mental_health/single_split/X_test_scaled.csv', index=False)

pd.DataFrame(y_train_scaled, columns = y_train.columns).to_csv('/mental_health/single_split/y_train_scaled.csv', index=False)
pd.DataFrame(y_test_scaled, columns = y_train.columns).to_csv('/mental_health/single_split/y_test_scaled.csv', index=False)

Remove composite scores and test model performance

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(ds.max().round(2).sort_values())

In [None]:
targets = pd.read_csv('/Cog-Ment/R/g_factor_5_folds/target.csv')
ds = pd.merge(mh, targets, on = 'eid')

print('Mental health shape before removing scores:', ds.shape)

scores_to_remove = ['NS-12',
'RDS-4',
'Wellbeing',
'GAD-7',
'PHQ-9',
'PCL-6',
'PDS',
'Ever self-harmed (non-suicidal)',
'Ever self-harmed',               
'Ever attempted suicide',         
'Substance addiction',            
'Current addiction',              
'Alcohol dependence (AUDIT≥15)',  
'GAD ever',                       
'Current GAD',                    
'Current GAD mild',               
'Current GAD moderate',           
'PTSD',                           
'Current GAD severe',             
'Hazardous alcohol use (AUDIT≥8)',
'Current severe depression',      
'Current depression',             
'Depression triggered by loss',   
'Recurrent depression',           
'Depression single episode',      
'Bipolar II',                     
'Bipolar I',                      
'Subthreshold depression',        
'Depression ever',                
'Lifetime frequency of contemplating self-harm',                                                              
"Frequency of 'life not worth living' thoughts", 
'(log)AUDIT-C', 
'Ever depressed for a whole week', 
'(log)AUDIT-P', 
'(log)AUDIT', 
'Childhood adverse events', 
'Adult adverse events', 
'Catastrophic trauma', 
'Any distress', 
'Unusual experience']

target_cols = ['log_RT', 'FIS',
       'Numeric memory:Max digits remembered correctly',
       'log_TMT:Duration to complete numeric path',
       'log_TMT:Duration to complete alphabetic path',
       'SDS:Numb of symbol digit matches made correctly',
       'PAL:Numb of word pairs correctly associated',
       'Tower rearranging:Numb of puzzles correct',
       'Matrix PC: Numb of puzzles correctly solved',
       'log1p_Pairs match:Incorrect matches (round 2)',
       'Picture vocab:Specific cognitive ability',
       'Prosp memory:Initial answer']

ds = ds.drop(columns = scores_to_remove)
X = ds.drop(columns=target_cols)

y = ds[['eid'] + [col for col in target_cols]]

print('Mental health shape after removing scores and target:', X.shape)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
pd.DataFrame(X_train['eid'], columns = ['eid']).to_csv('/mh/train_id.csv', index=False)
pd.DataFrame(X_test['eid'], columns = ['eid']).to_csv('/mh/test_id.csv', index=False)

X_train = X_train.drop(columns = 'eid').reset_index(drop=True)
X_test = X_test.drop(columns = 'eid').reset_index(drop=True)
y_train = y_train.drop(columns = 'eid').reset_index(drop=True)
y_test = y_test.drop(columns = 'eid').reset_index(drop=True)

pd.DataFrame(X_train, columns = X_train.columns).to_csv('/mh/X_train_original.csv', index=False)
pd.DataFrame(X_test, columns = X_test.columns).to_csv('/mh/X_test_original.csv', index=False)

pd.DataFrame(y_train, columns = y_train.columns).to_csv('/mh/y_train_original.csv', index=False)
pd.DataFrame(y_test, columns = y_test.columns).to_csv('/mh/y_test_original.csv', index=False)

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled, X_test_scaled = scaler_X.fit_transform(X_train), scaler_X.transform(X_test)
with open(f'/mh/scaler_X.pkl', "wb") as f:
    pickle.dump(scaler_X, f)

y_train_scaled, y_test_scaled = scaler_y.fit_transform(y_train), scaler_y.transform(y_test)
with open(f'/mh/scaler_y.pkl', "wb") as f:
    pickle.dump(scaler_y, f)

pd.DataFrame(X_train_scaled, columns = X_train.columns).to_csv('/mh/X_train_scaled.csv', index=False)
pd.DataFrame(X_test_scaled, columns = X_train.columns).to_csv('/mh/X_test_scaled.csv', index=False)

pd.DataFrame(y_train_scaled, columns = y_train.columns).to_csv('/mh/y_train_scaled.csv', index=False)
pd.DataFrame(y_test_scaled, columns = y_train.columns).to_csv('/mh/y_test_scaled.csv', index=False)

Keep scores, but remove questionnaire items

In [None]:
targets = pd.read_csv('/Cog-Ment/R/g_factor_5_folds/target.csv')
ds = pd.merge(mh, targets, on = 'eid')

print('Mental health shape before removing questionnaire items:', ds.shape)

scores_to_keep = [
'NS-12',
'RDS-4',
'Wellbeing',
'GAD-7',
'PHQ-9',
'PCL-6',
'PDS',
'Ever self-harmed (non-suicidal)',
'Ever self-harmed',               
'Ever attempted suicide',         
'Substance addiction',            
'Current addiction',              
'Alcohol dependence (AUDIT≥15)',  
'GAD ever',                       
'Current GAD',                    
'Current GAD mild',               
'Current GAD moderate',           
'PTSD',                           
'Current GAD severe',             
'Hazardous alcohol use (AUDIT≥8)',
'Current severe depression',      
'Current depression',             
'Depression triggered by loss',   
'Recurrent depression',           
'Depression single episode',      
'Bipolar II',                     
'Bipolar I',                      
'Subthreshold depression',        
'Depression ever',                
'Lifetime frequency of contemplating self-harm',                                                              
"Frequency of 'life not worth living' thoughts", 
'(log)AUDIT-C', 
'Ever depressed for a whole week', 
'(log)AUDIT-P', 
'(log)AUDIT', 
'Childhood adverse events', 
'Adult adverse events', 
'Catastrophic trauma', 
'Any distress', 
'Unusual experience']

target_cols = ['log_RT', 'FIS',
       'Numeric memory:Max digits remembered correctly',
       'log_TMT:Duration to complete numeric path',
       'log_TMT:Duration to complete alphabetic path',
       'SDS:Numb of symbol digit matches made correctly',
       'PAL:Numb of word pairs correctly associated',
       'Tower rearranging:Numb of puzzles correct',
       'Matrix PC: Numb of puzzles correctly solved',
       'log1p_Pairs match:Incorrect matches (round 2)',
       'Picture vocab:Specific cognitive ability',
       'Prosp memory:Initial answer']

X = ds[['eid'] + [col for col in scores_to_keep]]
y = ds[['eid'] + [col for col in target_cols]]

print('Mental health shape after removing questionnaire items and target:', X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
pd.DataFrame(X_train['eid'], columns = ['eid']).to_csv('/mh/scores_only/train_id.csv', index=False)
pd.DataFrame(X_test['eid'], columns = ['eid']).to_csv('/mh/scores_only/test_id.csv', index=False)

X_train = X_train.drop(columns = 'eid').reset_index(drop=True)
X_test = X_test.drop(columns = 'eid').reset_index(drop=True)
y_train = y_train.drop(columns = 'eid').reset_index(drop=True)
y_test = y_test.drop(columns = 'eid').reset_index(drop=True)

pd.DataFrame(X_train, columns = X_train.columns).to_csv('/mh/scores_only/X_train_original.csv', index=False)
pd.DataFrame(X_test, columns = X_test.columns).to_csv('/mh/scores_only/X_test_original.csv', index=False)

pd.DataFrame(y_train, columns = y_train.columns).to_csv('/mh/scores_only/y_train_original.csv', index=False)
pd.DataFrame(y_test, columns = y_test.columns).to_csv('/mh/scores_only/y_test_original.csv', index=False)

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled, X_test_scaled = scaler_X.fit_transform(X_train), scaler_X.transform(X_test)
with open(f'/mh/scores_only/scaler_X.pkl', "wb") as f:
    pickle.dump(scaler_X, f)

y_train_scaled, y_test_scaled = scaler_y.fit_transform(y_train), scaler_y.transform(y_test)
with open(f'/mh/scores_only/scaler_y.pkl', "wb") as f:
    pickle.dump(scaler_y, f)

pd.DataFrame(X_train_scaled, columns = X_train.columns).to_csv('/mh/scores_only/X_train_scaled.csv', index=False)
pd.DataFrame(X_test_scaled, columns = X_train.columns).to_csv('/mh/scores_only/X_test_scaled.csv', index=False)

pd.DataFrame(y_train_scaled, columns = y_train.columns).to_csv('/mh/scores_only/y_train_scaled.csv', index=False)
pd.DataFrame(y_test_scaled, columns = y_train.columns).to_csv('/mh/scores_only/y_test_scaled.csv', index=False)

# Correlations between the target and mental health features: whole set

Correlations between g-factor and mental health features for the whole set: a random cognitive score

In [None]:
# Correlations between g-factor and mental health features for the whole set: Individual correlations
X_train_original = pd.read_csv('/mental_health/single_split/X_train_original.csv')
y_train = pd.read_csv('/mental_health/single_split/y_train_scaled.csv')
feature_names = X_train_original.columns
correlations_train_full = {}
p_values_train_full = {}
for column in X_train_original.columns:
    r, p_value = pearsonr(X_train_original[column], y_train['FIS'])
    correlations_train_full[column] = r
    p_values_train_full[column] = p_value
result = {'correlations': correlations_train_full, 'p_values': p_values_train_full}

corr_p_full = pd.DataFrame(result)
corr_full = pd.DataFrame(corr_p_full['correlations'])
corr_full.index = feature_names
significant_corr = corr_p_full[corr_p_full['p_values'] < 0.05].sort_values(by='correlations', ascending=False)
with pd.option_context('display.max_rows', None):
    display(significant_corr)

Correlations between g-factor and mental health features for the whole set: g-factor

In [None]:
# Correlations between g-factor and mental health features for the whole set: g-factor
X_train_original = pd.read_csv('/mental_health/single_split/X_train_original.csv')
g_train = pd.read_csv('/g_factor/g_train.csv')
feature_names = X_train_original.columns
correlations_train_full = {}
p_values_train_full = {}
for column in X_train_original.columns:
    r, p_value = pearsonr(X_train_original[column], g_train['g'])
    correlations_train_full[column] = r
    p_values_train_full[column] = p_value
result = {'correlations': correlations_train_full, 'p_values': p_values_train_full}

corr_p_full = pd.DataFrame(result)
corr_full = pd.DataFrame(corr_p_full['correlations'])
corr_full.index = feature_names
significant_corr = corr_p_full[corr_p_full['p_values'] <= 0.05].sort_values(by='correlations', ascending=False)
with pd.option_context('display.max_rows', None):
    display(significant_corr)

# Run PLS on the whole sample to infer overall structure of relationships between mental health and g (20:80 split)

## PLS on single split

In [None]:
# PLS on the whole set
g_train = pd.read_csv('/g_factor/g_train.csv')
g_test = pd.read_csv('/g_factor/g_test.csv')
X_train_scaled = pd.read_csv('/mental_health/single_split/X_train_scaled.csv')
X_test_scaled = pd.read_csv('/mental_health/single_split/X_test_scaled.csv')
pls_result = {}
warnings.simplefilter(action='ignore', category=FutureWarning)

# Initiate and run PLS
parameters = {'n_components': range(1, np.array(X_train_scaled).shape[1]+1)}
pls = PLSRegression()
model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True), verbose=4, n_jobs=17)

print("Fitting PLS")
model.fit(X_train_scaled, g_train.values)

print(f'Model parameters:', model.cv_results_['params'])
print(f'Mean test score:', model.cv_results_['mean_test_score'])
print(f'Rank test score:', model.cv_results_['rank_test_score'])
print(model)

print(f'Saving PLS model')
with open(f'/mental_health/single_split/mh_pls_model.pkl', "wb") as f:
    pickle.dump(model, f)

print(f'Best params = ', model.best_params_)
print(f'Best score (neg_mean_absolute_error) = ', model.best_score_)

# Predict the values
print(f'Predicting g_test')
g_pred_test = model.predict(X_test_scaled)
print(f'Saving g pred')
pd.DataFrame(g_pred_test, columns = ['g_pred_mh']).to_csv(f'/mental_health/single_split/g_pred_mh.csv')

print("----------")
print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
print("----------")

pls_result['n_components'] = model.best_params_
pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['r'] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])
    
with open(f'/mental_health/single_split/MH_Result_PLS.csv', 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=pls_result.keys())
    writer.writerow(pls_result)


In [None]:
# PLS on the whole set without composite scores
g_train = pd.read_csv('/g_factor/g_train.csv')
g_test = pd.read_csv('/g_factor/g_test.csv')
X_train_scaled = pd.read_csv('/mh/X_train_scaled.csv')
X_test_scaled = pd.read_csv('/mh/X_test_scaled.csv')
pls_result = {}
warnings.simplefilter(action='ignore', category=FutureWarning)

# Initiate and run PLS
parameters = {'n_components': range(1, np.array(X_train_scaled).shape[1]+1)}
pls = PLSRegression()
model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True), verbose=4, n_jobs=17)

print("Fitting PLS")
model.fit(X_train_scaled, g_train.values)

print(f'Model parameters:', model.cv_results_['params'])
print(f'Mean test score:', model.cv_results_['mean_test_score'])
print(f'Rank test score:', model.cv_results_['rank_test_score'])
print(model)

print(f'Saving PLS model')
with open(f'/mh/mh_pls_model.pkl', "wb") as f:
    pickle.dump(model, f)

print(f'Best params = ', model.best_params_)
print(f'Best score (neg_mean_absolute_error) = ', model.best_score_)

# Predict the values
print(f'Predicting g_test')
g_pred_test = model.predict(X_test_scaled)
print(f'Saving g pred')
pd.DataFrame(g_pred_test, columns = ['g_pred_mh']).to_csv(f'/mh/g_pred_mh.csv')

print("----------")
print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
print("----------")

pls_result['n_components'] = model.best_params_
pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['r'] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])
    
with open(f'/mh/MH_Result_PLS.csv', 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=pls_result.keys())
    writer.writerow(pls_result)


In [None]:
# PLS on the whole set without questionnaire items
g_train = pd.read_csv('/g_factor/g_train.csv')
g_test = pd.read_csv('/g_factor/g_test.csv')
X_train_scaled = pd.read_csv('/mh/scores_only/X_train_scaled.csv')
X_test_scaled = pd.read_csv('/mh/scores_only/X_test_scaled.csv')
pls_result = {}
warnings.simplefilter(action='ignore', category=FutureWarning)

# Initiate and run PLS
parameters = {'n_components': range(1, np.array(X_train_scaled).shape[1]+1)}
pls = PLSRegression()
model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True), verbose=4, n_jobs=17)

print("Fitting PLS")
model.fit(X_train_scaled, g_train.values)

print(f'Model parameters:', model.cv_results_['params'])
print(f'Mean test score:', model.cv_results_['mean_test_score'])
print(f'Rank test score:', model.cv_results_['rank_test_score'])
print(model)

print(f'Saving PLS model')
with open(f'/mh/scores_only/mh_pls_model.pkl', "wb") as f:
    pickle.dump(model, f)

print(f'Best params = ', model.best_params_)
print(f'Best score (neg_mean_absolute_error) = ', model.best_score_)

# Predict the values
print(f'Predicting g_test')
g_pred_test = model.predict(X_test_scaled)
print(f'Saving g pred')
pd.DataFrame(g_pred_test, columns = ['g_pred_mh']).to_csv(f'/mh/scores_only/g_pred_mh.csv')

print("----------")
print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
print("----------")

pls_result['n_components'] = model.best_params_
pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
pls_result['r'] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])
    
with open(f'/mh/scores_only/MH_Result_PLS.csv', 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=pls_result.keys())
    writer.writerow(pls_result)


In [None]:
# Without composite scores
pls_result

In [None]:
# Without questionnaire items
pls_result

In [None]:
# New
pls_result

## Calculate R2 for each component

In [None]:
with open(f'/mental_health/single_split/mh_pls_model.pkl', "rb") as f:
    model = pickle.load(f)
r2_sum = 0
r2_vector = np.empty(model.best_estimator_.n_components)
for i in range(0,model.best_estimator_.n_components):
        Y_pred = np.dot(model.best_estimator_.x_scores_[:,i].reshape(-1,1), model.best_estimator_.y_loadings_[:,i].reshape(-1,1).T) * g_train.values.std(axis=0, ddof=1) + g_train['g'].mean(axis=0)
        r2_sum += r2_score(g_train.values,Y_pred)
        print('R2 for %d component: %g' %(i+1,r2_score(g_train.values, Y_pred)))
        r2_vector[i] = r2_score(g_train.values,Y_pred)
print('R2 for all components: %g' %r2_sum.round(2))
#print('R2 for all components: %g' %(r2_score(g_train.values, model.best_estimator_.predict(X_train_scaled))))
x_loading_by_r2 = model.best_estimator_.x_loadings_ *  r2_vector
x_loading_by_r2_scaled = stats.zscore(model.best_estimator_.x_loadings_) *  r2_vector
weighted_x_loading = np.sum(x_loading_by_r2, axis=1)
weighted_x_loading_scaled = np.sum(x_loading_by_r2_scaled, axis=1)

## Build MSE plot

In [None]:
# MSE plot
mse = model.cv_results_['mean_test_score']
plt.figure(figsize=(20, 5))
plt.plot(range(1, 134), mse) #, "-o")
plt.xticks(range(1, 134, 1), fontsize = 5)
optimal_n_components = model.best_params_['n_components']
plt.xlabel(f"Number of PLS components, optimal: {optimal_n_components}")
plt.ylabel("MSE")
plt.ylim(-0.530, -0.5275)
plt.title(f"PLSRegression MSE plot")
plt.axvline(x=optimal_n_components, color='red', linestyle='--')
plt.show()

Although GridSearchCV suggets 117 components, they seem to be noize. This may happen dues to a large proportion of 0/1 features. According to the MSE plot, 16 components may be enough to capture maximum variance.

## Rerun PLS on 16 components

In [None]:
# PLS on the whole set: select the number of components suggested by MSE plot
pls_result_16 = {}
warnings.simplefilter(action='ignore', category=FutureWarning)

# Initiate and run PLS
parameters = {'n_components': range(1, 17)}
pls = PLSRegression()
model_16 = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True), verbose=4, n_jobs=17)

print("Fitting PLS")
model_16.fit(X_train_scaled, g_train.values)

print(f'Model parameters:', model_16.cv_results_['params'])
print(f'Mean test score:', model_16.cv_results_['mean_test_score'])
print(f'Rank test score:', model_16.cv_results_['rank_test_score'])
print(model_16)

print(f'Saving PLS model')
with open(f'/mental_health/single_split/mh_pls_model_16comp.pkl', "wb") as f:
    pickle.dump(model_16, f)

print(f'Best params = ', model_16.best_params_)
print(f'Best score (neg_mean_absolute_error) = ', model_16.best_score_)

# Predict the values
print(f'Predicting g_test')
g_pred_test_16 = model_16.predict(X_test_scaled)
print(f'Saving g pred')
pd.DataFrame(g_pred_test_16, columns = ['g_pred_mh']).to_csv('/mental_health/single_split/g_pred_mh_16comp.csv')

print("----------")
print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test_16[:,0]))
print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test_16[:,0]))
print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test_16[:,0]))
print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test_16[:,0]))
print("----------")

pls_result_16['n_components'] = model_16.best_params_
pls_result_16['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test_16[:,0])
pls_result_16['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test_16[:,0])
pls_result_16['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test_16[:,0])
pls_result_16['r'] = pearsonr(np.array(g_test)[:,0], g_pred_test_16[:,0])
    
with open(f'/mental_health/single_split/MH_Result_PLS_16Comp.csv', 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=pls_result.keys())
    writer.writerow(pls_result_16)


In [None]:
g_pred_test_16 = pd.read_csv('/mental_health/single_split/g_pred_mh_16comp.csv')
result_16 = pd.read_csv('/mental_health/single_split/MH_Result_PLS_16Comp.csv', header=None)
result_16.columns=['n_components', 'MSE', 'MAE', 'R2', 'r']
result_16

## Calculate R2 for each component and sum up all components into one: 16 components

In [None]:
with open(f'/mental_health/single_split/mh_pls_model_16comp.pkl', "rb") as f:
    model_16 = pickle.load(f)
r2_sum = 0
r2_vector = np.empty(model_16.best_estimator_.n_components)
for i in range(0,model_16.best_estimator_.n_components):
        Y_pred = np.dot(model_16.best_estimator_.x_scores_[:,i].reshape(-1,1), model_16.best_estimator_.y_loadings_[:,i].reshape(-1,1).T) * g_train.values.std(axis=0, ddof=1) + g_train['g'].mean(axis=0)
        r2_sum += r2_score(g_train.values,Y_pred)
        print('R2 for %d component: %g' %(i+1,r2_score(g_train.values, Y_pred)))
        r2_vector[i] = r2_score(g_train.values,Y_pred)
print('R2 for all components: %g' %r2_sum.round(2))
#print('R2 for all components: %g' %(r2_score(g_train.values, model_16.best_estimator_.predict(X_train_scaled))))
x_loading_by_r2 = model_16.best_estimator_.x_loadings_ *  r2_vector
x_loading_by_r2_scaled = stats.zscore(model_16.best_estimator_.x_loadings_) *  r2_vector
weighted_x_loading = np.sum(x_loading_by_r2, axis=1)
weighted_x_loading_scaled = np.sum(x_loading_by_r2_scaled, axis=1)
#pd.DataFrame(weighted_x_loading_scaled, columns = ['Loadings weighted']).to_csv('/mental_health/single_split/mh_pls_model_16comp_weighted_x_loading_scaled.csv')

In [None]:
# MSE plot
mse = model_16.cv_results_['mean_test_score']
plt.figure(figsize=(20, 5))
plt.plot(range(1, 17), mse) #, "-o")
plt.xticks(range(1, 17, 1), fontsize = 10)
optimal_n_components = model_16.best_params_['n_components']
plt.xlabel(f"Number of PLS components, optimal: {optimal_n_components}")
plt.ylabel("MSE")
plt.ylim(-0.530, -0.528)
plt.title(f"PLSRegression MSE plot")
plt.axvline(x=optimal_n_components, color='red', linestyle='--')
plt.show()

# Get factor loadings

In [None]:
with open(f'/mental_health/single_split/mh_pls_model_16comp.pkl', "rb") as f:
    model = pickle.load(f)
X_train_scaled = pd.read_csv('/mental_health/single_split/X_train_scaled.csv')
loading = pd.DataFrame(model.best_estimator_.x_loadings_, columns = [f'Component {i+1}' for i in range(0, model.best_estimator_.x_loadings_.shape[1])])
loading.index = X_train_scaled.columns
loading

## Sort correlations in an ascending order and match to PLS loadings

In [None]:
corr_full_sorted = corr_full.sort_values(by='correlations', ascending=False)
weighted_x_loading_scaled_df = pd.concat([pd.DataFrame(feature_names, columns = ['Features']), pd.DataFrame(weighted_x_loading_scaled, columns = ['Loadings'])], axis=1)
weighted_loadings_full_matched = weighted_x_loading_scaled_df.set_index('Features').loc[corr_full_sorted.index] #.reset_index()

# Save
corr_full_sorted.to_csv('/mental_health/single_split/mh_pls_model_16comp_corr_full_sorted.csv')
weighted_x_loading_scaled_df.to_csv('/mental_health/single_split/mh_pls_model_16comp_weighted_x_loading_scaled_df.csv')
weighted_loadings_full_matched.to_csv('/mental_health/single_split/mh_pls_model_16comp_weighted_loadings_full_matched.csv')

In [None]:
corr_full_sorted = pd.read_csv('/mental_health/single_split/mh_pls_model_16comp_corr_full_sorted.csv', index_col=0)
weighted_loadings_full_matched = pd.read_csv('/mental_health/single_split/mh_pls_model_16comp_weighted_loadings_full_matched.csv', index_col=0)

In [None]:
# Rename some columns for clarity
corr_full_sorted_renamed = corr_full_sorted.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems"})
weighted_loadings_full_matched_renamed = weighted_loadings_full_matched.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems"})

In [None]:
print('Mean', corr_full_sorted_renamed['correlations'].mean().round(2))
print('SD', corr_full_sorted_renamed['correlations'].std().round(2))
print('MIN', corr_full_sorted_renamed['correlations'].min().round(2))
print('MAX', corr_full_sorted_renamed['correlations'].max().round(2))

In [None]:
# Match loadings to correlations and filter, do not reindex
loadings_match_to_corr = pd.merge(
    corr_full_sorted,
    weighted_loadings_full_matched,
    left_index=True,
    right_index=True,
    how='left'
)

negative_corr = loadings_match_to_corr[loadings_match_to_corr['correlations'] < 0]
positive_corr = loadings_match_to_corr[loadings_match_to_corr['correlations'] >= 0]
negative_load = loadings_match_to_corr[loadings_match_to_corr['correlations'] < 0]
positive_load = loadings_match_to_corr[loadings_match_to_corr['correlations'] >= 0]

## Select only significant p-values and match to loadings

In [None]:
corr_p_full.index = feature_names
significant_corr = corr_p_full[corr_p_full['p_values'] <= 0.05].sort_values(by='correlations', ascending=False)
significant_corr_sorted = significant_corr.sort_values(by='correlations', ascending=False)
weighted_loadings_full_matched_significant = weighted_x_loading_scaled_df.set_index('Features').loc[significant_corr_sorted.index]
weighted_loadings_full_matched_significant

### Loading + corr plot sorted larger to smaller, mirror

In [None]:
# Full loading plot
fig, ax = plt.subplots(1, 2, figsize=(60, 220),  sharey='col') #figsize=(35, 70) #figsize=(35, 130),

def get_edge_color(value):
    return 'black' if value < 0 else 'black' #return 'blue' if value < 0 else 'green'

negative_edge_colors = negative_load['Loadings'].apply(get_edge_color).tolist()
positive_edge_colors = positive_load['Loadings'].apply(get_edge_color).tolist()


negative_corr_sorted = negative_corr.sort_values(by='correlations', ascending=True)
negative_load_sorted = negative_load.reindex(negative_corr_sorted.index)
#Corr
ax[0].barh(negative_corr_sorted.index,
           negative_corr_sorted['correlations'],
           color='#79AF9799', alpha=0.4, height=0.9)

# Loadings
negative_edge_colors_sorted = negative_load_sorted['Loadings'].apply(get_edge_color).tolist() # Apply the edge color function to the sorted loadings
ax[0].barh(negative_load_sorted.index,
            negative_load_sorted['Loadings'],
            color='none', edgecolor=negative_edge_colors_sorted, linewidth=2, height=0.9)
#Corr
ax[1].barh(positive_corr.index,
            positive_corr['correlations'],
            color='#79AF9799', alpha=0.4, height=0.9) # #DF8F4499

# Loadings
ax[1].barh(positive_load.index,
            positive_load['Loadings'],
            color='none', edgecolor=positive_edge_colors, linewidth = 2, height=0.9) #, hatch='//'

ax[0].tick_params(axis='y', labelsize=90)
ax[0].tick_params(axis='x', labelsize=80, rotation=50)
ax[0].spines['left'].set_visible(False)
ax[0].invert_yaxis()


ax[1].tick_params(axis='y', labelsize=90, labelright=True, labelleft=False)
ax[1].tick_params(axis='x', labelsize=80, rotation=50)
ax[1].invert_yaxis()

# Remove the y-axis ticks on the right side while keeping the y-axis line
ax[1].yaxis.set_ticks_position('none')

# Hide the spines for a cleaner look
for axs in ax:
    for side in ["top", "bottom", "right"]:
        axs.spines[side].set_visible(False)

# Set a common x-axis label for both correlations and loadings
fig.text(0.55, 0.08, "Pearson's $r$ and Loadings", ha='center', fontsize=120)

min_negative = min(negative_corr['correlations'].min(), negative_load['Loadings'].min())
max_positive = max(positive_corr['correlations'].max(), positive_load['Loadings'].max())
ax[0].set_xlim(min_negative * 1.5,  max_positive * 0.2) #ax[0].set_xlim(min_negative * 1.1, 0) in_negative * 1.1,  max_positive * 0.1
ax[1].set_xlim(min_negative * 1.5, max_positive * 1.1) #ax[1].set_xlim(0, max_positive * 1.1) #min_negative * 1.1, max_positive * 1.1

# Format the x-axis labels to two decimal places
#formatter = FormatStrFormatter('%.2f')
ax[0].xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax[1].xaxis.set_major_formatter(FormatStrFormatter('%.2f'))

# Add legends to both subplots
ax[0].legend(loc='lower left')
ax[1].legend(loc='lower right')

ax[0].axvline(x=0, color='black', linewidth=1)
ax[1].axvline(x=0, color='black', linewidth=1)

ax[0].spines['right'].set_visible(False)
ax[1].spines['left'].set_visible(False)

# Set x-tick intervals
ax[0].xaxis.set_major_locator(MultipleLocator(0.05))
ax[1].xaxis.set_major_locator(MultipleLocator(0.05))

#plt.tight_layout()

# Adjust the spacing between the subplots
fig.subplots_adjust(wspace=0.1)

plt.show()

In [None]:
# Rename features for clarity 
negative_corr_renamed = negative_corr.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems",
                                                          'Diagnoses: Depression': 'Depression'})
negative_load_renamed = negative_load.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems",
                                                          'Diagnoses: Depression': 'Depression'})
positive_corr_renamed = positive_corr.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems",
                                                          'Diagnoses: Depression': 'Depression'})
positive_load_renamed = positive_load.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems",
                                                          'Diagnoses: Depression': 'Depression'})

corr_full_sorted_renamed = corr_full_sorted.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems",
                                                          'Ever had period extreme irritability': 'Ever had period of extreme irritability',
                                                          'Frequency of "life not worth living" thoughts': "Frequency of 'life not worth living' thoughts",
                                                          'Diagnoses: Anxiety/panic attacks': 'Anxiety/panic attacks',
                                                          'NS-12': 'N-12',
                                                          'Diagnoses: Depression': 'Depression'
                                                          })
weighted_loadings_full_matched_renamed = weighted_loadings_full_matched.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems",
                                                          'Ever had period extreme irritability': 'Ever had period of extreme irritability',
                                                          'Frequency of "life not worth living" thoughts': "Frequency of 'life not worth living' thoughts",
                                                          'Diagnoses: Anxiety/panic attacks': 'Anxiety/panic attacks',
                                                          'NS-12': 'N-12',
                                                          'Diagnoses: Depression': 'Depression'
                                                          })
significant_corr_renamed = significant_corr.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems",
                                                          'Ever had period extreme irritability': 'Ever had period of extreme irritability',
                                                          'Frequency of "life not worth living" thoughts': "Frequency of 'life not worth living' thoughts",
                                                          'Diagnoses: Anxiety/panic attacks': 'Anxiety/panic attacks',
                                                          'NS-12': 'N-12',
                                                          'Diagnoses: Depression': 'Depression'
                                                          })
significant_corr_renamed = significant_corr.rename(index={"Diagnoses 'G'": "Diseases of the nervous system",
                                                          "Diagnoses: Neurological problem, NS injury, epilepsy": "Neurological problems, nervous system injury, epilepsy",
                                                          "Diagnoses 'F'": "Mental and behavioural disorders",
                                                          "Diagnoses: Stress, insomnia, migraine, nervous/mental problems": "Stress, insomnia, migraine, nervous/mental problems",
                                                          'Ever had period extreme irritability': 'Ever had period of extreme irritability',
                                                          'Frequency of "life not worth living" thoughts': "Frequency of 'life not worth living' thoughts",
                                                          'Diagnoses: Anxiety/panic attacks': 'Anxiety/panic attacks',
                                                          'NS-12': 'N-12',
                                                          'Diagnoses: Depression': 'Depression'
                                                          })
corr_full_sorted_renamed_df = pd.DataFrame(corr_full_sorted_renamed).reset_index().rename(columns={'index': 'features'})
loadings_renamed_df = weighted_loadings_full_matched_renamed.reset_index().rename(columns={'index': 'features'})
# Save
significant_corr_renamed_df = significant_corr_renamed.reset_index().rename(columns={'index': 'features'})
corr_full_sorted_renamed_df.to_csv('/mental_health/single_split/corr_full_sorted_renamed_df.csv', index=False)
significant_corr_renamed_df.to_csv('/mental_health/single_split/significant_corr_renamed_df.csv', index=False)
loadings_renamed_df.to_csv('/mental_health/single_split/loadings_renamed_df.csv', index=False)