In [None]:
import csv
import os
import random
import pickle
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
import skbold
import textwrap
from skbold.preproc import ConfoundRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Upload data

In [None]:
dti_confounds = pd.read_csv('/ML_DATASETS/paper/brain/DTI/dti_confounds_final_set.csv')
dti = pd.read_csv('/BRAIN/CSV_brain/dti_names_nona_names.csv')
dti_confounds.columns

In [None]:
dti_confounds.shape

# Define ConfoundRegressor

In [None]:
# Define ConfoundRegressor: skbold
def confound_regressor_skbold(features_train, features_test, confounds_train, confounds_test):
    # Scale features (train and test sets)
    scaler_features = StandardScaler()
    features_train_scaled = scaler_features.fit_transform(features_train)
    features_test_scaled = scaler_features.transform(features_test)
    
    # Scale confounds (train and test sets)
    scaler_confounds = StandardScaler()
    confounds_train_scaled = scaler_confounds.fit_transform(confounds_train)
    confounds_test_scaled = scaler_confounds.transform(confounds_test)

    # Convert full sets into np.array
    features_full_scaled_np = np.array(pd.concat([pd.DataFrame(features_train_scaled, columns = features_train.columns), pd.DataFrame(features_test_scaled, columns = features_test.columns)], axis=0))
    confounds_full_scaled_np = np.array(pd.concat([pd.DataFrame(confounds_train_scaled, columns = confounds_train.columns), pd.DataFrame(confounds_test_scaled, columns = confounds_test.columns)], axis=0))
    
    # Define ConfoundRegressor on a FULL set (train and test)
    cfr = ConfoundRegressor(confound=confounds_full_scaled_np, X=features_full_scaled_np)
    features_train_corrected = cfr.fit_transform(features_train_scaled)
    features_test_corrected = cfr.transform(features_test_scaled)


    return features_train_corrected, features_test_corrected, features_train_scaled, features_test_scaled, scaler_features

In [None]:
# Define ConfoundRegressor: Linear Model
def confound_regressor(features_train, features_test, confounds_train, confounds_test):
    from sklearn.linear_model import LinearRegression

    scaler_features = StandardScaler()
    features_train = scaler_features.fit_transform(features_train)
    features_test = scaler_features.transform(features_test)
        
    # Scale confounds (train and test sets)
    scaler_confounds = StandardScaler()
    confounds_train = scaler_confounds.fit_transform(confounds_train)
    confounds_test = scaler_confounds.transform(confounds_test)
        
    model = LinearRegression()
    model.fit(confounds_train, features_train)
    features_train_pred = model.predict(confounds_train)
    features_train_res = features_train - features_train_pred

    features_test_pred = model.predict(confounds_test)
    features_test_res = features_test - features_test_pred

    return features_train_res, features_test_res, features_train, features_test, scaler_features

### Extract modalities from DTI

**FA**
- TBSS

fa_tbss_train

fa_tbss_test

- Probabilistic tractography

fa_prob_train

fa_prob_test

**MD**
- TBSS

md_tbss_train

md_tbss_test

- Probabilistic tractography

md_prob_train

md_prob_test

**L1**
- TBSS

l1_tbss_train

l1_tbss_test

- Probabilistic tractography

l1_prob_train

l1_prob_test

**L2**
- TBSS

l2_tbss_train

l2_tbss_test

- Probabilistic tractography

l2_prob_train

l2_prob_test

**L3**
- TBSS

l3_tbss_train

l3_tbss_test

- Probabilistic tractography

l3_prob_train

l3_prob_test

**MO**
- TBSS

mo_tbss_train

mo_tbss_test

- Probabilistic tractographisovf_tbssy_four_train_dti_trainy

mo_prob_train

mo_prob_test

**OD**
- TBSS

od_tbss_train

od_tbss_test

- Probabilistic tractography

od_prob_train

od_prob_test

**ICVF**
- TBSS

icvf_tbss_train

icvf_tbss_test

- Probabilistic tractography

icvf_prob_train

icvf_prob_test

**ISOVF**
- TBSS

isovf_tbss_train

isovf_tbss_test

- Probabilistic tractography

isovf_prob_train

isovf_prob_test

### Get columns containing individual metrics

In [None]:
# Get columns containing individual metrics
## FA
# TBSS
fa_tbss = pd.concat([dti.filter(regex="Mean FA in"), dti['eid']], axis=1)
# Probabilistic tractography
fa_prob = pd.concat([dti.filter(regex="Weighted-mean FA in"), dti['eid']], axis=1)

## MD
# TBSS
md_tbss = pd.concat([dti.filter(regex="Mean MD in"), dti['eid']], axis=1)
# Probabilistic tractography
md_prob = pd.concat([dti.filter(regex="Weighted-mean MD in"), dti['eid']], axis=1)

## L1
# TBSS
l1_tbss = pd.concat([dti.filter(regex="Mean L1 in"), dti['eid']], axis=1)
# Probabilistic tractography
l1_prob = pd.concat([dti.filter(regex="Weighted-mean L1 in"), dti['eid']], axis=1)


## L2
# TBSS
l2_tbss = pd.concat([dti.filter(regex="Mean L2 in"), dti['eid']], axis=1)
# Probabilistic tractography
l2_prob = pd.concat([dti.filter(regex="Weighted-mean L2 in"), dti['eid']], axis=1)


## L3
# TBSS
l3_tbss = pd.concat([dti.filter(regex="Mean L3 in"), dti['eid']], axis=1)
# Probabilistic tractography
l3_prob = pd.concat([dti.filter(regex="Weighted-mean L3 in"), dti['eid']], axis=1)


## MO
# TBSS
mo_tbss = pd.concat([dti.filter(regex="Mean MO in"), dti['eid']], axis=1)
# Probabilistic tractographisovf_tbssy_four_train_dti_trainy
mo_prob = pd.concat([dti.filter(regex="Weighted-mean MO in"), dti['eid']], axis=1)


## OD
# TBSS
od_tbss = pd.concat([dti.filter(regex="Mean OD in"), dti['eid']], axis=1)
# Probabilistic tractography
od_prob = pd.concat([dti.filter(regex="Weighted-mean OD in"), dti['eid']], axis=1)


## ICVF
# TBSS
icvf_tbss = pd.concat([dti.filter(regex="Mean ICVF in"), dti['eid']], axis=1)
# Probabilistic tractography
icvf_prob = pd.concat([dti.filter(regex="Weighted-mean ICVF in"), dti['eid']], axis=1)

## ISOVF
# TBSS
isovf_tbss = pd.concat([dti.filter(regex="Mean ISOVF in"), dti['eid']], axis=1)
# Probabilistic tractography
isovf_prob = pd.concat([dti.filter(regex="Weighted-mean ISOVF in"), dti['eid']], axis=1)

# PLS

## Get train and test set for g-factor

In [None]:
modalities = ["fa_tbss",  "fa_prob", "md_tbss", "md_prob",
              "l1_tbss", "l1_prob", "l2_tbss", "l2_prob",
              "l3_tbss", "l3_prob", "mo_tbss", "mo_prob",
              "od_tbss", "od_prob", "icvf_tbss", "icvf_prob",
              "isovf_tbss", "isovf_prob"]

In [None]:
folds = ["0", "1", "2", "3", "4"]
    
# Match confounds to MRI
for fold in folds:
    train_id = pd.read_csv(f'/Cog-Ment/g_factor_5_folds_python/fold_{fold}/train_id_fold_{fold}.csv')
    test_id = pd.read_csv(f'/Cog-Ment/g_factor_5_folds_python/fold_{fold}/test_id_fold_{fold}.csv')
        
    # Match g-factor to ID
    g_train_full = pd.concat([pd.read_csv(f'/Cog-Ment/R/g_factor_5_folds/fold_{fold}/g_train_{fold}.csv'), train_id.astype(int)], axis=1).to_csv(f'/g_factor/g_train_with_id_fold_{fold}.csv', index=False)
    g_test_full = pd.concat([pd.read_csv(f'/Cog-Ment/R/g_factor_5_folds/fold_{fold}/g_test_{fold}.csv'), test_id.astype(int)], axis=1).to_csv(f'/g_factor/g_test_with_id_fold_{fold}.csv', index=False)

## Run PLS

In [None]:
confounds = pd.read_csv('/ML_DATASETS/paper/brain/DTI/dti_confounds_final_set.csv')

warnings.simplefilter(action='ignore', category=FutureWarning)

############## 1
print('Matching mental health data to cognitive data')
seed = 42

for modality in modalities:

    modality_data = globals()[modality]

    pls_result = {}

    folds = ["0", "1", "2", "3", "4"]
    
    # Match confounds to MRI
    conf_to_brain_match = pd.merge(confounds, modality_data['eid'], on='eid')
    brain_to_conf_match = pd.merge(conf_to_brain_match['eid'], modality_data, on='eid')

    for fold in folds:
        train_id = pd.read_csv(f'/Cog-Ment/g_factor_5_folds_python/fold_{fold}/train_id_fold_{fold}.csv')
        test_id = pd.read_csv(f'/Cog-Ment/g_factor_5_folds_python/fold_{fold}/test_id_fold_{fold}.csv')
        
        # Upload g-factor with ID
        g_train_full = pd.read_csv(f'/g_factor/g_train_with_id_fold_{fold}.csv')
        g_test_full = pd.read_csv(f'/g_factor/g_test_with_id_fold_{fold}.csv')

        # Match brain data to cognitive data
        brain_train, brain_test, brain_train_id, brain_test_id = pd.merge(brain_to_conf_match, train_id, on='eid').drop(columns=['eid']), pd.merge(brain_to_conf_match, test_id, on='eid').drop(columns=['eid']), pd.merge(brain_to_conf_match, train_id, on='eid')['eid'], pd.merge(brain_to_conf_match, test_id, on='eid')['eid']

        brain_train.to_csv(f'/brain/dti/dti_idp/{modality}_train_fold_{fold}.csv', index=False)
        brain_test.to_csv(f'/brain/dti/dti_idp/{modality}_test_fold_{fold}.csv', index=False)
        brain_train_id.to_csv(f'/brain/dti/dti_idp/{modality}_train_id_fold_{fold}.csv', index=False)
        brain_test_id.to_csv(f'/brain/dti/dti_idp/{modality}_test_id_fold_{fold}.csv', index=False)
        
        ############## 2
        print(f'Matching confounds to {modality}')
        
        # Match confounds to MRI
        brain_conf_train, brain_conf_test = pd.merge(conf_to_brain_match, brain_train_id, on='eid').drop(columns=['eid']), pd.merge(conf_to_brain_match, brain_test_id, on='eid').drop(columns=['eid'])
        brain_conf_train.to_csv(f'/brain/dti/dti_idp/{modality}_conf_train_fold_{fold}.csv', index=False)
        brain_conf_test.to_csv(f'/brain/dti/dti_idp/{modality}_conf_test_fold_{fold}.csv', index=False)
        
        ############## 3
        print(f'Matching g-factor to {modality}')
        
        # Match g-factor back to brain data
        g_train, g_test, g_train_id, g_test_id = pd.merge(g_train_full, brain_train_id, on='eid').drop(columns=['eid']), pd.merge(g_test_full, brain_test_id, on='eid').drop(columns=['eid']), pd.merge(g_train_full, brain_train_id, on='eid')['eid'], pd.merge(g_test_full, brain_test_id, on='eid')['eid']
        g_train.to_csv(f'/brain/dti/dti_idp/g_train_{modality}_matched_fold_{fold}.csv', index=False)
        g_test.to_csv(f'/brain/dti/dti_idp/g_test_{modality}_matched_fold_{fold}.csv', index=False)
        
        ############## 4
        print('Applying ConfoundRegressor')
        
        # Apply ConfoundRegressor
        features_train_corr, features_test_corr, features_train_scaled, features_test_scaled, scaler_features = confound_regressor_skbold(brain_train, brain_test, brain_conf_train, brain_conf_test)
        pd.DataFrame(features_train_corr, columns = brain_train.columns).to_csv(f'/brain/dti/dti_idp/fold_{fold}/{modality}_train_corr_{fold}.csv', index=False)
        pd.DataFrame(features_test_corr, columns = brain_train.columns).to_csv(f'/brain/dti/dti_idp/fold_{fold}/{modality}_test_corr_{fold}.csv', index=False)

        pd.DataFrame(features_train_scaled, columns = brain_train.columns).to_csv(f'/brain/dti/dti_idp/fold_{fold}/{modality}_train_scaled_{fold}.csv', index=False)
        pd.DataFrame(features_test_scaled, columns = brain_train.columns).to_csv(f'/brain/dti/dti_idp/fold_{fold}/{modality}_test_scaled_{fold}.csv', index=False)
        
        with open(f'/brain/dti/dti_idp/fold_{fold}/scaler_features_{modality}_fold_{fold}.pkl', "wb") as f:
            pickle.dump(scaler_features, f)

        # Initiate and run PLS
        parameters = {'n_components': range(1, features_train_corr.shape[1]+1)}
        pls = PLSRegression()
        model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True, random_state=seed), verbose=4, n_jobs = 8)
        
        
        print("Fitting PLS")
        model.fit(features_train_corr, np.array(g_train)) #np.array(features_train_corr)
        
        print(f'Model parameters for fold {fold}:', model.cv_results_['params'])
        print(f'Mean test score for fold {fold}:', model.cv_results_['mean_test_score'])
        print(f'Rank test score for fold {fold}:', model.cv_results_['rank_test_score'])
        print(model)
        
        print(f'Saving PLS model for {modality} fold {fold}')
        with open(f'/brain/dti/dti_idp/fold_{fold}/models/{modality}_model_fold_{fold}.pkl', "wb") as f:
            pickle.dump(model, f)
            
        print(f'Best params in fold {fold} = ', model.best_params_)
        print(f'Best score (neg_mean_absolute_error) in fold {fold} = ', model.best_score_)
            
        # Predict the values
        print(f'Predicting & saving g_test for {modality} fold {fold}')
        g_pred_test = model.predict(np.array(features_test_corr))
        pd.DataFrame(g_pred_test, columns=['g predicted test']).to_csv(f'/brain/dti/dti_idp/fold_{fold}/g_pred/{modality}_g_pred_test_fold_{fold}.csv')

        g_pred_test_with_id = pd.concat([g_test_id.astype(int), pd.DataFrame(g_pred_test, columns=['g predicted test'])], axis=1).to_csv(f'/brain/dti/dti_idp/fold_{fold}/g_pred/{modality}_g_pred_test_id_fold_{fold}.csv')

        
        print(f'Predicting & saving g_train for {modality} fold {fold}')
        g_pred_train = model.predict(np.array(features_train_corr))
        pd.DataFrame(g_pred_train, columns=['g predicted train']).to_csv(f'/brain/dti/dti_idp/fold_{fold}/g_pred/{modality}_g_pred_train_fold_{fold}.csv')
        

        g_pred_train_with_id = pd.concat([g_train_id.astype(int), pd.DataFrame(g_pred_train, columns=['g predicted train'])], axis=1).to_csv(f'/brain/dti/dti_idp/fold_{fold}/g_pred/{modality}_g_pred_train_id_fold_{fold}.csv')

            
        print(f"Fold = {fold}")
        print("----------")
        print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
        print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
        print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
        print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
        print("----------")
            
        pls_result['fold'] = fold
        pls_result['modality'] = modality
        pls_result['n_components'] = model.best_params_
        pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
        pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
        pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
        pls_result['Pearson r'] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])
            
        with open(f'/brain/dti/dti_idp/fold_{fold}/models/{modality}_fold_{fold}_PLS_result.csv', 'a', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=pls_result.keys())
            writer.writerow(pls_result)
            
        pls_result.clear()
        
        corr, pval = stats.pearsonr(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
        r2 = r2_score(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
        mse = mean_squared_error(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
        pd.DataFrame([modality, fold, corr, pval, r2, mse, model.best_params_], index=['Modality', 'Fold', 'Correlation', 'P-value', 'R2', 'MSE', 'n components'], columns=['Values']).to_csv(f'/brain/dti/dti_idp/fold_{fold}/models/{modality}_fold_{fold}_full_result.csv')

# Compare decondounded and original features

In [None]:
# Plot original values vs residuals (deconfounded)
features_train_corr_check = pd.read_csv('/brain/dti/dti_idp/fold_0/fa_tbss_train_corr_0.csv')
brain_train_orig_check =  pd.read_csv('/brain/dti/dti_idp/fa_tbss_train_fold_0.csv')
cols = brain_train_orig_check.columns[:25]
fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(15, 15))
axes = axes.flatten()
for i, col in enumerate(cols):
    sns.regplot(x=brain_train_orig_check[col], y=features_train_corr_check[col], line_kws={'color': 'red', 'linewidth': 1}, scatter=True, scatter_kws = {"color": ".7", "s": 10, "alpha": 0.2}, ax=axes[i])
    axes[i].set_xlabel("\n".join(textwrap.wrap(cols[i] + ' (Original)', 20)), fontsize=7)
    axes[i].set_ylabel("\n".join(textwrap.wrap(cols[i] + ' (Residuals)', 20)), fontsize=7)
    plt.yticks(fontsize=6)
    plt.xticks(fontsize=6)
plt.tight_layout()
plt.show()

# Merge results across 5 folds and average R2 and r

In [None]:
# Upload individual pls results and merge them into one table
folds = ["0", "1", "2", "3", "4"]
five_folds = []
for modality in modalities:
    for fold in folds:
        pls = pd.read_csv(f'/brain/dti/dti_idp/fold_{fold}/models/{modality}_fold_{fold}_PLS_result.csv', header=None)
        pls.columns = ['Fold', 'Modality', 'n components', 'MSE', 'MAE', 'R2', 'Pearson r']
        five_folds.append(pls)
        five_folds_all_modalities = pd.concat(five_folds, ignore_index=False)

five_folds_all_modalities['Pearson r'] = five_folds_all_modalities['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
five_folds_all_modalities[['Pearson r', 'p-value']] = five_folds_all_modalities['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
five_folds_all_modalities = five_folds_all_modalities.round(decimals=3)
#five_folds_all_modalities.to_csv('/brain/dti/pls_5_folds_all_modalities.csv', index=False)
five_folds_all_modalities['n components'] = five_folds_all_modalities['n components'].astype(str).str.replace(r"{'n_components':", '', regex=True)
five_folds_all_modalities['n components'] = five_folds_all_modalities['n components'].astype(str).str.replace(r"}", '', regex=True)
with pd.option_context('display.max_rows', None):
    display(five_folds_all_modalities)

In [None]:
# Average across folds
five_folds_all_modalities_mean = five_folds_all_modalities[['R2', 'Pearson r', 'Modality', 'MSE', 'MAE']]
five_folds_all_modalities_mean.groupby(['Modality']).mean().sort_values(by='R2', ascending=False).round(3)

## Get PLS loadings (weighted)

In [None]:
from warnings import simplefilter
from sklearn.exceptions import InconsistentVersionWarning
simplefilter("ignore", category=InconsistentVersionWarning)

folds = ["0", "1", "2", "3", "4"]
for modality in modalities:
    for fold in folds:
        with open(f'/brain/dti/dti_idp/fold_{fold}/models/{modality}_model_fold_{fold}.pkl', "rb") as f:
              model = pickle.load(f)
              r2_sum = 0
              r2_vector = np.empty(model.best_estimator_.n_components)
              
              g_train = pd.read_csv(f'/brain/dti/dti_idp/g_train_{modality}_matched_fold_{fold}.csv')
        
        for i in range(0,model.best_estimator_.n_components):
          Y_pred = np.dot(model.best_estimator_.x_scores_[:,i].reshape(-1,1), model.best_estimator_.y_loadings_[:,i].reshape(-1,1).T) * g_train.values.std(axis=0, ddof=1) + g_train['g'].mean(axis=0)
          r2_sum += r2_score(g_train.values,Y_pred)
          print('R2 for %d component: %g' %(i+1,r2_score(g_train.values, Y_pred)))
          r2_vector[i] = r2_score(g_train.values,Y_pred)
          
          x_loading_by_r2 = model.best_estimator_.x_loadings_ *  r2_vector
          x_loading_by_r2_scaled = stats.zscore(model.best_estimator_.x_loadings_) *  r2_vector
          weighted_x_loading = np.sum(x_loading_by_r2, axis=1)
          weighted_x_loading_scaled = np.sum(x_loading_by_r2_scaled, axis=1)
        print(f'R2 for all components in {modality} fold {fold}: %g' %r2_sum.round(2))

Combine all weighted loadings into one dictionary

In [None]:
from warnings import simplefilter
from sklearn.exceptions import InconsistentVersionWarning
simplefilter("ignore", category=InconsistentVersionWarning)
###############################
weighted_x_loadings = {}
folds = ["0", "1", "2", "3", "4"]
for modality in modalities:
    weighted_x_loadings[modality] = {}
    for fold in folds:
        with open(f'/brain/dti/dti_idp/fold_{fold}/models/{modality}_model_fold_{fold}.pkl', "rb") as f:
            model = pickle.load(f)
        g_train = pd.read_csv(f'/brain/dti/dti_idp/g_train_{modality}_matched_fold_{fold}.csv')['g']
        
        # Initialize variables for R2 calculation
        r2_vector = np.empty(model.best_estimator_.n_components)
        
        # Calculate R2 for each component
        for i in range(model.best_estimator_.n_components):
            Y_pred = np.dot(model.best_estimator_.x_scores_[:, i].reshape(-1, 1),
                            model.best_estimator_.y_loadings_[:, i].reshape(-1, 1).T) * g_train.values.std(ddof=1) + g_train.mean()
            r2_vector[i] = r2_score(g_train.values, Y_pred)
        
        # Calculate weighted x_loadings
        x_loading_by_r2 = model.best_estimator_.x_loadings_ * r2_vector
        x_loading_by_r2_scaled = stats.zscore(model.best_estimator_.x_loadings_) * r2_vector
        weighted_x_loading = np.sum(x_loading_by_r2, axis=1)
        weighted_x_loading_scaled = np.sum(x_loading_by_r2_scaled, axis=1)
        
        # Store the weighted_x_loadings in the dictionary
        weighted_x_loadings[modality][fold] = {
            'weighted_x_loading': weighted_x_loading,
            'weighted_x_loading_scaled': weighted_x_loading_scaled
        }

# Print the accumulated weighted_x_loadings
for modality in weighted_x_loadings:
    for fold in weighted_x_loadings[modality]:
        print(f"Modality: {modality}, Fold: {fold}, Weighted X Loading: {weighted_x_loadings[modality][fold]['weighted_x_loading']}")
        print(f"Modality: {modality}, Fold: {fold}, Weighted X Loading Scaled: {weighted_x_loadings[modality][fold]['weighted_x_loading_scaled']}")

# Extract weighted loadings for each modality in the form of a data frame

In [None]:
modality_dataframes = {modality: globals()[modality] for modality in modalities}
weighted_x_loading_scaled_dfs = {}
for modality in weighted_x_loadings:
    dfs = []
    for fold in weighted_x_loadings[modality]:
        # Get the weighted_x_loading_scaled for the current fold
        weighted_x_loading_scaled = weighted_x_loadings[modality][fold]['weighted_x_loading_scaled']
        # Get feature names
        feature_names = modality_dataframes[modality].drop(columns='eid').columns.tolist()
        # Create a DataFrame with feature names and weighted_x_loading_scaled
        df = pd.concat([pd.DataFrame(feature_names, columns=['Features']),
                        pd.DataFrame(weighted_x_loading_scaled, columns=['Loadings'])], axis=1)
        dfs.append(df)
    # Concatenate all fold DataFrames for the current modality
    weighted_x_loading_scaled_dfs[modality] = pd.concat(dfs, keys=folds, names=['Fold', 'Index'])

# Print the DataFrames for each modality
for modality in weighted_x_loading_scaled_dfs:
    print(f"DataFrame for {modality}:")
    print(weighted_x_loading_scaled_dfs[modality])

# Save modality_dataframes
with open(f'/brain/dti/dti_idp/dti_idp_modality_dataframes.pkl', "wb") as f:
    pickle.dump(modality_dataframes, f)

In [None]:
with pd.option_context('display.max_rows', None):
    display(weighted_x_loading_scaled_dfs[modality]) 

In [None]:
# Save the dictionary
with open(f'/brain/dti/dti_idp/dti_idp_weighted_x_loading_scaled_dfs.pkl', "wb") as f:
    pickle.dump(weighted_x_loading_scaled_dfs, f)

In [None]:
for modality in weighted_x_loading_scaled_dfs:
    fold_0 = weighted_x_loading_scaled_dfs[modality].loc['0'].reset_index().drop(columns='Index')
    
    negative_load = fold_0[fold_0['Loadings'] < 0].sort_values(by='Loadings', ascending=False).reset_index(drop=True)
    positive_load = fold_0[fold_0['Loadings'] >= 0].sort_values(by='Loadings', ascending=False).reset_index(drop=True)
    
    max_abs_value = max(abs(negative_load['Loadings'].min()), positive_load['Loadings'].max())

    fig, ax_neg = plt.subplots(figsize=(12, 20))

    negative_load['Loadings'].plot.barh(ax=ax_neg, color='purple')
    ax_neg.set_xlim(-max_abs_value - 0.1, 0)
    ax_neg.tick_params(axis='y', labelcolor='purple', labelleft=False, labelright=True, labelsize=15)
    ax_neg.tick_params(axis='x', colors='purple', labelsize=15)
    ax_neg.set_yticks(range(len(negative_load)))
    ax_neg.set_yticklabels(negative_load['Features']) 
    ax_neg.yaxis.set_ticks_position('none')
    
    ax_pos = ax_neg.twiny().twinx()  # Create a secondary x-axis and y-axis
    positive_load['Loadings'].plot.barh(ax=ax_pos, color='crimson')
    ax_pos.set_xlim(0, max_abs_value + 0.1)
    ax_pos.tick_params(axis='y', labelcolor='crimson', right=False, labelleft=True, labelsize=15)
    ax_pos.set_yticks(range(len(positive_load)))
    ax_pos.set_yticklabels(positive_load['Features'])
    ax_pos.yaxis.set_ticks_position('none')
    ax_pos.set_xticklabels([])
    ax_pos.set_xticks([])
    ax_pos.invert_yaxis()

    # Set top x-axis labels
    ax_pos_twin = ax_neg.twiny()
    ax_pos_twin.set_xlim(0, max_abs_value + 0.1)
    ax_pos_twin.tick_params(axis='x', labelcolor='crimson', labelsize=15)
    
    # Remove the positive loading labels from the right side
    ax_pos.tick_params(axis='y', labelright=False)
    
    plt.show()