In [None]:
import os
import openpyxl
import xlsxwriter
from datetime import datetime
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils import resample
from scipy.stats import pearsonr
import pickle
import seaborn as sns
from matplotlib.gridspec import GridSpec
import matplotlib.patches as mpatches
from sklearn.neighbors import KernelDensity
import matplotlib as mpl
import matplotlib.gridspec as grid_spec
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio
from matplotlib.ticker import MultipleLocator
from typing import List, Dict, Optional

In [None]:
def save_dataframe_to_files(df, pathname, filename):
    """
    Save a pandas DataFrame to both CSV and Excel formats in the specified location.
    Creates directories if they don't exist.
    """
    # Ensure the filename doesn't include extensions
    base_name = os.path.splitext(filename)[0]
    
    # Create directory if it doesn't exist
    os.makedirs(pathname, exist_ok=True)
    
    # Create full paths
    csv_path = os.path.join(pathname, f"{base_name}.csv")
    excel_path = os.path.join(pathname, f"{base_name}.xlsx")
    
    try:
        # Save files
        df.to_csv(csv_path, index=False)
        df.to_excel(excel_path, index=False, engine='openpyxl')
        
        print(f"Files saved successfully:\n- {csv_path}\n- {excel_path}")
    except Exception as e:
        print(f"Error saving files: {str(e)}")

In [None]:
def get_bootstrap_ci(bootstrap_distribution, confidence_level=0.95):
    """Compute confidence intervals with input validation"""
    alpha = 1 - confidence_level
    lower = np.percentile(bootstrap_distribution, (alpha / 2) * 100).round(3)
    upper = np.percentile(bootstrap_distribution, (1 - alpha / 2) * 100).round(3)
    return (lower, upper)

In [None]:
def compute_and_save_bootstrap_results_1level(
    metrics: List[str],
    modality_name: str,
    base_path: str,
    output_path: str,
    ci_level: float = 0.95,
    column_order: Optional[List[str]] = None,
    print_summary: bool = True
) -> pd.DataFrame:
    """
    Compute bootstrap confidence intervals for multiple metrics and modalities.
    
    Args:
        metrics: List of metric names (e.g., ['r2', 'r', 'mse', 'mae'])
        modality_name: Name of the modality being analyzed
        base_path: Base directory path containing bootstrap CSV files
        output_path: Directory where results will be saved
        ci_level: Confidence level (default: 0.95)
        column_order: Optional list specifying column order in output
        print_summary: Whether to print formatted summary (default: True)
        
    Returns:
        DataFrame containing computed statistics and confidence intervals
    """
    # Default column order
    if column_order is None:
        column_order = ['Modality', 'Metric', 'Mean', 'Median', 'Std', 'CI_lower', 'CI_upper']
    
    ci_results = []
    
    for metric in metrics:
        # Load bootstrap results
        try:
            file_path = os.path.join(base_path, f"{modality_name}_{metric}_bootstrapped_renamed.csv")
            df = pd.read_csv(file_path)
            
            # Clean columns
            df = df.drop(
                columns=df.columns[df.columns.str.contains('^Unnamed|^index$', case=False)], 
                errors='ignore'
            )
            
        except FileNotFoundError:
            print(f"Warning: File not found for {modality_name}_{metric}")
            continue
        
        # Process each modality column
        for modality in df.columns:
            if df[modality].isna().all():
                print(f"Skipping empty modality: {modality}")
                continue
                
            values = df[modality].values
            lower, upper = get_bootstrap_ci(values, ci_level)
            
            ci_results.append({
                'Modality': modality,
                'Metric': metric.upper(),
                'Mean': np.mean(values).round(3),
                'Median': np.median(values).round(3),
                'Std': np.std(values).round(3),
                'CI_lower': lower,
                'CI_upper': upper
            })
    
    # Create and format DataFrame
    ci_df = pd.DataFrame(ci_results)
    
    # Ensure all expected columns exist
    missing_cols = [col for col in column_order if col not in ci_df.columns]
    if missing_cols:
        raise ValueError(f"Missing expected columns: {missing_cols}")
    
    ci_df = ci_df[column_order]
    
    # Save results
    output_filename = f'bootstrap_{modality_name}_pls_CI'
    save_dataframe_to_files(ci_df, output_path, output_filename)
    
    # Print results
    if print_summary:
        print_results(ci_df, output_path)
    
    return ci_df


def print_results(ci_df: pd.DataFrame, output_path: str) -> None:
    """Print formatted results summary."""
    print("=== Confidence Intervals ===")
    print(ci_df)
    print(f"\nResults saved to: {output_path}")
    
    for modality, group in ci_df.groupby('Modality'):
        print(f"\n=== {modality} ===")
        for metric, metric_group in group.groupby('Metric'):
            row = metric_group.iloc[0]
            print(f"{row['Metric']}:")
            print(f"  Mean ± SD: {row['Mean']} ± {row['Std']}")
            print(f"  Median: {row['Median']}")
            print(f"  95% CI: [{row['CI_lower']}, {row['CI_upper']}]")

In [None]:
def compute_and_save_bootstrap_results_2level(
    bootstrap_dists: dict,
    output_path: str,
    modality_name: str = None,
    algorithms: list = ['SVR', 'eNet', 'RF', 'XGB']
) -> pd.DataFrame:
    """
    Compute bootstrap statistics and save results for multiple algorithms and metrics
    
    Parameters:
    -----------
    bootstrap_dists : dict
        Dictionary containing bootstrap distributions (format: {'Metric': {'Algorithm': data}})
    output_path : str
        Directory where results will be saved
    modality_name : str, optional
        Name of the modality being analyzed (for output filename)
    algorithms : list, optional
        List of algorithms to analyze (default: ['SVR', 'eNet', 'RF', 'XGB'])
    """
    # Initialize results storage
    ci_results = {
        'Modality': [],
        'Algorithm': [],
        'Metric': [],
        'Mean': [],
        'Median': [],
        'Std': [],
        'CI_lower': [],
        'CI_upper': []
    }

    for algorithm in algorithms:  # <-- CHANGED: Use parameter instead of hardcoded list
        print(f"\n=== {algorithm} ===")
        
        for metric, dist in bootstrap_dists.items():
            try:
                alg_data = dist[algorithm]
                
                # Calculate statistics
                mean_val = np.mean(alg_data).round(3)
                median_val = np.median(alg_data).round(3)
                std_val = np.std(alg_data).round(3)
                lower, upper = get_bootstrap_ci(alg_data)
                
                # Store results (including modality if provided)
                if modality_name:
                    ci_results['Modality'].append(modality_name)
                ci_results['Algorithm'].append(algorithm)
                ci_results['Metric'].append(metric)
                ci_results['Mean'].append(mean_val)
                ci_results['Median'].append(median_val)
                ci_results['Std'].append(std_val)
                ci_results['CI_lower'].append(lower)
                ci_results['CI_upper'].append(upper)
                
                print(f"{metric}:")
                print(f"  Mean ± SD: {mean_val} ± {std_val}")
                print(f"  Median: {median_val}")
                print(f"  95% CI: [{lower}, {upper}]")
                
            except KeyError:
                print(f"Warning: {algorithm} not found in {metric} data")
                continue

    # Convert to DataFrame
    try:
        # Define expected columns (including Modality if needed)
        expected_columns = (['Modality'] if modality_name else []) + [
            'Algorithm', 'Metric', 'Mean', 'Median', 'Std', 'CI_lower', 'CI_upper'
        ]
        
        ci_df = pd.DataFrame(ci_results)
        
        # Verify columns
        missing_cols = [col for col in expected_columns if col not in ci_df.columns]
        if missing_cols:
            raise ValueError(f"Missing columns: {missing_cols}")
            
        ci_df = ci_df[expected_columns]
        
        # Create filename with modality if provided
        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
        base_filename = f"bootstrap_{modality_name}_CI" if modality_name else f"bootstrap_CI"
        
        save_dataframe_to_files(ci_df, output_path, base_filename)
        
        print("\n=== Final Results ===")
        print(ci_df)
        print(f"\nResults saved to: {output_path}")
        
        return ci_df
        
    except Exception as e:
        print(f"Error processing results: {str(e)}")
        return None

# Mental Health

In [None]:
folds = ["0", "1", "2", "3", "4"]

In [None]:
# Concat 5 folds
pred_test = []
obs_test = []

for fold in folds:
    g_pred_test = pd.read_csv(f'/mental_health/folds/fold_{fold}/g_pred/g_pred_mh_fold_{fold}.csv')
    g_obs_test = pd.read_csv(f'/mental_health/folds/fold_{fold}/suppl/g_test_matched_fold_{fold}.csv')

    pred_test.append(g_pred_test)
    obs_test.append(g_obs_test)
        
g_five_folds_pred_test = pd.concat(pred_test, axis=0, ignore_index=True)
g_five_folds_pred_test.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/mh/mental_health_target_pred_test.csv', index=False)

g_five_folds_obs_test = pd.concat(obs_test, axis=0, ignore_index=True)
g_five_folds_obs_test.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/mh/mental_health_target_obs_test.csv', index=False)

print(f'Shape of the (predicted) test set in mental health is', g_five_folds_pred_test.shape)
print(f'Shape of the (observed) test set in mental health is', g_five_folds_obs_test.shape)

In [None]:
# Bootstrap mental health
# =============================================
# PART 1: Combine 5-fold cross-validation results
# =============================================

# Concat 5 folds for mental health
pred_test = []
obs_test = []
folds = range(5)

for fold in folds:
    # Load predicted values
    g_pred_test = pd.read_csv(
        f'/mental_health/folds/fold_{fold}/g_pred/g_pred_mh_fold_{fold}.csv'
    )
    
    # Load observed values
    g_obs_test = pd.read_csv(
        f'/mental_health/folds/fold_{fold}/suppl/g_test_matched_fold_{fold}.csv'
    )

    pred_test.append(g_pred_test)
    obs_test.append(g_obs_test)
        
# Combine all folds
g_five_folds_pred_test = pd.concat(pred_test, axis=0, ignore_index=True)
g_five_folds_obs_test = pd.concat(obs_test, axis=0, ignore_index=True)

# Save combined results
g_five_folds_pred_test.to_csv(
    '/PLS/brain/stacking/BOOTSTRAP/mh/mental_health_target_pred_test.csv', 
    index=False
)

g_five_folds_obs_test.to_csv(
    '/PLS/brain/stacking/BOOTSTRAP/mh/mental_health_target_obs_test.csv', 
    index=False
)

print(f'Shape of the (predicted) test set for mental health: {g_five_folds_pred_test.shape}')
print(f'Shape of the (observed) test set for mental health: {g_five_folds_obs_test.shape}')

# =============================================
# PART 2: Bootstrapping for mental health
# =============================================

# Load the combined data
g_real = g_five_folds_obs_test
g_pred = g_five_folds_pred_test

# Merge observed and predicted values
g = pd.concat([g_real, g_pred], axis=1)
g = g.rename(columns={
    'g': 'g_mh_observed',
    'g_pred_mh': 'g_mh_predicted'
})

# Bootstrapping configuration
boot = 5000
metrics = ['R2', 'r', 'MSE', 'MAE']
bootstrap_results = {metric: [] for metric in metrics}

# Run bootstrap
for i in range(boot):
    df_sample = g.sample(len(g), replace=True) 
    
    # Calculate metrics
    corr, _ = pearsonr(df_sample['g_mh_observed'], df_sample['g_mh_predicted'])
    r2 = r2_score(df_sample['g_mh_observed'], df_sample['g_mh_predicted'])
    mse = mean_squared_error(df_sample['g_mh_observed'], df_sample['g_mh_predicted'])
    mae = mean_absolute_error(df_sample['g_mh_observed'], df_sample['g_mh_predicted'])
            
    bootstrap_results['r'].append(corr)
    bootstrap_results['R2'].append(r2)
    bootstrap_results['MSE'].append(mse)
    bootstrap_results['MAE'].append(mae)

# Convert results to DataFrames
for metric in bootstrap_results:
    bootstrap_results[metric] = pd.DataFrame(bootstrap_results[metric], columns=[f'mh_{metric}'])

# Save bootstrap results
with open(
    '/PLS/brain/stacking/BOOTSTRAP/mh/MH_bootstrap_results.pkl', 
    'wb'
) as f:
    pickle.dump(bootstrap_results, f)
       
print("Bootstrap results for mental health were successfully saved as MH_bootstrap_results.pkl")

# =============================================
# Optional: Calculate confidence intervals
# =============================================

# Calculate 95% confidence intervals for each metric
metrics = ['R2', 'r', 'MSE', 'MAE']
column_order = ['Modality', 'Metric', 'Mean', 'Median', 'Std', 'CI_lower', 'CI_upper']
ci_results = []

for metric, df in bootstrap_results.items():
    values = df[f'mh_{metric}'].values
    
    # Calculate confidence intervals (using percentile method)
    lower = np.percentile(values, 2.5)
    upper = np.percentile(values, 97.5)
    
    ci_results.append({
        'Modality': 'mh',
        'Metric': metric.upper(),
        'Mean': np.mean(values).round(3),
        'Median': np.median(values).round(3),
        'Std': np.std(values).round(3),
        'CI_lower': lower.round(3),
        'CI_upper': upper.round(3)
    })

# Create and format DataFrame
ci_df = pd.DataFrame(ci_results)
missing_cols = [col for col in column_order if col not in ci_df.columns]
if missing_cols:
    raise ValueError(f"Missing expected columns: {missing_cols}")

ci_df = ci_df[column_order]

# Save results
output_filename = 'bootstrap_mh_pls_CI'
ci_df.to_csv(
    f'/PLS/brain/stacking/BOOTSTRAP/mh/{output_filename}.csv',
    index=False
)
ci_df.to_pickle(
    f'/PLS/brain/stacking/BOOTSTRAP/mh/{output_filename}.pkl'
)


output_path = '/rev1'
save_dataframe_to_files(ci_df, output_path, f'bootstrap_mental_health_pls_CI')


# Print results
print("\n=== Confidence Intervals ===")
print(ci_df)
print(f"\nResults saved to: {output_path}")

for _, row in ci_df.iterrows():
    print(f"\n=== {row['Modality']} - {row['Metric']} ===")
    print(f"Mean ± SD: {row['Mean']} ± {row['Std']}")
    print(f"Median: {row['Median']}")
    print(f"95% CI: [{row['CI_lower']}, {row['CI_upper']}]")

# DTI

## Bootstrap DTI IDP + Parcellations Stacked

In [None]:
# Concat 5 folds
algorithms = ['svr', 'eNet', 'xgb', 'rf']
folds = ["0", "1", "2", "3", "4"]

g_five_folds_train = {}
g_five_folds_test = {}

for algorithm in algorithms:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/stacking/g/DTI_All_target_pred_2nd_level_{algorithm}_train_fold_{fold}.csv')
        g_pred_test = pd.read_csv(f'/PLS/brain/stacking/g/DTI_All_target_pred_2nd_level_{algorithm}_test_fold_{fold}.csv')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[algorithm] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[algorithm] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[algorithm].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/dti_all/DTI_All_target_pred_{algorithm}_train.csv', index=False)
    g_five_folds_test[algorithm].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/dti_all/DTI_All_target_pred_{algorithm}_test.csv', index=False)

    print(f'Shape of the train set in {algorithm} is', g_five_folds_train[algorithm].shape)
    print(f'Shape of the test set in {algorithm} is', g_five_folds_test[algorithm].shape)

In [None]:
# Match real to predicted g
g_five_folds_real_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')
g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/dti_all/DTI_All_target_pred_svr_test.csv')
g_real_to_pred_match = pd.DataFrame(g_five_folds_real_test.merge(g_five_folds_pred_test['eid'], on='eid'), columns = ['g_real_test', 'eid'])
g_real_to_pred_match.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/dti_all/g_real_to_pred_match_dti_all_test.csv')
print('Shape of g real test matched to g pred is', g_real_to_pred_match.shape)

In [None]:
# Bootstrapping
boot = 5000
g_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/dti_all/g_real_to_pred_match_dti_all_test.csv')
preds=[]

bootstrap_results_dfs = {}

for algorithm in algorithms:
    g_pred = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/dti_all/DTI_All_target_pred_{algorithm}_test.csv')
    df = pd.concat([g_real, g_pred], axis=1).drop(columns=['eid', 'Unnamed: 0'])
    df.columns = ['g_real_test', f'g_pred_dti_all_{algorithm}_test']
    
    # Initialize a discionary for each algorithmn separately

    bootstrap_results = {
    'R2': [],
    'r': [],
    'MSE': [],
    'MAE': []
    }

    for i in range(boot):
        df_sample = df.sample(len(df),replace=True) #random sample with replacement, frac=1 mean size will be the size of data set
        corr, _ = pearsonr(df_sample['g_real_test'], df_sample[f'g_pred_dti_all_{algorithm}_test'])
        r2 = r2_score(df_sample['g_real_test'], df_sample[f'g_pred_dti_all_{algorithm}_test'])
        mse = mean_squared_error(df_sample['g_real_test'], df_sample[f'g_pred_dti_all_{algorithm}_test'])
        mae = mean_absolute_error(df_sample['g_real_test'], df_sample[f'g_pred_dti_all_{algorithm}_test'])

        bootstrap_results['r'].append(corr)
        bootstrap_results['R2'].append(r2)
        bootstrap_results['MSE'].append(mse)
        bootstrap_results['MAE'].append(mae)

    for key in bootstrap_results:
        bootstrap_results[key] = pd.DataFrame(bootstrap_results[key], columns=[f'{algorithm}_{key}'])
        
    bootstrap_results_dfs[algorithm] = bootstrap_results

bootstrap_svr = bootstrap_results_dfs['svr']
bootstrap_enet = bootstrap_results_dfs['eNet']
bootstrap_rf = bootstrap_results_dfs['rf']
bootstrap_xgb = bootstrap_results_dfs['xgb']

for metric in bootstrap_svr:
    bootstrap_svr[metric] = pd.DataFrame(bootstrap_svr[metric])

bootstrap_svr = pd.concat([bootstrap_svr['R2'].reset_index(drop=True), 
                    bootstrap_svr['r'].reset_index(drop=True), 
                    bootstrap_svr['MSE'].reset_index(drop=True), 
                    bootstrap_svr['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_enet:
    bootstrap_enet[metric] = pd.DataFrame(bootstrap_enet[metric])

bootstrap_enet = pd.concat([bootstrap_enet['R2'].reset_index(drop=True), 
                    bootstrap_enet['r'].reset_index(drop=True), 
                    bootstrap_enet['MSE'].reset_index(drop=True), 
                    bootstrap_enet['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_rf:
    bootstrap_rf[metric] = pd.DataFrame(bootstrap_rf[metric])

bootstrap_rf = pd.concat([bootstrap_rf['R2'].reset_index(drop=True), 
                    bootstrap_rf['r'].reset_index(drop=True), 
                    bootstrap_rf['MSE'].reset_index(drop=True), 
                    bootstrap_rf['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_xgb:
    bootstrap_xgb[metric] = pd.DataFrame(bootstrap_xgb[metric])

bootstrap_xgb = pd.concat([bootstrap_xgb['R2'].reset_index(drop=True), 
                    bootstrap_xgb['r'].reset_index(drop=True), 
                    bootstrap_xgb['MSE'].reset_index(drop=True), 
                    bootstrap_xgb['MAE'].reset_index(drop=True)], axis=1)

In [None]:
# Extract metrics
bootstrap_fullres = pd.concat([
    bootstrap_svr,
    bootstrap_enet,
    bootstrap_rf,
    bootstrap_xgb], axis=1)

bootstrap_fullres.to_csv('/PLS/brain/stacking/BOOTSTRAP/dti_all/bootstrap_dti_all.csv', index=False)

In [None]:
# Combine algorithms
bootstrap_fullres_dti_stacked_rf = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/dti_all/bootstrap_dti_all.csv')

bootstrap_R2 = bootstrap_fullres_dti_stacked_rf[['svr_R2', 'eNet_R2', 'rf_R2', 'xgb_R2']]
bootstrap_R2.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_r = bootstrap_fullres_dti_stacked_rf[['svr_r', 'eNet_r', 'rf_r', 'xgb_r']]
bootstrap_r.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_mse = bootstrap_fullres_dti_stacked_rf[['svr_MSE', 'eNet_MSE', 'rf_MSE', 'xgb_MSE']]
bootstrap_mse.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_mae = bootstrap_fullres_dti_stacked_rf[['svr_MAE', 'eNet_MAE', 'rf_MAE', 'xgb_MAE']]
bootstrap_mae.columns = ['SVR', 'eNet', 'RF', 'XGB']

In [None]:
# Extract metrics
bootstrap_fullres_dti_stacked_rf = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/dti_all/bootstrap_dti_all.csv')

bootstrap_dti_R2 = bootstrap_fullres_dti_stacked_rf[['svr_R2', 'eNet_R2', 'rf_R2', 'xgb_R2']]
bootstrap_dti_R2.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_dti_r = bootstrap_fullres_dti_stacked_rf[['svr_r', 'eNet_r', 'rf_r', 'xgb_r']]
bootstrap_dti_r.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_dti_mse = bootstrap_fullres_dti_stacked_rf[['svr_MSE', 'eNet_MSE', 'rf_MSE', 'xgb_MSE']]
bootstrap_dti_mse.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_dti_mae = bootstrap_fullres_dti_stacked_rf[['svr_MAE', 'eNet_MAE', 'rf_MAE', 'xgb_MAE']]
bootstrap_dti_mae.columns = ['SVR', 'eNet', 'RF', 'XGB']

# Get best algorithm
bootstrap_dti_stacked_rf_r = bootstrap_dti_r['RF']
bootstrap_dti_stacked_rf_R2 = bootstrap_dti_R2['RF']

## Bootstrap DTI IDP PLSR

### Bootstrap DTI IDP PLSR: Prepare files

In [None]:
# Define modalities
dti_idp_modalities = ["fa_tbss",  "fa_prob", "md_tbss", "md_prob", "l1_tbss",
"l1_prob", "l2_tbss", "l2_prob", "l3_tbss", "l3_prob",
"mo_tbss", "mo_prob", "od_tbss", "od_prob", "icvf_tbss",
"icvf_prob", "isovf_tbss", "isovf_prob"]

In [None]:
# Concat 5 folds
g_five_folds_train = {}
g_five_folds_test = {}

for modality in dti_idp_modalities:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/dti/dti_idp/fold_{fold}/g_pred/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        g_pred_test = pd.read_csv(f'/PLS/brain/dti/dti_idp/fold_{fold}/g_pred/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[modality] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[modality] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/DTI_IDP_target_pred_{modality}_train.csv', index=False)
    g_five_folds_test[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/DTI_IDP_target_pred_{modality}_test.csv', index=False)

    print(f'Shape of the train set in {modality} is', g_five_folds_train[modality].shape)
    print(f'Shape of the test set in {modality} is', g_five_folds_test[modality].shape)

In [None]:
# Match real to predicted g for each modality
g_real_to_pred_match_dict = {}
g_five_folds_test_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')

for modality in dti_idp_modalities:

    g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/target_pred_{modality}_test.csv')
    g_real_to_pred_match_dict[modality] = pd.DataFrame(g_five_folds_test_real.merge(g_five_folds_pred_test, on='eid'))
    g_real_to_pred_match_dict[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/g_real_to_pred_match_{modality}_test.csv')

### Bootstrap DTI Parcellations PLSR: Prepare files

In [None]:
dti_parcellations = [
'31020_connectome_fa','31020_connectome_mean_length','31020_connectome_sift2','31020_connectome_streamline_count',
'31021_connectome_fa','31021_connectome_mean_length','31021_connectome_sift2','31021_connectome_streamline_count',
'31022_connectome_fa','31022_connectome_mean_length','31022_connectome_sift2','31022_connectome_streamline_count',
'31023_connectome_fa','31023_connectome_mean_length','31023_connectome_sift2','31023_connectome_streamline_count',
'31024_connectome_fa','31024_connectome_mean_length','31024_connectome_sift2','31024_connectome_streamline_count',
'31025_connectome_fa','31025_connectome_mean_length','31025_connectome_sift2','31025_connectome_streamline_count']

In [None]:
# Concat 5 folds
g_five_folds_train = {}
g_five_folds_test = {}

for atlas in dti_parcellations:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/dti/dti_struct/fold_{fold}/g_pred/{atlas}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        g_pred_test = pd.read_csv(f'/PLS/brain/dti/dti_struct/fold_{fold}/g_pred/{atlas}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[atlas] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[atlas] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[atlas].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/DTI_Struct_target_pred_{atlas}_train.csv', index=False)
    g_five_folds_test[atlas].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/DTI_Struct_target_pred_{atlas}_test.csv', index=False)

    print(f'Shape of the train set in {atlas} is', g_five_folds_train[atlas].shape)
    print(f'Shape of the test set in {atlas} is', g_five_folds_test[atlas].shape)

In [None]:
# Match real to predicted g for each modality
g_real_to_pred_match_dict = {}
g_five_folds_test_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')

for atlas in dti_parcellations:

    g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/target_pred_{atlas}_test.csv')
    g_real_to_pred_match_dict[atlas] = pd.DataFrame(g_five_folds_test_real.merge(g_five_folds_pred_test, on='eid'))
    g_real_to_pred_match_dict[atlas].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/g_real_to_pred_match_{atlas}_test.csv')

### Bootstrap DTI IDP and Parcellations PLSR

In [None]:
all_dti_modalities = ["fa_tbss",  "fa_prob", "md_tbss", "md_prob", "l1_tbss",
"l1_prob", "l2_tbss", "l2_prob", "l3_tbss", "l3_prob",
"mo_tbss", "mo_prob", "od_tbss", "od_prob", "icvf_tbss",
"icvf_prob", "isovf_tbss", "isovf_prob",
'31020_connectome_fa','31020_connectome_mean_length','31020_connectome_sift2','31020_connectome_streamline_count',
'31021_connectome_fa','31021_connectome_mean_length','31021_connectome_sift2','31021_connectome_streamline_count',
'31022_connectome_fa','31022_connectome_mean_length','31022_connectome_sift2','31022_connectome_streamline_count',
'31023_connectome_fa','31023_connectome_mean_length','31023_connectome_sift2','31023_connectome_streamline_count',
'31024_connectome_fa','31024_connectome_mean_length','31024_connectome_sift2','31024_connectome_streamline_count',
'31025_connectome_fa','31025_connectome_mean_length','31025_connectome_sift2','31025_connectome_streamline_count']

In [None]:
# Bootstrapping
boot = 5000
metrics = ['R2', 'r', 'MSE', 'MAE']
bootstrap_results_dfs = {mod: {metric: [] for metric in metrics} for mod in all_dti_modalities}
    
for modality in all_dti_modalities:
    g_real_pls = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/g_real_to_pred_match_{modality}_test.csv')
    g_pred_pls = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/target_pred_{modality}_test.csv')
    g = g_real_pls.merge(g_pred_pls, on = 'eid').drop(columns=['Unnamed: 0', 'g predicted test_x'])
    g = g.rename(columns={'g predicted test_y': f'g_predicted_pls_{modality}'})

    bootstrap_results = {metric: [] for metric in metrics}
        
    for i in range(boot):
        df_sample = g.sample(len(g),replace=True) 

        corr_pls, _ = pearsonr(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        r2_pls = r2_score(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        mse_pls = mean_squared_error(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        mae_pls = mean_absolute_error(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
            
        bootstrap_results['r'].append(corr_pls)
        bootstrap_results['R2'].append(r2_pls)
        bootstrap_results['MSE'].append(mse_pls)
        bootstrap_results['MAE'].append(mae_pls)
            
    for key in bootstrap_results:
        bootstrap_results[key] = pd.DataFrame(bootstrap_results[key], columns=[f'{modality}_{key}'])

    bootstrap_results_dfs[modality] = bootstrap_results

with open('/PLS/brain/stacking/BOOTSTRAP/DTI_PLS_bootstrap_results_dfs.pkl', 'wb') as f:
    pickle.dump(bootstrap_results_dfs, f)
        
print("bootstrap_results_dfs dictionary was successfully saved as bootstrap_results_dfs.pkl")

If you need to upload the saved bootstrapped object

In [None]:
with open('/PLS/brain/stacking/BOOTSTRAP/DTI_PLS_bootstrap_results_dfs.pkl', 'rb') as f:
    bootstrap_results_dfs = pickle.load(f)
metric_dfs = {metric: pd.DataFrame() for metric in ['R2', 'r', 'MSE', 'MAE']}
for modality, results in bootstrap_results_dfs.items():
    for metric, df in results.items():
        if metric in metric_dfs:
            metric_dfs[metric] = pd.concat([metric_dfs[metric], df], axis=1)

Get dataframes for each metric

In [None]:
dti_r2 = pd.concat([df['R2'] for df in bootstrap_results_dfs.values()], axis=1)
dti_r = pd.concat([df['r'] for df in bootstrap_results_dfs.values()], axis=1)
dti_mse = pd.concat([df['MSE'] for df in bootstrap_results_dfs.values()], axis=1)
dti_mae = pd.concat([df['MAE'] for df in bootstrap_results_dfs.values()], axis=1)

#dti_r2.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/r2_dti_bootstrapped.csv', index=False)
#dti_r.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/r_dti_bootstrapped.csv', index=False)
#dti_mse.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/mse_dti_bootstrapped.csv', index=False)
#dti_mae.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/mae_dti_bootstrapped.csv', index=False)

In [None]:
dti_r2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/r2_dti_bootstrapped.csv')
dti_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/r_dti_bootstrapped.csv')
dti_mse = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/mse_dti_bootstrapped.csv')
dti_mae = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/mae_dti_bootstrapped.csv')

In [None]:
base_rename_dict = {
'fa_tbss':'FA TBSS',
'fa_prob':'FA Probabilistic',
'md_tbss':'MD TBSS',
'md_prob':'MD Probabilistic',
'l1_tbss':'L1 TBSS',
'l1_prob':'L1 Probabilistic',
'l2_tbss':'L2 TBSS',
'l2_prob':'L2 Probabilistic',
'l3_tbss':'L3 TBSS',
'l3_prob':'L3 Probabilistic',
'mo_tbss':'MO TBSS',
'mo_prob':'MO Probabilistic',
'od_tbss':'OD TBSS',
'od_prob':'OD Probabilistic',
'icvf_tbss':'ICVF TBSS',
'icvf_prob':'ICVF Probabilistic',
'isovf_tbss':'ISOVF TBSS',
'isovf_prob':'ISOVF Probabilistic',

'31020_connectome_fa':'aparc a2009s MSA I Connectome FA',
'31020_connectome_mean_length':'aparc a2009s MSA I Connectome Mean Length',
'31020_connectome_sift2':'aparc a2009s MSA I Connectome SIFT2',
'31020_connectome_streamline_count':'aparc a2009s MSA I Connectome Streamline Count',

'31021_connectome_fa':'aparc MSA I Connectome FA',
'31021_connectome_mean_length':'aparc MSA I Connectome Mean Length',
'31021_connectome_sift2':'aparc MSA I Connectome SIFT2',
'31021_connectome_streamline_count':'aparc MSA I Connectome Streamline Count',

'31022_connectome_fa':'Glasser MSA I Connectome FA',
'31022_connectome_mean_length':'Glasser MSA I Connectome Mean Length',
'31022_connectome_sift2':'Glasser MSA I Connectome SIFT2',
'31022_connectome_streamline_count':'Glasser MSA I Connectome Streamline Count',

'31023_connectome_fa':'Glasser MSA IV Connectome FA',
'31023_connectome_mean_length':'Glasser MSA IV Connectome Mean Length',
'31023_connectome_sift2':'Glasser MSA IV Connectome SIFT2',
'31023_connectome_streamline_count':'Glasser MSA IV Connectome Streamline Count',

'31024_connectome_fa':'Schaefer7n200p MSA I Connectome FA',
'31024_connectome_mean_length':'Schaefer7n200p MSA I Connectome Mean Length',
'31024_connectome_sift2':'Schaefer7n200p MSA I Connectome SIFT2',
'31024_connectome_streamline_count':'Schaefer7n200p MSA I Connectome Streamline Count',

'31025_connectome_fa':'Schaefer7n500p MSA IV Connectome FA',
'31025_connectome_mean_length':'Schaefer7n500p MSA IV Connectome Mean Length',
'31025_connectome_sift2':'Schaefer7n500p MSA IV Connectome SIFT2',
'31025_connectome_streamline_count':'Schaefer7n500p MSA IV Connectome Streamline Count'
}

metrics = ['R2', 'r', 'MSE', 'MAE']



Rename columns

In [None]:
def rename_columns(df, base_rename_dict, metric):
    rename_dict = {f'{k}_{metric}': v for k, v in base_rename_dict.items()}
    return df.rename(columns=rename_dict)

In [None]:
dti_r2 = rename_columns(dti_r2, base_rename_dict, 'R2')
dti_r = rename_columns(dti_r, base_rename_dict, 'r')
dti_mse = rename_columns(dti_mse, base_rename_dict, 'MSE')
dti_mae = rename_columns(dti_mse, base_rename_dict, 'MAE')

In [None]:
# Save metrics
names_dti = ['dti_r2','dti_r','dti_mse','dti_mae']
dataframes_dti = [dti_r2,dti_r,dti_mse,dti_mae]
for df, name in zip(dataframes_dti, names_dti):
    df.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/{name}_bootstrapped_renamed.csv', index=False)

In [None]:
# Check dimensions
#31024_Schaefer7n1000p_Tian_S4
#31025_Schaefer7n200p_Tian_S1

# Get dimensions without loading full files
def count_lines(filepath):
    with open(filepath) as f:
        return sum(1 for _ in f) - 1  # Subtract 1 for header
                    
def count_columns(filepath):
    with open(filepath) as f:
        return len(next(f).split(','))  # Just read first line
                
Schaefer7n1000p_Tian_S4_dim = count_columns('/brainbody/brain/data/dwMRI/connectomes/31024_connectome_fa_test_corr_0.csv')
print(Schaefer7n1000p_Tian_S4_dim)
Schaefer7n200p_Tian_S1_dim = count_columns('/brainbody/brain/data/dwMRI/connectomes/31025_connectome_fa_test_corr_0.csv')
print(Schaefer7n200p_Tian_S1_dim)

In [None]:
# Compute CIs
# ===== Configuration =====
metrics = ['r2', 'r', 'mse', 'mae']
ci_level = 0.95
modality_name = 'dti'
base_path = f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/{modality_name}/output'
# ===== Process All Metrics =====
ci_results = []

for metric in metrics:
    # Load bootstrap results
    try:
        df = pd.read_csv(f'{base_path}/{modality_name}_{metric}_bootstrapped_renamed.csv')
        
        # Drop Unnamed columns and any index columns
        df = df.drop(columns=df.columns[df.columns.str.contains('^Unnamed|^index$', case=False)], errors='ignore')
        
    except FileNotFoundError:
        print(f"Warning: File not found for {modality_name}_{metric}")
        continue
    
    # Compute CIs for each modality (column)
    for modality in df.columns:
        # Skip if column is empty or all NA
        if df[modality].isna().all():
            print(f"Skipping empty modality: {modality}")
            continue
            
        # Calculate statistics
        values = df[modality].values
        lower, upper = get_bootstrap_ci(values, ci_level)
        
        ci_results.append({
            'Modality': modality,
            'Metric': metric.upper(),  # R2, r, MSE, MAE
            'Mean': np.mean(values).round(3),
            'Median': np.median(values).round(3),
            'Std': np.std(values).round(3),
            'CI_lower': lower,
            'CI_upper': upper
        })

# Convert to DataFrame
ci_df = pd.DataFrame(ci_results)
output_path = '/rev1-analysis'
save_dataframe_to_files(ci_df, output_path, f'bootstrap_{modality_name}_pls_CI')

# Reorder columns for better readability
column_order = ['Modality', 'Metric', 'Mean', 'Median', 'Std', 'CI_lower', 'CI_upper']
ci_df = ci_df[column_order]

print("=== Confidence Intervals ===")
print(ci_df)
print(f"\nResults saved to: {output_path}")

# Print formatted summary
for modality, group in ci_df.groupby('Modality'):
    print(f"\n=== {modality} ===")
    for metric, metric_group in group.groupby('Metric'):
        row = metric_group.iloc[0]  # Get first row for this metric
        print(f"{row['Metric']}:")
        print(f"  Mean ± SD: {row['Mean']} ± {row['Std']}")
        print(f"  Median: {row['Median']}")
        print(f"  95% CI: [{row['CI_lower']}, {row['CI_upper']}]")

# RS

## Bootstrap RS IDP + Parcellations Stacked

In [None]:
# Concat 5 folds
algorithms = ['svr', 'eNet', 'xgb', 'rf']
folds = ["0", "1", "2", "3", "4"]

g_five_folds_train = {}
g_five_folds_test = {}

for algorithm in algorithms:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/stacking/g/rs_idp_ts_new/RS_IDP_Timeseries_best_metrics_target_pred_2nd_level_{algorithm}_train_fold_{fold}.csv')
        g_pred_test = pd.read_csv(f'/PLS/brain/stacking/g/rs_idp_ts_new/RS_IDP_Timeseries_best_metrics_target_pred_2nd_level_{algorithm}_test_fold_{fold}.csv')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[algorithm] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[algorithm] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[algorithm].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/RS_IDP_Timeseries_best_metrics_target_pred_{algorithm}_train.csv', index=False)
    g_five_folds_test[algorithm].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/RS_IDP_Timeseries_best_metrics_target_pred_{algorithm}_test.csv', index=False)

    print(f'Shape of the train set in {algorithm} is', g_five_folds_train[algorithm].shape)
    print(f'Shape of the test set in {algorithm} is', g_five_folds_test[algorithm].shape)

In [None]:
# Match real to predicted g
g_five_folds_real_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/target_real_test.csv')
g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/RS_IDP_Timeseries_best_metrics_target_pred_svr_test.csv')
g_real_to_pred_match = pd.DataFrame(g_five_folds_test_real.merge(g_five_folds_pred_test['eid'], on='eid'), columns = ['g_real_test', 'eid'])
g_real_to_pred_match.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/g_real_to_pred_match_rs_ts_idp_test.csv')
print('Shape of g real test matched to g pred is', g_real_to_pred_match.shape)

In [None]:
# Bootstrapping
boot = 5000
g_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/g_real_to_pred_match_rs_ts_idp_test.csv')
preds=[]

bootstrap_results_dfs = {}

for algorithm in algorithms:
    g_pred = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/RS_IDP_Timeseries_best_metrics_target_pred_{algorithm}_test.csv')
    df = pd.concat([g_real, g_pred], axis=1).drop(columns=['eid', 'Unnamed: 0'])
    df.columns = ['g_real_test', f'g_pred_rs_ts_idp_{algorithm}_test']
    
    # Initialize a discionary for each algorithmn separately

    bootstrap_results = {
    'R2': [],
    'r': [],
    'MSE': [],
    'MAE': []
    }

    for i in range(boot):
        df_sample = df.sample(len(df),replace=True) #random sample with replacement, frac=1 mean size will be the size of data set
        corr, _ = pearsonr(df_sample['g_real_test'], df_sample[f'g_pred_rs_ts_idp_{algorithm}_test'])
        r2 = r2_score(df_sample['g_real_test'], df_sample[f'g_pred_rs_ts_idp_{algorithm}_test'])
        mse = mean_squared_error(df_sample['g_real_test'], df_sample[f'g_pred_rs_ts_idp_{algorithm}_test'])
        mae = mean_absolute_error(df_sample['g_real_test'], df_sample[f'g_pred_rs_ts_idp_{algorithm}_test'])

        bootstrap_results['r'].append(corr)
        bootstrap_results['R2'].append(r2)
        bootstrap_results['MSE'].append(mse)
        bootstrap_results['MAE'].append(mae)

    for key in bootstrap_results:
        bootstrap_results[key] = pd.DataFrame(bootstrap_results[key], columns=[f'{algorithm}_{key}'])
        
    bootstrap_results_dfs[algorithm] = bootstrap_results

bootstrap_svr = bootstrap_results_dfs['svr']
bootstrap_enet = bootstrap_results_dfs['eNet']
bootstrap_rf = bootstrap_results_dfs['rf']
bootstrap_xgb = bootstrap_results_dfs['xgb']

for metric in bootstrap_svr:
    bootstrap_svr[metric] = pd.DataFrame(bootstrap_svr[metric])

bootstrap_svr = pd.concat([bootstrap_svr['R2'].reset_index(drop=True), 
                    bootstrap_svr['r'].reset_index(drop=True), 
                    bootstrap_svr['MSE'].reset_index(drop=True), 
                    bootstrap_svr['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_enet:
    bootstrap_enet[metric] = pd.DataFrame(bootstrap_enet[metric])

bootstrap_enet = pd.concat([bootstrap_enet['R2'].reset_index(drop=True), 
                    bootstrap_enet['r'].reset_index(drop=True), 
                    bootstrap_enet['MSE'].reset_index(drop=True), 
                    bootstrap_enet['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_rf:
    bootstrap_rf[metric] = pd.DataFrame(bootstrap_rf[metric])

bootstrap_rf = pd.concat([bootstrap_rf['R2'].reset_index(drop=True), 
                    bootstrap_rf['r'].reset_index(drop=True), 
                    bootstrap_rf['MSE'].reset_index(drop=True), 
                    bootstrap_rf['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_xgb:
    bootstrap_xgb[metric] = pd.DataFrame(bootstrap_xgb[metric])

bootstrap_xgb = pd.concat([bootstrap_xgb['R2'].reset_index(drop=True), 
                    bootstrap_xgb['r'].reset_index(drop=True), 
                    bootstrap_xgb['MSE'].reset_index(drop=True), 
                    bootstrap_xgb['MAE'].reset_index(drop=True)], axis=1)

In [None]:
# Combine algorithms 
bootstrap_fullres = pd.concat([
    bootstrap_svr,
    bootstrap_enet,
    bootstrap_rf,
    bootstrap_xgb], axis=1)

bootstrap_fullres.to_csv('/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/bootstrap_rs_ts_idp.csv', index=False)

In [None]:
# Extract metrics
bootstrap_fullres_rs_stacked_rf = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/rs_idp_ts/bootstrap_rs_ts_idp.csv')

bootstrap_rs_R2 = bootstrap_fullres_rs_stacked_rf[['svr_R2', 'eNet_R2', 'rf_R2', 'xgb_R2']]
bootstrap_rs_R2.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_rs_r = bootstrap_fullres_rs_stacked_rf[['svr_r', 'eNet_r', 'rf_r', 'xgb_r']]
bootstrap_rs_r.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_rs_mse = bootstrap_fullres_rs_stacked_rf[['svr_MSE', 'eNet_MSE', 'rf_MSE', 'xgb_MSE']]
bootstrap_rs_mse.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_rs_mae = bootstrap_fullres_rs_stacked_rf[['svr_MAE', 'eNet_MAE', 'rf_MAE', 'xgb_MAE']]
bootstrap_rs_mae.columns = ['SVR', 'eNet', 'RF', 'XGB']

# Get best algorithm
bootstrap_rs_stacked_rf_r = bootstrap_rs_r['RF']
bootstrap_rs_stacked_rf_R2 = bootstrap_rs_R2['RF']

## Bootstrap RS PLSR

### Bootstrap RS IDP PLSR: Prepare files

In [None]:
# Concat 5 folds
amplitudes = ['amplitudes_21','amplitudes_55']

tangent = ['tangent_matrices_21', 'tangent_matrices_55']

folds = ["0", "1", "2", "3", "4"]

# Amplitudes
g_five_folds_train = {}
g_five_folds_test = {}

for a in amplitudes:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/rs/ica_main/fold_{fold}/g_pred/{a}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        g_pred_test = pd.read_csv(f'/PLS/brain/rs/ica_main/fold_{fold}/g_pred/{a}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[a] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[a] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[a].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/target_pred_{a}_train.csv', index=False)
    g_five_folds_test[a].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/target_pred_{a}_test.csv', index=False)

    print(f'Shape of the train set in {a} is', g_five_folds_train[a].shape)
    print(f'Shape of the test set in {a} is', g_five_folds_test[a].shape)


# Tangent
g_five_folds_train = {}
g_five_folds_test = {}
tangent = ['tangent_matrices_21', 'tangent_matrices_55']
for tg in tangent:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/rs/ica_tangent/fold_{fold}/g_pred/{tg}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        g_pred_test = pd.read_csv(f'/PLS/brain/rs/ica_tangent/fold_{fold}/g_pred/{tg}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[tg] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[tg] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[tg].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/target_pred_{tg}_train.csv', index=False)
    g_five_folds_test[tg].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/target_pred_{tg}_test.csv', index=False)

    print(f'Shape of the train set in {tg} is', g_five_folds_train[tg].shape)
    print(f'Shape of the test set in {tg} is', g_five_folds_test[tg].shape)

In [None]:
# Match real to predicted g for each modality

# Amplitudes
g_real_to_pred_match_dict = {}
g_five_folds_test_real = pd.read_csv(f'/PLS/brain/stacking_all_mod/all/four_alg/BOOTSTRAP/target_real_test.csv')

for a in amplitudes:

    g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking_all_mod/all/four_alg/BOOTSTRAP/pls_vs_stack/rs/target_pred_{a}_test.csv')
    g_real_to_pred_match_dict[a] = pd.DataFrame(g_five_folds_test_real.merge(g_five_folds_pred_test, on='eid'))
    g_real_to_pred_match_dict[a].to_csv(f'/PLS/brain/stacking_all_mod/all/four_alg/BOOTSTRAP/pls_vs_stack/rs/g_real_to_pred_match_rs_{a}_test.csv')

# Tangent
g_real_to_pred_match_dict = {}
g_five_folds_test_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')

for tg in tangent:

    g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/target_pred_{tg}_test.csv')
    g_real_to_pred_match_dict[tg] = pd.DataFrame(g_five_folds_test_real.merge(g_five_folds_pred_test, on='eid')) #, columns = ['g_real_test', 'eid' 'g_predicted_{atlas}'])
    g_real_to_pred_match_dict[tg].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/g_real_to_pred_match_rs_{tg}_test.csv')

### Bootstrap RS Parcellations PLSR: Prepare files

In [None]:
# Concat 5 folds
folds = ["0", "1", "2", "3", "4"]
folders = ['aparc', 'aparc_2009', 'glasser', 'glasser', 'shaefer_7n100_200', 'shaefer_7n500_600']
atlases = [
'aparc_Tian_s1_arrays_full_correlation', 
'aparc_2009_Tian_s1_arrays_full_correlation',
'glasser_Tian_s1_arrays_full_correlation',
'glasser_Tian_s4_arrays_full_correlation',
'Schaefer7n200p_tian_s1_arrays_full_correlation',
'Schaefer7n500p_s4_full_correlation']


g_five_folds_train = {}
g_five_folds_test = {}

for folder, atlas in zip(folders, atlases):
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/rs/parcellation_main/{folder}/fold_{fold}/g_pred/{atlas}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        g_pred_test = pd.read_csv(f'/PLS/brain/rs/parcellation_main/{folder}/fold_{fold}/g_pred/{atlas}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[atlas] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[atlas] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[atlas].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/target_pred_{atlas}_train.csv', index=False)
    g_five_folds_test[atlas].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/target_pred_{atlas}_test.csv', index=False)

    print(f'Shape of the train set in {atlas} is', g_five_folds_train[atlas].shape)
    print(f'Shape of the test set in {atlas} is', g_five_folds_test[atlas].shape)

In [None]:
# Match real to predicted g for each modality
g_real_to_pred_match_dict = {}
g_five_folds_test_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')

for atlas in atlases:

    g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/target_pred_{atlas}_test.csv')
    g_real_to_pred_match_dict[atlas] = pd.DataFrame(g_five_folds_test_real.merge(g_five_folds_pred_test, on='eid'))
    g_real_to_pred_match_dict[atlas].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/g_real_to_pred_match_rs_{atlas}_test.csv')

### Bootstrap RS IDP and Parcellations PLSR

In [None]:
all_rs_modalities = ['aparc_Tian_s1_arrays_full_correlation', 
'aparc_2009_Tian_s1_arrays_full_correlation',
'glasser_Tian_s1_arrays_full_correlation',
'glasser_Tian_s4_arrays_full_correlation',
'Schaefer7n200p_tian_s1_arrays_full_correlation',
'Schaefer7n500p_s4_full_correlation',
'amplitudes_21', 'amplitudes_55',
'tangent_matrices_21', 'tangent_matrices_55']

In [None]:
# Bootstrapping
boot = 5000
metrics = ['R2', 'r', 'MSE', 'MAE']
bootstrap_results_dfs = {mod: {metric: [] for metric in metrics} for mod in all_rs_modalities}
    
for modality in all_rs_modalities:
    g_real_pls = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/g_real_to_pred_match_rs_{modality}_test.csv')
    g_pred_pls = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/target_pred_{modality}_test.csv')
    g = g_real_pls.merge(g_pred_pls, on = 'eid').drop(columns=['Unnamed: 0', 'g predicted test_x'])
    g = g.rename(columns={'g predicted test_y': f'g_predicted_pls_{modality}'})

    bootstrap_results = {metric: [] for metric in metrics}
        
    for i in range(boot):
        df_sample = g.sample(len(g),replace=True) 

        corr_pls, _ = pearsonr(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        r2_pls = r2_score(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        mse_pls = mean_squared_error(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        mae_pls = mean_absolute_error(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
            
        bootstrap_results['r'].append(corr_pls)
        bootstrap_results['R2'].append(r2_pls)
        bootstrap_results['MSE'].append(mse_pls)
        bootstrap_results['MAE'].append(mae_pls)
            
    for key in bootstrap_results:
        bootstrap_results[key] = pd.DataFrame(bootstrap_results[key], columns=[f'{modality}_{key}'])

    bootstrap_results_dfs[modality] = bootstrap_results

with open('/PLS/brain/stacking/BOOTSTRAP/RS_PLS_bootstrap_results_dfs.pkl', 'wb') as f:
    pickle.dump(bootstrap_results_dfs, f)
        
print("bootstrap_results_dfs dictionary was successfully saved as bootstrap_results_dfs.pkl")

If you need to upload the saved bootstrapped object

In [None]:
import pickle
with open('/PLS/brain/stacking/BOOTSTRAP/RS_PLS_bootstrap_results_dfs.pkl', 'rb') as f:
    bootstrap_results_dfs = pickle.load(f)
metric_dfs = {metric: pd.DataFrame() for metric in ['R2', 'r', 'MSE', 'MAE']}
for modality, results in bootstrap_results_dfs.items():
    for metric, df in results.items():
        if metric in metric_dfs:
            metric_dfs[metric] = pd.concat([metric_dfs[metric], df], axis=1)

Get dataframes for each metric

In [None]:
rs_r2 = pd.concat([df['R2'] for df in bootstrap_results_dfs.values()], axis=1)
rs_r = pd.concat([df['r'] for df in bootstrap_results_dfs.values()], axis=1)
rs_mse = pd.concat([df['MSE'] for df in bootstrap_results_dfs.values()], axis=1)
rs_mae = pd.concat([df['MAE'] for df in bootstrap_results_dfs.values()], axis=1)

#rs_r2.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/r2_rs_bootstrapped.csv', index=False)
#rs_r.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/r_rs_bootstrapped.csv', index=False)
#rs_mse.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/mse_rs_bootstrapped.csv', index=False)
#rs_mae.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/mae_rs_bootstrapped.csv', index=False)

In [None]:
rs_r2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/r2_rs_bootstrapped.csv')
rs_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/r_rs_bootstrapped.csv')
rs_mse = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/mse_rs_bootstrapped.csv')
rs_mae = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/mae_rs_bootstrapped.csv')

In [None]:
metrics = ['R2', 'r', 'MSE', 'MAE']
base_rename_dict = {"aparc_Tian_s1_arrays_full_correlation":'aparc MSA I Full correlation',
"aparc_2009_Tian_s1_arrays_full_correlation":'aparc a2009s MSA I Full correlation',
"glasser_Tian_s1_arrays_full_correlation":'Glasser MSA I Full correlation',
"glasser_Tian_s4_arrays_full_correlation":'Glasser MSA IV Full correlation',
"Schaefer7n200p_tian_s1_arrays_full_correlation":'Schaefer7n200p MSA I Full correlation',
"Schaefer7n500p_s4_full_correlation":'Schaefer7n500p MSA IV Full correlation',
"amplitudes_21":'Amplitudes 21 IC',
"amplitudes_55":'Amplitudes 55 IC',
"tangent_matrices_21":'Tangent matrices 21 IC',
"tangent_matrices_55":'Tangent matrices 55 IC'}

Rename columns

In [None]:
def rename_columns(df, base_rename_dict, metric):
    rename_dict = {f'{k}_{metric}': v for k, v in base_rename_dict.items()}
    return df.rename(columns=rename_dict)

In [None]:
rs_r2 = rename_columns(rs_r2, base_rename_dict, 'R2')
rs_r = rename_columns(rs_r, base_rename_dict, 'r')
rs_mse = rename_columns(rs_mse, base_rename_dict, 'MSE')
rs_mae = rename_columns(rs_mae, base_rename_dict, 'MAE')

In [None]:
# Save metrics
names_rs = ['rs_r2','rs_r','rs_mse','rs_mae']
dataframes_rs = [rs_r2,rs_r,rs_mse,rs_mae]
for df, name in zip(dataframes_rs, names_rs):
    df.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/{name}_bootstrapped_renamed.csv', index=False)

In [None]:
# Compute CIs
# ===== Configuration =====
metrics = ['r2', 'r', 'mse', 'mae']
ci_level = 0.95
modality_name = 'rs'
base_path = f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/{modality_name}/output'
# ===== Process All Metrics =====
ci_results = []

for metric in metrics:
    # Load bootstrap results
    try:
        df = pd.read_csv(f'{base_path}/{modality_name}_{metric}_bootstrapped_renamed.csv')
        
        # Drop Unnamed columns and any index columns
        df = df.drop(columns=df.columns[df.columns.str.contains('^Unnamed|^index$', case=False)], errors='ignore')
        
    except FileNotFoundError:
        print(f"Warning: File not found for {modality_name}_{metric}")
        continue
    
    # Compute CIs for each modality (column)
    for modality in df.columns:
        # Skip if column is empty or all NA
        if df[modality].isna().all():
            print(f"Skipping empty modality: {modality}")
            continue
            
        # Calculate statistics
        values = df[modality].values
        lower, upper = get_bootstrap_ci(values, ci_level)
        
        ci_results.append({
            'Modality': modality,
            'Metric': metric.upper(),  # R2, r, MSE, MAE
            'Mean': np.mean(values).round(3),
            'Median': np.median(values).round(3),
            'Std': np.std(values).round(3),
            'CI_lower': lower,
            'CI_upper': upper
        })

# Convert to DataFrame
ci_df = pd.DataFrame(ci_results)
output_path = '/rev1-analysis'
save_dataframe_to_files(ci_df, output_path, f'bootstrap_{modality_name}_pls_CI')

# Reorder columns for better readability
column_order = ['Modality', 'Metric', 'Mean', 'Median', 'Std', 'CI_lower', 'CI_upper']
ci_df = ci_df[column_order]

print("=== Confidence Intervals ===")
print(ci_df)
print(f"\nResults saved to: {output_path}")

# Print formatted summary
for modality, group in ci_df.groupby('Modality'):
    print(f"\n=== {modality} ===")
    for metric, metric_group in group.groupby('Metric'):
        row = metric_group.iloc[0]  # Get first row for this metric
        print(f"{row['Metric']}:")
        print(f"  Mean ± SD: {row['Mean']} ± {row['Std']}")
        print(f"  Median: {row['Median']}")
        print(f"  95% CI: [{row['CI_lower']}, {row['CI_upper']}]")

# T1/T2

## Bootstrap T1w/T2w Stacked

In [None]:
algorithms = ['svr', 'eNet', 'xgb', 'rf']
folds = ["0", "1", "2", "3", "4"]

g_five_folds_train = {}
g_five_folds_test = {}

for algorithm in algorithms:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/stacking/g/T1_T2_whole_brain_target_pred_2nd_level_{algorithm}_train_fold_{fold}.csv')
        g_pred_test = pd.read_csv(f'/PLS/brain/stacking/g/T1_T2_whole_brain_target_pred_2nd_level_{algorithm}_test_fold_{fold}.csv')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[algorithm] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[algorithm] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[algorithm].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/t1_t2/T1_T2_whole_brain_target_pred_{algorithm}_train.csv', index=False)
    g_five_folds_test[algorithm].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/t1_t2/T1_T2_whole_brain_target_pred_{algorithm}_test.csv', index=False)

    print(f'Shape of the train set in {algorithm} is', g_five_folds_train[algorithm].shape)
    print(f'Shape of the test set in {algorithm} is', g_five_folds_test[algorithm].shape)

In [None]:
# Match real to predicted g (SVR)
g_five_folds_real_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')
g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/t1_t2/T1_T2_whole_brain_target_pred_svr_test.csv')
g_real_to_pred_match = pd.DataFrame(g_five_folds_real_test.merge(g_five_folds_pred_test['eid'], on='eid'), columns = ['g_real_test', 'eid'])
g_real_to_pred_match.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/t1_t2/g_real_to_pred_match_t1_t2_test.csv')
print('Shape of g real test matched to g pred is', g_real_to_pred_match.shape)

In [None]:
# Bootstrapping
boot = 5000
g_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/t1_t2/g_real_to_pred_match_t1_t2_test.csv')
preds=[]

bootstrap_results_dfs = {}

for algorithm in algorithms:
    g_pred = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/t1_t2/T1_T2_whole_brain_target_pred_{algorithm}_test.csv')
    df = pd.concat([g_real, g_pred], axis=1).drop(columns=['eid', 'Unnamed: 0'])
    df.columns = ['g_real_test', f'g_pred_t1_t2_{algorithm}_test']
    
    # Initialize a discionary for each algorithmn separately

    bootstrap_results = {
    'R2': [],
    'r': [],
    'MSE': [],
    'MAE': []
    }

    for i in range(boot):
        df_sample = df.sample(len(df),replace=True)
        corr, _ = pearsonr(df_sample['g_real_test'], df_sample[f'g_pred_t1_t2_{algorithm}_test'])
        r2 = r2_score(df_sample['g_real_test'], df_sample[f'g_pred_t1_t2_{algorithm}_test'])
        mse = mean_squared_error(df_sample['g_real_test'], df_sample[f'g_pred_t1_t2_{algorithm}_test'])
        mae = mean_absolute_error(df_sample['g_real_test'], df_sample[f'g_pred_t1_t2_{algorithm}_test'])

        bootstrap_results['r'].append(corr)
        bootstrap_results['R2'].append(r2)
        bootstrap_results['MSE'].append(mse)
        bootstrap_results['MAE'].append(mae)

    for key in bootstrap_results:
        bootstrap_results[key] = pd.DataFrame(bootstrap_results[key], columns=[f'{algorithm}_{key}'])
        
    bootstrap_results_dfs[algorithm] = bootstrap_results

with open('/PLS/brain/stacking/BOOTSTRAP/t1_t2/t1_t2_bootstrap_results_dfs.pkl', 'wb') as f:
    pickle.dump(bootstrap_results_dfs, f)
        
print("bootstrap_results_dfs dictionary was successfully saved as bootstrap_results_dfs.pkl")

bootstrap_svr = bootstrap_results_dfs['svr']
bootstrap_enet = bootstrap_results_dfs['eNet']
bootstrap_rf = bootstrap_results_dfs['rf']
bootstrap_xgb = bootstrap_results_dfs['xgb']

for metric in bootstrap_svr:
    bootstrap_svr[metric] = pd.DataFrame(bootstrap_svr[metric])

bootstrap_svr = pd.concat([bootstrap_svr['R2'].reset_index(drop=True), 
                    bootstrap_svr['r'].reset_index(drop=True), 
                    bootstrap_svr['MSE'].reset_index(drop=True), 
                    bootstrap_svr['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_enet:
    bootstrap_enet[metric] = pd.DataFrame(bootstrap_enet[metric])

bootstrap_enet = pd.concat([bootstrap_enet['R2'].reset_index(drop=True), 
                    bootstrap_enet['r'].reset_index(drop=True), 
                    bootstrap_enet['MSE'].reset_index(drop=True), 
                    bootstrap_enet['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_rf:
    bootstrap_rf[metric] = pd.DataFrame(bootstrap_rf[metric])

bootstrap_rf = pd.concat([bootstrap_rf['R2'].reset_index(drop=True), 
                    bootstrap_rf['r'].reset_index(drop=True), 
                    bootstrap_rf['MSE'].reset_index(drop=True), 
                    bootstrap_rf['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_xgb:
    bootstrap_xgb[metric] = pd.DataFrame(bootstrap_xgb[metric])

bootstrap_xgb = pd.concat([bootstrap_xgb['R2'].reset_index(drop=True), 
                    bootstrap_xgb['r'].reset_index(drop=True), 
                    bootstrap_xgb['MSE'].reset_index(drop=True), 
                    bootstrap_xgb['MAE'].reset_index(drop=True)], axis=1)

In [None]:
# Combine algorithms
bootstrap_fullres = pd.concat([
    bootstrap_svr,
    bootstrap_enet,
    bootstrap_rf,
    bootstrap_xgb], axis=1)

bootstrap_fullres.to_csv('/PLS/brain/stacking/BOOTSTRAP/t1_t2/bootstrap_t1_t2.csv', index=False)

In [None]:
# Extract metrics
bootstrap_fullres_t1t2_stacked_svr = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/t1_t2/bootstrap_t1_t2.csv')

bootstrap_t1t2_R2 = bootstrap_fullres_t1t2_stacked_svr[['svr_R2', 'eNet_R2', 'rf_R2', 'xgb_R2']]
bootstrap_t1t2_R2.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_t1t2_r = bootstrap_fullres_t1t2_stacked_svr[['svr_r', 'eNet_r', 'rf_r', 'xgb_r']]
bootstrap_t1t2_r.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_t1t2_mse = bootstrap_fullres_t1t2_stacked_svr[['svr_MSE', 'eNet_MSE', 'rf_MSE', 'xgb_MSE']]
bootstrap_t1t2_mse.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_t1t2_mae = bootstrap_fullres_t1t2_stacked_svr[['svr_MAE', 'eNet_MAE', 'rf_MAE', 'xgb_MAE']]
bootstrap_t1t2_mae.columns = ['SVR', 'eNet', 'RF', 'XGB']

# Get best algorithm
bootstrap_t1t2_stacked_svr_r = bootstrap_t1t2_r['SVR']
bootstrap_t1t2_stacked_svr_R2 = bootstrap_t1t2_R2['SVR']

## Bootstrap T1w/T2w PLSR

### Bootstrap T1w PLSR: Prepare files

In [None]:
t1_modalities = ['struct_fast', 'struct_sub_first',
'struct_aseg_mean_intensity', 'struct_aseg_volume',
'struct_ba_exvivo_area',  'struct_ba_exvivo_mean_thickness', 'struct_ba_exvivo_volume',
'struct_a2009s_area', 'struct_a2009s_mean_thickness', 'struct_a2009s_volume',
'struct_dkt_area', 'struct_dkt_mean_thickness', 'struct_dkt_volume',
'struct_desikan_gw', 'struct_desikan_pial', 'struct_desikan_white_area', 'struct_desikan_white_mean_thickness', 'struct_desikan_white_volume',
'struct_subsegmentation']

In [None]:
# Concat 5 folds
g_five_folds_train = {}
g_five_folds_test = {}

for modality in t1_modalities:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/t1/fold_{fold}/g_pred/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        g_pred_test = pd.read_csv(f'/PLS/brain/t1/fold_{fold}/g_pred/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[modality] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[modality] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/target_pred_{modality}_train.csv', index=False)
    g_five_folds_test[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/target_pred_{modality}_test.csv', index=False)

    print(f'Shape of the train set in {modality} is', g_five_folds_train[modality].shape)
    print(f'Shape of the test set in {modality} is', g_five_folds_test[modality].shape)

In [None]:
# Match real to predicted g for each modality
g_real_to_pred_match_dict = {}
g_five_folds_test_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')

for modality in t1_modalities:

    g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/target_pred_{modality}_test.csv')
    g_real_to_pred_match_dict[modality] = pd.DataFrame(g_five_folds_test_real.merge(g_five_folds_pred_test, on='eid'))
    g_real_to_pred_match_dict[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/g_real_to_pred_match_{modality}_test.csv')

### Bootstrap T1w/T2w Whole-brain PLSR: Prepare files

In [None]:
t1t2_whole_brain = ['T1_T2_whole_brain']

In [None]:
# Concat 5 folds
g_five_folds_train = {}
g_five_folds_test = {}

for modality in t1t2_whole_brain:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/additional/fold_{fold}/g_pred/{modality}_g_pred_train_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        g_pred_test = pd.read_csv(f'/PLS/brain/additional/fold_{fold}/g_pred/{modality}_g_pred_test_id_fold_{fold}.csv').drop(columns='Unnamed: 0')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[modality] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[modality] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/target_pred_{modality}_train.csv', index=False)
    g_five_folds_test[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/target_pred_{modality}_test.csv', index=False)

    print(f'Shape of the train set in {modality} is', g_five_folds_train[modality].shape)
    print(f'Shape of the test set in {modality} is', g_five_folds_test[modality].shape)

In [None]:
# Match real to predicted g for each modality
g_real_to_pred_match_dict = {}
g_five_folds_test_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')

for modality in t1t2_whole_brain:

    g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/target_pred_{modality}_test.csv')
    g_real_to_pred_match_dict[modality] = pd.DataFrame(g_five_folds_test_real.merge(g_five_folds_pred_test, on='eid'))
    g_real_to_pred_match_dict[modality].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/g_real_to_pred_match_{modality}_test.csv')

### Bootstrap T1w/T2w PLSR

In [None]:
all_t1t2_modalities = ['struct_fast', 'struct_sub_first',
'struct_aseg_mean_intensity', 'struct_aseg_volume',
'struct_ba_exvivo_area',  'struct_ba_exvivo_mean_thickness', 'struct_ba_exvivo_volume',
'struct_a2009s_area', 'struct_a2009s_mean_thickness', 'struct_a2009s_volume',
'struct_dkt_area', 'struct_dkt_mean_thickness', 'struct_dkt_volume',
'struct_desikan_gw', 'struct_desikan_pial', 'struct_desikan_white_area', 'struct_desikan_white_mean_thickness', 'struct_desikan_white_volume',
'struct_subsegmentation', 'T1_T2_whole_brain']

In [None]:
# Bootstrapping
boot = 5000
metrics = ['R2', 'r', 'MSE', 'MAE']
bootstrap_results_dfs = {mod: {metric: [] for metric in metrics} for mod in all_t1t2_modalities}
    
for modality in all_t1t2_modalities:
    g_real_pls = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/g_real_to_pred_match_{modality}_test.csv')
    g_pred_pls = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/target_pred_{modality}_test.csv')
    g = g_real_pls.merge(g_pred_pls, on = 'eid').drop(columns=['Unnamed: 0', 'g predicted test_x'])
    g = g.rename(columns={'g predicted test_y': f'g_predicted_pls_{modality}'})

    bootstrap_results = {metric: [] for metric in metrics}
        
    for i in range(boot):
        df_sample = g.sample(len(g),replace=True) 

        corr_pls, _ = pearsonr(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        r2_pls = r2_score(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        mse_pls = mean_squared_error(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
        mae_pls = mean_absolute_error(df_sample['g_real_test'], df_sample[f'g_predicted_pls_{modality}'])
            
        bootstrap_results['r'].append(corr_pls)
        bootstrap_results['R2'].append(r2_pls)
        bootstrap_results['MSE'].append(mse_pls)
        bootstrap_results['MAE'].append(mae_pls)
            
    for key in bootstrap_results:
        bootstrap_results[key] = pd.DataFrame(bootstrap_results[key], columns=[f'{modality}_{key}'])

    bootstrap_results_dfs[modality] = bootstrap_results

with open('/PLS/brain/stacking/BOOTSTRAP/T1T2_PLS_bootstrap_results_dfs.pkl', 'wb') as f:
    pickle.dump(bootstrap_results_dfs, f)
        
print("bootstrap_results_dfs dictionary was successfully saved as bootstrap_results_dfs.pkl")

If you need to upload the saved bootstrapped object

In [None]:
with open('/PLS/brain/stacking/BOOTSTRAP/T1T2_PLS_bootstrap_results_dfs.pkl', 'rb') as f:
    bootstrap_results_dfs = pickle.load(f)
metric_dfs = {metric: pd.DataFrame() for metric in ['R2', 'r', 'MSE', 'MAE']}
for modality, results in bootstrap_results_dfs.items():
    for metric, df in results.items():
        if metric in metric_dfs:
            metric_dfs[metric] = pd.concat([metric_dfs[metric], df], axis=1)

In [None]:
t1t2_r2 = pd.concat([df['R2'] for df in bootstrap_results_dfs.values()], axis=1)
t1t2_r = pd.concat([df['r'] for df in bootstrap_results_dfs.values()], axis=1)
t1t2_mse = pd.concat([df['MSE'] for df in bootstrap_results_dfs.values()], axis=1)
t1t2_mae = pd.concat([df['MAE'] for df in bootstrap_results_dfs.values()], axis=1)

#t1t2_r2.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/r2_t1t2_bootstrapped.csv', index=False)
#t1t2_r.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/r_t1t2_bootstrapped.csv', index=False)
#t1t2_mse.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/mse_t1t2_bootstrapped.csv', index=False)
#t1t2_mae.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/mae_t1t2_bootstrapped.csv', index=False)

In [None]:
t1t2_r2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/r2_t1t2_bootstrapped.csv')
t1t2_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/r_t1t2_bootstrapped.csv')
t1t2_mse = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/mse_t1t2_bootstrapped.csv')
t1t2_mae = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/mae_t1t2_bootstrapped.csv')

In [None]:
metrics = ['R2', 'r', 'MSE', 'MAE']
base_rename_dict = {
"struct_fast":'FSL FAST',
"struct_sub_first":'FSL FIRST',
"struct_aseg_mean_intensity":'ASEG Mean Intensity',
"struct_aseg_volume":'ASEG Volume',
"struct_ba_exvivo_area":'BA ex-vivo Area',
"struct_ba_exvivo_mean_thickness":'BA ex-vivo Mean Thickness',
"struct_ba_exvivo_volume":'BA ex-vivo Volume',
"struct_a2009s_area":'aparc a2009s Area',
"struct_a2009s_mean_thickness":'aparc a2009s Mean Thickness',
"struct_a2009s_volume":'aparc a2009s volume',
"struct_dkt_area":'Desikan-Killiany-Tourville Area',
"struct_dkt_mean_thickness":'Desikan-Killiany-Tourville Mean Thickness',
"struct_dkt_volume":'Desikan-Killiany-Tourville volume',
"struct_desikan_gw":'Desikan Grey/White Matter intensity',
"struct_desikan_pial":'Desikan pial',
"struct_desikan_white_area":'Desikan White Matter Area',
"struct_desikan_white_mean_thickness":'Desikan White Matter Mean Thickness',
"struct_desikan_white_volume":'Desikan White Matter volume',
"struct_subsegmentation":'Subcortical Volumetric Subsegmentation',
'T1_T2_whole_brain':'Whole-brain T1/T2'}

In [None]:
def rename_columns(df, base_rename_dict, metric):
    rename_dict = {f'{k}_{metric}': v for k, v in base_rename_dict.items()}
    return df.rename(columns=rename_dict)

In [None]:
t1t2_r2 = rename_columns(t1t2_r2, base_rename_dict, 'R2')
t1t2_r = rename_columns(t1t2_r, base_rename_dict, 'r')
t1t2_mse = rename_columns(t1t2_mse, base_rename_dict, 'MSE')
t1t2_mae = rename_columns(t1t2_mae, base_rename_dict, 'MAE')

In [None]:
# Save metrics
names_t1t2 = ['t1t2_r2','t1t2_r','t1t2_mse','t1t2_mae']
dataframes_t1t2 = [t1t2_r2,t1t2_r,t1t2_mse,t1t2_mae]
for df, name in zip(dataframes_t1t2, names_t1t2):
    df.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/output/{name}_bootstrapped_renamed.csv', index=False)

In [None]:
# Compute CIs
# ===== Configuration =====
metrics = ['r2', 'r', 'mse', 'mae']
ci_level = 0.95
modality_name = 't1t2'
base_path = f'/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/output'
# ===== Process All Metrics =====
ci_results = []

for metric in metrics:
    # Load bootstrap results
    try:
        df = pd.read_csv(f'{base_path}/{modality_name}_{metric}_bootstrapped_renamed.csv')
        
        # Drop Unnamed columns and any index columns
        df = df.drop(columns=df.columns[df.columns.str.contains('^Unnamed|^index$', case=False)], errors='ignore')
        
    except FileNotFoundError:
        print(f"Warning: File not found for {modality_name}_{metric}")
        continue
    
    # Compute CIs for each modality (column)
    for modality in df.columns:
        # Skip if column is empty or all NA
        if df[modality].isna().all():
            print(f"Skipping empty modality: {modality}")
            continue
            
        # Calculate statistics
        values = df[modality].values
        lower, upper = get_bootstrap_ci(values, ci_level)
        
        ci_results.append({
            'Modality': modality,
            'Metric': metric.upper(),  # R2, r, MSE, MAE
            'Mean': np.mean(values).round(3),
            'Median': np.median(values).round(3),
            'Std': np.std(values).round(3),
            'CI_lower': lower,
            'CI_upper': upper
        })

# Convert to DataFrame
ci_df = pd.DataFrame(ci_results)
output_path = '/rev1-analysis'
save_dataframe_to_files(ci_df, output_path, f'bootstrap_{modality_name}_pls_CI')

# Reorder columns for better readability
column_order = ['Modality', 'Metric', 'Mean', 'Median', 'Std', 'CI_lower', 'CI_upper']
ci_df = ci_df[column_order]

print("=== Confidence Intervals ===")
print(ci_df)
print(f"\nResults saved to: {output_path}")

# Print formatted summary
for modality, group in ci_df.groupby('Modality'):
    print(f"\n=== {modality} ===")
    for metric, metric_group in group.groupby('Metric'):
        row = metric_group.iloc[0]  # Get first row for this metric
        print(f"{row['Metric']}:")
        print(f"  Mean ± SD: {row['Mean']} ± {row['Std']}")
        print(f"  Median: {row['Median']}")
        print(f"  95% CI: [{row['CI_lower']}, {row['CI_upper']}]")

# All modalities

### Bootstrap all stacked

In [None]:
algorithms = ['svr', 'eNet', 'xgb', 'rf']
folds = ["0", "1", "2", "3", "4"]

g_five_folds_train = {}
g_five_folds_test = {}

for algorithm in algorithms:
    train = []
    test = []
    
    for fold in folds:
        g_pred_train = pd.read_csv(f'/PLS/brain/stacking/g/All_modalities_target_pred_2nd_level_{algorithm}_train_fold_{fold}.csv')
        g_pred_test = pd.read_csv(f'/PLS/brain/stacking/g/All_modalities_target_pred_2nd_level_{algorithm}_test_fold_{fold}.csv')
        
        train.append(g_pred_train)
        test.append(g_pred_test)
        
    g_five_folds_train[algorithm] = pd.concat(train, axis=0, ignore_index=True)
    g_five_folds_test[algorithm] = pd.concat(test, axis=0, ignore_index=True)

    g_five_folds_train[algorithm].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/all_mod/All_modalities_target_pred_{algorithm}_train.csv', index=False)
    g_five_folds_test[algorithm].to_csv(f'/PLS/brain/stacking/BOOTSTRAP/all_mod/All_modalities_target_pred_{algorithm}_test.csv', index=False)

    print(f'Shape of the train set in {algorithm} is', g_five_folds_train[algorithm].shape)
    print(f'Shape of the test set in {algorithm} is', g_five_folds_test[algorithm].shape)

In [None]:
# Match real to predicted g
g_five_folds_real_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/target_real_test.csv')
g_five_folds_pred_test = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/all_mod/All_modalities_target_pred_svr_test.csv')
g_real_to_pred_match = pd.DataFrame(g_five_folds_real_test.merge(g_five_folds_pred_test['eid'], on='eid'), columns = ['g_real_test', 'eid'])
g_real_to_pred_match.to_csv(f'/PLS/brain/stacking/BOOTSTRAP/all_mod/g_real_to_pred_match_all_mod_test.csv')
print('Shape of g real test matched to g pred is', g_real_to_pred_match.shape)

In [None]:
# Bootstrapping
boot = 5000
g_real = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/all_mod/g_real_to_pred_match_all_mod_test.csv')
preds=[]

bootstrap_results_dfs = {}

for algorithm in algorithms:
    g_pred = pd.read_csv(f'/PLS/brain/stacking/BOOTSTRAP/all_mod/All_modalities_target_pred_{algorithm}_test.csv')
    df = pd.concat([g_real, g_pred], axis=1).drop(columns=['eid', 'Unnamed: 0'])
    df.columns = ['g_real_test', f'g_pred_all_mod_mri_{algorithm}_test']

    bootstrap_results = {
    'R2': [],
    'r': [],
    'MSE': [],
    'MAE': []
    }

    for i in range(boot):
        df_sample = df.sample(len(df),replace=True)
        corr, _ = pearsonr(df_sample['g_real_test'], df_sample[f'g_pred_all_mod_mri_{algorithm}_test'])
        r2 = r2_score(df_sample['g_real_test'], df_sample[f'g_pred_all_mod_mri_{algorithm}_test'])
        mse = mean_squared_error(df_sample['g_real_test'], df_sample[f'g_pred_all_mod_mri_{algorithm}_test'])
        mae = mean_absolute_error(df_sample['g_real_test'], df_sample[f'g_pred_all_mod_mri_{algorithm}_test'])

        bootstrap_results['r'].append(corr)
        bootstrap_results['R2'].append(r2)
        bootstrap_results['MSE'].append(mse)
        bootstrap_results['MAE'].append(mae)

    for key in bootstrap_results:
        bootstrap_results[key] = pd.DataFrame(bootstrap_results[key], columns=[f'{algorithm}_{key}'])
        
    bootstrap_results_dfs[algorithm] = bootstrap_results

with open('/PLS/brain/stacking/BOOTSTRAP/all_mod/All_modalities_bootstrap_results_dfs.pkl', 'wb') as f:
    pickle.dump(bootstrap_results_dfs, f)
        
print("bootstrap_results_dfs dictionary was successfully saved as bootstrap_results_dfs.pkl")

bootstrap_svr = bootstrap_results_dfs['svr']
bootstrap_enet = bootstrap_results_dfs['eNet']
bootstrap_rf = bootstrap_results_dfs['rf']
bootstrap_xgb = bootstrap_results_dfs['xgb']

for metric in bootstrap_svr:
    bootstrap_svr[metric] = pd.DataFrame(bootstrap_svr[metric])

bootstrap_svr = pd.concat([bootstrap_svr['R2'].reset_index(drop=True), 
                    bootstrap_svr['r'].reset_index(drop=True), 
                    bootstrap_svr['MSE'].reset_index(drop=True), 
                    bootstrap_svr['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_enet:
    bootstrap_enet[metric] = pd.DataFrame(bootstrap_enet[metric])

bootstrap_enet = pd.concat([bootstrap_enet['R2'].reset_index(drop=True), 
                    bootstrap_enet['r'].reset_index(drop=True), 
                    bootstrap_enet['MSE'].reset_index(drop=True), 
                    bootstrap_enet['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_rf:
    bootstrap_rf[metric] = pd.DataFrame(bootstrap_rf[metric])

bootstrap_rf = pd.concat([bootstrap_rf['R2'].reset_index(drop=True), 
                    bootstrap_rf['r'].reset_index(drop=True), 
                    bootstrap_rf['MSE'].reset_index(drop=True), 
                    bootstrap_rf['MAE'].reset_index(drop=True)], axis=1)

for metric in bootstrap_xgb:
    bootstrap_xgb[metric] = pd.DataFrame(bootstrap_xgb[metric])

bootstrap_xgb = pd.concat([bootstrap_xgb['R2'].reset_index(drop=True), 
                    bootstrap_xgb['r'].reset_index(drop=True), 
                    bootstrap_xgb['MSE'].reset_index(drop=True), 
                    bootstrap_xgb['MAE'].reset_index(drop=True)], axis=1)

In [None]:
# Combine algorithms 
bootstrap_fullres = pd.concat([
    bootstrap_svr,
    bootstrap_enet,
    bootstrap_rf,
    bootstrap_xgb], axis=1)

bootstrap_fullres.to_csv('/PLS/brain/stacking/BOOTSTRAP/all_mod/bootstrap_all_modalities.csv', index=False)

In [None]:
# Extract metrics
bootstrap_fullres_all_stacked_xgb = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/all_mod/bootstrap_all_modalities.csv')

bootstrap_all_stack_R2 = bootstrap_fullres_all_stacked_xgb[['svr_R2', 'eNet_R2', 'rf_R2', 'xgb_R2']]
bootstrap_all_stack_R2.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_all_stack_r = bootstrap_fullres_all_stacked_xgb[['svr_r', 'eNet_r', 'rf_r', 'xgb_r']]
bootstrap_all_stack_r.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_all_stack_mse = bootstrap_fullres_all_stacked_xgb[['svr_MSE', 'eNet_MSE', 'rf_MSE', 'xgb_MSE']]
bootstrap_all_stack_mse.columns = ['SVR', 'eNet', 'RF', 'XGB']

bootstrap_all_stack_mae = bootstrap_fullres_all_stacked_xgb[['svr_MAE', 'eNet_MAE', 'rf_MAE', 'xgb_MAE']]

bootstrap_all_stack_mae.columns = ['SVR', 'eNet', 'RF', 'XGB']

# Extract best algorithm
bootstrap_all_stacked_xgb_r = bootstrap_all_stack_r['XGB']
bootstrap_all_stacked_xgb_R2 = bootstrap_all_stack_R2['XGB']

In [None]:
# Compute CIs
# Initialize results storage with additional statistics
ci_results = {
    'Algorithm': [],
    'Metric': [],
    'Mean': [],
    'Median': [],
    'Std': [],
    'CI_lower': [],
    'CI_upper': []
}

# Dictionary of bootstrap distributions
bootstrap_dists = {
    'R2': bootstrap_t1t2_R2,
    'r': bootstrap_t1t2_r,
    'MSE': bootstrap_t1t2_mse,
    'MAE': bootstrap_t1t2_mae
}

for algorithm in ['SVR', 'eNet', 'RF', 'XGB']:
    print(f"\n=== {algorithm} ===")
    
    for metric, dist in bootstrap_dists.items():
        # Get the specific algorithm's data
        try:
            alg_data = dist[algorithm]
            
            # Calculate statistics
            mean_val = np.mean(alg_data).round(3)
            median_val = np.median(alg_data).round(3)
            std_val = np.std(alg_data).round(3)
            lower, upper = get_bootstrap_ci(alg_data)
            
            # Store results
            ci_results['Algorithm'].append(algorithm)
            ci_results['Metric'].append(metric)
            ci_results['Mean'].append(mean_val)
            ci_results['Median'].append(median_val)
            ci_results['Std'].append(std_val)
            ci_results['CI_lower'].append(lower)
            ci_results['CI_upper'].append(upper)
            
            # Print summary
            print(f"{metric}:")
            print(f"  Mean ± SD: {mean_val} ± {std_val}")
            print(f"  Median: {median_val}")
            print(f"  95% CI: [{lower}, {upper}]")
            
        except KeyError:
            print(f"Warning: {algorithm} not found in {metric} data")
            continue

# Convert to DataFrame with proper column order
ci_df = pd.DataFrame(ci_results)[['Algorithm', 'Metric', 'Mean', 'Median', 'Std', 'CI_lower', 'CI_upper']]

# Save with timestamp
output_path = '/rev1-analysis'
ci_df.to_csv(os.path.join(output_path, 'bootstrap_t1t2_stacked_CI.csv'), index=False)
ci_df.to_excel(
    os.path.join(output_path, 'bootstrap_t1t2_stacked_CI.xlsx'),
    index=False,
    engine='openpyxl'
)

print("\n=== Final Results ===")
print(ci_df)
print(f"\nResults saved to: {output_path}")

# Compute CIs for PLS

In [None]:
# Compute CIs for PLS
common_metrics = ['r2', 'r', 'mse', 'mae']
base_root = '/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack'
modalities = ['dti', 'rs', 't1t2']
output_path = '/rev1'

if not os.path.exists(output_path):
    print(f"Creating output directory: {output_path}")
    os.makedirs(output_path, exist_ok=True)

# Then run your analysis
for modality in modalities:
    print(f"\n{'='*40}")
    print(f"Processing modality: {modality.upper()}")
    print(f"{'='*40}")

    if modality == 't1t2':
        modality_path = os.path.join(base_root, 't1', 'output')
    else:
        modality_path = os.path.join(base_root, modality, 'output')
    
    # Run analysis
    results = compute_and_save_bootstrap_results_1level(
        metrics=common_metrics,
        modality_name=modality,
        base_path=modality_path,
        output_path=output_path
    )

# Compute CIs for stacked models

In [None]:
# Define modalities
modality_definitions = [
    {
        'name': 'dti',
        'metrics': {
            'R2': bootstrap_dti_R2,
            'r': bootstrap_dti_r,
            'MSE': bootstrap_dti_mse,
            'MAE': bootstrap_dti_mae
        }
    },
    {
        'name': 'rs',
        'metrics': {
            'R2': bootstrap_rs_R2,
            'r': bootstrap_rs_r,
            'MSE': bootstrap_rs_mse,
            'MAE': bootstrap_rs_mae
        }
    },
    {
        'name': 't1t2',
        'metrics': {
            'R2': bootstrap_t1t2_R2,
            'r': bootstrap_t1t2_r,
            'MSE': bootstrap_t1t2_mse,
            'MAE': bootstrap_t1t2_mae
        }
    },
    {
        'name': 'all_stack',
        'metrics': {
            'R2': bootstrap_all_stack_R2,
            'r': bootstrap_all_stack_r,
            'MSE': bootstrap_all_stack_mse,
            'MAE': bootstrap_all_stack_mae
        }
    }
]

In [None]:
# Clean up any existing files before processing
for modality in modality_definitions:
    base_name = f"bootstrap_{modality['name']}_CI"
    for ext in ['.csv', '.xlsx']:
        file_path = os.path.join(output_path, base_name + ext)
        try:
            if os.path.exists(file_path):
                os.remove(file_path)
                print(f"Removed old file: {file_path}")
        except Exception as e:
            print(f"Could not remove {file_path}: {str(e)}")

In [None]:
# Compute CIs for stacked models
results = {}

for modality in modality_definitions:
    print(f"\nProcessing modality: {modality['name'].upper()}")
    
    # Compute results for this modality
    modality_results = compute_and_save_bootstrap_results_2level(
        bootstrap_dists=modality['metrics'],
        output_path=output_path,
        modality_name=modality['name']
    )
    
    # Store the DataFrame in results dictionary
    if modality_results is not None:
        results[modality['name']] = modality_results
        print(f"Successfully processed {modality['name']}")
    else:
        print(f"Failed to process {modality['name']}")

## Prepare data

In [None]:
# Save files
pd.DataFrame(bootstrap_all_stacked_xgb_r).to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_all_stacked_xgb_r.csv', index=False)
pd.DataFrame(bootstrap_t1t2_stacked_svr_r).to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_t1t2_stacked_svr_r.csv', index=False)
pd.DataFrame(bootstrap_rs_stacked_rf_r).to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_rs_stacked_rf_r.csv', index=False)
pd.DataFrame(bootstrap_dti_stacked_rf_r).to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_dti_stacked_rf_r.csv', index=False)


pd.DataFrame(bootstrap_all_stacked_xgb_R2).to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_all_stacked_xgb_R2.csv', index=False)
pd.DataFrame(bootstrap_t1t2_stacked_svr_R2).to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_t1t2_stacked_svr_R2.csv', index=False)
pd.DataFrame(bootstrap_rs_stacked_rf_R2).to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_rs_stacked_rf_R2.csv', index=False)
pd.DataFrame(bootstrap_dti_stacked_rf_R2).to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_dti_stacked_rf_R2.csv', index=False)

In [None]:
# Read files
bootstrap_all_stacked_xgb_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_all_stacked_xgb_r.csv')
bootstrap_t1t2_stacked_svr_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_t1t2_stacked_svr_r.csv')
bootstrap_rs_stacked_rf_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_rs_stacked_rf_r.csv')
bootstrap_dti_stacked_rf_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_dti_stacked_rf_r.csv')

t1t2_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/output/t1t2_r_bootstrapped_renamed.csv')
rs_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/rs_r_bootstrapped_renamed.csv')
dti_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/dti_r_bootstrapped_renamed.csv')


bootstrap_all_stacked_xgb_R2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_all_stacked_xgb_R2.csv')
bootstrap_t1t2_stacked_svr_R2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_t1t2_stacked_svr_R2.csv')
bootstrap_rs_stacked_rf_R2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_rs_stacked_rf_R2.csv')
bootstrap_dti_stacked_rf_R2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/bootstrap_dti_stacked_rf_R2.csv')

t1t2_r2 = pd.read_csv('//PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/output/t1t2_r2_bootstrapped_renamed.csv')
rs_r2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/output/rs_r2_bootstrapped_renamed.csv')
dti_r2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/output/dti_r2_bootstrapped_renamed.csv')

Combine and melt dataframes with r and R2

In [None]:
# Pearson's r
bootstrap_all_stacked_xgb_r = pd.DataFrame(bootstrap_all_stacked_xgb_r).rename(columns={'XGB': 'All MRI Modalities Stacked (XGB)'})
bootstrap_t1t2_stacked_svr_r = pd.DataFrame(bootstrap_t1t2_stacked_svr_r).rename(columns={'SVR': 'T1w/T2w Structural Stacked (SVR)'})
bootstrap_rs_stacked_rf_r = pd.DataFrame(bootstrap_rs_stacked_rf_r).rename(columns={'RF': 'rsMRI Stacked (RF)'})
bootstrap_dti_stacked_rf_r = pd.DataFrame(bootstrap_dti_stacked_rf_r).rename(columns={'RF': 'dwMRI Stacked (RF)'})

all_mod_plus_stacked_plot_bootstrapped_r = pd.concat([bootstrap_all_stacked_xgb_r, bootstrap_t1t2_stacked_svr_r,
                                                             bootstrap_rs_stacked_rf_r, bootstrap_dti_stacked_rf_r,
                                                             t1t2_r, rs_r, dti_r], axis = 1)
#all_mod_plus_stacked_plot_bootstrapped_r.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/all_mod_plus_stacked_plot_bootstrapped_r.csv', index=False)

# Melt
all_mod_plus_stacked_plot_bootstrapped_r_melted = all_mod_plus_stacked_plot_bootstrapped_r.melt(var_name="modality", value_name='r')
#all_mod_plus_stacked_plot_bootstrapped_r_melted.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/all_mod_plus_stacked_plot_bootstrapped_r_melted.csv', index=False)
all_mod_plus_stacked_plot_bootstrapped_r

In [None]:
# R2
bootstrap_all_stacked_xgb_R2 = pd.DataFrame(bootstrap_all_stacked_xgb_R2).rename(columns={'XGB': 'All MRI Modalities Stacked (XGB)'})
bootstrap_t1t2_stacked_svr_R2 = pd.DataFrame(bootstrap_t1t2_stacked_svr_R2).rename(columns={'SVR': 'T1w/T2w Structural Stacked (SVR)'})
bootstrap_rs_stacked_rf_R2 = pd.DataFrame(bootstrap_rs_stacked_rf_R2).rename(columns={'RF': 'rsMRI Stacked (RF)'})
bootstrap_dti_stacked_rf_R2 = pd.DataFrame(bootstrap_dti_stacked_rf_R2).rename(columns={'RF': 'dwMRI Stacked (RF)'})

all_mod_plus_stacked_plot_bootstrapped_R2 = pd.concat([bootstrap_all_stacked_xgb_R2, bootstrap_t1t2_stacked_svr_R2,
                                                             bootstrap_rs_stacked_rf_R2, bootstrap_dti_stacked_rf_R2,
                                                             t1t2_r2, rs_r2, dti_r2], axis = 1)
#all_mod_plus_stacked_plot_bootstrapped_R2.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/all_mod_plus_stacked_plot_bootstrapped_R2.csv', index=False)

# Melt
all_mod_plus_stacked_plot_bootstrapped_R2_melted = all_mod_plus_stacked_plot_bootstrapped_R2.melt(var_name="modality", value_name='R2')
#all_mod_plus_stacked_plot_bootstrapped_R2_melted.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/all_mod_plus_stacked_plot_bootstrapped_R2_melted.csv', index=False)
all_mod_plus_stacked_plot_bootstrapped_R2

In [None]:
all_mod_plus_stacked_plot_bootstrapped_R2 = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/all_mod_plus_stacked_plot_bootstrapped_R2.csv')
all_mod_plus_stacked_plot_bootstrapped_r_melted = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/all_mod_plus_stacked_plot_bootstrapped_r_melted.csv')

## Extract and rename individual modalities for plotting

In [None]:
# Individual modalities: RS
# r
rs_bootstrapped_r = pd.concat([bootstrap_rs_stacked_rf_r,rs_r, 
                               bootstrap_all_stacked_xgb_r], axis = 1)
dti_bootstrapped_r = pd.concat([bootstrap_dti_stacked_rf_r,dti_r, 
                                bootstrap_all_stacked_xgb_r], axis = 1)
t1t2_bootstrapped_r = pd.concat([bootstrap_t1t2_stacked_svr_r,t1t2_r, 
                                 bootstrap_all_stacked_xgb_r], axis = 1)

# R2
rs_bootstrapped_r2 = pd.concat([bootstrap_rs_stacked_rf_R2,rs_r2, 
                                bootstrap_all_stacked_xgb_R2], axis = 1)
dti_bootstrapped_r2 = pd.concat([bootstrap_dti_stacked_rf_R2,dti_r2, 
                                 bootstrap_all_stacked_xgb_R2], axis = 1)
t1t2_bootstrapped_r2 = pd.concat([bootstrap_t1t2_stacked_svr_R2,t1t2_r2, 
                                bootstrap_all_stacked_xgb_R2], axis = 1)

In [None]:
# Rename RS modalities for plotting
rs_bootstrapped_r = pd.DataFrame(rs_bootstrapped_r).rename(columns={
'XGB': 'All MRI Modalities Stacked (XGB)',
'RF': 'rsMRI Stacked (RF)',
'aparc MSA I Full correlation': 'aparc-I Functional Connectivity', 
'aparc a2009s MSA I Full correlation': 'aparc.a2009s-I Functional Connectivity',
'Glasser MSA I Full correlation': 'Glasser-I Functional Connectivity',
'Glasser MSA IV Full correlation': 'Glasser-IV Functional Connectivity',
'Schaefer7n200p MSA I Full correlation': 'Schaefer200-I Functional Connectivity',
'Schaefer7n500p MSA IV Full correlation': 'Schaefer500-IV Functional Connectivity',
'Amplitudes 21 IC': '55 IC Amplitudes',
'Amplitudes 55 IC': '21 IC Amplitudes',
'Tangent matrices 21 IC': '21 IC Functional Connectivity',
'Tangent matrices 55 IC': '55 IC Functional Connectivity'})

rs_bootstrapped_r2 = pd.DataFrame(rs_bootstrapped_r2).rename(columns={
'XGB': 'All MRI Modalities Stacked (XGB)',
'RF': 'rsMRI Stacked (RF)',
'aparc MSA I Full correlation': 'aparc-I Functional Connectivity', 
'aparc a2009s MSA I Full correlation': 'aparc.a2009s-I Functional Connectivity',
'Glasser MSA I Full correlation': 'Glasser-I Functional Connectivity',
'Glasser MSA IV Full correlation': 'Glasser-IV Functional Connectivity',
'Schaefer7n200p MSA I Full correlation': 'Schaefer200-I Functional Connectivity',
'Schaefer7n500p MSA IV Full correlation': 'Schaefer500-IV Functional Connectivity',
'Amplitudes 21 IC': '55 IC Amplitudes',
'Amplitudes 55 IC': '21 IC Amplitudes',
'Tangent matrices 21 IC': '21 IC Functional Connectivity',
'Tangent matrices 55 IC': '55 IC Functional Connectivity'})

rs_bootstrapped_r.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/rs_bootstrapped_r_renamed.csv', index=False)
rs_bootstrapped_r2.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/rs_bootstrapped_r2_renamed.csv', index=False)

In [None]:
# Rename DTI modalities for plotting
dti_bootstrapped_r = pd.DataFrame(dti_bootstrapped_r).rename(columns={
'XGB': 'All MRI Modalities Stacked (XGB)',
'RF': 'dwMRI Stacked (RF)',
'aparc a2009s MSA I Connectome FA': 'aparc.a2009s-I FA',
'aparc a2009s MSA I Connectome Mean Length': 'aparc.a2009s-I Mean Length',
'aparc a2009s MSA I Connectome SIFT2': 'aparc.a2009s-I SIFT2',
'aparc a2009s MSA I Connectome Streamline Count': 'aparc.a2009s-I Streamline Count',
'aparc MSA I Connectome FA': 'aparc-I FA',
'aparc MSA I Connectome Mean Length': 'aparc-I Mean Length',
'aparc MSA I Connectome SIFT2': 'aparc-I SIFT2',
'aparc MSA I Connectome Streamline Count': 'aparc-I Streamline Count',
'Glasser MSA I Connectome FA': 'Glasser-I FA',
'Glasser MSA I Connectome Mean Length': 'Glasser-I Mean Length',
'Glasser MSA I Connectome SIFT2': 'Glasser-I SIFT2',
'Glasser MSA I Connectome Streamline Count': 'Glasser-I Streamline Count',
'Glasser MSA IV Connectome FA': 'Glasser-IV FA',
'Glasser MSA IV Connectome Mean Length': 'Glasser-IV Mean Length',
'Glasser MSA IV Connectome SIFT2': 'Glasser-IV SIFT2',
'Glasser MSA IV Connectome Streamline Count': 'Glasser-IV Streamline Count',
'Schaefer7n200p MSA I Connectome FA': 'Schaefer200-I FA',
'Schaefer7n200p MSA I Connectome Mean Length': 'Schaefer200-I Mean Length',
'Schaefer7n200p MSA I Connectome SIFT2': 'Schaefer200-I SIFT2',
'Schaefer7n200p MSA I Connectome Streamline Count': 'Schaefer200-I Streamline Count',
'Schaefer7n500p MSA IV Connectome FA': 'Schaefer500-IV FA',
'Schaefer7n500p MSA IV Connectome Mean Length': 'Schaefer500-IV Mean Length',
'Schaefer7n500p MSA IV Connectome SIFT2': 'Schaefer500-IV SIFT2',
'Schaefer7n500p MSA IV Connectome Streamline Count': 'Schaefer500-IV Streamline Count',
'FA Probabilistic': 'FA Prob.',
'MD Probabilistic': 'MD Prob.',
'L1 Probabilistic': 'L1 Prob.',
'L2 Probabilistic': 'L2 Prob.',
'L3 Probabilistic': 'L3 Prob.',
'MO Probabilistic': 'MO Prob.',
'OD Probabilistic': 'OD Prob.',
'ICVF Probabilistic': 'ICVF Prob.',
'ISOVF Probabilistic': 'ISOVF Prob.'
})

dti_bootstrapped_r2 = pd.DataFrame(dti_bootstrapped_r2).rename(columns={
'XGB': 'All MRI Modalities Stacked (XGB)',
'RF': 'dwMRI Stacked (RF)',
'aparc a2009s MSA I Connectome FA': 'aparc.a2009s-I FA',
'aparc a2009s MSA I Connectome Mean Length': 'aparc.a2009s-I Mean Length',
'aparc a2009s MSA I Connectome SIFT2': 'aparc.a2009s-I SIFT2',
'aparc a2009s MSA I Connectome Streamline Count': 'aparc.a2009s-I Streamline Count',
'aparc MSA I Connectome FA': 'aparc-I FA',
'aparc MSA I Connectome Mean Length': 'aparc-I Mean Length',
'aparc MSA I Connectome SIFT2': 'aparc-I SIFT2',
'aparc MSA I Connectome Streamline Count': 'aparc-I Streamline Count',
'Glasser MSA I Connectome FA': 'Glasser-I FA',
'Glasser MSA I Connectome Mean Length': 'Glasser-I Mean Length',
'Glasser MSA I Connectome SIFT2': 'Glasser-I SIFT2',
'Glasser MSA I Connectome Streamline Count': 'Glasser-I Streamline Count',
'Glasser MSA IV Connectome FA': 'Glasser-IV FA',
'Glasser MSA IV Connectome Mean Length': 'Glasser-IV Mean Length',
'Glasser MSA IV Connectome SIFT2': 'Glasser-IV SIFT2',
'Glasser MSA IV Connectome Streamline Count': 'Glasser-IV Streamline Count',
'Schaefer7n200p MSA I Connectome FA': 'Schaefer200-I FA',
'Schaefer7n200p MSA I Connectome Mean Length': 'Schaefer200-I Mean Length',
'Schaefer7n200p MSA I Connectome SIFT2': 'Schaefer200-I SIFT2',
'Schaefer7n200p MSA I Connectome Streamline Count': 'Schaefer200-I Streamline Count',
'Schaefer7n500p MSA IV Connectome FA': 'Schaefer500-IV FA',
'Schaefer7n500p MSA IV Connectome Mean Length': 'Schaefer500-IV Mean Length',
'Schaefer7n500p MSA IV Connectome SIFT2': 'Schaefer500-IV SIFT2',
'Schaefer7n500p MSA IV Connectome Streamline Count': 'Schaefer500-IV Streamline Count',
'FA Probabilistic': 'FA Prob.',
'MD Probabilistic': 'MD Prob.',
'L1 Probabilistic': 'L1 Prob.',
'L2 Probabilistic': 'L2 Prob.',
'L3 Probabilistic': 'L3 Prob.',
'MO Probabilistic': 'MO Prob.',
'OD Probabilistic': 'OD Prob.',
'ICVF Probabilistic': 'ICVF Prob.',
'ISOVF Probabilistic': 'ISOVF Prob.'})

dti_bootstrapped_r.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/dti_bootstrapped_r_renamed.csv', index=False)
dti_bootstrapped_r2.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/dti_bootstrapped_r2_renamed.csv', index=False)

In [None]:
# Rename T1/T2 modalities for plotting
t1t2_bootstrapped_r = pd.DataFrame(t1t2_bootstrapped_r).rename(columns={
'XGB': 'All MRI Modalities Stacked (XGB)',
'SVR': 'T1w/T2w Structural Stacked (SVR)',
'aparc a2009s Area': 'aparc.a2009s Area', 
'aparc a2009s Mean Thickness': 'aparc.a2009s Mean Thickness', 
'aparc a2009s volume': 'aparc.a2009s Volume', 
'Desikan-Killiany-Tourville Area': 'DKT Area',
'Desikan-Killiany-Tourville Mean Thickness': 'DKT Mean Thickness',
'Desikan-Killiany-Tourville volume': 'DKT Volume',
'Desikan Grey/White Matter intensity': 'Desikan GM/WM Intensity',
'Desikan pial': 'Desikan Pial',
'Desikan White Matter Area': 'Desikan WM Area',
'Desikan White Matter Mean Thickness': 'Desikan WM Mean Thickness',
'Desikan White Matter volume': 'Desikan WM Volume',
'Subcortical Volumetric Subsegmentation': 'Subcor. Volumetric Subsegment.',
'ASEG volume':  "ASEG Volume"
})

t1t2_bootstrapped_r2 = pd.DataFrame(t1t2_bootstrapped_r2).rename(columns={
'XGB': 'All MRI Modalities Stacked (XGB)',
'SVR': 'T1w/T2w Structural Stacked (SVR)',
'aparc a2009s Area': 'aparc.a2009s Area', 
'aparc a2009s Mean Thickness': 'aparc.a2009s Mean Thickness', 
'aparc a2009s volume': 'aparc.a2009s Volume', 
'Desikan-Killiany-Tourville Area': 'DKT Area',
'Desikan-Killiany-Tourville Mean Thickness': 'DKT Mean Thickness',
'Desikan-Killiany-Tourville volume': 'DKT Volume',
'Desikan Grey/White Matter intensity': 'Desikan GM/WM Intensity',
'Desikan pial': 'Desikan Pial',
'Desikan White Matter Area': 'Desikan WM Area',
'Desikan White Matter Mean Thickness': 'Desikan WM Mean Thickness',
'Desikan White Matter volume': 'Desikan WM Volume',
'Subcortical Volumetric Subsegmentation': 'Subcor. Volumetric Subsegment.',
'Subcortical Volumetric Subsegmentation': 'Subcor. Volumetric Subsegment.'})

t1t2_bootstrapped_r.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/t1t2_bootstrapped_r_renamed.csv', index=False)
t1t2_bootstrapped_r2.to_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/t1t2_bootstrapped_r2_renamed.csv', index=False)

In [None]:
# Define modalities: before renaming
rs_parcellations = ['aparc MSA I Full correlation',
       'aparc a2009s MSA I Full correlation',
       'Glasser MSA I Full correlation',
       'Glasser MSA IV Full correlation',
       'Schaefer7n200p MSA I Full correlation',
       'Schaefer7n500p MSA IV Full correlation']
       
rs_idp = ['Amplitudes 21 IC','Amplitudes 55 IC', 'Tangent matrices 21 IC','Tangent matrices 55 IC']    

dti_parcellations = ['aparc a2009s MSA I Connectome FA',
       'aparc a2009s MSA I Connectome Mean Length',
       'aparc a2009s MSA I Connectome SIFT2',
       'aparc a2009s MSA I Connectome Streamline Count',
       'aparc MSA I Connectome FA', 'aparc MSA I Connectome Mean Length',
       'aparc MSA I Connectome SIFT2',
       'aparc MSA I Connectome Streamline Count',
       'Glasser MSA I Connectome FA',
       'Glasser MSA I Connectome Mean Length',
       'Glasser MSA I Connectome SIFT2',
       'Glasser MSA I Connectome Streamline Count',
       'Glasser MSA IV Connectome FA',
       'Glasser MSA IV Connectome Mean Length',
       'Glasser MSA IV Connectome SIFT2',
       'Glasser MSA IV Connectome Streamline Count',
       'Schaefer7n200p MSA I Connectome FA',
       'Schaefer7n200p MSA I Connectome Mean Length',
       'Schaefer7n200p MSA I Connectome SIFT2',
       'Schaefer7n200p MSA I Connectome Streamline Count',
       'Schaefer7n500p MSA IV Connectome FA',
       'Schaefer7n500p MSA IV Connectome Mean Length',
       'Schaefer7n500p MSA IV Connectome SIFT2',
       'Schaefer7n500p MSA IV Connectome Streamline Count']
dti_idp = ['FA TBSS', 'FA Probabilistic', 'MD TBSS',
       'MD Probabilistic', 'L1 TBSS', 'L1 Probabilistic', 'L2 TBSS',
       'L2 Probabilistic', 'L3 TBSS', 'L3 Probabilistic', 'MO TBSS',
       'MO Probabilistic', 'OD TBSS', 'OD Probabilistic', 'ICVF TBSS',
       'ICVF Probabilistic', 'ISOVF TBSS', 'ISOVF Probabilistic']
t1t2_modalities = ['FSL FAST', 'FSL FIRST',
       'ASEG Mean Thickness', 'ASEG Volume', 'BA ex-vivo Area',
       'BA ex-vivo Mean Thickness', 'BA ex-vivo Volume',
       'aparc a2009s Area', 'aparc a2009s Mean Thickness',
       'aparc a2009s volume', 'Desikan-Killiany-Tourville Area',
       'Desikan-Killiany-Tourville Mean Thickness',
       'Desikan-Killiany-Tourville volume',
       'Desikan Grey/White Matter intensity', 'Desikan pial',
       'Desikan White Matter Area', 'Desikan White Matter Mean Thickness',
       'Desikan White Matter volume',
       'Subcortical Volumetric Subsegmentation', 'Whole-brain T1/T2']

# FINAL PLOT: All modalities on one plot, three subplots, only r, no stacked

________________________________________________________________________________________

In [None]:
rs_bootstrapped_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/rs/rs_bootstrapped_r_renamed.csv')
dti_bootstrapped_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/dti/dti_bootstrapped_r_renamed.csv')
t1t2_bootstrapped_r = pd.read_csv('/PLS/brain/stacking/BOOTSTRAP/pls_vs_stack/t1/t1t2_bootstrapped_r_renamed.csv')
# Rename some vars for plots
t1t2_bootstrapped_r_short_names = t1t2_bootstrapped_r.rename(columns={'Subcor. Volumetric Subsegment.': 'Subcortical Volumetric Subseg.',
                                                                      'ASEG Mean Thickness': 'ASEG Mean Intensity'})
rs_bootstrapped_r_short_names = rs_bootstrapped_r.rename(columns=lambda x: x.replace('Functional Connectivity', 'Func. Connectivity') if 'Functional Connectivity' in x else x)
#Rename modalities
rs_bootstrapped_r_melt = rs_bootstrapped_r_short_names.melt(var_name="modality", value_name='r')
rs_bootstrapped_r_melt['mod'] = 'rs'
dti_bootstrapped_r_melt = dti_bootstrapped_r.melt(var_name="modality", value_name='r')
dti_bootstrapped_r_melt['mod'] = 'dti'
t1t2_bootstrapped_r_melt = t1t2_bootstrapped_r_short_names.melt(var_name="modality", value_name='r')
t1t2_bootstrapped_r_melt['mod'] = 't1t2'
#all_bootstrapped_r = pd.concat([rs_bootstrapped_r_melt, dti_bootstrapped_r_melt, t1t2_bootstrapped_r_melt])
#all_bootstrapped_r_melt = all_bootstrapped_r.melt(id_vars=['modality', 'mod'], var_name='variable', value_name='value').drop(columns = 'variable')

In [None]:
# Define modalities: after renaming
rs_parcellations = ['aparc-I Func. Connectivity',
       'aparc.a2009s-I Func. Connectivity',
       'Glasser-I Func. Connectivity',
       'Glasser-IV Func. Connectivity',
       'Schaefer200-I Func. Connectivity',
       'Schaefer500-IV Func. Connectivity']
       
rs_idp = ['55 IC Amplitudes','21 IC Amplitudes', '21 IC Func. Connectivity','55 IC Func. Connectivity']    

dti_parcellations = ['aparc.a2009s-I FA',
       'aparc.a2009s-I Mean Length', 'aparc.a2009s-I SIFT2',
       'aparc.a2009s-I Streamline Count', 'aparc-I FA', 'aparc-I Mean Length',
       'aparc-I SIFT2', 'aparc-I Streamline Count', 'Glasser-I FA',
       'Glasser-I Mean Length', 'Glasser-I SIFT2',
       'Glasser-I Streamline Count', 'Glasser-IV FA', 'Glasser-IV Mean Length',
       'Glasser-IV SIFT2', 'Glasser-IV Streamline Count', 'Schaefer200-I FA',
       'Schaefer200-I Mean Length', 'Schaefer200-I SIFT2',
       'Schaefer200-I Streamline Count', 'Schaefer500-IV FA',
       'Schaefer500-IV Mean Length', 'Schaefer500-IV SIFT2',
       'Schaefer500-IV Streamline Count']
dti_idp = ['FA TBSS', 'FA Prob.', 'MD TBSS', 'MD Prob.',
       'L1 TBSS', 'L1 Prob.', 'L2 TBSS', 'L2 Prob.', 'L3 TBSS', 'L3 Prob.',
       'MO TBSS', 'MO Prob.', 'OD TBSS', 'OD Prob.', 'ICVF TBSS', 'ICVF Prob.',
       'ISOVF TBSS', 'ISOVF Prob.']
t1t2_modalities = ['FSL FAST', 'FSL FIRST',
       'ASEG Mean Intensity', 'ASEG Volume', 'BA ex-vivo Area',
       'BA ex-vivo Mean Thickness', 'BA ex-vivo Volume', 'aparc.a2009s Area',
       'aparc.a2009s Mean Thickness', 'aparc.a2009s Volume', 'DKT Area',
       'DKT Mean Thickness', 'DKT Volume', 'Desikan GM/WM Intensity',
       'Desikan Pial', 'Desikan WM Area', 'Desikan WM Mean Thickness',
       'Desikan WM Volume', 'Subcortical Volumetric Subseg.',
       'Whole-brain T1/T2']

In [None]:
# Define color map
color_mapping_violin = {}
for modality in rs_parcellations:
    color_mapping_violin[modality] = 'rgba(121, 175, 151, 1.0)' #mediumaquamarine
for modality in rs_idp:
    color_mapping_violin[modality] = 'rgba(75, 111, 90, 0.6)' #seagreen

for modality in dti_parcellations:
    color_mapping_violin[modality] = 'rgba(31, 120, 180, 1.0)' #steelblue
for modality in dti_idp:
    color_mapping_violin[modality] = 'rgba(0, 161, 213, 0.6)' #dodgerblue
for modality in t1t2_modalities:
    color_mapping_violin[modality] = 'rgba(247, 200, 158, 1.0)' #wheat

color_mapping_violin['rsMRI Stacked'] = 'rgba(210, 105, 105, 0.6)' #salmon
color_mapping_violin['dwMRI Stacked'] = 'rgba(106, 101, 153, 0.6)' #blue
color_mapping_violin['sMRI Stacked'] = 'rgba(223, 143, 68, 1.0)'  #darkgoldenrod
color_mapping_violin['MRI Modalities Stacked'] = 'rgba(178, 71, 69, 1.0)' #Crimson

In [None]:
# Define color map using 6-digit hex codes
color_mapping_violin = {}

# rsMRI - Greens/Teals
for modality in rs_parcellations:
    color_mapping_violin[modality] = 'rgba(15, 77, 18, 1.0)'
for modality in rs_idp:
    color_mapping_violin[modality] = 'rgba(17, 173, 25, 1.0)'

# dwMRI - Blues
for modality in dti_parcellations:
    color_mapping_violin[modality] = 'rgba(37, 95, 122, 1.0)'
for modality in dti_idp:
    color_mapping_violin[modality] = 'rgba(62, 161, 207, 1.0)'

# sMRI - Warm tones
for modality in t1t2_modalities:
    color_mapping_violin[modality] = 'rgba(201, 168, 20, 1.0)'

color_mapping_violin['rsMRI Stacked'] = 'rgba(210, 105, 105, 0.6)' #salmon
color_mapping_violin['dwMRI Stacked'] = 'rgba(106, 101, 153, 0.6)' #blue
color_mapping_violin['sMRI Stacked'] = 'rgba(223, 143, 68, 1.0)'  #darkgoldenrod
color_mapping_violin['MRI Modalities Stacked'] = 'rgba(178, 71, 69, 1.0)' #Crimson

In [None]:
# Prepare data frames
rs_bootstrapped_r_melt_sort = rs_bootstrapped_r_melt.sort_values(by='r', ascending=True).reset_index(drop=True)
rs_bootstrapped_r_melt_sort = rs_bootstrapped_r_melt_sort[rs_bootstrapped_r_melt_sort['modality'] != 'All MRI Modalities Stacked (XGB)']
rs_bootstrapped_r_melt_sort = rs_bootstrapped_r_melt_sort[rs_bootstrapped_r_melt_sort['modality'] != 'rsMRI Stacked (RF)']

dti_bootstrapped_r_melt_sort = dti_bootstrapped_r_melt.sort_values(by='r', ascending=True).reset_index(drop=True)
dti_bootstrapped_r_melt_sort = dti_bootstrapped_r_melt_sort[dti_bootstrapped_r_melt_sort['modality'] != 'All MRI Modalities Stacked (XGB)']
dti_bootstrapped_r_melt_sort = dti_bootstrapped_r_melt_sort[dti_bootstrapped_r_melt_sort['modality'] != 'dwMRI Stacked (RF)']

t1t2_bootstrapped_r_melt_sort = t1t2_bootstrapped_r_melt.sort_values(by='r', ascending=True).reset_index(drop=True)
t1t2_bootstrapped_r_melt_sort = t1t2_bootstrapped_r_melt_sort[t1t2_bootstrapped_r_melt_sort['modality'] != 'All MRI Modalities Stacked (XGB)']
t1t2_bootstrapped_r_melt_sort = t1t2_bootstrapped_r_melt_sort[t1t2_bootstrapped_r_melt_sort['modality'] != 'T1w/T2w Structural Stacked (SVR)']

# Calculate median values for each modality and ensure the order matches the sorted data
rs_median = rs_bootstrapped_r_melt_sort.groupby('modality')['r'].median().reset_index()
rs_median = rs_median.set_index('modality').loc[rs_bootstrapped_r_melt_sort['modality'].unique()].reset_index() #loc[0:2,:]
rs_median = pd.concat([rs_median.head(1), rs_median.tail(1)]).reset_index(drop=True)

dti_median = dti_bootstrapped_r_melt_sort.groupby('modality')['r'].median().reset_index()
dti_median = dti_median.set_index('modality').loc[dti_bootstrapped_r_melt_sort['modality'].unique()].reset_index() #.loc[0:2,:]
dti_median = pd.concat([dti_median.head(1), dti_median.tail(1)]).reset_index(drop=True)

t1t2_median = t1t2_bootstrapped_r_melt_sort.groupby('modality')['r'].median().reset_index()
t1t2_median = t1t2_median.set_index('modality').loc[t1t2_bootstrapped_r_melt_sort['modality'].unique()].reset_index() #.loc[0:2,:]
t1t2_median = pd.concat([t1t2_median.head(1), t1t2_median.tail(1)]).reset_index(drop=True)

In [None]:
# Violins
fig = make_subplots(rows=1, cols=3, shared_yaxes=False, subplot_titles=("dwMRI", "rsMRI", "sMRI"),
                    horizontal_spacing=0.27)

# Define a function to add half-inverted violin plots
def add_half_violin_trace(fig, data, row, col, color_mapping):
    for modality in data['modality'].unique():
        subset = data[data['modality'] == modality]
        fig.add_trace(
            go.Violin(x=subset['r'], y=subset['modality'], orientation='h', side='positive', name=modality,
                      box_visible=False, meanline_visible=False, points=False, line_color=color_mapping.get(modality, 'black'), fillcolor=color_mapping.get(modality, 'black'), line_width=1.2, showlegend=False),
            row=row, col=col
        )

# Add half-inverted violin plots for each modality
add_half_violin_trace(fig, dti_bootstrapped_r_melt_sort, 1, 1, color_mapping_violin)
add_half_violin_trace(fig, rs_bootstrapped_r_melt_sort, 1, 2, color_mapping_violin)
add_half_violin_trace(fig, t1t2_bootstrapped_r_melt_sort, 1, 3, color_mapping_violin)


# Add scatter plots to show the median values
def add_median_trace(fig, medians, row, col):
    for i, modality in enumerate(medians['modality']):
        median_value = medians.loc[i, 'r']

        fig.add_trace(
            go.Scatter(
                x=[median_value],
                y=[modality],
                mode='markers+text', #markers+text #lines+text
                marker=dict(color='grey', size=3),
                #line=dict(color='black', width=2),
                showlegend=False,
                text=[f"{median_value:.2f}"],
                textposition="bottom center", #middle left 
                textfont=dict(size=23,family="DejaVu Sans", color='black')
            ),
            row=row, col=col
        )

# Add median traces for each subplot
add_median_trace(fig, dti_median, 1, 1)
add_median_trace(fig, rs_median, 1, 2)
add_median_trace(fig, t1t2_median, 1, 3)

# Create custom legend
legend_elements = [
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(75, 111, 90, 0.6)', width=6), name='rsMRI IDPs', showlegend=True),
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(121, 175, 151, 1.0)', width=6), name='rsMRI Parcellations', showlegend=True),
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(0, 161, 213, 0.6)', width=6), name='dwMRI IDPs', showlegend=True),
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(31, 120, 180, 1.0)', width=6), name='dwMRI Parcellations', showlegend=True),
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(247, 200, 158, 1.0)', width=6), name='sMRI', showlegend=True),

]

# Add custom legend to the figure
for trace in legend_elements:
    fig.add_trace(trace)

for i in range(1, 4):
    fig.update_xaxes(showline=True, linewidth=0.5, linecolor='grey', mirror=True, dtick=0.1,
                     range=[0.05, 0.35], row=1, col=i, tickangle=-50, tickfont=dict(size=33, family="DejaVu Sans", color='black'),
                    tickvals=[0.1, 0.15, 0.2, 0.25, 0.3], ticktext=['0.1', '0.15', '0.2', '0.25', '0.3'])
    fig.update_yaxes(showline=True, linewidth=0.5, linecolor='grey', mirror=True, row=1, col=i, tickfont=dict(size=15, family="DejaVu Sans", color='black'))


# Customize y-axis labels for each subplot
fig.update_yaxes(tickfont=dict(size=22,family="DejaVu Sans", color='black'), row=1, col=1) #, family='Arial'
fig.update_yaxes(tickfont=dict(size=26.5,family="DejaVu Sans", color='black'), row=1, col=2) #, family='Courier New'
fig.update_yaxes(tickfont=dict(size=28,family="DejaVu Sans", color='black'), row=1, col=3) #, family='Times New Roman' #, side='right'
                 

# Update layout
fig.update_layout(
    height=6.0*300,  # 5.85 inches (half of A4 height) in pixels (300 dpi)
    width=7.3*300,   # 8.27 inches (A4 width) in pixels (300 dpi)
    #height=1100, width=1800,  # height=1100, width=1800,
    plot_bgcolor='white', paper_bgcolor='white',
    showlegend=True,

    title=dict(
        text="Bootstrap Distribution of Pearson <i>r</i>: PLSR",
        x=0.5,
        y=0.95,
        xanchor='center',
        font=dict(size=55, family="DejaVu Sans", color='black')
    ),
     margin=dict(t=250),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.15,
        xanchor="center",
        x=0.5,
        font=dict(size=45, family="DejaVu Sans", color='black')),
    annotations=[
        dict(
            text="dwMRI", x=0.08, y=1, xref="paper", yref="paper",
            showarrow=False, font=dict(size=40, family="DejaVu Sans", color='black')),
        dict(
            text="rsMRI", x=0.498, y=1, xref="paper", yref="paper",
            showarrow=False, font=dict(size=40, family="DejaVu Sans", color='black')),
        dict(
            text="sMRI", x=0.915, y=1, xref="paper", yref="paper",
            showarrow=False, font=dict(size=40, family="DejaVu Sans", color='black'))
],
)

# Add a red marker at minimum x-value (0.05)
for col in [1, 2, 3]:
    fig.add_vline(
        x=0.05, 
        line=dict(color="red", width=2, dash="dot"),
        opacity=0.8,
        row=1,
        col=col,
        annotation_text="0",
        annotation_position="bottom right",
        annotation_font=dict(
            size=24,    
            #family="DejaVu Sans",
            color="#660000"
        )
    )

pio.write_image(fig, '/figures/Fig3b.png', scale=1, format='png') # width=8.27*400, height=6.85*400,  
fig.show()

In [None]:
# CI line plots
fig = make_subplots(rows=1, cols=3, shared_yaxes=False, subplot_titles=("dwMRI", "rsMRI", "sMRI"),
                    horizontal_spacing=0.27)

# New function to add CI lines and median markers
def add_ci_trace(fig, data, row, col, color_mapping):
    modalities = data['modality'].unique()
    for i, modality in enumerate(modalities):
        subset = data[data['modality'] == modality]
        values = subset['r']
        
        median = np.median(values)
        lower, upper = get_bootstrap_ci(values)
        
        # Main CI line with end markers
        fig.add_trace(
            go.Scatter(
                x=[lower, upper],
                y=[modality, modality],
                mode='lines+markers',
                line=dict(color=color_mapping.get(modality, 'black'), dash='longdash', width=3),
                marker=dict(
                    color=color_mapping.get(modality, 'black'),
                    size=8,
                    symbol='line-ns-open',  # Vertical line markers
                    line=dict(width=1.5),
                    opacity=1
                ),
                showlegend=False
            ),
            row=row, col=col
        )
        
        # Median marker
        fig.add_trace(
            go.Scatter(
                x=[median],
                y=[modality],
                mode='markers+text',
                marker=dict(
                    symbol= 'line-ns', #'line-ns-open',
                    color=color_mapping.get(modality, 'black'),#'#660000',
                    size=4,
                    line=dict(color='#660000', width=2),
                    opacity=1
                ),
                #text=[f"{median:.2f}"] if (i == 0 or i == len(modalities)-1) else None,
                #textposition="bottom center",
                #textfont=dict(size=12, family="DejaVu Sans", color='black'),
                showlegend=False
            ),
            row=row, col=col
        )
        
        # Annotate first/last modalities
        if i == 0 or i == len(modalities)-1:
            # Left CI annotation
            fig.add_trace(
                go.Scatter(
                    x=[lower],
                    y=[modality],
                    mode='text',
                    text=[f"{lower:.2f}"],
                    textposition="bottom left",
                    textfont=dict(size=22, family="DejaVu Sans", color='black'),
                    showlegend=False
                ),
                row=row, col=col
            )
            # Right CI annotation
            fig.add_trace(
                go.Scatter(
                    x=[upper],
                    y=[modality],
                    mode='text',
                    text=[f"{upper:.2f}"],
                    textposition="bottom right",
                    textfont=dict(size=22, family="DejaVu Sans", color='black'),
                    showlegend=False
                ),
                row=row, col=col
            )

# Replace violin plots with CI lines
add_ci_trace(fig, dti_bootstrapped_r_melt_sort, 1, 1, color_mapping_violin)
add_ci_trace(fig, rs_bootstrapped_r_melt_sort, 1, 2, color_mapping_violin)
add_ci_trace(fig, t1t2_bootstrapped_r_melt_sort, 1, 3, color_mapping_violin)

# Create custom legend
legend_elements = [
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(62, 161, 207, 1.0)', width=6), name='dwMRI IDPs', showlegend=True),  #'rgba(0, 161, 213, 0.6)'
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(37, 95, 122, 1.0)', width=6), name='dwMRI Parcellations', showlegend=True), #'rgba(31, 120, 180, 1.0)'
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(17, 173, 25, 1.0)', width=6), name='rsMRI IDPs', showlegend=True), #'rgba(75, 111, 90, 0.6)'
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(15, 77, 18, 1.0)', width=6), name='rsMRI Parcellations', showlegend=True), #'rgba(121, 175, 151, 1.0)'
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color='rgba(201, 168, 20, 1.0)', width=6), name='sMRI', showlegend=True), #'rgba(247, 200, 158, 1.0)'
]

# Add custom legend to the figure
for trace in legend_elements:
    fig.add_trace(trace)

for i in range(1, 4):
    fig.update_xaxes(showline=True, linewidth=0.5, linecolor='grey', mirror=True, dtick=0.1,
                     range=[-0.01, 0.38], row=1, col=i, tickangle=-50, tickfont=dict(size=33, family="DejaVu Sans", color='black'),
                    tickvals=[-0.01, 0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.30, 0.35], ticktext=['', '0', '0.05', '0.1', '0.15', '0.2', '0.25', '0.30', '0.35'])
    fig.update_yaxes(showline=True, linewidth=0.5, linecolor='grey', mirror=True, row=1, col=i, tickfont=dict(size=15, family="DejaVu Sans", color='black'))


# Customize y-axis labels for each subplot
fig.update_yaxes(tickfont=dict(size=22,family="DejaVu Sans", color='black'), row=1, col=1) #, family='Arial'
fig.update_yaxes(tickfont=dict(size=26.5,family="DejaVu Sans", color='black'), row=1, col=2) #, family='Courier New'
fig.update_yaxes(tickfont=dict(size=28,family="DejaVu Sans", color='black'), row=1, col=3) #, family='Times New Roman' #, side='right'
                 

# Update layout
fig.update_layout(
    height=6.0*300,  # 5.85 inches (half of A4 height) in pixels (300 dpi)
    width=7.3*300,   # 8.27 inches (A4 width) in pixels (300 dpi)
    #height=1100, width=1800,  # height=1100, width=1800,
    plot_bgcolor='white', paper_bgcolor='white',
    showlegend=True,

    title=dict(
        text="Bootstrap Distribution of Pearson <i>r</i>: PLSR",
        x=0.5,
        y=0.95,
        xanchor='center',
        font=dict(size=55, family="DejaVu Sans", color='black')
    ),
     margin=dict(t=250),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.15,
        xanchor="center",
        x=0.5,
        font=dict(size=45, family="DejaVu Sans", color='black')),
    annotations=[
        dict(
            text="dwMRI", x=0.08, y=1, xref="paper", yref="paper",
            showarrow=False, font=dict(size=40, family="DejaVu Sans", color='black')),
        dict(
            text="rsMRI", x=0.498, y=1, xref="paper", yref="paper",
            showarrow=False, font=dict(size=40, family="DejaVu Sans", color='black')),
        dict(
            text="sMRI", x=0.915, y=1, xref="paper", yref="paper",
            showarrow=False, font=dict(size=40, family="DejaVu Sans", color='black'))
],
)

# Add a red marker at minimum x-value (0.05)
for col in [1, 2, 3]:
    fig.add_vline(
        x=0.0009, 
        line=dict(color="#660000", width=2, dash="dash"), #['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot']
        opacity=0.8,
        row=1,
        col=col,
        #annotation_text="0",
        #annotation_position="bottom right",
        #annotation_font=dict(
            #size=24,    
            #family="DejaVu Sans",
            #color="#660000")
    )

pio.write_image(fig, '/figures/Fig3b-ci.pdf', scale=1, format='pdf') # width=8.27*400, height=6.85*400,  
fig.show()

In [None]:
dark_red = "#8B0000"  # Standard dark red
darker_red = "#5C0000"  # Even darker
blood_red = "#660000"  # Rich dark red
dark_red_rgb = "rgb(139, 0, 0)"  # Same as #8B0000
dark_red_rgba = "rgba(139, 0, 0, 0.8)"  # With 80% opacity

# KDE plots for stacked modalities

In [None]:
# Prepare data frames
rs_bootstrapped_r_melt_sort = rs_bootstrapped_r_melt.sort_values(by='r', ascending=True).reset_index(drop=True)
rs_bootstrapped_r_melt_sort = rs_bootstrapped_r_melt_sort[rs_bootstrapped_r_melt_sort['modality'] == 'rsMRI Stacked (RF)']

dti_bootstrapped_r_melt_sort = dti_bootstrapped_r_melt.sort_values(by='r', ascending=True).reset_index(drop=True)
dti_bootstrapped_r_melt_sort = dti_bootstrapped_r_melt_sort[dti_bootstrapped_r_melt_sort['modality'] == 'dwMRI Stacked (RF)']

t1t2_bootstrapped_r_melt_sort = t1t2_bootstrapped_r_melt.sort_values(by='r', ascending=True).reset_index(drop=True)
all_stacked = t1t2_bootstrapped_r_melt_sort[t1t2_bootstrapped_r_melt_sort['modality'] == 'All MRI Modalities Stacked (XGB)']
all_stacked['mod'] = all_stacked['mod'].replace('t1t2', 'all')
t1t2_bootstrapped_r_melt_sort = t1t2_bootstrapped_r_melt_sort[t1t2_bootstrapped_r_melt_sort['modality'] == 'T1w/T2w Structural Stacked (SVR)']

# Calculate median values for each modality and ensure the order matches the sorted data
rs_median = rs_bootstrapped_r_melt_sort.groupby('modality')['r'].median().reset_index()
rs_median = rs_median.set_index('modality').loc[rs_bootstrapped_r_melt_sort['modality'].unique()].reset_index() #loc[0:2,:]
rs_median = pd.concat([rs_median.head(1), rs_median.tail(1)]).reset_index(drop=True)

dti_median = dti_bootstrapped_r_melt_sort.groupby('modality')['r'].median().reset_index()
dti_median = dti_median.set_index('modality').loc[dti_bootstrapped_r_melt_sort['modality'].unique()].reset_index() #.loc[0:2,:]
dti_median = pd.concat([dti_median.head(1), dti_median.tail(1)]).reset_index(drop=True)

t1t2_median = t1t2_bootstrapped_r_melt_sort.groupby('modality')['r'].median().reset_index()
t1t2_median = t1t2_median.set_index('modality').loc[t1t2_bootstrapped_r_melt_sort['modality'].unique()].reset_index() #.loc[0:2,:]
t1t2_median = pd.concat([t1t2_median.head(1), t1t2_median.tail(1)]).reset_index(drop=True)

In [None]:
combined_df = pd.concat([dti_bootstrapped_r_melt_sort, rs_bootstrapped_r_melt_sort, t1t2_bootstrapped_r_melt_sort, all_stacked])
combined_df['modality'] = combined_df['modality'].replace('All MRI Modalities Stacked (XGB)', 'MRI Modalities Stacked')
combined_df['modality'] = combined_df['modality'].replace('dwMRI Stacked (RF)', 'dwMRI Stacked')
combined_df['modality'] = combined_df['modality'].replace('rsMRI Stacked (RF)', 'rsMRI Stacked')
combined_df['modality'] = combined_df['modality'].replace('T1w/T2w Structural Stacked (SVR)', 'sMRI Stacked')

In [None]:
fig = plt.figure(figsize=(18,4))

custom_palette = {
    'dwMRI Stacked': '#0d648f', #'#6A659999', 7AA6DCFF
    'rsMRI Stacked': '#4B6F5A99', #'#79AF9799', #0d8f81
    'sMRI Stacked': '#f8b976', #'#DF8F4499', 
    'MRI Modalities Stacked': '#6A659999' #'#374E5599'
}

kde = sns.kdeplot(
   data=combined_df, x="r", hue="modality",
   fill=True, common_norm=False, palette=custom_palette, #"crest",
   alpha=.5, linewidth=1, legend=False
)

for mod in combined_df['modality'].unique():
    median_r = combined_df[combined_df['modality'] == mod]['r'].median()
    plt.axvline(median_r, color='grey', linestyle='--', linewidth=0.7)
    plt.text(median_r + 0.003, plt.gca().get_ylim()[1] * 0.9, f'{median_r:.2f}', color='black', fontsize=25)

sns.despine(left=True)
plt.title("Bootstrap Distribution of Pearson $r$:\nStacked MRI Modalities", fontsize=40, y=1.3)
plt.xlabel("Pearson $r$", fontsize=25)
plt.ylabel("Density", fontsize=25)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
x_ticks = np.arange(combined_df['r'].min(), combined_df['r'].max() + 0.02, 0.02)
plt.xticks(ticks=x_ticks, labels=[f'{x:.2f}' for x in x_ticks], fontsize=17) #ticks=plt.xticks()[0] / for x in plt.xticks()[0]

legend_elements = [mpatches.Patch(color=color, label=mod, alpha=.5) for mod, color in custom_palette.items()]
fig.legend(handles=legend_elements, loc='lower center', ncol=4, fontsize=26, frameon=False, bbox_to_anchor=(0.5, -0.4))

plt.savefig("/figures/Fig3a.jpg",
            bbox_inches="tight", 
            pad_inches=1, 
            transparent=False, 
            facecolor="w", 
            edgecolor='w', 
            orientation='landscape',
            format='jpg')
plt.show()