In [2]:
import csv
import os
import random
import pickle
import gc
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import sklearn
import warnings
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats

In [3]:
def analyze_g_matched(fold, paths):
    """Analyze g-matched files for a given fold"""
    # Load files
    train = pd.read_csv(paths['stacking_output']['train_g_matched_inner'].format(fold)).drop(columns=['eid', 'g'])
    test = pd.read_csv(paths['stacking_output']['test_g_matched_inner'].format(fold)).drop(columns=['eid', 'g'])
    
    train_out = pd.read_csv(paths['stacking_output']['train_g_matched_outer'].format(fold))
    test_out = pd.read_csv(paths['stacking_output']['test_g_matched_outer'].format(fold))
    # Calculate metrics
    metrics = {
        'Fold': fold,
        'N Train Inner': len(train),
        'N Test Inner': len(test),
        'N Train Outer': len(train_out),
        'N Test Outer': len(test_out),
        'N Features': train.shape[1],
        'ncols Train': list(train.columns),
        'ncols Test': list(test.columns)
    }
    return metrics

# Prepare data

In [None]:
# Config
base_path = '/UK_BB/brainbody'
folds = range(0, 5)

## Brain

In [11]:
# Define modalities
modalities_smri = [
'struct_fast',
'struct_sub_first',
'struct_fs_aseg_mean_intensity',
'struct_fs_aseg_volume',
'struct_ba_exvivo_area', 
'struct_ba_exvivo_mean_thickness',
'struct_ba_exvivo_volume',
'struct_a2009s_area',
'struct_a2009s_mean_thickness',
'struct_a2009s_volume',
'struct_dkt_area',
'struct_dkt_mean_thickness',
'struct_dkt_volume',
'struct_desikan_gw',
'struct_desikan_pial',
'struct_desikan_white_area',
'struct_desikan_white_mean_thickness',
'struct_desikan_white_volume',
'struct_subsegmentation',
'add_t1',
'add_t2'
]
print('smri:', len(modalities_smri))
modalities_dwi = [
"dwi_FA_tbss", "dwi_FA_prob",
"dwi_MD_tbss", "dwi_MD_prob",
"dwi_L1_tbss", "dwi_L1_prob",
"dwi_L2_tbss", "dwi_L2_prob",
"dwi_L3_tbss", "dwi_L3_prob",
"dwi_MO_tbss", "dwi_MO_prob",
"dwi_OD_tbss", "dwi_OD_prob",
"dwi_ICVF_tbss", "dwi_ICVF_prob",
"dwi_ISOVF_tbss", "dwi_ISOVF_prob",

'aparc_Tian_S1_FA_i2',
'aparc_Tian_S1_Length_i2',
'aparc_Tian_S1_SIFT2_FBC_i2',
'aparc_Tian_S1_Streamline_Count_i2',

'aparc_a2009s_Tian_S1_FA_i2',
'aparc_a2009s_Tian_S1_Length_i2',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2',
'aparc_a2009s_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S1_FA_i2',
'Glasser_Tian_S1_Length_i2',
'Glasser_Tian_S1_SIFT2_FBC_i2',
'Glasser_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S4_FA_i2',
'Glasser_Tian_S4_Length_i2',
'Glasser_Tian_S4_SIFT2_FBC_i2',
'Glasser_Tian_S4_Streamline_Count_i2',

'Schaefer7n200p_Tian_S1_FA_i2',
'Schaefer7n200p_Tian_S1_Length_i2',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2',

'Schaefer7n1000p_Tian_S4_FA_i2',
'Schaefer7n1000p_Tian_S4_Length_i2',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2']

print('dwi:', len(modalities_dwi))

modalities_rs = [
"amplitudes_21",
"full_correlation_21",
"partial_correlation_21",
"amplitudes_55",
"full_correlation_55",
"partial_correlation_55",
'full_correlation_aparc_a2009s_Tian_S1',
'full_correlation_aparc_Tian_S1',
'full_correlation_Glasser_Tian_S1',
'full_correlation_Glasser_Tian_S4',
'full_correlation_Schaefer7n200p_Tian_S1',
'full_correlation_Schaefer7n500p_Tian_S4',
'partial_correlation_aparc_a2009s_Tian_S1',
'partial_correlation_aparc_Tian_S1',
'partial_correlation_Glasser_Tian_S1',
'partial_correlation_Glasser_Tian_S4',
'partial_correlation_Schaefer7n200p_Tian_S1',
'partial_correlation_Schaefer7n500p_Tian_S4'
]


print('rs:', len(modalities_rs))

modalities_mri = [
'struct_fast',
'struct_sub_first',
'struct_fs_aseg_mean_intensity',
'struct_fs_aseg_volume',
'struct_ba_exvivo_area', 
'struct_ba_exvivo_mean_thickness',
'struct_ba_exvivo_volume',
'struct_a2009s_area',
'struct_a2009s_mean_thickness',
'struct_a2009s_volume',
'struct_dkt_area',
'struct_dkt_mean_thickness',
'struct_dkt_volume',
'struct_desikan_gw',
'struct_desikan_pial',
'struct_desikan_white_area',
'struct_desikan_white_mean_thickness',
'struct_desikan_white_volume',
'struct_subsegmentation',
'add_t1',
'add_t2',

"dwi_FA_tbss", "dwi_FA_prob",
"dwi_MD_tbss", "dwi_MD_prob",
"dwi_L1_tbss", "dwi_L1_prob",
"dwi_L2_tbss", "dwi_L2_prob",
"dwi_L3_tbss", "dwi_L3_prob",
"dwi_MO_tbss", "dwi_MO_prob",
"dwi_OD_tbss", "dwi_OD_prob",
"dwi_ICVF_tbss", "dwi_ICVF_prob",
"dwi_ISOVF_tbss", "dwi_ISOVF_prob",

'aparc_Tian_S1_FA_i2',
'aparc_Tian_S1_Length_i2',
'aparc_Tian_S1_SIFT2_FBC_i2',
'aparc_Tian_S1_Streamline_Count_i2',

'aparc_a2009s_Tian_S1_FA_i2',
'aparc_a2009s_Tian_S1_Length_i2',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2',
'aparc_a2009s_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S1_FA_i2',
'Glasser_Tian_S1_Length_i2',
'Glasser_Tian_S1_SIFT2_FBC_i2',
'Glasser_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S4_FA_i2',
'Glasser_Tian_S4_Length_i2',
'Glasser_Tian_S4_SIFT2_FBC_i2',
'Glasser_Tian_S4_Streamline_Count_i2',

'Schaefer7n200p_Tian_S1_FA_i2',
'Schaefer7n200p_Tian_S1_Length_i2',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2',

'Schaefer7n1000p_Tian_S4_FA_i2',
'Schaefer7n1000p_Tian_S4_Length_i2',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2',

"amplitudes_21",
"full_correlation_21",
"partial_correlation_21",
"amplitudes_55",
"full_correlation_55",
"partial_correlation_55",
'full_correlation_aparc_a2009s_Tian_S1',
'full_correlation_aparc_Tian_S1',
'full_correlation_Glasser_Tian_S1',
'full_correlation_Glasser_Tian_S4',
'full_correlation_Schaefer7n200p_Tian_S1',
'full_correlation_Schaefer7n500p_Tian_S4',
'partial_correlation_aparc_a2009s_Tian_S1',
'partial_correlation_aparc_Tian_S1',
'partial_correlation_Glasser_Tian_S1',
'partial_correlation_Glasser_Tian_S4',
'partial_correlation_Schaefer7n200p_Tian_S1',
'partial_correlation_Schaefer7n500p_Tian_S4'

]

print('all mri:', len(modalities_mri))

smri: 21
dwi: 42
rs: 18
all mri: 81


### sMRI

In [None]:
# Match features and targets
base_path = '/UK_BB/brainbody'
paths = {
    'g_pred': {
        'train': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_train_with_id_fold_{}.csv'),
        'test': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_test_with_id_fold_{}.csv')
    },
    'g_observed': {
        'train': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_train_with_id_{}.csv'),
        'test': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_test_with_id_{}.csv')
    },
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'smri', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'smri','features_train_level1_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'smri','features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'smri','features_train_level1_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'smri','features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'smri','features_test_level1_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'smri','features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'smri','features_test_level1_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}

for path_key in paths['stacking_output']:
    os.makedirs(os.path.dirname(paths['stacking_output'][path_key].format(0)), exist_ok=True)
    
# Match features and targets
folds = range(0, 5)
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
    print(f'Started {fold}', flush=True)
    all_modalities_train = []
    all_modalities_test = []
    
    for modality in modalities_smri:
        # Read train and test predictions using formatted paths
        train_path = paths['g_pred']['train'].format(fold, modality, fold)
        test_path = paths['g_pred']['test'].format(fold, modality, fold)
        
        g_train_pred_level1 = pd.read_csv(train_path)
        g_test_pred_level1 = pd.read_csv(test_path)
        
        print(f'g TRAIN shape BEFORE stacking ({modality}):', g_train_pred_level1.shape)
        print(f'g TEST shape BEFORE stacking ({modality}):', g_test_pred_level1.shape)
        
        all_modalities_train.append(g_train_pred_level1)
        all_modalities_test.append(g_test_pred_level1)
    
    # Initialize with first modality
    features_train_level1_inner = all_modalities_train[0]
    features_train_level1_outer = all_modalities_train[0]
    features_test_level1_inner = all_modalities_test[0]
    features_test_level1_outer = all_modalities_test[0]
    
    # Stack train data with both merge types
    for i in range(1, len(all_modalities_train)):
        features_train_level1_inner = pd.merge(features_train_level1_inner, all_modalities_train[i], on='eid', how='inner')
        features_train_level1_outer = pd.merge(features_train_level1_outer, all_modalities_train[i], on='eid', how='outer')
    
    # Save stacked train data
    features_train_level1_inner.to_csv(paths['stacking_output']['train_stacked_inner'].format(fold), index=False)
    features_train_level1_outer.to_csv(paths['stacking_output']['train_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_train = pd.read_csv(paths['g_observed']['train'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_train_level1_inner_g_matched = pd.merge(features_train_level1_inner, g_observed_train, on='eid', how='inner')
    features_train_level1_inner_g_matched.to_csv(paths['stacking_output']['train_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_train_level1_outer_g_matched = pd.merge(features_train_level1_outer, g_observed_train, on='eid', how='left')
    features_train_level1_outer_g_matched.to_csv(paths['stacking_output']['train_g_matched_outer'].format(fold), index=False)
    
    print('g TRAIN shape AFTER inner stacking:', features_train_level1_inner_g_matched.shape)
    print('g TRAIN shape AFTER outer stacking:', features_train_level1_outer_g_matched.shape)
    
    # Stack test data with both merge types
    for i in range(1, len(all_modalities_test)):
        features_test_level1_inner = pd.merge(features_test_level1_inner, all_modalities_test[i], on='eid', how='inner')
        features_test_level1_outer = pd.merge(features_test_level1_outer, all_modalities_test[i], on='eid', how='outer')
    
    # Save stacked test data
    features_test_level1_inner.to_csv(paths['stacking_output']['test_stacked_inner'].format(fold), index=False)
    features_test_level1_outer.to_csv(paths['stacking_output']['test_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_test = pd.read_csv(paths['g_observed']['test'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_test_level1_inner_g_matched = pd.merge(features_test_level1_inner, g_observed_test, on='eid', how='inner')
    features_test_level1_inner_g_matched.to_csv(paths['stacking_output']['test_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_test_level1_outer_g_matched = pd.merge(features_test_level1_outer, g_observed_test, on='eid', how='left')
    features_test_level1_outer_g_matched.to_csv(paths['stacking_output']['test_g_matched_outer'].format(fold), index=False)
    
    print('g TEST shape AFTER inner stacking:', features_test_level1_inner_g_matched.shape)
    print('g TEST shape AFTER outer stacking:', features_test_level1_outer_g_matched.shape)

Started 0


g TRAIN shape BEFORE stacking (struct_fast): (21977, 2)
g TEST shape BEFORE stacking (struct_fast): (5474, 2)
g TRAIN shape BEFORE stacking (struct_sub_first): (21977, 2)
g TEST shape BEFORE stacking (struct_sub_first): (5474, 2)
g TRAIN shape BEFORE stacking (struct_fs_aseg_mean_intensity): (21977, 2)
g TEST shape BEFORE stacking (struct_fs_aseg_mean_intensity): (5474, 2)
g TRAIN shape BEFORE stacking (struct_fs_aseg_volume): (21977, 2)
g TEST shape BEFORE stacking (struct_fs_aseg_volume): (5474, 2)
g TRAIN shape BEFORE stacking (struct_ba_exvivo_area): (21977, 2)
g TEST shape BEFORE stacking (struct_ba_exvivo_area): (5474, 2)
g TRAIN shape BEFORE stacking (struct_ba_exvivo_mean_thickness): (21977, 2)
g TEST shape BEFORE stacking (struct_ba_exvivo_mean_thickness): (5474, 2)
g TRAIN shape BEFORE stacking (struct_ba_exvivo_volume): (21977, 2)
g TEST shape BEFORE stacking (struct_ba_exvivo_volume): (5474, 2)
g TRAIN shape BEFORE stacking (struct_a2009s_area): (21950, 2)
g TEST shape BEFO

In [None]:
# Analyze all folds
paths = {
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'smri', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'smri', 'features_train_level1_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'smri',  'features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'smri', 'features_train_level1_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'smri', 'features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'smri', 'features_test_level1_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'smri', 'features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'smri', 'features_test_level1_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}

results = [analyze_g_matched(fold, paths) for fold in folds]
results_df = pd.DataFrame(results)
all_features_consistent = all(
    set(r['ncols Train']) == set(results[0]['ncols Train']) 
    for r in results)

print("Sample Sizes and Feature Counts:")
results_df[['Fold', 'N Train Inner', 'N Test Inner', 'N Train Outer', 'N Test Outer', 'N Features']]

Sample Sizes and Feature Counts:


Unnamed: 0,Fold,N Train Inner,N Test Inner,N Train Outer,N Test Outer,N Features
0,0,21710,5415,21977,5474,21
1,1,21665,5460,21920,5531,21
2,2,21702,5423,21965,5486,21
3,3,21760,5365,22031,5420,21
4,4,21663,5462,21911,5540,21


### dwMRI

In [None]:
# Match features and targets
base_path = '/UK_BB/brainbody'
paths = {
    'g_pred': {
        'train': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_train_with_id_fold_{}.csv'),
        'test': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_test_with_id_fold_{}.csv')
    },
    'g_observed': {
        'train': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_train_with_id_{}.csv'),
        'test': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_test_with_id_{}.csv')
    },
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'dwi', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'dwi','features_train_level1_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'dwi','features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'dwi','features_train_level1_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'dwi','features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'dwi','features_test_level1_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'dwi','features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'dwi','features_test_level1_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}

# Add this right after the paths dictionary definition (before the folds loop)
for path_key in paths['stacking_output']:
    os.makedirs(os.path.dirname(paths['stacking_output'][path_key].format(0)), exist_ok=True)
    
# Match features and targets
folds = range(0, 5)
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
    print(f'Started {fold}', flush=True)
    all_modalities_train = []
    all_modalities_test = []
    
    for modality in modalities_dwi:
        # Read train and test predictions using formatted paths
        train_path = paths['g_pred']['train'].format(fold, modality, fold)
        test_path = paths['g_pred']['test'].format(fold, modality, fold)
        
        g_train_pred_level1 = pd.read_csv(train_path)
        g_test_pred_level1 = pd.read_csv(test_path)
        
        print(f'g TRAIN shape BEFORE stacking ({modality}):', g_train_pred_level1.shape)
        print(f'g TEST shape BEFORE stacking ({modality}):', g_test_pred_level1.shape)
        
        all_modalities_train.append(g_train_pred_level1)
        all_modalities_test.append(g_test_pred_level1)
    
    # Initialize with first modality
    features_train_level1_inner = all_modalities_train[0]
    features_train_level1_outer = all_modalities_train[0]
    features_test_level1_inner = all_modalities_test[0]
    features_test_level1_outer = all_modalities_test[0]
    
    # Stack train data with both merge types
    for i in range(1, len(all_modalities_train)):
        features_train_level1_inner = pd.merge(features_train_level1_inner, all_modalities_train[i], on='eid', how='inner')
        features_train_level1_outer = pd.merge(features_train_level1_outer, all_modalities_train[i], on='eid', how='outer')
    
    # Save stacked train data
    features_train_level1_inner.to_csv(paths['stacking_output']['train_stacked_inner'].format(fold), index=False)
    features_train_level1_outer.to_csv(paths['stacking_output']['train_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_train = pd.read_csv(paths['g_observed']['train'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_train_level1_inner_g_matched = pd.merge(features_train_level1_inner, g_observed_train, on='eid', how='inner')
    features_train_level1_inner_g_matched.to_csv(paths['stacking_output']['train_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_train_level1_outer_g_matched = pd.merge(features_train_level1_outer, g_observed_train, on='eid', how='left')
    features_train_level1_outer_g_matched.to_csv(paths['stacking_output']['train_g_matched_outer'].format(fold), index=False)
    
    print('g TRAIN shape AFTER inner stacking:', features_train_level1_inner_g_matched.shape)
    print('g TRAIN shape AFTER outer stacking:', features_train_level1_outer_g_matched.shape)
    
    # Stack test data with both merge types
    for i in range(1, len(all_modalities_test)):
        features_test_level1_inner = pd.merge(features_test_level1_inner, all_modalities_test[i], on='eid', how='inner')
        features_test_level1_outer = pd.merge(features_test_level1_outer, all_modalities_test[i], on='eid', how='outer')
    
    # Save stacked test data
    features_test_level1_inner.to_csv(paths['stacking_output']['test_stacked_inner'].format(fold), index=False)
    features_test_level1_outer.to_csv(paths['stacking_output']['test_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_test = pd.read_csv(paths['g_observed']['test'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_test_level1_inner_g_matched = pd.merge(features_test_level1_inner, g_observed_test, on='eid', how='inner')
    features_test_level1_inner_g_matched.to_csv(paths['stacking_output']['test_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_test_level1_outer_g_matched = pd.merge(features_test_level1_outer, g_observed_test, on='eid', how='left')
    features_test_level1_outer_g_matched.to_csv(paths['stacking_output']['test_g_matched_outer'].format(fold), index=False)
    
    print('g TEST shape AFTER inner stacking:', features_test_level1_inner_g_matched.shape)
    print('g TEST shape AFTER outer stacking:', features_test_level1_outer_g_matched.shape)

In [None]:
# Analyze all folds
paths = {
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'dwi', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'dwi', 'features_train_level1_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'dwi',  'features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'dwi', 'features_train_level1_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'dwi', 'features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'dwi', 'features_test_level1_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'dwi', 'features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'dwi', 'features_test_level1_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}


results = [analyze_g_matched(fold, paths) for fold in folds]
results_df = pd.DataFrame(results)
all_features_consistent = all(
    set(r['ncols Train']) == set(results[0]['ncols Train']) 
    for r in results)

print("Sample Sizes and Feature Counts:")
results_df[['Fold', 'N Train Inner', 'N Test Inner', 'N Train Outer', 'N Test Outer', 'N Features']]

Sample Sizes and Feature Counts:


Unnamed: 0,Fold,N Train Inner,N Test Inner,N Train Outer,N Test Outer,N Features
0,0,20646,5162,21555,5389,42
1,1,20601,5207,21518,5426,42
2,2,20647,5161,21556,5388,42
3,3,20738,5070,21626,5318,42
4,4,20600,5208,21521,5423,42


### rsMRI

In [None]:
# Match features and targets
base_path = '/UK_BB/brainbody'
paths = {
    'g_pred': {
        'train': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_train_with_id_fold_{}.csv'),
        'test': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_test_with_id_fold_{}.csv')
    },
    'g_observed': {
        'train': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_train_with_id_{}.csv'),
        'test': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_test_with_id_{}.csv')
    },
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'rs', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'rs','features_train_level1_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'rs','features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'rs','features_train_level1_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'rs','features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'rs','features_test_level1_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'rs','features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'rs','features_test_level1_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}

# Add this right after the paths dictionary definition (before the folds loop)
for path_key in paths['stacking_output']:
    os.makedirs(os.path.dirname(paths['stacking_output'][path_key].format(0)), exist_ok=True)
    
# Match features and targets
folds = range(0, 5)
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
    print(f'Started {fold}', flush=True)
    all_modalities_train = []
    all_modalities_test = []
    
    for modality in modalities_rs:
        # Read train and test predictions using formatted paths
        train_path = paths['g_pred']['train'].format(fold, modality, fold)
        test_path = paths['g_pred']['test'].format(fold, modality, fold)
        
        g_train_pred_level1 = pd.read_csv(train_path)
        g_test_pred_level1 = pd.read_csv(test_path)
        
        print(f'g TRAIN shape BEFORE stacking ({modality}):', g_train_pred_level1.shape)
        print(f'g TEST shape BEFORE stacking ({modality}):', g_test_pred_level1.shape)
        
        all_modalities_train.append(g_train_pred_level1)
        all_modalities_test.append(g_test_pred_level1)
    
    # Initialize with first modality
    features_train_level1_inner = all_modalities_train[0]
    features_train_level1_outer = all_modalities_train[0]
    features_test_level1_inner = all_modalities_test[0]
    features_test_level1_outer = all_modalities_test[0]
    
    # Stack train data with both merge types
    for i in range(1, len(all_modalities_train)):
        features_train_level1_inner = pd.merge(features_train_level1_inner, all_modalities_train[i], on='eid', how='inner')
        features_train_level1_outer = pd.merge(features_train_level1_outer, all_modalities_train[i], on='eid', how='outer')
    
    # Save stacked train data
    features_train_level1_inner.to_csv(paths['stacking_output']['train_stacked_inner'].format(fold), index=False)
    features_train_level1_outer.to_csv(paths['stacking_output']['train_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_train = pd.read_csv(paths['g_observed']['train'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_train_level1_inner_g_matched = pd.merge(features_train_level1_inner, g_observed_train, on='eid', how='inner')
    features_train_level1_inner_g_matched.to_csv(paths['stacking_output']['train_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_train_level1_outer_g_matched = pd.merge(features_train_level1_outer, g_observed_train, on='eid', how='left')
    features_train_level1_outer_g_matched.to_csv(paths['stacking_output']['train_g_matched_outer'].format(fold), index=False)
    
    print('g TRAIN shape AFTER inner stacking:', features_train_level1_inner_g_matched.shape)
    print('g TRAIN shape AFTER outer stacking:', features_train_level1_outer_g_matched.shape)
    
    # Stack test data with both merge types
    for i in range(1, len(all_modalities_test)):
        features_test_level1_inner = pd.merge(features_test_level1_inner, all_modalities_test[i], on='eid', how='inner')
        features_test_level1_outer = pd.merge(features_test_level1_outer, all_modalities_test[i], on='eid', how='outer')
    
    # Save stacked test data
    features_test_level1_inner.to_csv(paths['stacking_output']['test_stacked_inner'].format(fold), index=False)
    features_test_level1_outer.to_csv(paths['stacking_output']['test_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_test = pd.read_csv(paths['g_observed']['test'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_test_level1_inner_g_matched = pd.merge(features_test_level1_inner, g_observed_test, on='eid', how='inner')
    features_test_level1_inner_g_matched.to_csv(paths['stacking_output']['test_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_test_level1_outer_g_matched = pd.merge(features_test_level1_outer, g_observed_test, on='eid', how='left')
    features_test_level1_outer_g_matched.to_csv(paths['stacking_output']['test_g_matched_outer'].format(fold), index=False)
    
    print('g TEST shape AFTER inner stacking:', features_test_level1_inner_g_matched.shape)
    print('g TEST shape AFTER outer stacking:', features_test_level1_outer_g_matched.shape)

Started 0
g TRAIN shape BEFORE stacking (amplitudes_21): (21518, 2)
g TEST shape BEFORE stacking (amplitudes_21): (5376, 2)
g TRAIN shape BEFORE stacking (full_correlation_21): (21518, 2)
g TEST shape BEFORE stacking (full_correlation_21): (5376, 2)
g TRAIN shape BEFORE stacking (partial_correlation_21): (21518, 2)
g TEST shape BEFORE stacking (partial_correlation_21): (5376, 2)
g TRAIN shape BEFORE stacking (amplitudes_55): (21518, 2)
g TEST shape BEFORE stacking (amplitudes_55): (5376, 2)
g TRAIN shape BEFORE stacking (full_correlation_55): (21518, 2)
g TEST shape BEFORE stacking (full_correlation_55): (5376, 2)
g TRAIN shape BEFORE stacking (partial_correlation_55): (21518, 2)
g TEST shape BEFORE stacking (partial_correlation_55): (5376, 2)
g TRAIN shape BEFORE stacking (full_correlation_aparc_a2009s_Tian_S1): (20600, 2)
g TEST shape BEFORE stacking (full_correlation_aparc_a2009s_Tian_S1): (5151, 2)
g TRAIN shape BEFORE stacking (full_correlation_aparc_Tian_S1): (20660, 2)
g TEST sh

In [None]:
# Analyze all folds
paths = {
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'rs', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'rs', 'features_train_level1_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'rs',  'features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'rs', 'features_train_level1_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'rs', 'features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'rs', 'features_test_level1_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'rs', 'features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'rs', 'features_test_level1_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}


results = [analyze_g_matched(fold, paths) for fold in folds]
results_df = pd.DataFrame(results)
all_features_consistent = all(
    set(r['ncols Train']) == set(results[0]['ncols Train']) 
    for r in results)

print("Sample Sizes and Feature Counts:")
results_df[['Fold', 'N Train Inner', 'N Test Inner', 'N Train Outer', 'N Test Outer', 'N Features']]

Sample Sizes and Feature Counts:


Unnamed: 0,Fold,N Train Inner,N Test Inner,N Train Outer,N Test Outer,N Features
0,0,20324,5085,21520,5376,18
1,1,20279,5130,21481,5415,18
2,2,20330,5079,21518,5378,18
3,3,20419,4990,21585,5311,18
4,4,20284,5125,21480,5416,18


### All brain phenotypes

In [None]:
# Match features and targets
base_path = '/UK_BB/brainbody'
paths = {
    'g_pred': {
        'train': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_train_with_id_fold_{}.csv'),
        'test': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_test_with_id_fold_{}.csv')
    },
    'g_observed': {
        'train': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_train_with_id_{}.csv'),
        'test': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_test_with_id_{}.csv')
    },
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'allmri', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'allmri','features_train_level1_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'allmri','features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'allmri','features_train_level1_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'allmri','features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'allmri','features_test_level1_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'allmri','features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'allmri','features_test_level1_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}

# Add this right after the paths dictionary definition (before the folds loop)
for path_key in paths['stacking_output']:
    os.makedirs(os.path.dirname(paths['stacking_output'][path_key].format(0)), exist_ok=True)
    
# Match features and targets
folds = range(0, 5)
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
    print(f'Started {fold}', flush=True)
    all_modalities_train = []
    all_modalities_test = []
    
    for modality in modalities_mri:
        # Read train and test predictions using formatted paths
        train_path = paths['g_pred']['train'].format(fold, modality, fold)
        test_path = paths['g_pred']['test'].format(fold, modality, fold)
        
        g_train_pred_level1 = pd.read_csv(train_path)
        g_test_pred_level1 = pd.read_csv(test_path)
        
        print(f'g TRAIN shape BEFORE stacking ({modality}):', g_train_pred_level1.shape)
        print(f'g TEST shape BEFORE stacking ({modality}):', g_test_pred_level1.shape)
        
        all_modalities_train.append(g_train_pred_level1)
        all_modalities_test.append(g_test_pred_level1)
    
    # Initialize with first modality
    features_train_level1_inner = all_modalities_train[0]
    features_train_level1_outer = all_modalities_train[0]
    features_test_level1_inner = all_modalities_test[0]
    features_test_level1_outer = all_modalities_test[0]
    
    # Stack train data with both merge types
    for i in range(1, len(all_modalities_train)):
        features_train_level1_inner = pd.merge(features_train_level1_inner, all_modalities_train[i], on='eid', how='inner')
        features_train_level1_outer = pd.merge(features_train_level1_outer, all_modalities_train[i], on='eid', how='outer')
    
    # Save stacked train data
    features_train_level1_inner.to_csv(paths['stacking_output']['train_stacked_inner'].format(fold), index=False)
    features_train_level1_outer.to_csv(paths['stacking_output']['train_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_train = pd.read_csv(paths['g_observed']['train'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_train_level1_inner_g_matched = pd.merge(features_train_level1_inner, g_observed_train, on='eid', how='inner')
    features_train_level1_inner_g_matched.to_csv(paths['stacking_output']['train_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_train_level1_outer_g_matched = pd.merge(features_train_level1_outer, g_observed_train, on='eid', how='left')
    features_train_level1_outer_g_matched.to_csv(paths['stacking_output']['train_g_matched_outer'].format(fold), index=False)
    
    print('g TRAIN shape AFTER inner stacking:', features_train_level1_inner_g_matched.shape)
    print('g TRAIN shape AFTER outer stacking:', features_train_level1_outer_g_matched.shape)
    
    # Stack test data with both merge types
    for i in range(1, len(all_modalities_test)):
        features_test_level1_inner = pd.merge(features_test_level1_inner, all_modalities_test[i], on='eid', how='inner')
        features_test_level1_outer = pd.merge(features_test_level1_outer, all_modalities_test[i], on='eid', how='outer')
    
    # Save stacked test data
    features_test_level1_inner.to_csv(paths['stacking_output']['test_stacked_inner'].format(fold), index=False)
    features_test_level1_outer.to_csv(paths['stacking_output']['test_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_test = pd.read_csv(paths['g_observed']['test'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_test_level1_inner_g_matched = pd.merge(features_test_level1_inner, g_observed_test, on='eid', how='inner')
    features_test_level1_inner_g_matched.to_csv(paths['stacking_output']['test_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_test_level1_outer_g_matched = pd.merge(features_test_level1_outer, g_observed_test, on='eid', how='left')
    features_test_level1_outer_g_matched.to_csv(paths['stacking_output']['test_g_matched_outer'].format(fold), index=False)
    
    print('g TEST shape AFTER inner stacking:', features_test_level1_inner_g_matched.shape)
    print('g TEST shape AFTER outer stacking:', features_test_level1_outer_g_matched.shape)

Started 0
g TRAIN shape BEFORE stacking (struct_fast): (21977, 2)
g TEST shape BEFORE stacking (struct_fast): (5474, 2)
g TRAIN shape BEFORE stacking (struct_sub_first): (21977, 2)
g TEST shape BEFORE stacking (struct_sub_first): (5474, 2)
g TRAIN shape BEFORE stacking (struct_fs_aseg_mean_intensity): (21977, 2)
g TEST shape BEFORE stacking (struct_fs_aseg_mean_intensity): (5474, 2)
g TRAIN shape BEFORE stacking (struct_fs_aseg_volume): (21977, 2)
g TEST shape BEFORE stacking (struct_fs_aseg_volume): (5474, 2)
g TRAIN shape BEFORE stacking (struct_ba_exvivo_area): (21977, 2)
g TEST shape BEFORE stacking (struct_ba_exvivo_area): (5474, 2)
g TRAIN shape BEFORE stacking (struct_ba_exvivo_mean_thickness): (21977, 2)
g TEST shape BEFORE stacking (struct_ba_exvivo_mean_thickness): (5474, 2)
g TRAIN shape BEFORE stacking (struct_ba_exvivo_volume): (21977, 2)
g TEST shape BEFORE stacking (struct_ba_exvivo_volume): (5474, 2)
g TRAIN shape BEFORE stacking (struct_a2009s_area): (21950, 2)
g TEST 

In [None]:
# Analyze all folds
paths = {
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'allmri', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'allmri', 'features_train_level1_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'allmri',  'features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'allmri', 'features_train_level1_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain', 'allmri', 'features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain', 'allmri', 'features_test_level1_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain', 'allmri', 'features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain', 'allmri', 'features_test_level1_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}


results = [analyze_g_matched(fold, paths) for fold in folds]
results_df = pd.DataFrame(results)
all_features_consistent = all(
    set(r['ncols Train']) == set(results[0]['ncols Train']) 
    for r in results)

print("Sample Sizes and Feature Counts:")
results_df[['Fold', 'N Train Inner', 'N Test Inner', 'N Train Outer', 'N Test Outer', 'N Features']]

Sample Sizes and Feature Counts:


Unnamed: 0,Fold,N Train Inner,N Test Inner,N Train Outer,N Test Outer,N Features
0,0,20273,5073,22210,5539,81
1,1,20231,5115,22156,5593,81
2,2,20275,5071,22202,5547,81
3,3,20373,4973,22265,5484,81
4,4,20232,5114,22163,5586,81


## Body

In [None]:
# Define modalities
###############################################################
modalities_body = [
'immune',
'renalhepatic',
'metabolic',
'cardiopulmonary',
'musculoskeletal',
'bone_densitometry',
'pwa',
'heart_mri',
'carotid_ultrasound',
'arterial_stiffness',
'ecg_rest',
'body_composition_by_impedance',
'body_composition_dxa',
'bone_dxa',
'kidneys_mri',
'liver_mri',
'abdominal_composition_mri_18_vars', #17 vars
'abdominal_organ_composition_mri_13_vars', #12 vars
'hearing'
]

body_composition = [
'body_composition_by_impedance',
'bone_dxa',
'abdominal_composition_mri_18_vars', #17 vars
'body_composition_dxa',
'abdominal_organ_composition_mri_13_vars', #12 vars
'bone_densitometry',
'musculoskeletal',
]

cardiopulmonary = [
'cardiopulmonary',
'pwa',
'carotid_ultrasound',
'heart_mri',
'arterial_stiffness',
'ecg_rest'
]

renal_hepatic = [
'renalhepatic',
'kidneys_mri',
'liver_mri',]

immune = ['immune']
metabolic = ['metabolic']
hearing = ['hearing']

In [None]:
# Match features and targets
base_path = '/UK_BB/brainbody'
paths = {
    'g_pred': {
        'train': os.path.join(base_path, 'lifestyle-envir-body', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_train_with_id_fold_{}.csv'),
        'test': os.path.join(base_path, 'lifestyle-envir-body', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_test_with_id_fold_{}.csv')
    },
    'g_observed': {
        'train': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_train_with_id_{}.csv'),
        'test': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_test_with_id_{}.csv')
    },
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking',  'body', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'body','features_train_level1_stacked_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking',  'body','features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'body','features_train_level1_stacked_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'body','features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'body','features_test_level1_stacked_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking',  'body','features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'body','features_test_level1_stacked_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}
for path_key in paths['stacking_output']:
    os.makedirs(os.path.dirname(paths['stacking_output'][path_key].format(0)), exist_ok=True)
    
# Match features and targets
folds = range(0, 5)
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
    print(f'Started {fold}', flush=True)
    all_modalities_train = []
    all_modalities_test = []
    
    for modality in modalities_body:
        # Read train and test predictions using formatted paths
        train_path = paths['g_pred']['train'].format(fold, modality, fold)
        test_path = paths['g_pred']['test'].format(fold, modality, fold)
        
        g_train_pred_level1 = pd.read_csv(train_path)
        g_test_pred_level1 = pd.read_csv(test_path)
        
        print(f'g TRAIN shape BEFORE stacking ({modality}):', g_train_pred_level1.shape)
        print(f'g TEST shape BEFORE stacking ({modality}):', g_test_pred_level1.shape)
        
        all_modalities_train.append(g_train_pred_level1)
        all_modalities_test.append(g_test_pred_level1)
    
    # Initialize with first modality
    features_train_level1_inner = all_modalities_train[0]
    features_train_level1_outer = all_modalities_train[0]
    features_test_level1_inner = all_modalities_test[0]
    features_test_level1_outer = all_modalities_test[0]
    
    # Stack train data with both merge types
    for i in range(1, len(all_modalities_train)):
        features_train_level1_inner = pd.merge(features_train_level1_inner, all_modalities_train[i], on='eid', how='inner')
        features_train_level1_outer = pd.merge(features_train_level1_outer, all_modalities_train[i], on='eid', how='outer')
    
    # Save stacked train data
    features_train_level1_inner.to_csv(paths['stacking_output']['train_stacked_inner'].format(fold), index=False)
    features_train_level1_outer.to_csv(paths['stacking_output']['train_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_train = pd.read_csv(paths['g_observed']['train'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_train_level1_inner_g_matched = pd.merge(features_train_level1_inner, g_observed_train, on='eid', how='inner')
    features_train_level1_inner_g_matched.to_csv(paths['stacking_output']['train_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_train_level1_outer_g_matched = pd.merge(features_train_level1_outer, g_observed_train, on='eid', how='left')
    features_train_level1_outer_g_matched.to_csv(paths['stacking_output']['train_g_matched_outer'].format(fold), index=False)
    
    print('g TRAIN shape AFTER inner stacking:', features_train_level1_inner_g_matched.shape)
    print('g TRAIN shape AFTER outer stacking:', features_train_level1_outer_g_matched.shape)
    
    # Stack test data with both merge types
    for i in range(1, len(all_modalities_test)):
        features_test_level1_inner = pd.merge(features_test_level1_inner, all_modalities_test[i], on='eid', how='inner')
        features_test_level1_outer = pd.merge(features_test_level1_outer, all_modalities_test[i], on='eid', how='outer')
    
    # Save stacked test data
    features_test_level1_inner.to_csv(paths['stacking_output']['test_stacked_inner'].format(fold), index=False)
    features_test_level1_outer.to_csv(paths['stacking_output']['test_stacked_outer'].format(fold), index=False)
    
    # Match to observed g (both inner and outer versions)
    g_observed_test = pd.read_csv(paths['g_observed']['test'].format(fold, fold))
    
    # Inner merge with g observed (removes NAs)
    features_test_level1_inner_g_matched = pd.merge(features_test_level1_inner, g_observed_test, on='eid', how='inner')
    features_test_level1_inner_g_matched.to_csv(paths['stacking_output']['test_g_matched_inner'].format(fold), index=False)
    
    # Outer merge with g observed (keeps all eids)
    features_test_level1_outer_g_matched = pd.merge(features_test_level1_outer, g_observed_test, on='eid', how='left')
    features_test_level1_outer_g_matched.to_csv(paths['stacking_output']['test_g_matched_outer'].format(fold), index=False)
    
    print('g TEST shape AFTER inner stacking:', features_test_level1_inner_g_matched.shape)
    print('g TEST shape AFTER outer stacking:', features_test_level1_outer_g_matched.shape)

Started 0
g TRAIN shape BEFORE stacking (immune): (22911, 2)
g TEST shape BEFORE stacking (immune): (5738, 2)
g TRAIN shape BEFORE stacking (renalhepatic): (18407, 2)
g TEST shape BEFORE stacking (renalhepatic): (4598, 2)
g TRAIN shape BEFORE stacking (metabolic): (14548, 2)
g TEST shape BEFORE stacking (metabolic): (3572, 2)
g TRAIN shape BEFORE stacking (cardiopulmonary): (16285, 2)
g TEST shape BEFORE stacking (cardiopulmonary): (4116, 2)
g TRAIN shape BEFORE stacking (musculoskeletal): (14558, 2)
g TEST shape BEFORE stacking (musculoskeletal): (3701, 2)
g TRAIN shape BEFORE stacking (bone_densitometry): (13957, 2)
g TEST shape BEFORE stacking (bone_densitometry): (3535, 2)
g TRAIN shape BEFORE stacking (pwa): (19418, 2)
g TEST shape BEFORE stacking (pwa): (4821, 2)
g TRAIN shape BEFORE stacking (heart_mri): (19704, 2)
g TEST shape BEFORE stacking (heart_mri): (5002, 2)
g TRAIN shape BEFORE stacking (carotid_ultrasound): (23976, 2)
g TEST shape BEFORE stacking (carotid_ultrasound): 

In [7]:
# Analyze all folds
paths = {
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'body', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'body', 'features_train_level1_stacked_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'body','features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'body', 'features_train_level1_stacked_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'body', 'features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'body', 'features_test_level1_stacked_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'body', 'features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'body', 'features_test_level1_stacked_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}
folds = range(0, 5)
results = [analyze_g_matched(fold, paths) for fold in folds]
results_df = pd.DataFrame(results)
all_features_consistent = all(
    set(r['ncols Train']) == set(results[0]['ncols Train']) 
    for r in results)

print("Sample Sizes and Feature Counts:")
results_df[['Fold', 'N Train Inner', 'N Test Inner', 'N Train Outer', 'N Test Outer', 'N Features']]

Sample Sizes and Feature Counts:


Unnamed: 0,Fold,N Train Inner,N Test Inner,N Train Outer,N Test Outer,N Features
0,0,1109,323,25517,6380,19
1,1,1165,267,25517,6380,19
2,2,1169,263,25518,6379,19
3,3,1136,296,25518,6379,19
4,4,1149,283,25518,6379,19


## Body and brain

In [None]:
# Define modalities
###############################################################
modalities_brain_body = [
'immune',
'renalhepatic',
'metabolic',
'cardiopulmonary',
'musculoskeletal',
'bone_densitometry',
'pwa',
'heart_mri',
'carotid_ultrasound',
'arterial_stiffness',
'ecg_rest',
'body_composition_by_impedance',
'body_composition_dxa',
'bone_dxa',
'kidneys_mri',
'liver_mri',
'abdominal_composition_mri_18_vars', #17 vars
'abdominal_organ_composition_mri_13_vars', #12 vars
'hearing',

'struct_fast',
'struct_sub_first',
'struct_fs_aseg_mean_intensity',
'struct_fs_aseg_volume',
'struct_ba_exvivo_area', 
'struct_ba_exvivo_mean_thickness',
'struct_ba_exvivo_volume',
'struct_a2009s_area',
'struct_a2009s_mean_thickness',
'struct_a2009s_volume',
'struct_dkt_area',
'struct_dkt_mean_thickness',
'struct_dkt_volume',
'struct_desikan_gw',
'struct_desikan_pial',
'struct_desikan_white_area',
'struct_desikan_white_mean_thickness',
'struct_desikan_white_volume',
'struct_subsegmentation',
'add_t1',
'add_t2',

"dwi_FA_tbss", "dwi_FA_prob",
"dwi_MD_tbss", "dwi_MD_prob",
"dwi_L1_tbss", "dwi_L1_prob",
"dwi_L2_tbss", "dwi_L2_prob",
"dwi_L3_tbss", "dwi_L3_prob",
"dwi_MO_tbss", "dwi_MO_prob",
"dwi_OD_tbss", "dwi_OD_prob",
"dwi_ICVF_tbss", "dwi_ICVF_prob",
"dwi_ISOVF_tbss", "dwi_ISOVF_prob",

'aparc_Tian_S1_FA_i2',
'aparc_Tian_S1_Length_i2',
'aparc_Tian_S1_SIFT2_FBC_i2',
'aparc_Tian_S1_Streamline_Count_i2',

'aparc_a2009s_Tian_S1_FA_i2',
'aparc_a2009s_Tian_S1_Length_i2',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2',
'aparc_a2009s_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S1_FA_i2',
'Glasser_Tian_S1_Length_i2',
'Glasser_Tian_S1_SIFT2_FBC_i2',
'Glasser_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S4_FA_i2',
'Glasser_Tian_S4_Length_i2',
'Glasser_Tian_S4_SIFT2_FBC_i2',
'Glasser_Tian_S4_Streamline_Count_i2',

'Schaefer7n200p_Tian_S1_FA_i2',
'Schaefer7n200p_Tian_S1_Length_i2',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2',

'Schaefer7n1000p_Tian_S4_FA_i2',
'Schaefer7n1000p_Tian_S4_Length_i2',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2',

"amplitudes_21",
"full_correlation_21",
"partial_correlation_21",
"amplitudes_55",
"full_correlation_55",
"partial_correlation_55",
'full_correlation_aparc_a2009s_Tian_S1',
'full_correlation_aparc_Tian_S1',
'full_correlation_Glasser_Tian_S1',
'full_correlation_Glasser_Tian_S4',
'full_correlation_Schaefer7n200p_Tian_S1',
'full_correlation_Schaefer7n500p_Tian_S4',
'partial_correlation_aparc_a2009s_Tian_S1',
'partial_correlation_aparc_Tian_S1',
'partial_correlation_Glasser_Tian_S1',
'partial_correlation_Glasser_Tian_S4',
'partial_correlation_Schaefer7n200p_Tian_S1',
'partial_correlation_Schaefer7n500p_Tian_S4'
]

modalities_body = [
'immune',
'renalhepatic',
'metabolic',
'cardiopulmonary',
'musculoskeletal',
'bone_densitometry',
'pwa',
'heart_mri',
'carotid_ultrasound',
'arterial_stiffness',
'ecg_rest',
'body_composition_by_impedance',
'body_composition_dxa',
'bone_dxa',
'kidneys_mri',
'liver_mri',
'abdominal_composition_mri_18_vars', #17 vars
'abdominal_organ_composition_mri_13_vars', #12 vars
'hearing'
]

In [None]:
# Match features and targets
base_path = '/UK_BB/brainbody'
folds = range(0, 5)
warnings.simplefilter(action='ignore', category=FutureWarning)

for fold in folds:
    print(f'Started {fold}', flush=True)
    all_modalities_train = []
    all_modalities_test = []
    
    # Define paths with proper formatting
    paths = {
        'body': {
            'train': os.path.join(base_path, 'lifestyle-envir-body', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_train_with_id_fold_{}.csv'),
            'test': os.path.join(base_path, 'lifestyle-envir-body', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_test_with_id_fold_{}.csv')
        },
        'brain': {
            'train': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_train_with_id_fold_{}.csv'),
            'test': os.path.join(base_path, 'brain', 'folds', 'fold_{}', 'g_pred', '{}_g_pred_XGB_test_with_id_fold_{}.csv')
        },
        'g_observed': {
            'train': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_train_with_id_{}.csv'),
            'test': os.path.join(base_path, 'cognition', 'folds', 'fold_{}', 'g', 'g_test_with_id_{}.csv')
        },
        'stacking_output': {
            'train': os.path.join(base_path, 'stacking', 'brain-body'),
            'test': os.path.join(base_path, 'stacking', 'brain-body')
        }
    }

    # Create output directories
    for path_type in ['train', 'test']:
        os.makedirs(os.path.join(paths['stacking_output'][path_type], 'features_train_level1_stacked_inner'), exist_ok=True)
        os.makedirs(os.path.join(paths['stacking_output'][path_type], 'features_test_level1_stacked_inner'), exist_ok=True)
        os.makedirs(os.path.join(paths['stacking_output'][path_type], 'features_train_level1_stacked_outer'), exist_ok=True)
        os.makedirs(os.path.join(paths['stacking_output'][path_type], 'features_test_level1_stacked_outer'), exist_ok=True)

    for modality in modalities_brain_body:
        # Get correct paths based on modality type
        modality_type = 'body' if modality in modalities_body else 'brain'
        train_path = paths[modality_type]['train'].format(fold, modality, fold)
        test_path = paths[modality_type]['test'].format(fold, modality, fold)
        
        # Load data if paths exist
        if os.path.exists(train_path):
            train_df = pd.read_csv(train_path)
            all_modalities_train.append(train_df)
        
        if os.path.exists(test_path):
            test_df = pd.read_csv(test_path)
            all_modalities_test.append(test_df)
    
    # Merge all modalities - INNER VERSION
    def merge_modalities_inner(df_list):
        if not df_list:
            return pd.DataFrame()
        merged = df_list[0]
        for df in df_list[1:]:
            merged = pd.merge(merged, df, on='eid', how='inner')
        return merged
    
    # Merge all modalities - OUTER VERSION (your existing function)
    def merge_modalities_outer(df_list):
        if not df_list:
            return pd.DataFrame()
        merged = df_list[0]
        for df in df_list[1:]:
            merged = pd.merge(merged, df, on='eid', how='outer')
        return merged
    
    # Process train data - BOTH MERGE TYPES
    if all_modalities_train:
        g_observed_train = pd.read_csv(paths['g_observed']['train'].format(fold, fold))
        
        # Inner merge
        features_train_inner = merge_modalities_inner(all_modalities_train)
        features_train_inner.to_csv(
            os.path.join(paths['stacking_output']['train'], 'features_train_level1_stacked_inner', f'features_train_level1_inner_fold_{fold}.csv'), 
            index=False
        )
        pd.merge(features_train_inner, g_observed_train, on='eid', how='inner').to_csv(
            os.path.join(paths['stacking_output']['train'], 'features_train_level1_stacked_inner', f'features_train_level1_inner_g_matched_fold_{fold}.csv'),
            index=False
        )
        
        # Outer merge (your existing code)
        features_train_outer = merge_modalities_outer(all_modalities_train)
        features_train_outer.to_csv(
            os.path.join(paths['stacking_output']['train'], 'features_train_level1_stacked_outer', f'features_train_level1_outer_fold_{fold}.csv'), 
            index=False
        )
        pd.merge(features_train_outer, g_observed_train, on='eid', how='left').to_csv(
            os.path.join(paths['stacking_output']['train'], 'features_train_level1_stacked_outer', f'features_train_level1_outer_g_matched_fold_{fold}.csv'),
            index=False
        )
    
    # Process test data - BOTH MERGE TYPES
    if all_modalities_test:
        g_observed_test = pd.read_csv(paths['g_observed']['test'].format(fold, fold))
        
        # Inner merge
        features_test_inner = merge_modalities_inner(all_modalities_test)
        features_test_inner.to_csv(
            os.path.join(paths['stacking_output']['test'], 'features_test_level1_stacked_inner', f'features_test_level1_inner_fold_{fold}.csv'), 
            index=False
        )
        pd.merge(features_test_inner, g_observed_test, on='eid', how='inner').to_csv(
            os.path.join(paths['stacking_output']['test'], 'features_test_level1_stacked_inner', f'features_test_level1_inner_g_matched_fold_{fold}.csv'),
            index=False
        )
        
        # Outer merge (your existing code)
        features_test_outer = merge_modalities_outer(all_modalities_test)
        features_test_outer.to_csv(
            os.path.join(paths['stacking_output']['test'], 'features_test_level1_stacked_outer', f'features_test_level1_outer_fold_{fold}.csv'), 
            index=False
        )
        pd.merge(features_test_outer, g_observed_test, on='eid', how='left').to_csv(
            os.path.join(paths['stacking_output']['test'], 'features_test_level1_stacked_outer', f'features_test_level1_outer_g_matched_fold_{fold}.csv'),
            index=False
        )

In [58]:
# Analyze all folds
paths = {
    'stacking_output': {
        'train_stacked_inner': os.path.join(base_path, 'stacking', 'brain-body', 'features_train_level1_stacked_inner', 'features_train_level1_inner_fold_{}.csv'),
        'train_stacked_outer': os.path.join(base_path, 'stacking', 'brain-body', 'features_train_level1_stacked_outer', 'features_train_level1_outer_fold_{}.csv'),
        'train_g_matched_inner': os.path.join(base_path, 'stacking', 'brain-body','features_train_level1_stacked_inner', 'features_train_level1_inner_g_matched_fold_{}.csv'),
        'train_g_matched_outer': os.path.join(base_path, 'stacking', 'brain-body', 'features_train_level1_stacked_outer', 'features_train_level1_outer_g_matched_fold_{}.csv'),
        'test_stacked_inner': os.path.join(base_path, 'stacking', 'brain-body', 'features_test_level1_stacked_inner', 'features_test_inner_level1_fold_{}.csv'),
        'test_stacked_outer': os.path.join(base_path, 'stacking', 'brain-body', 'features_test_level1_stacked_outer', 'features_test_level1_outer_fold_{}.csv'),
        'test_g_matched_inner': os.path.join(base_path, 'stacking', 'brain-body', 'features_test_level1_stacked_inner', 'features_test_level1_inner_g_matched_fold_{}.csv'),
        'test_g_matched_outer': os.path.join(base_path, 'stacking', 'brain-body', 'features_test_level1_stacked_outer', 'features_test_level1_outer_g_matched_fold_{}.csv')
    }
}

results = [analyze_g_matched(fold, paths) for fold in folds]
results_df = pd.DataFrame(results)
all_features_consistent = all(
    set(r['ncols Train']) == set(results[0]['ncols Train']) 
    for r in results)

print("Sample Sizes and Feature Counts:")
results_df[['Fold', 'N Train Inner', 'N Test Inner', 'N Train Outer', 'N Test Outer', 'N Features']]

Sample Sizes and Feature Counts:


Unnamed: 0,Fold,N Train Inner,N Test Inner,N Train Outer,N Test Outer,N Features
0,0,949,289,25517,6380,100
1,1,1002,236,25517,6380,100
2,2,1007,231,25518,6379,100
3,3,996,242,25518,6379,100
4,4,998,240,25518,6379,100
