In [None]:
import csv
import os
import warnings
import pandas as pd
import numpy as np
import logging
import tqdm
from scipy.stats import pearsonr
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
#####
import ukbiobank
import ukbiobank.utils.utils
from ukbiobank.utils import loadCsv
from ukbiobank.utils import addFields
from ukbiobank.utils.utils import fieldIdsToNames

# Abdominal MRI
- liver MRI
- kidney MRI
- abdominal composition
- abdominal organ composition

In [None]:
# Upload UK Bioabank csv
csv_path = '/UK_BB/ukbbdata/ukb.csv'
ukb = ukbiobank.ukbio(ukb_csv=csv_path)

In [None]:
# Create directories & save
base_path = f'/UK_BB/brainbody'
lifestyle_path = os.path.join(base_path, 'body')
data_path = os.path.join(lifestyle_path, 'data')
shap_path = os.path.join(lifestyle_path, 'shap')
os.makedirs(base_path, exist_ok=True)
os.makedirs(lifestyle_path, exist_ok=True)
os.makedirs(data_path, exist_ok=True)
os.makedirs(shap_path, exist_ok=True)

In [None]:
# Upload variables
abdominal_mri_columns = [
'eid',  
# Kidneys
'21163', # Kidney distance
'21164', # Kidney fusion
'21161', # Kidney parenchyma (left)
'21162', # Kidney parenchyma (right)
'21160', # Kidney parenchyma total
# Liver
'40062', # Liver iron corrected T1 (ct1)
'40060', # Liver iron (Fe)
# Abdominal composition
'22436', # 10P Liver PDFF (proton density fat fraction)
'22434', # Abdominal fat ratio
'23359', # Anterior thigh error indicator (left)
'24353', # Anterior thigh muscle fat infiltration (MFI) (left)
'24354', # Anterior thigh muscle fat infiltration (MFI) (right)
'24352', # FR liver PDFF mean
'22435', # Muscle fat infiltration
'23355', # Posterior thigh muscle fat infiltration (MFI) (left)
'23356', # Posterior thigh muscle fat infiltration (MFI) (right)
'22432', # Total abdominal adipose tissue index
'22433', # Weight-to-muscle ratio
'22410', # Total trunk fat volume
'22415', # Total adipose tissue volume
'22416', # Total lean tissue volume
'22408', # Abdominal subcutaneous adipose tissue volume (ASAT)
'22407', # Visceral adipose tissue volume (VAT)
'22409', # Total thigh fat-free muscle volume
'22405', # Anterior thigh fat-free muscle volume (left)
'22403', # Anterior thigh fat-free muscle volume (right)
'22406', # Posterior thigh fat-free muscle volume (left)
'22404', # Posterior thigh fat-free muscle volume (right)
# Abdominal organ composition
'21084', # Lung volume
'21080', # Liver volume
'21088', # Liver PDFF (fat fraction)
'21089', # Liver iron
'21087', # Pancreas volume
'21090', # Pancreas PDFF (fat fraction)
'21091', # Pancreas iron
'21083', # Spleen volume
'21081', # Left kidney volume
'21082', # Right kidney volume
'21086', # Subcutaneous fat volume
'21085' # Visceral fat volume
]
file_path = '/UK_BB/ukbbdata/ukb.csv'
with open(file_path, 'r') as f:
    headers = f.readline().strip().split(',')
    headers = [header.strip('"') for header in headers]

missing_columns = [col for col in abdominal_mri_columns if not any(header.startswith(col) for header in headers)]
if missing_columns:
    raise ValueError(f"The following required columns are missing: {missing_columns}")
else:
    print("All required columns are present. Proceeding with loading the data.")

    df_abdominal_mri = pd.read_csv(file_path, usecols=lambda col: (col == 'eid' or any(col.startswith(prefix) for prefix in abdominal_mri_columns[1:])))
    df_abdominal_mri.to_csv(os.path.join(data_path, 'abdominal_mri_vars.csv'), index=False)

All required columns are present. Proceeding with loading the data.


In [7]:
print('% missing')
with pd.option_context('display.max_rows', None):
    display(((df_abdominal_mri.isna().sum() / len(df_abdominal_mri)).round(2)*100).sort_values(ascending=False))

% missing


23359-2.2    100.0
21160-3.0    100.0
21090-3.0    100.0
21161-3.0    100.0
21089-3.0    100.0
23359-2.1    100.0
21088-3.0    100.0
21087-3.0    100.0
23359-3.0    100.0
21086-3.0    100.0
23359-3.1    100.0
21085-3.0    100.0
23359-3.2    100.0
21084-3.0    100.0
21162-3.0    100.0
21083-3.0    100.0
21082-3.0    100.0
21163-3.0    100.0
21081-3.0    100.0
21164-0.0    100.0
21080-3.0    100.0
21091-3.0    100.0
22406-3.0     99.0
22407-3.0     99.0
22408-3.0     99.0
22404-3.0     99.0
23355-3.0     99.0
23356-3.0     99.0
23359-2.0     99.0
22403-3.0     99.0
24352-3.0     99.0
24353-3.0     99.0
24354-3.0     99.0
40060-3.0     99.0
22405-3.0     99.0
40062-3.0     99.0
22436-2.0     98.0
22415-2.0     98.0
22416-2.0     98.0
21091-2.0     94.0
21088-2.0     94.0
21089-2.0     94.0
21090-2.0     94.0
21161-2.0     93.0
40062-2.0     93.0
21160-2.0     93.0
21163-2.0     93.0
21087-2.0     93.0
21162-2.0     93.0
21083-2.0     92.0
22433-2.0     92.0
21086-2.0     92.0
21085-2.0   

In [8]:
# Filter Instance 2
abdominal_mri_i2 = df_abdominal_mri[['eid'] + df_abdominal_mri.filter(regex=r'2\.\d$').columns.tolist()]
with pd.option_context('display.max_rows', None):
    display(((abdominal_mri_i2.isna().sum() / len(abdominal_mri_i2)).round(2)*100).sort_values(ascending=False))

23359-2.1    100.0
23359-2.2    100.0
23359-2.0     99.0
22436-2.0     98.0
22416-2.0     98.0
22415-2.0     98.0
21088-2.0     94.0
21089-2.0     94.0
21090-2.0     94.0
21091-2.0     94.0
21160-2.0     93.0
21162-2.0     93.0
21161-2.0     93.0
40062-2.0     93.0
21087-2.0     93.0
21163-2.0     93.0
21085-2.0     92.0
21083-2.0     92.0
21081-2.0     92.0
22435-2.0     92.0
22434-2.0     92.0
22433-2.0     92.0
22432-2.0     92.0
21082-2.0     92.0
22410-2.0     92.0
40060-2.0     92.0
21086-2.0     92.0
21084-2.0     92.0
21080-2.0     92.0
22409-2.0     92.0
24352-2.0     91.0
24353-2.0     89.0
24354-2.0     89.0
22407-2.0     89.0
23356-2.0     89.0
23355-2.0     89.0
22408-2.0     89.0
22406-2.0     89.0
22405-2.0     89.0
22404-2.0     89.0
22403-2.0     89.0
eid            0.0
dtype: float64

In [None]:
# Count % missing
abdominal_mri_i2 = abdominal_mri_i2.drop(columns=['23359-2.0', '23359-2.1', '23359-2.2']).reset_index(drop=True)
abdominal_mri_i2 = abdominal_mri_i2.dropna(subset=[
'24353-2.0',
'24354-2.0',
'22404-2.0',
'23355-2.0',
'23356-2.0',
'22408-2.0',
'22407-2.0',
'22406-2.0',
'22403-2.0',
'22405-2.0']).reset_index(drop=True)
abdominal_mri_i2.columns = abdominal_mri_i2.columns.str.replace('-2.0', '')
with pd.option_context('display.max_rows', None):
    display(((abdominal_mri_i2.isna().sum() / len(abdominal_mri_i2)).round(2)*100).sort_values(ascending=False))

The default value of regex will change from True to False in a future version.


22416    85.0
22415    85.0
22436    83.0
21091    47.0
21090    47.0
21088    45.0
21089    45.0
40062    39.0
21087    33.0
21160    33.0
21162    33.0
21161    33.0
21163    33.0
21083    31.0
21085    31.0
21080    31.0
21086    31.0
21084    31.0
21082    31.0
21081    31.0
22433    29.0
22435    29.0
22434    29.0
22409    29.0
22410    29.0
22432    29.0
40060    28.0
24352    16.0
23356     0.0
24354     0.0
23355     0.0
24353     0.0
eid       0.0
22408     0.0
22407     0.0
22406     0.0
22404     0.0
22403     0.0
22405     0.0
dtype: float64

In [59]:
# Select modalities
kidneys = abdominal_mri_i2[['eid','21163','21161','21162','21160']]
#21164: Kidney fusion, Instance 0 only, <100 participants
liver = abdominal_mri_i2[['eid', '40062','40060']]
abdominal_composition_mri = abdominal_mri_i2[['eid',
'22436', '22434', '24353',
'24354', '24352', '22435',
'23355', '23356', '22432',
'22433', '22410', '22415',
'22416', '22408', '22407',
'22409', '22405', '22403',
'22406', '22404']]
abdominal_organ_composition_mri = abdominal_mri_i2[['eid',
'21084','21080','21088',
'21089','21087','21090',
'21091','21083','21081',
'21082','21086','21085']]

In [60]:
# Rename variables
kidneys = kidneys.rename(columns = {
'21163': 'Kidney distance',
'21161': 'Kidney parenchyma (left)',
'21162': 'Kidney parenchyma (right)',
'21160': 'Kidney parenchyma total'})

liver = liver.rename(columns = {
'40062': 'Liver iron corrected T1 (ct1)',
'40060': 'Liver iron (Fe)'})

abdominal_composition_mri = abdominal_composition_mri.rename(columns = {
'22436': '10P Liver PDFF (proton density fat fraction)',
'22434': 'Abdominal fat ratio',
'23359': 'Anterior thigh error indicator (left)',
'24353': 'Anterior thigh muscle fat infiltration (MFI) (left)',
'24354': 'Anterior thigh muscle fat infiltration (MFI) (right)',
'24352': 'FR liver PDFF mean',
'22435': 'Muscle fat infiltration',
'23355': 'Posterior thigh muscle fat infiltration (MFI) (left)',
'23356': 'Posterior thigh muscle fat infiltration (MFI) (right)',
'22432': 'Total abdominal adipose tissue index',
'22433': 'Weight-to-muscle ratio',
'22410': 'Total trunk fat volume',
'22415': 'Total adipose tissue volume',
'22416': 'Total lean tissue volume',
'22408': 'Abdominal subcutaneous adipose tissue volume (ASAT)',
'22407': 'Visceral adipose tissue volume (VAT)',
'22409': 'Total thigh fat-free muscle volume',
'22405': 'Anterior thigh fat-free muscle volume (left)',
'22403': 'Anterior thigh fat-free muscle volume (right)',
'22406': 'Posterior thigh fat-free muscle volume (left)',
'22404': 'Posterior thigh fat-free muscle volume (right)'
})

abdominal_organ_composition_mri = abdominal_organ_composition_mri.rename(columns = {
'21084': 'Lung volume',
'21080': 'Liver volume',
'21088': 'Liver PDFF (fat fraction)',
'21089': 'Liver iron',
'21087': 'Pancreas volume',
'21090': 'Pancreas PDFF (fat fraction)',
'21091': 'Pancreas iron',
'21083': 'Spleen volume',
'21081': 'Left kidney volume',
'21082': 'Right kidney volume',
'21086': 'Subcutaneous fat volume',
'21085': 'Visceral fat volume'})

In [None]:
# Count % missing in each data frame
abd_mri_dfs = [kidneys, liver, abdominal_composition_mri, abdominal_organ_composition_mri]
abd_mri_dfs_names = ['kidneys', 'liver', 'abdominal composition mri', 'abdominal organ composition mri']
for df_name, df in zip(abd_mri_dfs_names,abd_mri_dfs):
    print('SHAPE of', f'{df_name}:', df.shape)
    with pd.option_context('display.max_rows', None):
        display(((df.isna().sum() / len(df)).round(2)*100).sort_values(ascending=False))

    print('SHAPE of', f'{df_name} after removing NAs:', df.dropna().shape)
    print('______________________')

SHAPE of kidneys: (55091, 5)


Kidney distance              33.0
Kidney parenchyma (left)     33.0
Kidney parenchyma (right)    33.0
Kidney parenchyma total      33.0
eid                           0.0
dtype: float64

SHAPE of kidneys after removing NAs: (36828, 5)
______________________
SHAPE of liver: (55091, 3)


Liver iron corrected T1 (ct1)    39.0
Liver iron (Fe)                  28.0
eid                               0.0
dtype: float64

SHAPE of liver after removing NAs: (33511, 3)
______________________
SHAPE of abdominal composition mri: (55091, 21)


Total adipose tissue volume                              85.0
Total lean tissue volume                                 85.0
10P Liver PDFF (proton density fat fraction)             83.0
Weight-to-muscle ratio                                   29.0
Abdominal fat ratio                                      29.0
Muscle fat infiltration                                  29.0
Total thigh fat-free muscle volume                       29.0
Total abdominal adipose tissue index                     29.0
Total trunk fat volume                                   29.0
FR liver PDFF mean                                       16.0
Visceral adipose tissue volume (VAT)                      0.0
Posterior thigh fat-free muscle volume (left)             0.0
Anterior thigh fat-free muscle volume (right)             0.0
Anterior thigh fat-free muscle volume (left)              0.0
eid                                                       0.0
Abdominal subcutaneous adipose tissue volume (ASAT)       0.0
Posterio

SHAPE of abdominal composition mri after removing NAs: (903, 21)
______________________
SHAPE of abdominal organ composition mri: (55091, 13)


Pancreas PDFF (fat fraction)    47.0
Pancreas iron                   47.0
Liver PDFF (fat fraction)       45.0
Liver iron                      45.0
Pancreas volume                 33.0
Lung volume                     31.0
Liver volume                    31.0
Spleen volume                   31.0
Left kidney volume              31.0
Right kidney volume             31.0
Subcutaneous fat volume         31.0
Visceral fat volume             31.0
eid                              0.0
dtype: float64

SHAPE of abdominal organ composition mri after removing NAs: (26888, 13)
______________________


Drop missing values

In [62]:
# Kidneys
kidneys = kidneys.dropna().reset_index(drop=True)
print('SHAPE:', kidneys.shape)
kidneys.to_csv(os.path.join(data_path, 'kidneys_mri_vars.csv'), index=False)
kidneys.to_csv(os.path.join(data_path, 'kidneys_mri.csv'), index=False)

SHAPE: (36828, 5)


In [63]:
# Liver
liver = liver.dropna().reset_index(drop=True)
print('SHAPE:', liver.shape)
liver.to_csv(os.path.join(data_path, 'liver_mri_vars.csv'), index=False)
liver.to_csv(os.path.join(data_path, 'liver_mri.csv'), index=False)

SHAPE: (33511, 3)


In [None]:
# Abdominal composition
abdominal_composition_mri_11var = abdominal_composition_mri.drop(columns=[
# 85% missing
'Total adipose tissue volume',
'Total lean tissue volume',
'10P Liver PDFF (proton density fat fraction)',
# 29% missing
'Weight-to-muscle ratio',
'Abdominal fat ratio',
'Muscle fat infiltration',
'Total thigh fat-free muscle volume',
'Total abdominal adipose tissue index',
'Total trunk fat volume',
]).dropna().reset_index(drop=True)
print('SHAPE:', abdominal_composition_mri_11var.shape)

abdominal_composition_mri_11var.to_csv(os.path.join(data_path, 'abdominal_composition_mri_11_vars.csv'), index=False)
############################################################################
abdominal_composition_mri_18var = abdominal_composition_mri.drop(columns=[
# 85% missing
'Total adipose tissue volume',
'Total lean tissue volume',
'10P Liver PDFF (proton density fat fraction)',
]).dropna().reset_index(drop=True)
print('SHAPE:', abdominal_composition_mri_18var.shape)

abdominal_composition_mri_18var.to_csv(os.path.join(data_path, 'abdominal_composition_mri_18_vars.csv'), index=False)

SHAPE: (46510, 12)
SHAPE: (30968, 18)


In [None]:
# Abdominal organ composition
abdominal_organ_composition_mri_9var = abdominal_organ_composition_mri.drop(columns=[
# 47% missing
'Pancreas PDFF (fat fraction)',
'Pancreas iron',
'Liver PDFF (fat fraction)',
'Liver iron'
]).dropna().reset_index(drop=True)
print('SHAPE:', abdominal_organ_composition_mri_9var.shape)

abdominal_organ_composition_mri_9var.to_csv(os.path.join(data_path, 'abdominal_organ_composition_mri_9_vars.csv'), index=False)
############################################################################
abdominal_composition_mri_13var = abdominal_organ_composition_mri.dropna().reset_index(drop=True)
print('SHAPE:', abdominal_composition_mri_13var.shape)

abdominal_composition_mri_13var.to_csv(os.path.join(data_path, 'abdominal_composition_mri_13_vars.csv'), index=False)

SHAPE: (36671, 9)
SHAPE: (26888, 13)


Match features to the target and compute sample sizes

In [None]:
# Define body modalities
modalities = [
'kidneys_mri',
'liver_mri',
'abdominal_composition_mri_11_vars',
'abdominal_composition_mri_18_vars',
'abdominal_organ_composition_mri_9_vars',
'abdominal_composition_mri_13_vars']

In [None]:
# Match features to the target and compute sample sizes
folds = range(0,5)
modality_observations = {modality: {'train': 0, 'test': 0} for modality in modalities}

for modality in modalities:
    for fold in folds:
        base_path = f'/UK_BB/brainbody/body'
        folds_path = os.path.join(base_path, f'folds/fold_{fold}')
        suppl_path = os.path.join(folds_path, 'suppl')
        scaling_path = os.path.join(folds_path, 'scaling')
        models_path = os.path.join(folds_path, 'models')
        g_pred_path = os.path.join(folds_path, 'g_pred')
        
        os.makedirs(folds_path, exist_ok=True)
        os.makedirs(suppl_path, exist_ok=True)
        os.makedirs(scaling_path, exist_ok=True)
        os.makedirs(models_path, exist_ok=True)
        os.makedirs(g_pred_path, exist_ok=True)

        g_train = pd.read_csv(f'/UK_BB/brainbody/cognition/folds/fold_{fold}/g/g_train_with_id_{fold}.csv')
        g_test = pd.read_csv(f'/UK_BB/brainbody/cognition/folds/fold_{fold}/g/g_test_with_id_{fold}.csv')
        features = pd.read_csv(os.path.join(base_path, f'data/{modality}.csv'))
        print('Features shape', features.shape)

        feature_columns = features.drop(columns='eid').columns

        # Merge features with targets
        train_merge_all = pd.merge(features, g_train, on='eid').reset_index(drop=True)
        test_merge_all = pd.merge(features, g_test, on='eid').reset_index(drop=True)

        # Save merged data
        train_merge_all.to_csv(os.path.join(suppl_path, f'{modality}_train_feat_targ_fold_{fold}.csv'), index=False)
        test_merge_all.to_csv(os.path.join(suppl_path, f'{modality}_test_feat_targ_fold_{fold}.csv'), index=False)

        print(f'==== Train shape ====\n {modality} - Fold {fold}', train_merge_all.shape)
        print(f'==== Test shape ====\n {modality} - Fold {fold}', test_merge_all.shape)

        # Update the number of observations for the current modality
        modality_observations[modality]['train'] += train_merge_all.shape[0]
        modality_observations[modality]['test'] += test_merge_all.shape[0]

        # Extract features and save
        train_merge_all[feature_columns].to_csv(os.path.join(suppl_path, f'{modality}_train_fold_{fold}.csv'), index=False)
        test_merge_all[feature_columns].to_csv(os.path.join(suppl_path, f'{modality}_test_fold_{fold}.csv'), index=False)

        # Scale features
        print('Scaling')
        scaler_features = StandardScaler()
        features_train_scaled = scaler_features.fit_transform(train_merge_all[feature_columns])
        features_test_scaled = scaler_features.transform(test_merge_all[feature_columns])
        pd.DataFrame(features_train_scaled, columns=feature_columns).to_csv(os.path.join(scaling_path, f'{modality}_train_scaled_fold_{fold}.csv'), index=False)
        pd.DataFrame(features_test_scaled, columns=feature_columns).to_csv(os.path.join(scaling_path, f'{modality}_test_scaled_fold_{fold}.csv'), index=False)

        # Save scaler
        with open(os.path.join(scaling_path, f'{modality}_scaler_features_fold_{fold}.pkl'), "wb") as f:
            pickle.dump(scaler_features, f)

best_modality = max(modality_observations, key=lambda x: modality_observations[x]['train'] + modality_observations[x]['test'])
total_observations = modality_observations[best_modality]['train'] + modality_observations[best_modality]['test']

print('======================================================================================================')
print(f'The modality with the highest number of observations is: {best_modality}: n = {total_observations}')
print(f'Observations in each modality train/test sets:\n', modality_observations)

Features shape (36828, 5)
==== Train shape ====
 kidneys_mri - Fold 0 (18690, 6)
==== Test shape ====
 kidneys_mri - Fold 0 (4761, 6)
Scaling
Features shape (36828, 5)
==== Train shape ====
 kidneys_mri - Fold 1 (18753, 6)
==== Test shape ====
 kidneys_mri - Fold 1 (4698, 6)
Scaling
Features shape (36828, 5)
==== Train shape ====
 kidneys_mri - Fold 2 (18748, 6)
==== Test shape ====
 kidneys_mri - Fold 2 (4703, 6)
Scaling
Features shape (36828, 5)
==== Train shape ====
 kidneys_mri - Fold 3 (18832, 6)
==== Test shape ====
 kidneys_mri - Fold 3 (4619, 6)
Scaling
Features shape (36828, 5)
==== Train shape ====
 kidneys_mri - Fold 4 (18781, 6)
==== Test shape ====
 kidneys_mri - Fold 4 (4670, 6)
Scaling
Features shape (33511, 3)
==== Train shape ====
 liver_mri - Fold 0 (20368, 4)
==== Test shape ====
 liver_mri - Fold 0 (5147, 4)
Scaling
Features shape (33511, 3)
==== Train shape ====
 liver_mri - Fold 1 (20416, 4)
==== Test shape ====
 liver_mri - Fold 1 (5099, 4)
Scaling
Features shape

# Body composition
- body composition by impedance
- body composition by DXA

### Body composition by impedance

In [None]:
df_body_composition_impedance = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid',
23098, #Weight
23104, #Body mass index (BMI)
23113, #Leg fat-free mass (right)
23118, #Leg predicted mass (left)
23114, #Leg predicted mass (right)
23123, #Arm fat percentage (left)
23119, #Arm fat percentage (right)
23124, #Arm fat mass (left)
23120, #Arm fat mass (right)
23121, #Arm fat-free mass (right)
23125, #Arm fat-free mass (left)
23126, #Arm predicted mass (left)
23122, #Arm predicted mass (right)
23127, #Trunk fat percentage
23128, #Trunk fat mass
23129, #Trunk fat-free mass
23130, #Trunk predicted mass
23105, #Basal metabolic rate
23099, #Body fat percentage
23100, #Whole body fat mass
23101, #Whole body fat-free mass
23102, #Whole body water mass
23115, #Leg fat percentage (left)
23111, #Leg fat percentage (right)
23116, #Leg fat mass (left)
23112, #Leg fat mass (right)
23117, #Leg fat-free mass (left)
23106, #Impedance of whole body
23110, #Impedance of arm (left)
23109, #Impedance of arm (right)
23108, #Impedance of leg (left)
23107 #Impedance of leg (right)
])
df_body_composition_impedance_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=df_body_composition_impedance)
df_body_composition_impedance_names.to_csv(os.path.join(data_path, 'body_composition_by_impedance_vars.csv'), index=False)

In [None]:
print('% missing')
with pd.option_context('display.max_rows', None):
    display(((df_body_composition_impedance_names.isna().sum() / len(df_body_composition_impedance_names)).round(2)*100).sort_values(ascending=False))

% missing


Leg predicted mass (right)-3.0                 99.0
Leg predicted mass (left)-3.0                  99.0
Impedance of leg (right)-3.0                   99.0
Impedance of leg (left)-3.0                    99.0
Impedance of arm (right)-3.0                   99.0
Impedance of arm (left)-3.0                    99.0
Leg fat percentage (right)-3.0                 99.0
Leg fat mass (right) (Field ID: 23112)-3.0     99.0
Leg fat-free mass (right)-3.0                  99.0
Leg fat percentage (left)-3.0                  99.0
Leg fat mass (left) (Field ID: 23116)-3.0      99.0
Leg fat-free mass (left)-3.0                   99.0
Arm fat percentage (right)-3.0                 99.0
Basal metabolic rate-3.0                       99.0
Arm fat mass (right) (Field ID: 23120)-3.0     99.0
Arm fat-free mass (right)-3.0                  99.0
Arm predicted mass (right)-3.0                 99.0
Arm fat percentage (left)-3.0                  99.0
Arm fat mass (left) (Field ID: 23124)-3.0      99.0
Arm fat-free

In [None]:
# Filter Instance 2
body_composition_impedance_i2 = df_body_composition_impedance_names[['eid'] + df_body_composition_impedance_names.filter(regex=r'2\.\d$').columns.tolist()]
body_composition_impedance_i2 = body_composition_impedance_i2.dropna(axis=0).reset_index(drop=True)
body_composition_impedance_i2.columns = body_composition_impedance_i2.columns.str.replace('-2.0', '')
body_composition_impedance_i2.to_csv(os.path.join(data_path, 'body_composition_by_impedance_i2_raw.csv'), index=False)
with pd.option_context('display.max_rows', None):
    display(((body_composition_impedance_i2.isna().sum() / len(body_composition_impedance_i2)).round(2)*100).sort_values(ascending=False))

The default value of regex will change from True to False in a future version.


eid                                        0.0
Leg fat percentage (left)                  0.0
Trunk fat-free mass                        0.0
Trunk fat mass (Field ID: 23128)           0.0
Trunk fat percentage                       0.0
Arm predicted mass (left)                  0.0
Arm fat-free mass (left)                   0.0
Arm fat mass (left) (Field ID: 23124)      0.0
Arm fat percentage (left)                  0.0
Arm predicted mass (right)                 0.0
Arm fat-free mass (right)                  0.0
Arm fat mass (right) (Field ID: 23120)     0.0
Arm fat percentage (right)                 0.0
Leg predicted mass (left)                  0.0
Leg fat-free mass (left)                   0.0
Leg fat mass (left) (Field ID: 23116)      0.0
Leg predicted mass (right)                 0.0
Weight (Field ID: 23098)                   0.0
Leg fat-free mass (right)                  0.0
Leg fat mass (right) (Field ID: 23112)     0.0
Leg fat percentage (right)                 0.0
Impedance of 

In [None]:
# Rename some variables
body_composition_impedance_i2 = body_composition_impedance_i2.rename(columns=
{'Weight (Field ID: 23098)': 'Weight',
'Body mass index (BMI) (Field ID: 23104)': 'Body mass index (BMI)',
'Leg fat mass (right) (Field ID: 23112)': 'Leg fat mass (right)',
'Leg fat mass (left) (Field ID: 23116)': 'Leg fat mass (left)',
'Arm fat mass (right) (Field ID: 23120)': 'Arm fat mass (right)',
'Arm fat mass (left) (Field ID: 23124)': 'Arm fat mass (left)',
'Trunk fat mass (Field ID: 23128)': 'Trunk fat mass'
})

body_composition_impedance_i2.to_csv(os.path.join(data_path, 'body_composition_by_impedance.csv'), index=False)

Match features to the target and compute sample sizes

In [None]:
# Match features to the target and compute sample sizes
folds = range(0,5)
modalities = ['body_composition_by_impedance']

modality_observations = {modality: {'train': 0, 'test': 0} for modality in modalities}

for modality in modalities:
    for fold in folds:
        base_path = f'/UK_BB/brainbody/body'
        folds_path = os.path.join(base_path, f'folds/fold_{fold}')
        suppl_path = os.path.join(folds_path, 'suppl')
        scaling_path = os.path.join(folds_path, 'scaling')
        models_path = os.path.join(folds_path, 'models')
        g_pred_path = os.path.join(folds_path, 'g_pred')
        
        os.makedirs(folds_path, exist_ok=True)
        os.makedirs(suppl_path, exist_ok=True)
        os.makedirs(scaling_path, exist_ok=True)
        os.makedirs(models_path, exist_ok=True)
        os.makedirs(g_pred_path, exist_ok=True)

        g_train = pd.read_csv(f'/UK_BB/brainbody/cognition/folds/fold_{fold}/g/g_train_with_id_{fold}.csv')
        g_test = pd.read_csv(f'/UK_BB/brainbody/cognition/folds/fold_{fold}/g/g_test_with_id_{fold}.csv')
        features = pd.read_csv(os.path.join(base_path, f'data/{modality}.csv'))
        print('Features shape', features.shape)

        feature_columns = features.drop(columns='eid').columns

        # Merge features with targets
        train_merge_all = pd.merge(features, g_train, on='eid').reset_index(drop=True)
        test_merge_all = pd.merge(features, g_test, on='eid').reset_index(drop=True)

        # Save merged data
        train_merge_all.to_csv(os.path.join(suppl_path, f'{modality}_train_feat_targ_fold_{fold}.csv'), index=False)
        test_merge_all.to_csv(os.path.join(suppl_path, f'{modality}_test_feat_targ_fold_{fold}.csv'), index=False)

        print(f'==== Train shape ====\n {modality} - Fold {fold}', train_merge_all.shape)
        print(f'==== Test shape ====\n {modality} - Fold {fold}', test_merge_all.shape)

        # Update the number of observations for the current modality
        modality_observations[modality]['train'] += train_merge_all.shape[0]
        modality_observations[modality]['test'] += test_merge_all.shape[0]

        # Extract features and save
        train_merge_all[feature_columns].to_csv(os.path.join(suppl_path, f'{modality}_train_fold_{fold}.csv'), index=False)
        test_merge_all[feature_columns].to_csv(os.path.join(suppl_path, f'{modality}_test_fold_{fold}.csv'), index=False)

        # Scale features
        print('Scaling')
        scaler_features = StandardScaler()
        features_train_scaled = scaler_features.fit_transform(train_merge_all[feature_columns])
        features_test_scaled = scaler_features.transform(test_merge_all[feature_columns])
        pd.DataFrame(features_train_scaled, columns=feature_columns).to_csv(os.path.join(scaling_path, f'{modality}_train_scaled_fold_{fold}.csv'), index=False)
        pd.DataFrame(features_test_scaled, columns=feature_columns).to_csv(os.path.join(scaling_path, f'{modality}_test_scaled_fold_{fold}.csv'), index=False)

        # Save scaler
        with open(os.path.join(scaling_path, f'{modality}_scaler_features_fold_{fold}.pkl'), "wb") as f:
            pickle.dump(scaler_features, f)

best_modality = max(modality_observations, key=lambda x: modality_observations[x]['train'] + modality_observations[x]['test'])
total_observations = modality_observations[best_modality]['train'] + modality_observations[best_modality]['test']

print('======================================================================================================')
print(f'The modality with the highest number of observations is: {best_modality}: n = {total_observations}')
print(f'Observations in each modality train/test sets:\n', modality_observations)

Features shape (55611, 33)
==== Train shape ====
 body_composition_by_impedance - Fold 0 (23912, 34)
==== Test shape ====
 body_composition_by_impedance - Fold 0 (5966, 34)
Scaling
Features shape (55611, 33)
==== Train shape ====
 body_composition_by_impedance - Fold 1 (23895, 34)
==== Test shape ====
 body_composition_by_impedance - Fold 1 (5983, 34)
Scaling
Features shape (55611, 33)
==== Train shape ====
 body_composition_by_impedance - Fold 2 (23923, 34)
==== Test shape ====
 body_composition_by_impedance - Fold 2 (5955, 34)
Scaling
Features shape (55611, 33)
==== Train shape ====
 body_composition_by_impedance - Fold 3 (23878, 34)
==== Test shape ====
 body_composition_by_impedance - Fold 3 (6000, 34)
Scaling
Features shape (55611, 33)
==== Train shape ====
 body_composition_by_impedance - Fold 4 (23904, 34)
==== Test shape ====
 body_composition_by_impedance - Fold 4 (5974, 34)
Scaling
The modality with the highest number of observations is: body_composition_by_impedance: n = 149

###  Body composition by DXA

In [None]:
# Upload variables
body_composition_dxa_columns = [
'eid',  # Keep 'eid' as is
'21110', #Android fat free mass
'21111', #Android tissue mass
'21112', #Arm bone mass (left)
'21116', #Arm bone mass (right)
'21113', #Arm fat free mass (left)
'21117', #Arm fat free mass (right)
'21114', #Arm tissue mass (left)
'21118', #Arm tissue mass (right)
'21119', #Arms bone mass
'21120', #Arms fat free mass
'21121', #Arms tissue mass
'21123', #Gynoid fat free mass
'21124', #Gynoid tissue mass
'21125', #Leg bone mass (left)
'21128', #Leg bone mass (right)
'21126', #Leg fat free mass (left)
'21129', #Leg fat free mass (right)
'21127', #Leg tissue mass (left)
'21130', #Leg tissue mass (right)
'21131', #Legs bone mass
'21132', #Legs fat free mass
'21133', #Legs tissue mass
'21122', #Total bone mass
'21134', #Trunk bone mass
'21135', #Trunk tissue mass
'23244', #Android bone mass
'23245', #Android fat mass
'23246', #Android lean mass
'23247', #Android tissue fat percentage
'23248', #Android total mass
'23249', #Arm fat mass (left)
'23253', #Arm fat mass (right)
'23250', #Arm lean mass (left)
'23254', #Arm lean mass (right)
'23251', #Arm tissue fat percentage (left)
'23255', #Arm tissue fat percentage (right)
'23252', #Arm total mass (left)
'23256', #Arm total mass (right)
'23257', #Arms fat mass
'23258', #Arms lean mass
'23259', #Arms tissue fat percentage
'23260', #Arms total mass
'23261', #Gynoid bone mass
'23262', #Gynoid fat mass
'23263', #Gynoid lean mass
'23264', #Gynoid tissue fat percentage
'23265', #Gynoid total mass
'23266', #Leg fat mass (left)
'23270', #Leg fat mass (right)
'23267', #Leg lean mass (left)
'23271', #Leg lean mass (right)
'23268', #Leg tissue fat percentage (left)
'23272', #Leg tissue fat percentage (right)
'23269', #Leg total mass (left)
'23273', #Leg total mass (right)
'23274', #Legs fat mass
'23275', #Legs lean mass
'23276', #Legs tissue fat percentage
'23277', #Legs total mass
'23278', #Total fat mass
'23279', #Total fat-free mass
'23280', #Total lean mass
'23281', #Total tissue fat percentage
'23282', #Total tissue mass
'23283', #Total mass
'23284', #Trunk fat mass
'23285', #Trunk lean mass
'23286', #Trunk tissue fat percentage
'23287', #Trunk total mass
'23288', #VAT (visceral adipose tissue) mass
'23289' #VAT (visceral adipose tissue) volume
]

file_path = '/UK_BB/ukbbdata/ukb.csv'
with open(file_path, 'r') as f:
    headers = f.readline().strip().split(',')
    headers = [header.strip('"') for header in headers]

missing_columns = [col for col in body_composition_dxa_columns if not any(header.startswith(col) for header in headers)]
if missing_columns:
    raise ValueError(f"The following required columns are missing: {missing_columns}")
else:
    print("All required columns are present. Proceeding with loading the data.")

    df_body_composition_dxa = pd.read_csv(file_path, usecols=lambda col: (col == 'eid' or any(col.startswith(prefix) for prefix in body_composition_dxa_columns[1:])))
    df_body_composition_dxa.to_csv(os.path.join(data_path, 'body_composition_by_dxa_vars.csv'), index=False)

#lstrip(): Removes leading characters only.
#rstrip(): Removes trailing characters only.
#header.strip('"'): Removes the leading and trailing " from each header.

In [None]:
print('% missing')
df_body_composition_dxa = pd.read_csv(os.path.join(data_path, 'body_composition_by_dxa_vars.csv'))
with pd.option_context('display.max_rows', None):
    display(((df_body_composition_dxa.isna().sum() / len(df_body_composition_dxa)).round(2)*100).sort_values(ascending=False))

% missing


23252-3.0    100.0
23270-3.0    100.0
23254-3.0    100.0
23268-3.0    100.0
23253-3.0    100.0
23269-3.0    100.0
21129-3.0    100.0
23251-3.0    100.0
23250-3.0    100.0
23273-3.0    100.0
23249-3.0    100.0
23271-3.0    100.0
21125-3.0    100.0
23272-3.0    100.0
21126-3.0    100.0
21128-3.0    100.0
23267-3.0    100.0
23255-3.0    100.0
21118-3.0    100.0
21114-3.0    100.0
21112-3.0    100.0
23256-3.0    100.0
21130-3.0    100.0
21113-3.0    100.0
21116-3.0    100.0
23266-3.0    100.0
21117-3.0    100.0
21127-3.0    100.0
23274-3.0     99.0
23245-3.0     99.0
23246-3.0     99.0
23275-3.0     99.0
23244-3.0     99.0
23276-3.0     99.0
21135-3.0     99.0
23289-3.0     99.0
23247-3.0     99.0
23248-3.0     99.0
21134-3.0     99.0
23257-3.0     99.0
23258-3.0     99.0
23265-3.0     99.0
23259-3.0     99.0
23264-3.0     99.0
23260-3.0     99.0
23263-3.0     99.0
23277-3.0     99.0
23280-3.0     99.0
23278-3.0     99.0
21123-3.0     99.0
21110-3.0     99.0
23288-3.0     99.0
21111-3.0   

In [None]:
# Filter Instance 2
body_composition_dxa_i2 = df_body_composition_dxa[['eid'] + df_body_composition_dxa.filter(regex=r'2\.\d$').columns.tolist()]
body_composition_dxa_i2.columns = body_composition_dxa_i2.columns.str.replace('-2.0', '')
print('SHAPE after removing NA:', body_composition_dxa_i2.dropna(axis=0).reset_index(drop=True).shape)

SHAPE after removing NA: (16874, 72)


The default value of regex will change from True to False in a future version.


In [None]:
# Rename columns
body_composition_dxa_i2_renamed = body_composition_dxa_i2.rename(columns=
{'eid': 'eid',
'21110': 'Android fat free mass',
'21111': 'Android tissue mass',
'21112': 'Arm bone mass (left)',
'21116': 'Arm bone mass (right)',
'21113': 'Arm fat free mass (left)',
'21117': 'Arm fat free mass (right)',
'21114': 'Arm tissue mass (left)',
'21118': 'Arm tissue mass (right)',
'21119': 'Arms bone mass',
'21120': 'Arms fat free mass',
'21121': 'Arms tissue mass',
'21123': 'Gynoid fat free mass',
'21124': 'Gynoid tissue mass',
'21125': 'Leg bone mass (left)',
'21128': 'Leg bone mass (right)',
'21126': 'Leg fat free mass (left)',
'21129': 'Leg fat free mass (right)',
'21127': 'Leg tissue mass (left)',
'21130': 'Leg tissue mass (right)',
'21131': 'Legs bone mass',
'21132': 'Legs fat free mass',
'21133': 'Legs tissue mass',
'21122': 'Total bone mass',
'21134': 'Trunk bone mass',
'21135': 'Trunk tissue mass',
'23244': 'Android bone mass',
'23245': 'Android fat mass',
'23246': 'Android lean mass',
'23247': 'Android tissue fat percentage',
'23248': 'Android total mass',
'23249': 'Arm fat mass (left)',
'23253': 'Arm fat mass (right)',
'23250': 'Arm lean mass (left)',
'23254': 'Arm lean mass (right)',
'23251': 'Arm tissue fat percentage (left)',
'23255': 'Arm tissue fat percentage (right)',
'23252': 'Arm total mass (left)',
'23256': 'Arm total mass (right)',
'23257': 'Arms fat mass',
'23258': 'Arms lean mass',
'23259': 'Arms tissue fat percentage',
'23260': 'Arms total mass',
'23261': 'Gynoid bone mass',
'23262': 'Gynoid fat mass',
'23263': 'Gynoid lean mass',
'23264': 'Gynoid tissue fat percentage',
'23265': 'Gynoid total mass',
'23266': 'Leg fat mass (left)',
'23270': 'Leg fat mass (right)',
'23267': 'Leg lean mass (left)',
'23271': 'Leg lean mass (right)',
'23268': 'Leg tissue fat percentage (left)',
'23272': 'Leg tissue fat percentage (right)',
'23269': 'Leg total mass (left)',
'23273': 'Leg total mass (right)',
'23274': 'Legs fat mass',
'23275': 'Legs lean mass',
'23276': 'Legs tissue fat percentage',
'23277': 'Legs total mass',
'23278': 'Total fat mass',
'23279': 'Total fat-free mass',
'23280': 'Total lean mass',
'23281': 'Total tissue fat percentage',
'23282': 'Total tissue mass',
'23283': 'Total mass',
'23284': 'Trunk fat mass',
'23285': 'Trunk lean mass',
'23286': 'Trunk tissue fat percentage',
'23287': 'Trunk total mass',
'23288': 'VAT (visceral adipose tissue) mass',
'23289': 'VAT (visceral adipose tissue) volume'
})
with pd.option_context('display.max_rows', None):
    display(((body_composition_dxa_i2_renamed.isna().sum() / len(body_composition_dxa_i2_renamed)).round(2)*100).sort_values(ascending=False))

Leg bone mass (right)                   97.0
Leg tissue mass (left)                  97.0
Arm bone mass (left)                    97.0
Arm fat free mass (left)                97.0
Arm tissue mass (left)                  97.0
Arm bone mass (right)                   97.0
Arm fat free mass (right)               97.0
Arm tissue mass (right)                 97.0
Leg fat free mass (right)               97.0
Leg fat free mass (left)                97.0
Leg bone mass (left)                    97.0
Leg tissue mass (right)                 97.0
Arm fat mass (left)                     96.0
Arm lean mass (right)                   96.0
Arm lean mass (left)                    96.0
Arm fat mass (right)                    96.0
Leg lean mass (right)                   96.0
Leg fat mass (left)                     96.0
Leg fat mass (right)                    96.0
Leg lean mass (left)                    96.0
Arm total mass (left)                   95.0
Leg total mass (left)                   95.0
Leg tissue

In [None]:
# Drop columns with >93% missing
columns_to_remove = [
'Leg bone mass (right)',
'Leg tissue mass (left)',
'Arm bone mass (left)',
'Arm fat free mass (left)',
'Arm tissue mass (left)',
'Arm bone mass (right)',
'Arm fat free mass (right)',
'Arm tissue mass (right)',
'Leg fat free mass (right)',
'Leg fat free mass (left)',
'Leg bone mass (left)',
'Leg tissue mass (right)',
'Arm fat mass (left)',
'Arm lean mass (right)',
'Arm lean mass (left)',
'Arm fat mass (right)',
'Leg lean mass (right)',
'Leg fat mass (left)',
'Leg fat mass (right)',
'Leg lean mass (left)',
'Arm total mass (left)',
'Leg total mass (left)',
'Leg tissue fat percentage (left)',
'Leg tissue fat percentage (right)',
'Leg total mass (right)',
'Arm total mass (right)',
'Arm tissue fat percentage (right)',
'Arm tissue fat percentage (left)'
]
body_composition_dxa_i2_renamed = body_composition_dxa_i2_renamed.drop(columns=columns_to_remove).dropna().reset_index(drop=True)
body_composition_dxa_i2_renamed.to_csv(os.path.join(data_path, 'body_composition_dxa.csv'), index=False)

Match features to the target and compute sample sizes

In [None]:
# Match features to the target and compute sample sizes
folds = range(0,5)
modalities = ['body_composition_dxa']

modality_observations = {modality: {'train': 0, 'test': 0} for modality in modalities}

for modality in modalities:
    for fold in folds:
        base_path = f'/UK_BB/brainbody/body'
        folds_path = os.path.join(base_path, f'folds/fold_{fold}')
        suppl_path = os.path.join(folds_path, 'suppl')
        scaling_path = os.path.join(folds_path, 'scaling')
        models_path = os.path.join(folds_path, 'models')
        g_pred_path = os.path.join(folds_path, 'g_pred')
        
        os.makedirs(folds_path, exist_ok=True)
        os.makedirs(suppl_path, exist_ok=True)
        os.makedirs(scaling_path, exist_ok=True)
        os.makedirs(models_path, exist_ok=True)
        os.makedirs(g_pred_path, exist_ok=True)

        g_train = pd.read_csv(f'/UK_BB/brainbody/cognition/folds/fold_{fold}/g/g_train_with_id_{fold}.csv')
        g_test = pd.read_csv(f'/UK_BB/brainbody/cognition/folds/fold_{fold}/g/g_test_with_id_{fold}.csv')
        features = pd.read_csv(os.path.join(base_path, f'data/{modality}.csv'))
        print('Features shape', features.shape)

        feature_columns = features.drop(columns='eid').columns

        # Merge features with targets
        train_merge_all = pd.merge(features, g_train, on='eid').reset_index(drop=True)
        test_merge_all = pd.merge(features, g_test, on='eid').reset_index(drop=True)

        # Save merged data
        train_merge_all.to_csv(os.path.join(suppl_path, f'{modality}_train_feat_targ_fold_{fold}.csv'), index=False)
        test_merge_all.to_csv(os.path.join(suppl_path, f'{modality}_test_feat_targ_fold_{fold}.csv'), index=False)

        print(f'==== Train shape ====\n {modality} - Fold {fold}', train_merge_all.shape)
        print(f'==== Test shape ====\n {modality} - Fold {fold}', test_merge_all.shape)

        # Update the number of observations for the current modality
        modality_observations[modality]['train'] += train_merge_all.shape[0]
        modality_observations[modality]['test'] += test_merge_all.shape[0]

        # Extract features and save
        train_merge_all[feature_columns].to_csv(os.path.join(suppl_path, f'{modality}_train_fold_{fold}.csv'), index=False)
        test_merge_all[feature_columns].to_csv(os.path.join(suppl_path, f'{modality}_test_fold_{fold}.csv'), index=False)

        # Scale features
        print('Scaling')
        scaler_features = StandardScaler()
        features_train_scaled = scaler_features.fit_transform(train_merge_all[feature_columns])
        features_test_scaled = scaler_features.transform(test_merge_all[feature_columns])
        pd.DataFrame(features_train_scaled, columns=feature_columns).to_csv(os.path.join(scaling_path, f'{modality}_train_scaled_fold_{fold}.csv'), index=False)
        pd.DataFrame(features_test_scaled, columns=feature_columns).to_csv(os.path.join(scaling_path, f'{modality}_test_scaled_fold_{fold}.csv'), index=False)

        # Save scaler
        with open(os.path.join(scaling_path, f'{modality}_scaler_features_fold_{fold}.pkl'), "wb") as f:
            pickle.dump(scaler_features, f)

best_modality = max(modality_observations, key=lambda x: modality_observations[x]['train'] + modality_observations[x]['test'])
total_observations = modality_observations[best_modality]['train'] + modality_observations[best_modality]['test']

print('======================================================================================================')
print(f'The modality with the highest number of observations is: {best_modality}: n = {total_observations}')
print(f'Observations in each modality train/test sets:\n', modality_observations)

Features shape (34964, 44)
==== Train shape ====
 body_composition_dxa - Fold 0 (18163, 45)
==== Test shape ====
 body_composition_dxa - Fold 0 (4637, 45)
Scaling
Features shape (34964, 44)
==== Train shape ====
 body_composition_dxa - Fold 1 (18304, 45)
==== Test shape ====
 body_composition_dxa - Fold 1 (4496, 45)
Scaling
Features shape (34964, 44)
==== Train shape ====
 body_composition_dxa - Fold 2 (18275, 45)
==== Test shape ====
 body_composition_dxa - Fold 2 (4525, 45)
Scaling
Features shape (34964, 44)
==== Train shape ====
 body_composition_dxa - Fold 3 (18189, 45)
==== Test shape ====
 body_composition_dxa - Fold 3 (4611, 45)
Scaling
Features shape (34964, 44)
==== Train shape ====
 body_composition_dxa - Fold 4 (18269, 45)
==== Test shape ====
 body_composition_dxa - Fold 4 (4531, 45)
Scaling
The modality with the highest number of observations is: body_composition_dxa: n = 114000
Observations in each modality train/test sets:
 {'body_composition_dxa': {'train': 91200, 'test

# Bone size, mineral and density by DXA

In [None]:
df_bone_dxa_fields = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid',
20310, #Alpha angle
21137, #B2-B4 BMC (bone mineral content)
21138, #B2-B4 BMD (bine mineral density)
21136, #B2-B4 area
20317, #Grading of a superior femoral osteophyte
20316, #Grading of an acetabular osteophyte
20318, #Grading of an inferior femoral osteophyte
20315, #Grading of any osteophyte at any site
20319, #Grading of osteophytes at all three site
20300, #Hip shape mode score 1
20309, #Hip shape mode score 10
20301, #Hip shape mode score 2
20302, #Hip shape mode score 3
20303, #Hip shape mode score 4
20304, #Hip shape mode score 5
20305, #Hip shape mode score 6
20306, #Hip shape mode score 7
20307, #Hip shape mode score 8
20308, #Hip shape mode score 9
20313, #Index mJSW
20311, #Lateral centre-edge angle
20312, #Minimum joint space width (mJSW)
20320, #Osteophyte grading at any site
20321, #Radiographic osteoarthritis grading
23305, #Head bone area
23220, #Arm BMC (bone mineral content) (left)
23222, #Arm BMC (bone mineral content) (right)
23317, #Arms combined bone area
23221, #Arm BMD (bone mineral density) (left)
23223, #Arm BMD (bone mineral density) (right)
23313, #Arm bone area (left)
23314, #Arm bone area (right)
23309, #Ribs bone area
23224, #Arms BMC (bone mineral content)
23225, #Arms BMD (bone mineral density)
23311, #Spine bone area
23302, #Femur lower neck BMD (bone mineral density) (left)
23206, #Femur lower neck BMD (bone mineral density) (right)
23304, #Trunk bone area
23307, #Pelvis bone area
23318, #Legs combined bone area
23315, #Leg bone area (left)
23207, #Femur lower neck BMD (bone mineral density) T-score (right)
23294, #Femur lower neck BMD (bone mineral density) T-score (left)
23299, #Femur neck BMD (bone mineral density) (left)
23208, #Femur neck BMD (bone mineral density) (right)
23316, #Leg bone area (right)
23327, #Femur neck BMC (bone mineral content) (left)
23328, #Femur neck BMC (bone mineral content) (right)
23325, #Femur neck bone area (left)
23326, #Femur neck bone area (right)
23331, #Femur shaft BMC (bone mineral content) (left)
23332, #Femur shaft BMC (bone mineral content) (right)
23329, #Femur shaft bone area (left)
23330, #Femur shaft bone area (right)
23335, #Femur total BMC (bone mineral content) (left)
23336, #Femur total BMC (bone mineral content) (right)
23333, #Femur total area (left)
23334, #Femur total area (right)
23339, #Femur troch BMC (bone mineral content) (left)
23340, #Femur troch BMC (bone mineral content) (right)
23337, #Femur troch bone area (left)
23338, #Femur troch bone area (right)
23343, #Femur wards BMC (bone mineral content) (left)
23344, #Femur wards BMC (bone mineral content) (right)
23341, #Femur wards bone area (left)
23342, #Femur wards bone area (right)
23300, #Femur neck BMD (bone mineral density) T-score (left)
23209, #Femur neck BMD (bone mineral density) T-score (right)
23290, #Femur shaft BMD (bone mineral density) (left)
23210, #Femur shaft BMD (bone mineral density) (right)
23303, #Femur shaft BMD (bone mineral density) T-score (left)
23211, #Femur shaft BMD (bone mineral density) T-score (right)
23291, #Femur total BMD (bone mineral density) (left)
23212, #Femur total BMD (bone mineral density) (right)
23293, #Femur total BMD (bone mineral density) T-score (left)
23213, #Femur total BMD (bone mineral density) T-score (right)
23295, #Femur troch BMD (bone mineral density) (left)
23214, #Femur troch BMD (bone mineral density) (right)
23298, #Femur troch BMD (bone mineral density) T-score (left)
23215, #Femur troch BMD (bone mineral density) T-score (right)
23200, #L1-L4 area
23201, #L1-L4 average height
23202, #L1-L4 average width
21005, #L1-L4 TBS (trabecular bone score)
23292, #Femur upper neck BMD (bone mineral density) (left)
23216, #Femur upper neck BMD (bone mineral density) (right)
23296, #Femur upper neck BMD (bone mineral density) T-score (left)
23217, #Femur upper neck BMD (bone mineral density) T-score (right)
23297, #Femur wards BMD (bone mineral density) (left)
23218, #Femur wards BMD (bone mineral density) (right)
23301, #Femur wards BMD (bone mineral density) T-score (left)
23219, #Femur wards BMD (bone mineral density) T-score (right)
23306, #Head BMC (bone mineral content)
23226, #Head BMD (bone mineral density)
23203, #L1-L4 BMC (bone mineral content)
23204, #L1-L4 BMD (bone mineral density)
23205, #L1-L4 BMD (bone mineral density) T-score
23320, #Leg BMC (bone mineral content) (left)
23228, #Leg BMC (bone mineral content) (right)
23227, #Leg BMD (bone mineral density) (left)
23229, #Leg BMD (bone mineral density) (right)
23230, #Legs BMC (bone mineral content)
23231, #Legs BMD (bone mineral density)
23308, #Pelvis BMC (bone mineral content)
23232, #Pelvis BMD (bone mineral density)
23310, #Ribs BMC (bone mineral content)
23233, #Ribs BMD (bone mineral density)
23312, #Spine BMC (bone mineral content)
23234, #Spine BMD (bone mineral density)
23235, #Total BMC (bone mineral content)
23236, #Total BMD (bone mineral density)
23237, #Total BMD (bone mineral density) (left)
23238, #Total BMD (bone mineral density) (right)
23239, #Total BMD (bone mineral density) T-score
23240, #Trunk BMC (bone mineral content)
23241, #Trunk BMD (bone mineral density)
23242, #Trunk BMD (bone mineral density) (left)
23243 #Trunk BMD (bone mineral density) (right)
])
df_bone_dxa_fields.to_csv(os.path.join(data_path, 'bone_dxa_ukbb_raw.csv'), index=False)

The following variables were not found:
 [20310, 21137, 21138, 21136, 20317, 20316, 20318, 20315, 20319, 20300, 20309, 20301, 20302, 20303, 20304, 20305, 20306, 20307, 20308, 20313, 20311, 20312, 20320, 20321, 23207, 23294, 23327, 23328, 23325, 23326, 23331, 23332, 23329, 23330, 23335, 23336, 23333, 23334, 23339, 23340, 23337, 23338, 23343, 23344, 23341, 23342, 23303, 23211, 21005]. 

 This may be because the same fieldname is used across multiple field IDs.

 Either check the spelling of the variables or alternatively try using the field ID


In [None]:
# Upload variables
bone_dxa_columns = [
'eid',
'20310', #Alpha angle
'21137', #B2-B4 BMC (bone mineral content)
'21138', #B2-B4 BMD (bone mineral density)
'21136', #B2-B4 area
'20317', #Grading of a superior femoral osteophyte
'20316', #Grading of an acetabular osteophyte
'20318', #Grading of an inferior femoral osteophyte
'20315', #Grading of any osteophyte at any site
'20319', #Grading of osteophytes at all three site
'20300', #Hip shape mode score 1
'20309', #Hip shape mode score 10
'20301', #Hip shape mode score 2
'20302', #Hip shape mode score 3
'20303', #Hip shape mode score 4
'20304', #Hip shape mode score 5
'20305', #Hip shape mode score 6
'20306', #Hip shape mode score 7
'20307', #Hip shape mode score 8
'20308', #Hip shape mode score 9
'20313', #Index mJSW
'20311', #Lateral centre-edge angle
'20312', #Minimum joint space width (mJSW)
'20320', #Osteophyte grading at any site
'20321', #Radiographic osteoarthritis grading
'23305', #Head bone area
'23220', #Arm BMC (bone mineral content) (left)
'23222', #Arm BMC (bone mineral content) (right)
'23317', #Arms combined bone area
'23221', #Arm BMD (bone mineral density) (left)
'23223', #Arm BMD (bone mineral density) (right)
'23313', #Arm bone area (left)
'23314', #Arm bone area (right)
'23309', #Ribs bone area
'23224', #Arms BMC (bone mineral content)
'23225', #Arms BMD (bone mineral density)
'23311', #Spine bone area
'23302', #Femur lower neck BMD (bone mineral density) (left)
'23206', #Femur lower neck BMD (bone mineral density) (right)
'23304', #Trunk bone area
'23307', #Pelvis bone area
'23318', #Legs combined bone area
'23315', #Leg bone area (left)
'23207', #Femur lower neck BMD (bone mineral density) T-score (right)
'23294', #Femur lower neck BMD (bone mineral density) T-score (left)
'23299', #Femur neck BMD (bone mineral density) (left)
'23208', #Femur neck BMD (bone mineral density) (right)
'23316', #Leg bone area (right)
'23327', #Femur neck BMC (bone mineral content) (left)
'23328', #Femur neck BMC (bone mineral content) (right)
'23325', #Femur neck bone area (left)
'23326', #Femur neck bone area (right)
'23331', #Femur shaft BMC (bone mineral content) (left)
'23332', #Femur shaft BMC (bone mineral content) (right)
'23329', #Femur shaft bone area (left)
'23330', #Femur shaft bone area (right)
'23335', #Femur total BMC (bone mineral content) (left)
'23336', #Femur total BMC (bone mineral content) (right)
'23333', #Femur total area (left)
'23334', #Femur total area (right)
'23339', #Femur troch BMC (bone mineral content) (left)
'23340', #Femur troch BMC (bone mineral content) (right)
'23337', #Femur troch bone area (left)
'23338', #Femur troch bone area (right)
'23343', #Femur wards BMC (bone mineral content) (left)
'23344', #Femur wards BMC (bone mineral content) (right)
'23341', #Femur wards bone area (left)
'23342', #Femur wards bone area (right)
'23300', #Femur neck BMD (bone mineral density) T-score (left)
'23209', #Femur neck BMD (bone mineral density) T-score (right)
'23290', #Femur shaft BMD (bone mineral density) (left)
'23210', #Femur shaft BMD (bone mineral density) (right)
'23303', #Femur shaft BMD (bone mineral density) T-score (left)
'23211', #Femur shaft BMD (bone mineral density) T-score (right)
'23291', #Femur total BMD (bone mineral density) (left)
'23212', #Femur total BMD (bone mineral density) (right)
'23293', #Femur total BMD (bone mineral density) T-score (left)
'23213', #Femur total BMD (bone mineral density) T-score (right)
'23295', #Femur troch BMD (bone mineral density) (left)
'23214', #Femur troch BMD (bone mineral density) (right)
'23298', #Femur troch BMD (bone mineral density) T-score (left)
'23215', #Femur troch BMD (bone mineral density) T-score (right)
'23200', #L1-L4 area
'23201', #L1-L4 average height
'23202', #L1-L4 average width
'21005', #L1-L4 TBS (trabecular bone score)
'23292', #Femur upper neck BMD (bone mineral density) (left)
'23216', #Femur upper neck BMD (bone mineral density) (right)
'23296', #Femur upper neck BMD (bone mineral density) T-score (left)
'23217', #Femur upper neck BMD (bone mineral density) T-score (right)
'23297', #Femur wards BMD (bone mineral density) (left)
'23218', #Femur wards BMD (bone mineral density) (right)
'23301', #Femur wards BMD (bone mineral density) T-score (left)
'23219', #Femur wards BMD (bone mineral density) T-score (right)
'23306', #Head BMC (bone mineral content)
'23226', #Head BMD (bone mineral density)
'23203', #L1-L4 BMC (bone mineral content)
'23204', #L1-L4 BMD (bone mineral density)
'23205', #L1-L4 BMD (bone mineral density) T-score
'23320', #Leg BMC (bone mineral content) (left)
'23228', #Leg BMC (bone mineral content) (right)
'23227', #Leg BMD (bone mineral density) (left)
'23229', #Leg BMD (bone mineral density) (right)
'23230', #Legs BMC (bone mineral content)
'23231', #Legs BMD (bone mineral density)
'23308', #Pelvis BMC (bone mineral content)
'23232', #Pelvis BMD (bone mineral density)
'23310', #Ribs BMC (bone mineral content)
'23233', #Ribs BMD (bone mineral density)
'23312', #Spine BMC (bone mineral content)
'23234', #Spine BMD (bone mineral density)
'23235', #Total BMC (bone mineral content)
'23236', #Total BMD (bone mineral density)
'23237', #Total BMD (bone mineral density) (left)
'23238', #Total BMD (bone mineral density) (right)
'23239', #Total BMD (bone mineral density) T-score
'23240', #Trunk BMC (bone mineral content)
'23241', #Trunk BMD (bone mineral density)
'23242', #Trunk BMD (bone mineral density) (left)
'23243' #Trunk BMD (bone mineral density) (right)
]

file_path = '/UK_BB/ukbbdata/ukb.csv'
with open(file_path, 'r') as f:
    headers = f.readline().strip().split(',')
    headers = [header.strip('"') for header in headers]

missing_columns = [col for col in bone_dxa_columns if not any(header.startswith(col) for header in headers)]
available_columns = [col for col in bone_dxa_columns if col not in missing_columns]

if missing_columns:
    print(f"The following required columns are missing: {missing_columns}") #raise ValueError(
else:
    print("All required columns are present. Proceeding with loading the data.")

df_bone_dxa = pd.read_csv(file_path, usecols=lambda col: (col == 'eid' or any(col.startswith(prefix) for prefix in bone_dxa_columns[1:])))
df_bone_dxa.to_csv(os.path.join(data_path, 'bone_dxa_manual_upload_vars.csv'), index=False)

The following required columns are missing: ['20310', '20317', '20316', '20318', '20315', '20319', '20300', '20309', '20301', '20302', '20303', '20304', '20305', '20306', '20307', '20308', '20313', '20311', '20312', '20320', '20321', '23207', '23294', '23303', '23211']


Manually upload variables that were not found by ukb utils

In [None]:
# Manually upload variables that were not found by ukb utils
bone_dxa_man = pd.read_csv(os.path.join(data_path, 'bone_dxa_manual_upload_vars.csv'))
bone_dxa_ukbb = pd.read_csv(os.path.join(data_path, 'bone_dxa_ukbb_raw.csv'))
bone_dxa = bone_dxa_man.merge(bone_dxa_ukbb.loc[:, ~bone_dxa_ukbb.columns.isin(bone_dxa_man.columns.difference(['eid']))], on='eid')
#bone_dxa = pd.merge(bone_dxa_man, bone_dxa_ukbb, on='eid')
print(bone_dxa.shape)
bone_dxa.columns.to_list()

(502356, 188)


['eid',
 '21005-2.0',
 '21136-2.0',
 '21136-3.0',
 '21137-2.0',
 '21137-3.0',
 '21138-2.0',
 '21138-3.0',
 '23200-2.0',
 '23200-3.0',
 '23201-2.0',
 '23201-3.0',
 '23202-2.0',
 '23202-3.0',
 '23203-2.0',
 '23203-3.0',
 '23204-2.0',
 '23204-3.0',
 '23205-2.0',
 '23205-3.0',
 '23206-2.0',
 '23206-3.0',
 '23208-2.0',
 '23208-3.0',
 '23209-2.0',
 '23209-3.0',
 '23210-2.0',
 '23210-3.0',
 '23212-2.0',
 '23212-3.0',
 '23213-2.0',
 '23213-3.0',
 '23214-2.0',
 '23214-3.0',
 '23215-2.0',
 '23215-3.0',
 '23216-2.0',
 '23216-3.0',
 '23217-2.0',
 '23217-3.0',
 '23218-2.0',
 '23218-3.0',
 '23219-2.0',
 '23219-3.0',
 '23220-2.0',
 '23220-3.0',
 '23221-2.0',
 '23221-3.0',
 '23222-2.0',
 '23222-3.0',
 '23223-2.0',
 '23223-3.0',
 '23224-2.0',
 '23224-3.0',
 '23225-2.0',
 '23225-3.0',
 '23226-2.0',
 '23226-3.0',
 '23227-2.0',
 '23227-3.0',
 '23228-2.0',
 '23228-3.0',
 '23229-2.0',
 '23229-3.0',
 '23230-2.0',
 '23230-3.0',
 '23231-2.0',
 '23231-3.0',
 '23232-2.0',
 '23232-3.0',
 '23233-2.0',
 '23233-3.0'

In [None]:
# Filter Instance 2
bone_dxa_i2 = bone_dxa[['eid'] + bone_dxa.filter(regex=r'2\.\d$').columns.tolist()]
bone_dxa_i2.columns = bone_dxa_i2.columns.str.replace('-2.0', '')
print('SHAPE after removing NA:', bone_dxa_i2.dropna(axis=0).reset_index(drop=True).shape)
print('% missing')
with pd.option_context('display.max_rows', None):
    display(((bone_dxa_i2.isna().sum() / len(bone_dxa_i2)).round(2)*100).sort_values(ascending=False))
bone_dxa_i2.to_csv(os.path.join(data_path, 'bone_dxa_i2_raw.csv'), index=False)

SHAPE after removing NA: (122, 95)
% missing


The default value of regex will change from True to False in a future version.


23292    98.0
23217    98.0
23302    98.0
23216    98.0
23206    98.0
23296    98.0
21136    97.0
21137    97.0
21138    97.0
23229    95.0
23220    95.0
23242    95.0
23243    95.0
23223    95.0
23227    95.0
23237    95.0
23228    95.0
23238    95.0
23222    95.0
23320    95.0
23316    95.0
23315    95.0
23314    95.0
23313    95.0
23221    95.0
21005    94.0
23336    91.0
23331    91.0
23332    91.0
23333    91.0
23330    91.0
23329    91.0
23328    91.0
23326    91.0
23334    91.0
23335    91.0
23344    91.0
23210    91.0
23218    91.0
23204    91.0
23208    91.0
23209    91.0
23338    91.0
23212    91.0
23213    91.0
23214    91.0
23215    91.0
23219    91.0
23203    91.0
23342    91.0
23202    91.0
23201    91.0
23340    91.0
23200    91.0
23205    91.0
23318    90.0
23317    90.0
23337    90.0
23325    90.0
23327    90.0
23343    90.0
23341    90.0
23311    90.0
23339    90.0
23312    90.0
23290    90.0
23310    90.0
23241    90.0
23224    90.0
23225    90.0
23226    90.0
23230 

In [None]:
# Remove columns with >90% missing
columns_to_remove_bone_dxa = [
'23292', #98.0
'23217', #98.0
'23302', #98.0
'23216', #98.0
'23206', #98.0
'23296', #98.0
'21136', #97.0
'21137', #97.0
'21138', #97.0
'23229', #95.0
'23220', #95.0
'23242', #95.0
'23243', #95.0
'23223', #95.0
'23227', #95.0
'23237', #95.0
'23228', #95.0
'23238', #95.0
'23222', #95.0
'23320', #95.0
'23316', #95.0
'23315', #95.0
'23314', #95.0
'23313', #95.0
'23221', #95.0
'21005', #94.0
]
bone_dxa_i2_nona = bone_dxa_i2.drop(columns=columns_to_remove_bone_dxa).dropna().reset_index(drop=True)
print(bone_dxa_i2_nona.shape)
bone_dxa_i2_nona.columns.to_list()

(45093, 69)


['eid',
 '23200',
 '23201',
 '23202',
 '23203',
 '23204',
 '23205',
 '23208',
 '23209',
 '23210',
 '23212',
 '23213',
 '23214',
 '23215',
 '23218',
 '23219',
 '23224',
 '23225',
 '23226',
 '23230',
 '23231',
 '23232',
 '23233',
 '23234',
 '23235',
 '23236',
 '23239',
 '23240',
 '23241',
 '23290',
 '23291',
 '23293',
 '23295',
 '23297',
 '23298',
 '23299',
 '23300',
 '23301',
 '23304',
 '23305',
 '23306',
 '23307',
 '23308',
 '23309',
 '23310',
 '23311',
 '23312',
 '23317',
 '23318',
 '23325',
 '23326',
 '23327',
 '23328',
 '23329',
 '23330',
 '23331',
 '23332',
 '23333',
 '23334',
 '23335',
 '23336',
 '23337',
 '23338',
 '23339',
 '23340',
 '23341',
 '23342',
 '23343',
 '23344']

In [None]:
# Original columns
'20310', #Alpha angle
'21137', #B2-B4 BMC (bone mineral content)
'21138', #B2-B4 BMD (bone mineral density)
'21136', #B2-B4 area
'20317', #Grading of a superior femoral osteophyte
'20316', #Grading of an acetabular osteophyte
'20318', #Grading of an inferior femoral osteophyte
'20315', #Grading of any osteophyte at any site
'20319', #Grading of osteophytes at all three site
'20300', #Hip shape mode score 1
'20309', #Hip shape mode score 10
'20301', #Hip shape mode score 2
'20302', #Hip shape mode score 3
'20303', #Hip shape mode score 4
'20304', #Hip shape mode score 5
'20305', #Hip shape mode score 6
'20306', #Hip shape mode score 7
'20307', #Hip shape mode score 8
'20308', #Hip shape mode score 9
'20313', #Index mJSW
'20311', #Lateral centre-edge angle
'20312', #Minimum joint space width (mJSW)
'20320', #Osteophyte grading at any site
'20321', #Radiographic osteoarthritis grading
'23305', #Head bone area
'23220', #Arm BMC (bone mineral content) (left)
'23222', #Arm BMC (bone mineral content) (right)
'23317', #Arms combined bone area
'23221', #Arm BMD (bone mineral density) (left)
'23223', #Arm BMD (bone mineral density) (right)
'23313', #Arm bone area (left)
'23314', #Arm bone area (right)
'23309', #Ribs bone area
'23224', #Arms BMC (bone mineral content)
'23225', #Arms BMD (bone mineral density)
'23311', #Spine bone area
'23302', #Femur lower neck BMD (bone mineral density) (left)
'23206', #Femur lower neck BMD (bone mineral density) (right)
'23304', #Trunk bone area
'23307', #Pelvis bone area
'23318', #Legs combined bone area
'23315', #Leg bone area (left)
'23207', #Femur lower neck BMD (bone mineral density) T-score (right)
'23294', #Femur lower neck BMD (bone mineral density) T-score (left)
'23299', #Femur neck BMD (bone mineral density) (left)
'23208', #Femur neck BMD (bone mineral density) (right)
'23316', #Leg bone area (right)
'23327', #Femur neck BMC (bone mineral content) (left)
'23328', #Femur neck BMC (bone mineral content) (right)
'23325', #Femur neck bone area (left)
'23326', #Femur neck bone area (right)
'23331', #Femur shaft BMC (bone mineral content) (left)
'23332', #Femur shaft BMC (bone mineral content) (right)
'23329', #Femur shaft bone area (left)
'23330', #Femur shaft bone area (right)
'23335', #Femur total BMC (bone mineral content) (left)
'23336', #Femur total BMC (bone mineral content) (right)
'23333', #Femur total area (left)
'23334', #Femur total area (right)
'23339', #Femur troch BMC (bone mineral content) (left)
'23340', #Femur troch BMC (bone mineral content) (right)
'23337', #Femur troch bone area (left)
'23338', #Femur troch bone area (right)
'23343', #Femur wards BMC (bone mineral content) (left)
'23344', #Femur wards BMC (bone mineral content) (right)
'23341', #Femur wards bone area (left)
'23342', #Femur wards bone area (right)
'23300', #Femur neck BMD (bone mineral density) T-score (left)
'23209', #Femur neck BMD (bone mineral density) T-score (right)
'23290', #Femur shaft BMD (bone mineral density) (left)
'23210', #Femur shaft BMD (bone mineral density) (right)
'23303', #Femur shaft BMD (bone mineral density) T-score (left)
'23211', #Femur shaft BMD (bone mineral density) T-score (right)
'23291', #Femur total BMD (bone mineral density) (left)
'23212', #Femur total BMD (bone mineral density) (right)
'23293', #Femur total BMD (bone mineral density) T-score (left)
'23213', #Femur total BMD (bone mineral density) T-score (right)
'23295', #Femur troch BMD (bone mineral density) (left)
'23214', #Femur troch BMD (bone mineral density) (right)
'23298', #Femur troch BMD (bone mineral density) T-score (left)
'23215', #Femur troch BMD (bone mineral density) T-score (right)
'23200', #L1-L4 area
'23201', #L1-L4 average height
'23202', #L1-L4 average width
'21005', #L1-L4 TBS (trabecular bone score)
'23292', #Femur upper neck BMD (bone mineral density) (left)
'23216', #Femur upper neck BMD (bone mineral density) (right)
'23296', #Femur upper neck BMD (bone mineral density) T-score (left)
'23217', #Femur upper neck BMD (bone mineral density) T-score (right)
'23297', #Femur wards BMD (bone mineral density) (left)
'23218', #Femur wards BMD (bone mineral density) (right)
'23301', #Femur wards BMD (bone mineral density) T-score (left)
'23219', #Femur wards BMD (bone mineral density) T-score (right)
'23306', #Head BMC (bone mineral content)
'23226', #Head BMD (bone mineral density)
'23203', #L1-L4 BMC (bone mineral content)
'23204', #L1-L4 BMD (bone mineral density)
'23205', #L1-L4 BMD (bone mineral density) T-score
'23320', #Leg BMC (bone mineral content) (left)
'23228', #Leg BMC (bone mineral content) (right)
'23227', #Leg BMD (bone mineral density) (left)
'23229', #Leg BMD (bone mineral density) (right)
'23230', #Legs BMC (bone mineral content)
'23231', #Legs BMD (bone mineral density)
'23308', #Pelvis BMC (bone mineral content)
'23232', #Pelvis BMD (bone mineral density)
'23310', #Ribs BMC (bone mineral content)
'23233', #Ribs BMD (bone mineral density)
'23312', #Spine BMC (bone mineral content)
'23234', #Spine BMD (bone mineral density)
'23235', #Total BMC (bone mineral content)
'23236', #Total BMD (bone mineral density)
'23237', #Total BMD (bone mineral density) (left)
'23238', #Total BMD (bone mineral density) (right)
'23239', #Total BMD (bone mineral density) T-score
'23240', #Trunk BMC (bone mineral content)
'23241', #Trunk BMD (bone mineral density)
'23242', #Trunk BMD (bone mineral density) (left)
'23243' #Trunk BMD (bone mineral density) (right)

In [None]:
# Rename remaining columns
renaming_dict_bone_dxa = {
    '23200': 'L1-L4 area',
    '23201': 'L1-L4 average height',
    '23202': 'L1-L4 average width',
    '23203': 'L1-L4 BMC (bone mineral content)',
    '23204': 'L1-L4 BMD (bone mineral density)',
    '23205': 'L1-L4 BMD (bone mineral density) T-score',
    '23208': 'Femur neck BMD (bone mineral density) (right)',
    '23209': 'Femur neck BMD (bone mineral density) T-score (right)',
    '23210': 'Femur shaft BMD (bone mineral density) (right)',
    '23212': 'Femur total BMD (bone mineral density) (right)',
    '23213': 'Femur total BMD (bone mineral density) T-score (right)',
    '23214': 'Femur troch BMD (bone mineral density) (right)',
    '23215': 'Femur troch BMD (bone mineral density) T-score (right)',
    '23218': 'Femur wards BMD (bone mineral density) (right)',
    '23219': 'Femur wards BMD (bone mineral density) T-score (right)',
    '23224': 'Arms BMC (bone mineral content)',
    '23225': 'Arms BMD (bone mineral density)',
    '23226': 'Head BMD (bone mineral density)',
    '23230': 'Legs BMC (bone mineral content)',
    '23231': 'Legs BMD (bone mineral density)',
    '23232': 'Pelvis BMD (bone mineral density)',
    '23233': 'Ribs BMD (bone mineral density)',
    '23234': 'Spine BMD (bone mineral density)',
    '23235': 'Total BMC (bone mineral content)',
    '23236': 'Total BMD (bone mineral density)',
    '23239': 'Total BMD (bone mineral density) T-score',
    '23240': 'Trunk BMC (bone mineral content)',
    '23241': 'Trunk BMD (bone mineral density)',
    '23290': 'Femur shaft BMD (bone mineral density) (left)',
    '23291': 'Femur total BMD (bone mineral density) (left)',
    '23293': 'Femur total BMD (bone mineral density) T-score (left)',
    '23295': 'Femur troch BMD (bone mineral density) (left)',
    '23297': 'Femur wards BMD (bone mineral density) (left)',
    '23298': 'Femur troch BMD (bone mineral density) T-score (left)',
    '23299': 'Femur neck BMD (bone mineral density) (left)',
    '23300': 'Femur neck BMD (bone mineral density) T-score (left)',
    '23301': 'Femur wards BMD (bone mineral density) T-score (left)',
    '23304': 'Trunk bone area',
    '23305': 'Head bone area',
    '23306': 'Head BMC (bone mineral content)',
    '23307': 'Pelvis bone area',
    '23308': 'Pelvis BMC (bone mineral content)',
    '23309': 'Ribs bone area',
    '23310': 'Ribs BMC (bone mineral content)',
    '23311': 'Spine bone area',
    '23312': 'Spine BMC (bone mineral content)',
    '23317': 'Arms combined bone area',
    '23318': 'Legs combined bone area',
    '23325': 'Femur neck bone area (left)',
    '23326': 'Femur neck bone area (right)',
    '23327': 'Femur neck BMC (bone mineral content) (left)',
    '23328': 'Femur neck BMC (bone mineral content) (right)',
    '23329': 'Femur shaft bone area (left)',
    '23330': 'Femur shaft bone area (right)',
    '23331': 'Femur shaft BMC (bone mineral content) (left)',
    '23332': 'Femur shaft BMC (bone mineral content) (right)',
    '23333': 'Femur total area (left)',
    '23334': 'Femur total area (right)',
    '23335': 'Femur total BMC (bone mineral content) (left)',
    '23336': 'Femur total BMC (bone mineral content) (right)',
    '23337': 'Femur troch bone area (left)',
    '23338': 'Femur troch bone area (right)',
    '23339': 'Femur troch BMC (bone mineral content) (left)',
    '23340': 'Femur troch BMC (bone mineral content) (right)',
    '23341': 'Femur wards bone area (left)',
    '23342': 'Femur wards bone area (right)',
    '23343': 'Femur wards BMC (bone mineral content) (left)',
    '23344': 'Femur wards BMC (bone mineral content) (right)'
}
bone_dxa_i2_fin = bone_dxa_i2_nona.rename(columns=renaming_dict_bone_dxa)
bone_dxa_i2_fin.to_csv(os.path.join(data_path, 'bone_dxa.csv'), index=False)

In [None]:
# Count negative values, min, and max
print('MIN\n', bone_dxa_i2_fin.min().round(2))
print('MAX\n', bone_dxa_i2_fin.max().round(2))
print('NEG\n', (bone_dxa_i2_fin < 0).sum().sort_values(ascending=False))

MIN
 eid                                               1000046.00
L1-L4 area                                             33.92
L1-L4 average height                                    8.85
L1-L4 average width                                     2.75
L1-L4 BMC (bone mineral content)                        8.34
                                                     ...    
Femur troch BMC (bone mineral content) (right)          0.98
Femur wards bone area (left)                            0.04
Femur wards bone area (right)                           0.68
Femur wards BMC (bone mineral content) (left)           0.03
Femur wards BMC (bone mineral content) (right)          0.43
Length: 69, dtype: float64
MAX
 eid                                               6024614.00
L1-L4 area                                            138.44
L1-L4 average height                                   17.22
L1-L4 average width                                    11.68
L1-L4 BMC (bone mineral content)                

Match features to the target and compute sample sizes

In [None]:
# Match features to the target and compute sample sizes
folds = range(0,5)
modalities = ['bone_dxa']

modality_observations = {modality: {'train': 0, 'test': 0} for modality in modalities}

for modality in modalities:
    for fold in folds:
        base_path = f'/UK_BB/brainbody/body'
        folds_path = os.path.join(base_path, f'folds/fold_{fold}')
        suppl_path = os.path.join(folds_path, 'suppl')
        scaling_path = os.path.join(folds_path, 'scaling')
        models_path = os.path.join(folds_path, 'models')
        g_pred_path = os.path.join(folds_path, 'g_pred')
        
        os.makedirs(folds_path, exist_ok=True)
        os.makedirs(suppl_path, exist_ok=True)
        os.makedirs(scaling_path, exist_ok=True)
        os.makedirs(models_path, exist_ok=True)
        os.makedirs(g_pred_path, exist_ok=True)

        g_train = pd.read_csv(f'/UK_BB/brainbody/cognition/folds/fold_{fold}/g/g_train_with_id_{fold}.csv')
        g_test = pd.read_csv(f'/UK_BB/brainbody/cognition/folds/fold_{fold}/g/g_test_with_id_{fold}.csv')
        features = pd.read_csv(os.path.join(base_path, f'data/{modality}.csv'))
        print('Features shape', features.shape)

        feature_columns = features.drop(columns='eid').columns

        # Merge features with targets
        train_merge_all = pd.merge(features, g_train, on='eid').reset_index(drop=True)
        test_merge_all = pd.merge(features, g_test, on='eid').reset_index(drop=True)

        # Save merged data
        train_merge_all.to_csv(os.path.join(suppl_path, f'{modality}_train_feat_targ_fold_{fold}.csv'), index=False)
        test_merge_all.to_csv(os.path.join(suppl_path, f'{modality}_test_feat_targ_fold_{fold}.csv'), index=False)

        print(f'==== Train shape ====\n {modality} - Fold {fold}', train_merge_all.shape)
        print(f'==== Test shape ====\n {modality} - Fold {fold}', test_merge_all.shape)

        # Update the number of observations for the current modality
        modality_observations[modality]['train'] += train_merge_all.shape[0]
        modality_observations[modality]['test'] += test_merge_all.shape[0]

        # Extract features and save
        train_merge_all[feature_columns].to_csv(os.path.join(suppl_path, f'{modality}_train_fold_{fold}.csv'), index=False)
        test_merge_all[feature_columns].to_csv(os.path.join(suppl_path, f'{modality}_test_fold_{fold}.csv'), index=False)

        # Scale features
        print('Scaling')
        scaler_features = StandardScaler()
        features_train_scaled = scaler_features.fit_transform(train_merge_all[feature_columns])
        features_test_scaled = scaler_features.transform(test_merge_all[feature_columns])
        pd.DataFrame(features_train_scaled, columns=feature_columns).to_csv(os.path.join(scaling_path, f'{modality}_train_scaled_fold_{fold}.csv'), index=False)
        pd.DataFrame(features_test_scaled, columns=feature_columns).to_csv(os.path.join(scaling_path, f'{modality}_test_scaled_fold_{fold}.csv'), index=False)

        # Save scaler
        with open(os.path.join(scaling_path, f'{modality}_scaler_features_fold_{fold}.pkl'), "wb") as f:
            pickle.dump(scaler_features, f)

best_modality = max(modality_observations, key=lambda x: modality_observations[x]['train'] + modality_observations[x]['test'])
total_observations = modality_observations[best_modality]['train'] + modality_observations[best_modality]['test']

print('======================================================================================================')
print(f'The modality with the highest number of observations is: {best_modality}: n = {total_observations}')
print(f'Observations in each modality train/test sets:\n', modality_observations)

Features shape (45093, 69)
==== Train shape ====
 bone_dxa - Fold 0 (20703, 70)
==== Test shape ====
 bone_dxa - Fold 0 (5172, 70)
Scaling
Features shape (45093, 69)
==== Train shape ====
 bone_dxa - Fold 1 (20701, 70)
==== Test shape ====
 bone_dxa - Fold 1 (5174, 70)
Scaling
Features shape (45093, 69)
==== Train shape ====
 bone_dxa - Fold 2 (20698, 70)
==== Test shape ====
 bone_dxa - Fold 2 (5177, 70)
Scaling
Features shape (45093, 69)
==== Train shape ====
 bone_dxa - Fold 3 (20669, 70)
==== Test shape ====
 bone_dxa - Fold 3 (5206, 70)
Scaling
Features shape (45093, 69)
==== Train shape ====
 bone_dxa - Fold 4 (20729, 70)
==== Test shape ====
 bone_dxa - Fold 4 (5146, 70)
Scaling
The modality with the highest number of observations is: bone_dxa: n = 129375
Observations in each modality train/test sets:
 {'bone_dxa': {'train': 103500, 'test': 25875}}


# Physiology

### Immune system

In [None]:
# Immune system
df_immune = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid',
30000,	#White blood cell (leukocyte) count
30010,	#Red blood cell (erythrocyte) count
30020,	#Haemoglobin concentration
30080,	#Platelet count
30710,	#C-reactive protein
30030,	#Haematocrit percentage
30040,	#Mean corpuscular volume
30050,	#Mean corpuscular haemoglobin
30060,	#Mean corpuscular haemoglobin concentration
30070,	#Red blood cell (erythrocyte) distribution width
30090,	#Platelet crit
30100,	#Mean platelet (thrombocyte) volume
30110,	#Platelet distribution width
30120,	#Lymphocyte count
30130,	#Monocyte count
30140,	#Neutrophill count
30150,	#Eosinophill count
30160,	#Basophill count
30170,	#Nucleated red blood cell count
30180,	#Lymphocyte percentage
30190,	#Monocyte percentage
30200,	#Neutrophill percentage
30210,	#Eosinophill percentage
30220,	#Basophill percentage
30230,	#Nucleated red blood cell percentage
30240,	#Reticulocyte percentage
30250,	#Reticulocyte count
30260,	#Mean reticulocyte volume
30270,	#Mean sphered cell volume
30280,	#Immature reticulocyte fraction
30290,	#High light scatter reticulocyte percentage
30300,	#High light scatter reticulocyte count
])
df_immune_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=df_immune)
df_immune_names.to_csv('/UK_BB/brainbody/body/data/immune_vars.csv', index=False)

### Cardiopulmonary, renal, hepatic, and metabolic systems

In [None]:
# Cardiac, pulmonary, renal, hepatic, and metabolic systems
df_physio = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid', 
21003,  #age                                                         
102, 	#Pulse rate, automated reading
4079,	#Diastolic blood pressure, automated reading
4080,	#Systolic blood pressure, automated reading
3062,	#Forced vital capacity (FVC)
3063,	#Forced expiratory volume in 1-second (FEV1)
3064,	#Peak expiratory flow (PEF)
30510,	#Creatinine (enzymatic) in urine
30520,	#Potassium in urine
30530,	#Sodium in urine
30670,	#Urea
30700,	#Creatinine
30720,	#Cystatin C
30880,	#Urate
30500,	#Microalbumin in urine
30600,	#Albumin
30860,	#Total protein
30620,	#Alanine aminotransferase (ALT)
30650,	#Aspartate aminotransferase (AST)
30660,	#Direct bilirubin
30730,	#Gamma glutamyltransferase
30840,	#Total bilirubin
30610,	#Alkaline phosphatase
30810,	#Phosphate
30630,	#Apolipoprotein A
30640,	#Apolipoprotein B
30690,	#Cholesterol
30740,	#Glucose
30750,	#Glycated haemoglobin (HbA1c)
30760,	#HDL cholesterol
30780,	#LDL direct
30790,	#Lipoprotein A
30870,	#Triglycerides
30770,	#IGF-1
30850,	#Testosterone
30830,	#SHBG
30820,	#Rheumatoid factor
30800,	#Oestradiol
30680, #Calcium
30890, #Vitamin D
])
df_physio_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=df_physio)
df_physio_names.to_csv('/UK_BB/brainbody/body/data/physio_vars.csv', index=False)

### Musculoskeletal system

In [None]:
# Musculoskeletal system
df_musculoskeletal = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid', 
46, #Handgrip strength (left)
47, #Handgrip strength (right)
48, #Waist circumference
49, #Hip circumference
50, #Standing height
21001, #Body mass index (BMI)
21002, #Weight
4105, #Heel bone mineral density (BMD) (left)
4124, #Heel bone mineral density (BMD) (right)
4100, #Ankle spacing width (left)
4119, #Ankle spacing width (right)
30610, #Alkaline phosphatase
30810, #Phosphate
12144, #Height
50, #Standing height
51, #Seated height
20015, #Sitting height
3077, #Seating box height
3146, #Speed of sound through heel
3143, #Ankle spacing width
3144, #Heel Broadband ultrasound attenuation, direct entry
3147, #Heel quantitative ultrasound index (QUI), direct entry
3148, #Heel bone mineral density (BMD)
78, #Heel bone mineral density (BMD) T-score, automated
30038 #V02max per kg bodyweight estimated from the exercise test
])
df_musculoskeletal_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=df_musculoskeletal)
df_musculoskeletal_names.to_csv('/UK_BB/brainbody/body/data/musculoskeletal_vars.csv', index=False)

### Cardiovascular: MRI, ECG, and ultrasound

In [None]:
# Cardiovascular: MRI, ECG, and ultrasound
df_cardiovascular = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid', 
#Carotid ultrasound
22672, #Maximum carotid IMT (intima-medial thickness) at 120 degrees
22675, #Maximum carotid IMT (intima-medial thickness) at 150 degrees
22678, #Maximum carotid IMT (intima-medial thickness) at 210 degrees
22681, #Maximum carotid IMT (intima-medial thickness) at 240 degrees
22671, #Mean carotid IMT (intima-medial thickness) at 120 degrees
22674, #Mean carotid IMT (intima-medial thickness) at 150 degrees
22677, #Mean carotid IMT (intima-medial thickness) at 210 degrees
22680, #Mean carotid IMT (intima-medial thickness) at 240 degrees
22670, #Minimum carotid IMT (intima-medial thickness) at 120 degrees
22673, #Minimum carotid IMT (intima-medial thickness) at 150 degrees
22676, #Minimum carotid IMT (intima-medial thickness) at 210 degrees
22679, #Minimum carotid IMT (intima-medial thickness) at 240 degrees
#Arterial stiffness
4194, #Pulse rate
21021, #Pulse wave Arterial Stiffness index
4196, #Pulse wave peak to peak time
4205, #Pulse wave pressure versus time response curve
4195, #Pulse wave reflection index
# Heart MRI (Pulse wave analysis)
12681, #Augmentation index for PWA - PVR
12702, #Cardiac index during PWA - PVR
12682, #Cardiac output during PWA - PVR
12680, #Central augmentation pressure during PWA - PVR
12678, #Central pulse pressure during PWA - PVR
12677, #Central systolic blood pressure during PWA - PVR
12698, #Diastolic brachial blood pressure - OSC
12675, #Diastolic brachial blood pressure during PWA - PVR
12683, #End systolic pressure during PWA - PVR
12684, #End systolic pressure index during PWA - PVR
12673, #Heart rate during PWA - PVR
12687, #Mean arterial pressure during PWA - PVR
12679, #Number of beats in waveform average for PWA - PVR
12676, #Peripheral pulse pressure during PWA - PVR
12686, #Stroke volume during PWA - PVR
12697, #Systolic brachial blood pressure - OSC
12674, #Systolic brachial blood pressure during PWA - PVR
12685, #Total peripheral resistance during PWA - PVR
#Heart MRI  (Left ventricular size and function)
22426, #Average heart rate
22427, #Body surface area
22425, #Cardiac index
22424, #Cardiac output
22420, #LV ejection fraction
22421, #LV end diastolic volume
22422, #LV end systolic volume
22423, #LV stroke volume
#ECG at rest, 12-lead (Physical measures)
12336, #Ventricular rate
12338, #P duration
22334, #PP interval
22330, #PQ interval
22338, #QRS num
12340, #QRS duration
22331, #QT interval
22332, #QTC interval
22333, #RR interval
#ECG during exercise  (Physical measures)
6014,	#Doctor restricts physical activity due to heart condition
6015,	#Chest pain felt during physical activity
6016,	#Chest pain felt outside physical activity
6017,	#Able to walk or cycle unaided for 10 minutes
5985,	#Bicycle speed
5984,	#ECG, load
5983,	#ECG, heart rate
6033,	#Maximum heart rate during fitness test
6032,	#Maximum workload during fitness test
6034	#Target heart rate achieved
])
df_cardiovascular_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=df_cardiovascular)
df_cardiovascular_names.to_csv('/UK_BB/brainbody/body/data/cardiovascular_vars.csv', index=False)

## Explore and prepare the data

### Immune

Instances 0-2

In immune system, grab Instance 0 because Instance 2 has only 5800 observation

In [None]:
# Immune system
immune = pd.read_csv('/UK_BB/brainbody/body/data/immune_vars.csv')
immune_i0 = immune[['eid'] + immune.filter(regex=r'0\.\d$').columns.tolist()]
immune_i0.columns = immune_i0.columns.str.replace('-0.0', '')
immune_i0.isna().sum().sort_values(ascending=False)

C-reactive protein                                 33925
Immature reticulocyte fraction                     32639
Mean reticulocyte volume                           32639
High light scatter reticulocyte count              32638
Reticulocyte count                                 32638
Reticulocyte percentage                            32638
High light scatter reticulocyte percentage         32637
Mean sphered cell volume                           32637
Nucleated red blood cell percentage                25229
Nucleated red blood cell count                     25225
Monocyte count                                     25214
Basophill count                                    25214
Neutrophill count                                  25214
Eosinophill count                                  25214
Lymphocyte count                                   25214
Lymphocyte percentage                              25208
Eosinophill percentage                             25208
Neutrophill percentage         

In [None]:
# Drop NA & save
immune_i0_nona = immune_i0.dropna(axis=0)
immune_i0_nona.to_csv('/UK_BB/brainbody/body/data/immune_vars_i0_nona.csv', index=False)
immune_i0_nona.to_csv('/UK_BB/brainbody/body/data/immune.csv', index=False)

#Display max, min, and negative values
print('MIN\n', immune_i0_nona.min())
print('MAX\n', immune_i0_nona.max())
print('NEG\n', (immune_i0_nona < 0).sum().sort_values(ascending=False)) #immune_i0_nona.iloc[:,1:] < 0).sum().sort_values()
print('NA\n', (immune_i0_nona.isna()).sum().sort_values(ascending=False))

MIN eid                                                1000019.000
White blood cell (leukocyte) count                       0.040
Red blood cell (erythrocyte) count                       0.259
Haemoglobin concentration                                0.970
Haematocrit percentage                                   2.420
Mean corpuscular volume                                 52.100
Mean corpuscular haemoglobin                            14.200
Mean corpuscular haemoglobin concentration              16.100
Red blood cell (erythrocyte) distribution width         10.780
Platelet count                                           3.000
Platelet crit                                            0.002
Mean platelet (thrombocyte) volume                       5.730
Platelet distribution width                             13.270
Lymphocyte count                                         0.000
Monocyte count                                           0.000
Neutrophill count                                  

### General physiology (blood biochemistry)

In [None]:
# General physiology
physio = pd.read_csv('/UK_BB/brainbody/body/data/physio_vars.csv')
print(physio.shape)
physio.columns.to_list()

(502356, 131)


['eid',
 'Pulse rate, automated reading-0.0',
 'Pulse rate, automated reading-0.1',
 'Pulse rate, automated reading-1.0',
 'Pulse rate, automated reading-1.1',
 'Pulse rate, automated reading-2.0',
 'Pulse rate, automated reading-2.1',
 'Pulse rate, automated reading-3.0',
 'Pulse rate, automated reading-3.1',
 'Forced vital capacity (FVC)-0.0',
 'Forced vital capacity (FVC)-0.1',
 'Forced vital capacity (FVC)-0.2',
 'Forced vital capacity (FVC)-1.0',
 'Forced vital capacity (FVC)-1.1',
 'Forced vital capacity (FVC)-1.2',
 'Forced vital capacity (FVC)-2.0',
 'Forced vital capacity (FVC)-2.1',
 'Forced vital capacity (FVC)-2.2',
 'Forced vital capacity (FVC)-3.0',
 'Forced vital capacity (FVC)-3.1',
 'Forced vital capacity (FVC)-3.2',
 'Forced expiratory volume in 1-second (FEV1)-0.0',
 'Forced expiratory volume in 1-second (FEV1)-0.1',
 'Forced expiratory volume in 1-second (FEV1)-0.2',
 'Forced expiratory volume in 1-second (FEV1)-1.0',
 'Forced expiratory volume in 1-second (FEV1)-1.

In [None]:
# Split physiology variables into systems
physio_cardiopulmonary = physio[['eid',
'Pulse rate, automated reading-0.0',
'Pulse rate, automated reading-0.1',
'Pulse rate, automated reading-1.0',
'Pulse rate, automated reading-1.1',
'Pulse rate, automated reading-2.0',
'Pulse rate, automated reading-2.1',
'Pulse rate, automated reading-3.0',
'Pulse rate, automated reading-3.1',
'Forced vital capacity (FVC)-0.0',
'Forced vital capacity (FVC)-0.1',
'Forced vital capacity (FVC)-0.2',
'Forced vital capacity (FVC)-1.0',
'Forced vital capacity (FVC)-1.1',
'Forced vital capacity (FVC)-1.2',
'Forced vital capacity (FVC)-2.0',
'Forced vital capacity (FVC)-2.1',
'Forced vital capacity (FVC)-2.2',
'Forced vital capacity (FVC)-3.0',
'Forced vital capacity (FVC)-3.1',
'Forced vital capacity (FVC)-3.2',
'Forced expiratory volume in 1-second (FEV1)-0.0',
'Forced expiratory volume in 1-second (FEV1)-0.1',
'Forced expiratory volume in 1-second (FEV1)-0.2',
'Forced expiratory volume in 1-second (FEV1)-1.0',
'Forced expiratory volume in 1-second (FEV1)-1.1',
'Forced expiratory volume in 1-second (FEV1)-1.2',
'Forced expiratory volume in 1-second (FEV1)-2.0',
'Forced expiratory volume in 1-second (FEV1)-2.1',
'Forced expiratory volume in 1-second (FEV1)-2.2',
'Forced expiratory volume in 1-second (FEV1)-3.0',
'Forced expiratory volume in 1-second (FEV1)-3.1',
'Forced expiratory volume in 1-second (FEV1)-3.2',
'Peak expiratory flow (PEF)-0.0',
'Peak expiratory flow (PEF)-0.1',
'Peak expiratory flow (PEF)-0.2',
'Peak expiratory flow (PEF)-1.0',
'Peak expiratory flow (PEF)-1.1',
'Peak expiratory flow (PEF)-1.2',
'Peak expiratory flow (PEF)-2.0',
'Peak expiratory flow (PEF)-2.1',
'Peak expiratory flow (PEF)-2.2',
'Peak expiratory flow (PEF)-3.0',
'Peak expiratory flow (PEF)-3.1',
'Peak expiratory flow (PEF)-3.2',
'Diastolic blood pressure, automated reading-0.0',
'Diastolic blood pressure, automated reading-0.1',
'Diastolic blood pressure, automated reading-1.0',
'Diastolic blood pressure, automated reading-1.1',
'Diastolic blood pressure, automated reading-2.0',
'Diastolic blood pressure, automated reading-2.1',
'Diastolic blood pressure, automated reading-3.0',
'Diastolic blood pressure, automated reading-3.1',
'Systolic blood pressure, automated reading-0.0',
'Systolic blood pressure, automated reading-0.1',
'Systolic blood pressure, automated reading-1.0',
'Systolic blood pressure, automated reading-1.1',
'Systolic blood pressure, automated reading-2.0',
'Systolic blood pressure, automated reading-2.1',
'Systolic blood pressure, automated reading-3.0',
'Systolic blood pressure, automated reading-3.1']]
physio_renal_hepatic = physio[['eid', 
'Microalbumin in urine-0.0',
'Microalbumin in urine-1.0',
'Creatinine (enzymatic) in urine-0.0',
'Creatinine (enzymatic) in urine-1.0',
'Potassium in urine-0.0',
'Potassium in urine-1.0',
'Sodium in urine-0.0',
'Sodium in urine-1.0',
'Albumin-0.0',
'Albumin-1.0',
'Alkaline phosphatase-0.0',
'Alkaline phosphatase-1.0',
'Alanine aminotransferase-0.0',
'Alanine aminotransferase-1.0',
'Aspartate aminotransferase-0.0',
'Aspartate aminotransferase-1.0',
'Direct bilirubin-0.0',
'Direct bilirubin-1.0',
'Urea-0.0',
'Urea-1.0',
'Creatinine-0.0',
'Creatinine-1.0',
'Cystatin C-0.0',
'Cystatin C-1.0',
'Gamma glutamyltransferase-0.0',
'Gamma glutamyltransferase-1.0',
'Phosphate-0.0',
'Phosphate-1.0',
'Total bilirubin-0.0',
'Total bilirubin-1.0',
'Total protein-0.0',
'Total protein-1.0',
'Urate-0.0',
'Urate-1.0']]
physio_metabolic = physio[['eid',
'Apolipoprotein A-0.0',
'Apolipoprotein A-1.0',
'Apolipoprotein B-0.0',
'Apolipoprotein B-1.0',
'Glucose-0.0',
'Glucose-1.0',
'Glycated haemoglobin (HbA1c)-0.0',
'Glycated haemoglobin (HbA1c)-1.0',
'HDL cholesterol-0.0',
'HDL cholesterol-1.0',
'LDL direct-0.0',
'LDL direct-1.0',
'Lipoprotein A-0.0',
'Lipoprotein A-1.0',
'Cholesterol-0.0',
'Cholesterol-1.0',
'Calcium (Field ID: 30680)-0.0',
'Vitamin D (Field ID: 30890)-0.0', 
'IGF-1-0.0',
'IGF-1-1.0',
'Oestradiol-0.0',
'Oestradiol-1.0',
'Rheumatoid factor-0.0',
'Rheumatoid factor-1.0',
'SHBG-0.0',
'SHBG-1.0',
'Testosterone-0.0',
'Testosterone-1.0',
'Triglycerides-0.0',
'Triglycerides-1.0']]

Exclude measures with missing responses in more than 30% of individuals (Tian et al.):

- urine microalbumin (70%)
- blood estradiol (85%)
- rheumatoid factor (92%)
- measures of cardiorespiratory fitness (missing proportion, %)
- arterial stiffness (%)

### Renal, hepatic

Instances 0-1: use Instance 0

In [None]:
# Renal and hepatic
physio_renhep_i0 = physio_renal_hepatic[['eid'] + physio_renal_hepatic.filter(regex=r'0\.\d$').columns.tolist()]
physio_renhep_i0.columns = physio_renhep_i0.columns.str.replace('-0.0', '')
# Calculate proportion of missing values
print('% missing')
((physio_renhep_i0.isna().sum() / len(physio_renhep_i0)).round(2)*100).sort_values(ascending=False)

% missing


Microalbumin in urine              70.0
Direct bilirubin                   21.0
Total protein                      15.0
Phosphate                          15.0
Albumin                            14.0
Urea                                7.0
Total bilirubin                     7.0
Gamma glutamyltransferase           7.0
Cystatin C                          7.0
Creatinine                          7.0
Urate                               7.0
Aspartate aminotransferase          7.0
Alanine aminotransferase            7.0
Alkaline phosphatase                7.0
Sodium in urine                     4.0
Potassium in urine                  4.0
Creatinine (enzymatic) in urine     4.0
eid                                 0.0
dtype: float64

In [None]:
# Drop NA & save
physio_renhep_i0 = physio_renhep_i0.drop(columns = ['Microalbumin in urine']).dropna(axis=0).reset_index(drop=True)
physio_renhep_i0.to_csv('/UK_BB/brainbody/body/data/renhep_vars_i0_nona.csv', index=False)
physio_renhep_i0.to_csv('/UK_BB/brainbody/body/data/renalhepatic.csv', index=False)

#Display max, min, and negative values
print('Shape', physio_renhep_i0.shape)
print('MIN\n', physio_renhep_i0.min())
print('MAX\n', physio_renhep_i0.max())
print('NEG\n', (physio_renhep_i0 < 0).sum().sort_values(ascending=False))
print('NA\n', (physio_renhep_i0.isna()).sum().sort_values(ascending=False))

### Metabolic

Use Instance 0

In [None]:
# Metabolic
physio_metabolic_i0 = physio_metabolic[['eid'] + physio_metabolic.filter(regex=r'0\.\d$').columns.tolist()]
physio_metabolic_i0.columns = physio_metabolic_i0.columns.str.replace('-0.0', '')
# Calculate proportion of missing values
print('% missing')
((physio_metabolic_i0.isna().sum() / len(physio_metabolic_i0)).round(2)*100).sort_values(ascending=False)

% missing


Rheumatoid factor               92.0
Oestradiol                      85.0
Lipoprotein A                   25.0
Testosterone                    15.0
Glucose                         15.0
SHBG                            15.0
Apolipoprotein A                15.0
HDL cholesterol                 14.0
Calcium (Field ID: 30680)       14.0
Vitamin D (Field ID: 30890)     11.0
Cholesterol                      7.0
Triglycerides                    7.0
IGF-1                            7.0
LDL direct                       7.0
Glycated haemoglobin (HbA1c)     7.0
Apolipoprotein B                 7.0
eid                              0.0
dtype: float64

In [None]:
# # Drop NA & save
physio_metabolic_i0 = physio_metabolic_i0.rename(columns={'Calcium (Field ID: 30680)': 'Calcium', 'Vitamin D (Field ID: 30890)': 'Vitamin D'})
physio_metabolic_i0 = physio_metabolic_i0.drop(columns = ['Rheumatoid factor', 'Oestradiol']).dropna(axis=0).reset_index(drop=True)
physio_metabolic_i0.to_csv('/UK_BB/brainbody/body/data/metabolic_vars_i0_nona.csv', index=False)
physio_metabolic_i0.to_csv('/UK_BB/brainbody/body/data/metabolic.csv', index=False)

#Display max, min, and negative values
print('Shape', physio_metabolic_i0.shape)
print('MIN\n', physio_metabolic_i0.min())
print('MAX\n', physio_metabolic_i0.max())
print('NEG\n', (physio_metabolic_i0 < 0).sum().sort_values(ascending=False))
print('NA\n', (physio_metabolic_i0.isna()).sum().sort_values(ascending=False))

### Cardiopulmonary

Instances 0-3: use Instance 2 because it has more observations

In [None]:
# Extract Instance 2
physio_cardiopulmonary_i2 = physio_cardiopulmonary[['eid'] + physio_cardiopulmonary.filter(regex=r'2\.\d$').columns.tolist()].copy()
# Calculate proportion of missing values
print('% missing')
((physio_cardiopulmonary_i2.isna().sum() / len(physio_cardiopulmonary_i2)).round(2)*100).sort_values(ascending=False)

% missing


Forced vital capacity (FVC)-2.2                    94.0
Forced expiratory volume in 1-second (FEV1)-2.2    94.0
Peak expiratory flow (PEF)-2.2                     94.0
Forced vital capacity (FVC)-2.0                    91.0
Forced expiratory volume in 1-second (FEV1)-2.1    91.0
Peak expiratory flow (PEF)-2.0                     91.0
Forced vital capacity (FVC)-2.1                    91.0
Forced expiratory volume in 1-second (FEV1)-2.0    91.0
Peak expiratory flow (PEF)-2.1                     91.0
Diastolic blood pressure, automated reading-2.0    88.0
Pulse rate, automated reading-2.0                  88.0
Pulse rate, automated reading-2.1                  88.0
Systolic blood pressure, automated reading-2.0     88.0
Diastolic blood pressure, automated reading-2.1    88.0
Systolic blood pressure, automated reading-2.1     88.0
eid                                                 0.0
dtype: float64

Average measures tested more than once at the same visit: diastolic and systolic blood pressure, pulse rate

In [None]:
# Cardio: diastolic and systolic blood pressure, pulse rate
physio_cardiopulmonary_i2['Diastolic blood pressure'] = (physio_cardiopulmonary_i2['Diastolic blood pressure, automated reading-2.0'] + physio_cardiopulmonary_i2['Diastolic blood pressure, automated reading-2.1']) / 2
physio_cardiopulmonary_i2['Systolic blood pressure'] = (physio_cardiopulmonary_i2['Systolic blood pressure, automated reading-2.0'] + physio_cardiopulmonary_i2['Systolic blood pressure, automated reading-2.1']) / 2
physio_cardiopulmonary_i2['Pulse rate'] = (physio_cardiopulmonary_i2['Pulse rate, automated reading-2.0'] + physio_cardiopulmonary_i2['Pulse rate, automated reading-2.1']) / 2

Select the best performance among multiple repeated tests at the same visit: forced vital capacity (FVC), FEV1 and peak expiratory flow (PEF)

In [None]:
# Pulmonary: forced vital capacity (FVC), FEV1 and peak expiratory flow (PEF)
physio_cardiopulmonary_i2['Forced vital capacity (FVC)'] = physio_cardiopulmonary_i2[['Forced vital capacity (FVC)-2.0', 'Forced vital capacity (FVC)-2.1', 'Forced vital capacity (FVC)-2.2']].max(axis=1)
physio_cardiopulmonary_i2['Forced expiratory volume (FEV1)'] = physio_cardiopulmonary_i2[['Forced expiratory volume in 1-second (FEV1)-2.0', 'Forced expiratory volume in 1-second (FEV1)-2.1', 'Forced expiratory volume in 1-second (FEV1)-2.2']].max(axis=1)
physio_cardiopulmonary_i2['Peak expiratory flow (PEF)'] = physio_cardiopulmonary_i2[['Peak expiratory flow (PEF)-2.0', 'Peak expiratory flow (PEF)-2.1', 'Peak expiratory flow (PEF)-2.2']].max(axis=1)
# Calculate FEV1-FVC ratio
physio_cardiopulmonary_i2['FEV1:FVC'] = physio_cardiopulmonary_i2['Forced expiratory volume (FEV1)'] / physio_cardiopulmonary_i2['Forced vital capacity (FVC)']
physio_cardiopulmonary_i2['FEV1:FVC'] = physio_cardiopulmonary_i2['FEV1:FVC'].round(3)

In [None]:
# Drop redundant columns, NA & save
physio_cardiopulmonary_i2 = physio_cardiopulmonary_i2.drop(columns=[ 'Pulse rate, automated reading-2.0',
'Pulse rate, automated reading-2.1',
'Forced vital capacity (FVC)-2.0',
'Forced vital capacity (FVC)-2.1',
'Forced vital capacity (FVC)-2.2',
'Forced expiratory volume in 1-second (FEV1)-2.0',
'Forced expiratory volume in 1-second (FEV1)-2.1',
'Forced expiratory volume in 1-second (FEV1)-2.2',
'Peak expiratory flow (PEF)-2.0',
'Peak expiratory flow (PEF)-2.1',
'Peak expiratory flow (PEF)-2.2',
'Diastolic blood pressure, automated reading-2.0',
'Diastolic blood pressure, automated reading-2.1',
'Systolic blood pressure, automated reading-2.0',
'Systolic blood pressure, automated reading-2.1']).dropna(axis=0).reset_index(drop=True)

physio_cardiopulmonary_i2.columns = physio_cardiopulmonary_i2.columns.str.replace('-0.0', '')
physio_cardiopulmonary_i2.to_csv('/UK_BB/brainbody/body/data/cardiopulmonary_vars_i2_nona.csv', index=False)
physio_cardiopulmonary_i2.to_csv('/UK_BB/brainbody/body/data/cardiopulmonary.csv', index=False)

#Display max, min, and negative values
print('Shape', physio_cardiopulmonary_i2.shape)
print('MIN\n', physio_cardiopulmonary_i2.min())
print('MAX\n', physio_cardiopulmonary_i2.max())
print('NEG\n', (physio_cardiopulmonary_i2 < 0).sum().sort_values(ascending=False))
print('NA\n', (physio_cardiopulmonary_i2.isna()).sum().sort_values(ascending=False))

### Musculoskeletal

Instances 0-3: use Instance 2 because it has more observations

In [None]:
# Pick variables of interest
musculoskeletal = pd.read_csv('/UK_BB/brainbody/body/data/musculoskeletal_vars.csv')
print(musculoskeletal.shape)

# Remove Alkaline phosphatase and Phosphate because they were grouped with hepatic/renal redundant column
musculoskeletal = musculoskeletal.drop(columns=['Alkaline phosphatase-0.0','Phosphate-0.0'])

#  Pick variables that have only baseline data
bone_densitometry_i0 = musculoskeletal[['eid', 'Speed of sound through heel-0.0','Heel Broadband ultrasound attenuation, direct entry-0.0',
'Heel quantitative ultrasound index (QUI), direct entry-0.0','Heel bone mineral density (BMD) T-score, automated-0.0']]
# Select Instance 2
musculoskeletal_i2 = musculoskeletal[['eid'] + musculoskeletal.filter(regex=r'2\.\d$').columns.tolist()].copy()

# Remove instance number from column names
musculoskeletal_i2.columns = musculoskeletal_i2.columns.str.replace(r'-2.*', '', regex=True)

# Rename columns
musculoskeletal_i2 = musculoskeletal_i2.rename(columns={'Body mass index (BMI) (Field ID: 21001)': 'Body mass index (BMI)', 'Weight (Field ID: 21002)':'Weight'})

(502356, 69)


Get derivatives: average left and right for:
- Handgrip strength (average)
- Waist-hip circumference ratio
- Heel bone mineral density
- Ankle spacing width

In [None]:
# Get derivatives: average left and right 
dataframes = [musculoskeletal_i2]

for df in dataframes:
    # Handgrip strength (average)
    df['Handgrip strength'] = (df['Hand grip strength (left)'] + df['Hand grip strength (right)']) / 2
    # Waist-hip circumference ratio
    df['Waist-hip circumference ratio'] = (df['Waist circumference'] + df['Hip circumference']) / 2
    # Heel bone mineral density (average)
    df['Heel bone mineral density'] = (df['Heel bone mineral density (BMD) (left)'] + df['Heel bone mineral density (BMD) (right)']) / 2
    # Ankle spacing width (average)
    df['Ankle spacing width'] = (df['Ankle spacing width (left)'] + df['Ankle spacing width (right)']) / 2

    df.drop(columns=['Hand grip strength (left)', 'Hand grip strength (right)',
                     'Ankle spacing width (left)', 'Ankle spacing width (right)',
                     'Heel bone mineral density (BMD) (left)', 
                     'Heel bone mineral density (BMD) (right)'], inplace=True)
    print(df.shape)

(502356, 14)


In [None]:
# Drop NA & save
musculoskeletal_i2 = musculoskeletal_i2.dropna(axis=0).reset_index(drop=True).round(3)
musculoskeletal_i2.to_csv('/UK_BB/brainbody/body/data/musculoskeletal_vars_i2_nona.csv', index=False)
musculoskeletal_i2.to_csv('/UK_BB/brainbody/body/data/musculoskeletal.csv', index=False)

#Display max, min, and negative values
print('Shape', musculoskeletal_i2.shape)
print('MIN\n', musculoskeletal_i2.min())
print('MAX\n', musculoskeletal_i2.max())
print('NEG\n', (musculoskeletal_i2 < 0).sum().sort_values(ascending=False))
print('NA\n', (musculoskeletal_i2.isna()).sum().sort_values(ascending=False))

### Bone densitometry of heel

Use baseline data 

In [None]:
# Drop NA & save
bone_densitometry_i0.columns = bone_densitometry_i0.columns.str.replace(r'-0.*', '', regex=True)
bone_densitometry_i0 = bone_densitometry_i0.dropna(axis=0).reset_index(drop=True).round(3)
bone_densitometry_i0.to_csv('/UK_BB/brainbody/body/data/bone_densitometry_vars_i0_nona.csv', index=False)
bone_densitometry_i0.to_csv('/UK_BB/brainbody/body/data/bone_densitometry.csv', index=False)

print('Shape', bone_densitometry_i0.shape)
print('MIN\n', bone_densitometry_i0.min())
print('MAX\n', bone_densitometry_i0.max())
print('NEG\n', (bone_densitometry_i0 < 0).sum().sort_values(ascending=False))
print('NA\n', (bone_densitometry_i0.isna()).sum().sort_values(ascending=False))

## Cardiovascular: heart MRI, ECG, and ultrasound

Instances 0(2)-3: use Instance 2 because it has more observations


In [None]:
# Filter Instance 2
cardiovascular = pd.read_csv('/UK_BB/brainbody/body/data/cardiovascular_vars.csv')
print(cardiovascular.shape)
cardiovascular_i2 = cardiovascular[['eid'] + cardiovascular.filter(regex=r'-2\.\d$').columns.tolist()].copy()
# Drop non-numeric columns
cardiovascular_i2 = cardiovascular_i2.drop(columns=['Pulse wave pressure versus time response curve-2.0'])

(502356, 937)


Check each group of variables separately

In [None]:
# Group variables: Instance 2
pwa_i2 = [
    'Augmentation index for PWA-2.0',
    'Augmentation index for PWA-2.1',
    'Augmentation index for PWA-2.2',
    'Augmentation index for PWA-2.3',
    'Augmentation index for PWA-2.4',
    'Cardiac index during PWA-2.0',
    'Cardiac index during PWA-2.1',
    'Cardiac index during PWA-2.2',
    'Cardiac index during PWA-2.3',
    'Cardiac index during PWA-2.4',
    'Cardiac output during PWA-2.0',
    'Cardiac output during PWA-2.1',
    'Cardiac output during PWA-2.2',
    'Cardiac output during PWA-2.3',
    'Cardiac output during PWA-2.4',
    'Central augmentation pressure during PWA-2.0',
    'Central augmentation pressure during PWA-2.1',
    'Central augmentation pressure during PWA-2.2',
    'Central augmentation pressure during PWA-2.3',
    'Central augmentation pressure during PWA-2.4',
    'Central pulse pressure during PWA-2.0',
    'Central pulse pressure during PWA-2.1',
    'Central pulse pressure during PWA-2.2',
    'Central pulse pressure during PWA-2.3',
    'Central pulse pressure during PWA-2.4',
    'Central systolic blood pressure during PWA-2.0',
    'Central systolic blood pressure during PWA-2.1',
    'Central systolic blood pressure during PWA-2.2',
    'Central systolic blood pressure during PWA-2.3',
    'Central systolic blood pressure during PWA-2.4',
    'Diastolic brachial blood pressure during PWA-2.0',
    'Diastolic brachial blood pressure during PWA-2.1',
    'Diastolic brachial blood pressure during PWA-2.2',
    'Diastolic brachial blood pressure during PWA-2.3',
    'Diastolic brachial blood pressure during PWA-2.4',
    'Diastolic brachial blood pressure-2.0',
    'End systolic pressure during PWA-2.0',
    'End systolic pressure during PWA-2.1',
    'End systolic pressure during PWA-2.2',
    'End systolic pressure during PWA-2.3',
    'End systolic pressure during PWA-2.4',
    'End systolic pressure index during PWA-2.0',
    'End systolic pressure index during PWA-2.1',
    'End systolic pressure index during PWA-2.2',
    'End systolic pressure index during PWA-2.3',
    'End systolic pressure index during PWA-2.4',
    'Heart rate during PWA-2.0',
    'Heart rate during PWA-2.1',
    'Heart rate during PWA-2.2',
    'Heart rate during PWA-2.3',
    'Heart rate during PWA-2.4',
    'Mean arterial pressure during PWA-2.0',
    'Mean arterial pressure during PWA-2.1',
    'Mean arterial pressure during PWA-2.2',
    'Mean arterial pressure during PWA-2.3',
    'Mean arterial pressure during PWA-2.4',
    'Number of beats in waveform average for PWA-2.0',
    'Number of beats in waveform average for PWA-2.1',
    'Number of beats in waveform average for PWA-2.2',
    'Number of beats in waveform average for PWA-2.3',
    'Number of beats in waveform average for PWA-2.4',
    'Peripheral pulse pressure during PWA-2.0',
    'Peripheral pulse pressure during PWA-2.1',
    'Peripheral pulse pressure during PWA-2.2',
    'Peripheral pulse pressure during PWA-2.3',
    'Peripheral pulse pressure during PWA-2.4',
    'Stroke volume during PWA-2.0',
    'Stroke volume during PWA-2.1',
    'Stroke volume during PWA-2.2',
    'Stroke volume during PWA-2.3',
    'Stroke volume during PWA-2.4',
    'Systolic brachial blood pressure during PWA-2.0',
    'Systolic brachial blood pressure during PWA-2.1',
    'Systolic brachial blood pressure during PWA-2.2',
    'Systolic brachial blood pressure during PWA-2.3',
    'Systolic brachial blood pressure during PWA-2.4',
    'Systolic brachial blood pressure-2.0',
    'Total peripheral resistance during PWA-2.0',
    'Total peripheral resistance during PWA-2.1',
    'Total peripheral resistance during PWA-2.2',
    'Total peripheral resistance during PWA-2.3',
    'Total peripheral resistance during PWA-2.4']
heart_mri_i2 = [
    'LV ejection fraction-2.0',
    'Body surface area-2.0',
    'Average heart rate-2.0',
    'Cardiac index-2.0',
    'LV end diastolic volume-2.0',
    'LV end systolic volume-2.0',
    'Cardiac output-2.0',
    'LV stroke volume-2.0']
carotid_ultrasound_i2 = [
    'Mean carotid IMT (intima-medial thickness) at 120 degrees -2.0',
    'Maximum carotid IMT (intima-medial thickness) at 120 degrees -2.0',
    'Minimum carotid IMT (intima-medial thickness) at 120 degrees -2.0',
    'Mean carotid IMT (intima-medial thickness) at 240 degrees -2.0',
    'Maximum carotid IMT (intima-medial thickness) at 240 degrees -2.0',
    'Minimum carotid IMT (intima-medial thickness) at 240 degrees -2.0',
    'Maximum carotid IMT (intima-medial thickness) at 150 degrees -2.0',
    'Mean carotid IMT (intima-medial thickness) at 150 degrees -2.0',
    'Minimum carotid IMT (intima-medial thickness) at 150 degrees -2.0',
    'Maximum carotid IMT (intima-medial thickness) at 210 degrees -2.0',
    'Mean carotid IMT (intima-medial thickness) at 210 degrees -2.0',
    'Minimum carotid IMT (intima-medial thickness) at 210 degrees -2.0']
arterial_stiffness_i2 = [
    'Pulse wave Arterial Stiffness index-2.0',
    'Pulse wave peak to peak time-2.0',
    'Pulse rate-2.0',
    'Pulse wave reflection index-2.0']

### PWA

In [None]:
# Display missing values in PWA
with pd.option_context('display.max_rows', None):
    display(print((cardiovascular_i2[pwa_i2].isna().sum() / len(cardiovascular_i2)).round(2)*100))

Augmentation index for PWA-2.0                       87.0
Augmentation index for PWA-2.1                       88.0
Augmentation index for PWA-2.2                      100.0
Augmentation index for PWA-2.3                      100.0
Augmentation index for PWA-2.4                      100.0
Cardiac index during PWA-2.0                         89.0
Cardiac index during PWA-2.1                         89.0
Cardiac index during PWA-2.2                        100.0
Cardiac index during PWA-2.3                        100.0
Cardiac index during PWA-2.4                        100.0
Cardiac output during PWA-2.0                        88.0
Cardiac output during PWA-2.1                        89.0
Cardiac output during PWA-2.2                       100.0
Cardiac output during PWA-2.3                       100.0
Cardiac output during PWA-2.4                       100.0
Central augmentation pressure during PWA-2.0         87.0
Central augmentation pressure during PWA-2.1         88.0
Central augmen

None

Array 0 in PWA has the lowest proportion of missing values, therefore arrays 1-4 are to be excluded

In [None]:
# Select only first array for PWA
pwa_i2_array0 = [col for col in cardiovascular_i2[pwa_i2] if col.endswith('.0') or col == 'eid']
#Display max, min, and negative values
print('Shape', cardiovascular_i2[pwa_i2_array0].shape)
print('MIN\n', cardiovascular_i2[pwa_i2_array0].min())
print('MAX\n', cardiovascular_i2[pwa_i2_array0].max())
print('NEG\n', (cardiovascular_i2[pwa_i2_array0] < 0).sum().sort_values(ascending=False))
print('NA\n', (cardiovascular_i2[pwa_i2_array0].isna()).sum().sort_values(ascending=False))

In [None]:
# Drop NA & save
pwa_i2 = cardiovascular_i2[['eid'] + pwa_i2_array0].dropna(axis=0).reset_index(drop=True) 
# Remove instance number from column names
pwa_i2.columns = pwa_i2.columns.str.replace(r'-2.*', '', regex=True)
#cardiovascular_i2.columns = [re.sub(r'-2\.\d+$', '', col) for col in cardiovascular_i2_nona.columns]
print(pwa_i2.shape)

# Save
pwa_i2.to_csv('/UK_BB/brainbody/body/data/pwa_vars_i2_nona.csv', index=False)
pwa_i2.to_csv('/UK_BB/brainbody/body/data/pwa.csv', index=False)

(56457, 19)


### Heart MRI

In [None]:
# Display missing values in Heart MRI
with pd.option_context('display.max_rows', None):
    display(print((cardiovascular_i2[heart_mri_i2].isna().sum() / len(cardiovascular_i2)).round(2)*100))

print('Shape', cardiovascular_i2[heart_mri_i2].shape)
print('MIN\n', cardiovascular_i2[heart_mri_i2].min())
print('MAX\n', cardiovascular_i2[heart_mri_i2].max())
print('NEG\n', (cardiovascular_i2[heart_mri_i2] < 0).sum().sort_values(ascending=False))
print('NA\n', (cardiovascular_i2[heart_mri_i2].isna()).sum().sort_values(ascending=False))

In [None]:
# Drop NA & save
heart_mri_i2 = cardiovascular_i2[['eid'] + heart_mri_i2].dropna(axis=0).reset_index(drop=True) 
# Remove instance number from column names
heart_mri_i2.columns = heart_mri_i2.columns.str.replace(r'-2.*', '', regex=True)
print(heart_mri_i2.shape)

# Save
heart_mri_i2.to_csv('/UK_BB/brainbody/body/data/heart_mri_vars_i2_nona.csv', index=False)
heart_mri_i2.to_csv('/UK_BB/brainbody/body/data/heart_mri.csv', index=False)

(39607, 9)


### Carotid ultrasound

In [None]:
# Display max, min, and negative values: Carotid ultrasound
print('Shape', cardiovascular_i2[carotid_ultrasound_i2].shape)
print('MIN\n', cardiovascular_i2[carotid_ultrasound_i2].min())
print('MAX\n', cardiovascular_i2[carotid_ultrasound_i2].max())
print('NEG\n', (cardiovascular_i2[carotid_ultrasound_i2] < 0).sum().sort_values(ascending=False))
print('NA\n', (cardiovascular_i2[carotid_ultrasound_i2].isna()).sum().sort_values(ascending=False))

In [None]:
# Drop NA
carotid_ultrasound_i2 = cardiovascular_i2[['eid'] + carotid_ultrasound_i2].dropna(axis=0).reset_index(drop=True) 
# Remove instance number from column names
carotid_ultrasound_i2.columns = carotid_ultrasound_i2.columns.str.replace(r'-2.*', '', regex=True)
print(carotid_ultrasound_i2.shape)

# Save
carotid_ultrasound_i2.to_csv('/UK_BB/brainbody/body/data/carotid_ultrasound_vars_i2_nona.csv', index=False)
carotid_ultrasound_i2.to_csv('/UK_BB/brainbody/body/data/carotid_ultrasound.csv', index=False)

(49110, 13)


### Arterial stiffness

Can be negative

In [None]:
# Display max, min, and negative values: Arterial stiffness
print('Shape', cardiovascular_i2[arterial_stiffness_i2].shape)
print('MIN\n', cardiovascular_i2[arterial_stiffness_i2].min())
print('MAX\n', cardiovascular_i2[arterial_stiffness_i2].max())
print('NEG\n', (cardiovascular_i2[arterial_stiffness_i2] < 0).sum().sort_values(ascending=False))
print('NA\n', (cardiovascular_i2[arterial_stiffness_i2].isna()).sum().sort_values(ascending=False))

In [None]:
# Drop NA
arterial_stiffness_i2 = cardiovascular_i2[['eid'] + arterial_stiffness_i2].dropna(axis=0).reset_index(drop=True) 
# Remove instance number from column names
arterial_stiffness_i2.columns = arterial_stiffness_i2.columns.str.replace(r'-2.*', '', regex=True)
print(arterial_stiffness_i2.shape)

# Save
arterial_stiffness_i2.to_csv('/UK_BB/brainbody/body/data/arterial_stiffness_vars_i2_nona.csv', index=False)
arterial_stiffness_i2.to_csv('/UK_BB/brainbody/body/data/arterial_stiffness.csv', index=False)

(62388, 5)


### EEG at rest

In [None]:
ecg_rest_i2 = cardiovascular[['eid',
'PQ interval-2.0',
'PP interval-2.0',
'QTC interval-2.0',
'QT interval-2.0',
'QRS num-2.0',
'RR interval-2.0',
'P duration-2.0',
'QRS duration-2.0',
'Ventricular rate-2.0']]

#Display max, min, and negative values
print('Shape', ecg_rest_i2.shape)
print('MIN\n', ecg_rest_i2.min())
print('MAX\n', ecg_rest_i2.max())
print('NEG\n', (ecg_rest_i2 < 0).sum().sort_values(ascending=False))
print('NA\n', (ecg_rest_i2.isna()).sum().sort_values(ascending=False))

ecg_rest_i2.columns = ecg_rest_i2.columns.str.replace('-2.0', '')

# Drop NA & save
ecg_rest_i2 = ecg_rest_i2.dropna(axis=0).reset_index(drop=True).round(3)
print(ecg_rest_i2.shape)
ecg_rest_i2.to_csv('/UK_BB/brainbody/body/data/ecg_rest_vars_i2_nona.csv', index=False)
ecg_rest_i2.to_csv('/UK_BB/brainbody/body/data/eeg_rest.csv', index=False)

# Hearing

In [None]:
# Create directories & save
base_path = f'/media/hcs-sci-psy-narun/UK_BB/brainbody'
hearing_path = os.path.join(base_path, 'hearing-vision')
data_path = os.path.join(hearing_path, 'data')
os.makedirs(base_path, exist_ok=True)
os.makedirs(hearing_path, exist_ok=True)
os.makedirs(data_path, exist_ok=True)

In [None]:
df_hearing = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid',
20019, #Speech-reception-threshold (SRT) estimate (left)
20021 #Speech-reception-threshold (SRT) estimate (right)
])
df_hearing_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=df_hearing)
df_hearing_names.to_csv(os.path.join(data_path, 'hearing_vars.csv'), index=False)

In [None]:
print('% missing')
with pd.option_context('display.max_rows', None):
    display(((df_hearing_names.isna().sum() / len(df_hearing_names)).round(2)*100).sort_values(ascending=False))

% missing


Speech-reception-threshold (SRT) estimate (left)-3.0     99.0
Speech-reception-threshold (SRT) estimate (right)-3.0    99.0
Speech-reception-threshold (SRT) estimate (left)-1.0     96.0
Speech-reception-threshold (SRT) estimate (right)-1.0    96.0
Speech-reception-threshold (SRT) estimate (left)-2.0     86.0
Speech-reception-threshold (SRT) estimate (right)-2.0    86.0
Speech-reception-threshold (SRT) estimate (left)-0.0     68.0
Speech-reception-threshold (SRT) estimate (right)-0.0    68.0
eid                                                       0.0
dtype: float64

In [None]:
# Filter Instance 2
hearing_i2 = df_hearing_names[['eid'] + df_hearing_names.filter(regex=r'2\.\d$').columns.tolist()]
hearing_i2.columns = hearing_i2.columns.str.replace('-2.0', '')
hearing_i2.to_csv(os.path.join(data_path, 'hearing_i2.csv'), index=False)
with pd.option_context('display.max_rows', None):
    display(((hearing_i2.isna().sum() / len(hearing_i2)).round(2)*100).sort_values(ascending=False))

Speech-reception-threshold (SRT) estimate (left)     86.0
Speech-reception-threshold (SRT) estimate (right)    86.0
eid                                                   0.0
dtype: float64

In [None]:
# Count missing values, negative values, min, and max
print('MIN\n', hearing_i2.min())
print('MAX\n', hearing_i2.max())
print('NEG\n', (hearing_i2 < 0).sum().sort_values(ascending=False))

Here negative values are [ok](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20019)
- Maximum: 8
- Minimum: -11.25

In [None]:
# Drop NAs
hearing_i2_nona = hearing_i2.dropna(axis=0).reset_index(drop=True)

print('MIN\n', hearing_i2_nona.min())
print('NEG\n', (hearing_i2_nona < 0).sum().sort_values(ascending=False))
print('NA\n', (hearing_i2_nona.isna()).sum().sort_values(ascending=False))
print('SHAPE\n', hearing_i2_nona.shape)

hearing_i2_nona.to_csv(os.path.join(data_path, 'hearing_vars_i2_nona.csv'), index=False)
hearing_i2_nona.to_csv(os.path.join(data_path, 'hearing.csv'), index=False)