In [None]:
import csv
import os
import random
import pickle
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import sklearn
import warnings
import skbold
from skbold.preproc import ConfoundRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import ukbiobank.utils.utils
from ukbiobank.utils import loadCsv
from ukbiobank.utils import addFields
from ukbiobank.utils.utils import fieldIdsToNames

# Define ConfoundRegressor

In [None]:
# Define ConfoundRegressor: skbold
def confound_regressor_skbold(features_train, features_test, confounds_train, confounds_test):
    # Scale features (train and test sets)
    scaler_features = StandardScaler()
    features_train_scaled = scaler_features.fit_transform(features_train)
    features_test_scaled = scaler_features.transform(features_test)
    
    # Scale confounds (train and test sets)
    scaler_confounds = StandardScaler()
    confounds_train_scaled = scaler_confounds.fit_transform(confounds_train)
    confounds_test_scaled = scaler_confounds.transform(confounds_test)

    # Convert full sets into np.array
    features_full_scaled_np = np.array(pd.concat([pd.DataFrame(features_train_scaled, columns = features_train.columns), pd.DataFrame(features_test_scaled, columns = features_test.columns)], axis=0))
    confounds_full_scaled_np = np.array(pd.concat([pd.DataFrame(confounds_train_scaled, columns = confounds_train.columns), pd.DataFrame(confounds_test_scaled, columns = confounds_test.columns)], axis=0))
    
    # Define ConfoundRegressor on a FULL set (train and test)
    cfr = ConfoundRegressor(confound=confounds_full_scaled_np, X=features_full_scaled_np)
    features_train_corrected = cfr.fit_transform(features_train_scaled)
    features_test_corrected = cfr.transform(features_test_scaled)


    return features_train_corrected, features_test_corrected, features_train_scaled, features_test_scaled, scaler_features

In [None]:
# Define ConfoundRegressor: Linear Model
def confound_regressor(features_train, features_test, confounds_train, confounds_test):
    from sklearn.linear_model import LinearRegression

    scaler_features = StandardScaler()
    features_train = scaler_features.fit_transform(features_train)
    features_test = scaler_features.transform(features_test)
        
    # Scale confounds (train and test sets)
    scaler_confounds = StandardScaler()
    confounds_train = scaler_confounds.fit_transform(confounds_train)
    confounds_test = scaler_confounds.transform(confounds_test)
        
    model = LinearRegression()
    model.fit(confounds_train, features_train)
    features_train_pred = model.predict(confounds_train)
    features_train_res = features_train - features_train_pred

    features_test_pred = model.predict(confounds_test)
    features_test_res = features_test - features_test_pred

    return features_train_res, features_test_res, features_train, features_test, scaler_features

# Get data: T2w MRI

In [None]:
csv_path = '/ukbbdata/ukb.csv'
ukb = ukbiobank.ukbio(ukb_csv=csv_path)

### T2


- 24486	Total volume of deep white matter hyperintensities
- 24485	Total volume of peri-ventricular white matter hyperintensities
- 25781	Total volume of white matter hyperintensities (from T1 and T2_FLAIR images)

#### Confounds:

- 25926	Intensity scaling for T2_FLAIR
- 25736	Discrepancy between T2 FLAIR brain image and T1 brain image


In [None]:
df_add_t2 = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid',
24486,	#Total volume of deep white matter hyperintensities - T2
24485,	#Total volume of peri-ventricular white matter hyperintensities - T2
25781,	#Total volume of white matter hyperintensities (from T1 and T2_FLAIR images) - T2
], instance=2)
add_t2_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=df_add_t2)

Get columns that could not be uploaded with ukb 'loadCsv' using 'usecols'

In [None]:
add_t2 = pd.read_csv('/ukbbdata/ukb.csv', usecols=['eid','24486-2.0', '24485-2.0', '25781-2.0'])
add_t2.columns = ['eid','Total volume of deep white matter hyperintensities from T2', 'Total volume of peri-ventricular white matter hyperintensities from T2', 'Total volume of white matter hyperintensities (from T1 and T2_FLAIR images)']
add_t2_nona = add_t2.dropna(axis=0).reset_index(drop=True)
add_t2_nona.to_csv('/ML_DATASETS/Brain/T1/additional_(t1-t2-fMRI)-T2_names_nona.csv', index=False)
add_t2 = pd.read_csv('/ML_DATASETS/Brain/T1/additional_(t1-t2-fMRI)-T2_names_nona.csv')

Upload confounds for T2

- Head size: 25000
- Site: 54
- Acquisition date: 53
- STRUCT MOTION: 24419
- Discrepancy between T1 brain image and standard-space brain template (linearly-aligned): 25731
- Discrepancy between T1 brain image and standard-space brain template (nonlinearly-aligned): 25732

In [None]:
df_add_t2_conf = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid',
25926,
25736,
25000,
54,
53,
24419], instance=2)
add_t2_conf_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=df_add_t2_conf)
add_t2_conf_names_nona = add_t2_conf_names.dropna(axis=0)
add_t2_conf_names_nona.columns = add_t2_conf_names_nona.columns.str.replace('-2.0', '')
add_t2_conf_names_nona.to_csv(r'/ML_DATASETS/Brain/T1/additional_(t1-t2-fMRI)-T2_CONF_RAW.csv', index=False)
add_t2_conf_names_nona

Add 'Structural motion'

In [None]:
add_t2_conf = pd.read_csv('/ukbbdata/ukb.csv', usecols=['eid', '24419-2.0'])
add_t2_conf.columns = ['eid', 'Struct.motion']
add_t2_conf_nona = add_t2_conf.dropna(axis=0).reset_index(drop=True)
add_t2_conf_full = pd.merge(add_t2_conf_names_nona, add_t2_conf_nona, on='eid')
add_t2_conf_full.to_csv('/ML_DATASETS/Brain/T1/additional_(t1-t2-fMRI)-T2_CONF_RAW_FULL.csv', index=False)
add_t2_conf_full

Convert date & site

In [None]:
# Convert date
add_t2_conf_full = pd.read_csv('/ML_DATASETS/Brain/T1/additional_(t1-t2-fMRI)-T2_CONF_RAW_FULL.csv')
import datetime
add_t2_conf_unix = add_t2_conf_full.copy()
add_t2_conf_unix['Date of attending assessment centre'] = pd.to_datetime(add_t2_conf_unix['Date of attending assessment centre'], format="%Y-%m-%d")  #"%m/%d/%Y")
add_t2_conf_unix['Date of attending assessment centre'] = add_t2_conf_unix['Date of attending assessment centre'].apply(datetime.datetime.timestamp)
# Round values
add_t2_conf_unix['Date of attending assessment centre'] = add_t2_conf_unix['Date of attending assessment centre'].apply(int)
print(add_t2_conf_unix['Date of attending assessment centre'])
# Dummy encode site
add_t2_conf_unix_dummy = pd.get_dummies(add_t2_conf_unix, columns=['UK Biobank assessment centre'], dtype=int)
add_t2_conf_unix_dummy = add_t2_conf_unix_dummy.drop(columns=['T2-FLAIR used (in addition to T1) to run FreeSurfer', 'Discrepancy between tfMRI brain image and T1 brain image'])
add_t2_conf_unix_dummy.to_csv('/ML_DATASETS/Brain/t1_t2_tfmri/add_t2_conf_unix_dummy.csv', index=False)
t2_conf = pd.read_csv('/media/hcs-sci-psy-narun/ML_DATASETS/Brain/t1_t2_tfmri/add_t2_conf_unix_dummy.csv')
t2_conf

# Get data: whole-brain T1w MRI

In [None]:
add_t1 = pd.read_csv('/ML_DATASETS/Brain/T1/additional_(t1-t2-fMRI)-T1_names_nona.csv')
add_t1_conf = pd.read_csv('/ML_DATASETS/Brain/T1/struct_conf_full_dummy.csv')
add_t1_conf.to_csv('/PLS/brain/additional/orig/add_t1_conf.csv', index=False)
add_t1.columns.to_list()

In [None]:
t1_t2_combined = add_t1.merge(add_t2, on = 'eid')
t1_t2_combined

In [None]:
t1_t2_conf_combined = pd.read_csv('/PLS/brain/additional/orig/add_t1_conf.csv').merge(t2_conf[['Intensity scaling for T2_FLAIR', 'Discrepancy between T2 FLAIR brain image and T1 brain image', 'eid']], on='eid')
t1_t2_conf_combined.columns

# PLS on uncategorized (whole-brain) T1w and T2w MRI data combined

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

############## 1
print('Started: Uploading')
seed = 42
file = t1_t2_combined.copy()
confound = t1_t2_conf_combined.copy()

folds = ["0", "1", "2", "3", "4"] 
pls_result = {}
    
# Match confounds to MRI
print('Started: Match confounds to brain data')
conf_to_brain_match = pd.merge(confound, file['eid'], on='eid')
brain_to_conf_match = pd.merge(conf_to_brain_match['eid'], file, on='eid')

for fold in folds:
# Upload g-factor with ID
    g_train_full = pd.read_csv(f'/PLS/g_factor/g_train_with_id_fold_{fold}.csv')
    g_test_full = pd.read_csv(f'/PLS/g_factor/g_test_with_id_fold_{fold}.csv')

    
    # Brain data to cognitive data
    brain_train, brain_test, brain_train_id, brain_test_id = pd.merge(brain_to_conf_match, g_train_full['eid'], on='eid').drop(columns=['eid']), pd.merge(brain_to_conf_match, g_test_full['eid'], on='eid').drop(columns=['eid']), pd.merge(brain_to_conf_match, g_train_full['eid'], on='eid')['eid'], pd.merge(brain_to_conf_match, g_test_full['eid'], on='eid')['eid']

    brain_train.to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/T1_T2_whole_brain_train_fold_{fold}.csv', index=False)
    brain_test.to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/T1_T2_whole_brain_test_fold_{fold}.csv', index=False)
    brain_train_id.to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/T1_T2_whole_brain_train_id_fold_{fold}.csv', index=False)
    brain_test_id.to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/T1_T2_whole_brain_test_id_fold_{fold}.csv', index=False)
        
    ############## 2
    print(f'Matching confounds to T1_T2_whole_brain fold {fold}')
        
    # Match confounds to MRI
    brain_conf_train, brain_conf_test = pd.merge(conf_to_brain_match, brain_train_id, on='eid').drop(columns=['eid']), pd.merge(conf_to_brain_match, brain_test_id, on='eid').drop(columns=['eid'])
    brain_conf_train.to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/T1_T2_whole_brain_conf_train_fold_{fold}.csv', index=False)
    brain_conf_test.to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/T1_T2_whole_brain_conf_test_fold_{fold}.csv', index=False)
        
    ############## 3
    print(f'Matching g-factor to T1_T2_whole_brain')
        
    # Match g-factor back to MRI
    g_train, g_test, g_train_id, g_test_id = pd.merge(g_train_full, brain_train_id, on='eid').drop(columns=['eid']), pd.merge(g_test_full, brain_test_id, on='eid').drop(columns=['eid']), pd.merge(g_train_full, brain_train_id, on='eid')['eid'], pd.merge(g_test_full, brain_test_id, on='eid')['eid']
    g_train.to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/g_train_T1_T2_whole_brain_matched_fold_{fold}.csv', index=False)
    g_test.to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/g_test_T1_T2_whole_brain_matched_fold_{fold}.csv', index=False)
        
    ############## 4
    print(f'Applying ConfoundRegressor to MRI data fold {fold}')
        
    # Apply ConfoundRegressor
    features_corr_train, features_corr_test, features_scaled_train, features_scaled_test, scaler_features = confound_regressor(brain_train, brain_test, brain_conf_train, brain_conf_test)
    pd.DataFrame(features_corr_train, columns = brain_train.columns).to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/T1_T2_whole_brain_train_corr_{fold}.csv', index=False)
    pd.DataFrame(features_corr_test, columns = brain_train.columns).to_csv(f'/PLS/brain/additional/fold_{fold}/suppl/T1_T2_whole_brain_test_corr_{fold}.csv', index=False)
    pd.DataFrame(features_scaled_train, columns = brain_train.columns).to_csv(f'/PLS/brain/additional/fold_{fold}/scaling/T1_T2_whole_brain_train_scaled_{fold}.csv', index=False)
    pd.DataFrame(features_scaled_test, columns = brain_train.columns).to_csv(f'/PLS/brain/additional/fold_{fold}/scaling/T1_T2_whole_brain_test_scaled_{fold}.csv', index=False)

        
    with open(f'/PLS/brain/additional/fold_{fold}/scaling/scaler_features_T1_T2_whole_brain_fold_{fold}.pkl', "wb") as f:
        pickle.dump(scaler_features, f)


    # Initiate and run PLS
    parameters = {'n_components': range(1, features_corr_train.shape[1]+1, 1)}
    pls = PLSRegression()
    model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_error', cv=KFold(10, shuffle = True, random_state=seed), verbose=4)
        
        
    print("Fitting PLS")
    model.fit(features_corr_train, np.array(g_train))
        
    print(f'Model parameters for fold {fold}:', model.cv_results_['params'])
    print(f'Mean test score for fold {fold}:', model.cv_results_['mean_test_score'])
    print(f'Rank test score for fold {fold}:', model.cv_results_['rank_test_score'])
    print(model)
        
    print(f'Saving PLS model for T1_T2_whole_brain fold {fold}')
    with open(f'/PLS/brain/additional/fold_{fold}/models/pkl/T1_T2_whole_brain_model_fold_{fold}.pkl', "wb") as f:
        pickle.dump(model, f)
            
    print(f'Best params in fold {fold} = ', model.best_params_)
    print(f'Best score (neg_mean_absolute_error) in fold {fold} = ', model.best_score_)
            
    # Predict g-factor
    print(f'Predicting & saving g_test for T1_T2_whole_brain fold {fold}')
    g_pred_test = model.predict(np.array(features_corr_test))
    pd.DataFrame(g_pred_test, columns=['g predicted test']).to_csv(f'/PLS/brain/additional/fold_{fold}/g_pred/T1_T2_whole_brain_g_pred_test_fold_{fold}.csv')

    g_pred_test_with_id = pd.concat([g_test_id.astype(int), pd.DataFrame(g_pred_test, columns=['g predicted test'])], axis=1).to_csv(f'/PLS/brain/additional/fold_{fold}/g_pred/T1_T2_whole_brain_g_pred_test_id_fold_{fold}.csv')

        
    print(f'Predicting & saving g_train for T1_T2_whole_brain fold {fold}')
    g_pred_train = model.predict(np.array(features_corr_train))
    pd.DataFrame(g_pred_train, columns=['g predicted train']).to_csv(f'/PLS/brain/additional/fold_{fold}/g_pred/T1_T2_whole_brain_g_pred_train_fold_{fold}.csv')
        

    g_pred_train_with_id = pd.concat([g_train_id.astype(int), pd.DataFrame(g_pred_train, columns=['g predicted train'])], axis=1).to_csv(f'/PLS/brain/additional/fold_{fold}/g_pred/T1_T2_whole_brain_g_pred_train_id_fold_{fold}.csv')
        
            
    print(f"Fold = {fold}")
    print("----------")
    print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
    print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
    print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
    print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
    print("----------")
            
    pls_result['fold'] = fold
    pls_result['modality'] = 'T1_T2_whole_brain'
    pls_result['n_components'] = model.best_params_
    pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
    pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
    pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
    pls_result['Pearson r'] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])
            
    with open(f'/PLS/brain/additional/fold_{fold}/models/csv/T1_T2_whole_brain_fold_{fold}_PLS_result.csv', 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=pls_result.keys())
        writer.writerow(pls_result)
            
    pls_result.clear()
        
    corr, pval = stats.pearsonr(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
    r2 = r2_score(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
    mse = mean_squared_error(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
    result = pd.DataFrame(['T1_T2', fold, corr, pval, r2, mse, model.best_params_], index=['Modality', 'Fold', 'Correlation', 'P-value', 'R2', 'MSE', 'n components'], columns=['Values']).to_csv(f'/PLS/brain/additional/fold_{fold}/models/csv/T1_T2_whole_brain_fold_{fold}_full_result.csv')

## Display and average results across folds

In [None]:
five_folds = []
folds = ["0", "1", "2", "3", "4"]
for fold in folds:
    pls = pd.read_csv(f'/PLS/brain/additional/fold_{fold}/models/csv/T1_T2_whole_brain_fold_{fold}_PLS_result.csv', header=None)
    pls.columns = ['Fold', 'Modality', 'n components', 'MSE', 'MAE', 'R2', 'Pearson r']
    five_folds.append(pls)
    five_folds_all_modalities = pd.concat(five_folds, ignore_index=False)

five_folds_all_modalities['Pearson r'] = five_folds_all_modalities['Pearson r'].astype(str).str.replace(r'PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
five_folds_all_modalities[['Pearson r', 'p-value']] = five_folds_all_modalities['Pearson r'].str.split(',', expand=True).astype(float).round(decimals=3)
five_folds_all_modalities = five_folds_all_modalities.round(decimals=3)
#five_folds_all_modalities.to_csv('/PLS/brain/dti/pls_5_folds_all_modalities.csv', index=False)
with pd.option_context('display.max_rows', None):
    display(five_folds_all_modalities)

In [None]:
# Average across folders
five_folds_all_modalities_mean= five_folds_all_modalities[['R2', 'Pearson r', 'Modality', 'MSE', 'MAE']]
five_folds_all_modalities_mean = five_folds_all_modalities_mean.groupby(['Modality']).mean().round(3).reset_index() #.sort_values(by='R2', ascending=False)
five_folds_all_modalities_mean