In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os as os
import sklearn

from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb

In [None]:
sns.set_style({'axes.facecolor': 'white',
 'axes.edgecolor': '.15',
 'axes.grid': True,
 'axes.axisbelow': True,
 'axes.labelcolor': '.15',
 'figure.facecolor': 'white',
 'grid.color': '.8',
 'grid.linestyle': '-',
 'text.color': '.15',
 'xtick.color': '.15',
 'ytick.color': '.15',
 'xtick.direction': 'out',
 'ytick.direction': 'out',
 #'lines.solid_capstyle': <CapStyle.round: 'round'>,
 'patch.edgecolor': 'w',
 'patch.force_edgecolor': True,
 'image.cmap': 'rocket',
 'font.family': ['sans-serif'],
 'font.sans-serif': ['Arial',
  'DejaVu Sans',
  'Liberation Sans',
  'Bitstream Vera Sans',
  'sans-serif'],
 'xtick.bottom': False,
 'xtick.top': False,
 'ytick.left': False,
 'ytick.right': False,
 'axes.spines.left': True,
 'axes.spines.bottom': True,
 'axes.spines.right': True,
 'axes.spines.top': True})

sns.set_context({'font.size': 17.0,
 'axes.labelsize': 'medium',
 'axes.titlesize': 'large',
 'xtick.labelsize': 'medium',
 'ytick.labelsize': 'medium',
 'legend.fontsize': 'medium',
 'axes.linewidth': 0.8,
 'grid.linewidth': 0.8,
 'lines.linewidth': 1.5,
 'lines.markersize': 6.0,
 'patch.linewidth': 1.0,
 'xtick.major.width': 0.8,
 'ytick.major.width': 0.8,
 'xtick.minor.width': 0.6,
 'ytick.minor.width': 0.6,
 'xtick.major.size': 3.5,
 'ytick.major.size': 3.5,
 'xtick.minor.size': 2.0,
 'ytick.minor.size': 2.0,
 'legend.title_fontsize': None})

In [None]:
## MRI data from Philadelphia Neurodevelopmental Cohort study, training data for age prediction
X = pd.read_csv('processedData\\PNCtraining.csv')
## Create a dictionary to map the current column names to the new numeric names
new_column_names = {col: str(i + 1) for i, col in enumerate(X.columns)}
## Rename the columns using the dictionary
X.rename(columns=new_column_names, inplace=True)

In [None]:
## Use only female subjects for age prediction model
Xfem = X[X['235'] == 'F']

In [None]:
## Load age data and reduce to female subjects
PNClabels = pd.read_csv('processedData\\PNClabels.csv')
PNClabelsFem = PNClabels[PNClabels.index.isin(Xfem.index)]

In [None]:
## Remove sex column from MRI feature df
Xfem.drop(columns = '235', inplace=True)
yfem = PNClabelsFem['ageAtScan']

In [None]:
## Create train-test-split for model training
X_train, X_test, y_train, y_test = train_test_split(Xfem, yfem, random_state=0)

In [None]:
print(PNClabelsFem['ageAtScan'].max()/12)
print(PNClabelsFem['ageAtScan'].min()/12)

## Parameter tuning with grid search

In [None]:
hyperparameter_grid = {'max_depth': [3,6,9],
                    'max_leaves': [0,2,5,10],
                    'subsample': [0.5],
                    'learning_rate': [0.001,0.01,0.1,0.5,1,3],
                    'min_child_weight': [1,10,100],
                    'n_estimators': [100, 500, 1000]}

xgb_model = xgb.XGBRegressor()

clf = GridSearchCV(xgb_model, hyperparameter_grid, verbose=3, cv=5)

clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_params_)

## Using best parameters in model and testing it with holdout data

In [None]:
dtrain = xgb.DMatrix(X_train, label = y_train)

In [None]:
dtest = xgb.DMatrix(X_test, label = y_test)

In [None]:
import xgboost as xgb

param = {'objective': 'reg:squarederror',
    'booster': 'gbtree',
    'learning_rate': 0.01,
    'max_depth': 6,
    'max_leaves': 0,
    'min_child_weight': 10,
    'subsample': 0.5,
    'seed': 123,
    'eval_metric': 'rmse'}

num_round = 1000

evallist = [(dtrain, 'train'), (dtest, 'eval')]

bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10, verbose_eval=True)

In [None]:
bst.save_model('femaleOnly.model')

In [None]:
## Load ABCD test data that was used for menarche classification
Test = pd.read_csv('processedData\\TestAgePredFeaturesHarmonised.csv')
Test.sort_values(by='subjectkey',inplace=True)
holdout_subs = Test['subjectkey']

## Load dataframe containing demographic data
MRIwithAgeSex = pd.read_csv('processedData\\processedMRIDataMenarcheSubsONLYSMRI.csv')
## Reduce the MRI data containing sex and age data to only the relevant subjects
MRIred = MRIwithAgeSex[MRIwithAgeSex['subjectkey'].isin(holdout_subs)] 
## Extract the info on sex and age from that dataframe
AgeSex = MRIred[['sex_M1','interview_age_M1','subjectkey']].copy()

Test = pd.merge(Test, AgeSex, how = 'outer', on = 'subjectkey')

In [None]:
## Create the age prediction features by dropping all columns that are not MRI features
ABCD_X = Test.drop(columns = ['subjectkey', 'sex_M1', 'interview_age_M1'])

## Create a dictionary to map the current column names to the new numeric names
new_column_names = {col: str(i + 1) for i, col in enumerate(ABCD_X.columns)}
## Rename the columns using the dictionary
ABCD_X.rename(columns=new_column_names, inplace=True)

ABCD_y = Test['interview_age_M1']

dABCD = xgb.DMatrix(ABCD_X)

In [None]:
ABCD_predictedAge = bst.predict(dABCD)

In [None]:
## Permutation test: Labels in the training data get shuffled and a model gets trained on the random training data
## and is then applied to the holdout data. Resulting (random) accuracies are compared to the baseline accuracy 
## generated by training the model on non-shuffled training data and applying it to the holdout data
param = {'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'learning_rate': 0.01,
        'max_depth': 6,
        'max_leaves': 0,
        'min_child_weight': 10,
        'subsample': 0.5,
        'seed': 123,
        'eval_metric': 'rmse'}

from sklearn.metrics import mean_absolute_error
from numpy import mean

## Calculate empirical MAE
observed_accuracyTestData = mean_absolute_error(ABCD_predictedAge, ABCD_y)/12

## 1000 permutations and a list to save the results
n_permutations = 100
permuted_accuraciesTestData = []

## Use the same parameters as in the original model
for _ in range(n_permutations):
    dtrain = xgb.DMatrix(X_train, label = np.random.permutation(y_train))
    ## train the model on random data
    num_round = 10
    rand = xgb.train(param, dtrain, num_round, verbose_eval=True)
    
    ## use the model to classify menarche status in the test data
    predsRandom = rand.predict(dABCD)
    
    ## calculate and append accuracies to the list
    permuted_accuracy = mean_absolute_error(ABCD_y, predsRandom)/12
    permuted_accuraciesTestData.append(permuted_accuracy)


## calculate the p-value by looking at number of models with learned randomness that performed as good or better than the
## original model
p_valuePermTest = (np.sum(permuted_accuraciesTestData <= observed_accuracyTestData) + 1) / (n_permutations + 1)

print(f"Observed Accuracy: {observed_accuracyTestData}")
print(mean(permuted_accuraciesTestData))
print(f"P-value: {p_valuePermTest}")

In [None]:
fig, ax = plt.subplots(constrained_layout = True)

sns.histplot(permuted_accuraciesTestData, bins=15, color='#A4B7D6')

ax.axvline(observed_accuracyTestData, color="red")

#ax.set_xlim([0.1,0.9])

ax.set_xlabel("Mean absolute error (years)")

#ax.set_title('Permutation Test Performed with Holdout Data', fontsize = 15)
plt.savefig('Plots\\permutationTest_AgePred1902.pdf', dpi = 1000)

In [None]:
dtest = xgb.DMatrix(X_test)

In [None]:
y_predict_age = bst.predict(dtest)

In [None]:
from scipy.stats.stats import pearsonr

In [None]:
pearsonr(y_predict_age, y_test)

In [None]:
rmse = mean_squared_error(y_predict_age, y_test, squared=False)
print(f"RMSE of the base model: {rmse/12:.3f}")

In [None]:
sn = sns.regplot(x = y_test/12, y = y_predict_age/12)
sn.set_xlabel('Age at Scan')
sn.set_ylabel('Predicted Age')
#plt.savefig('regplotBrainAgeFemaleModelPNC.png')

In [None]:
BAG = y_predict_age - y_test

In [None]:
hist = sns.histplot(BAG/12)
hist.set_xlabel('Brain Age Gap')
#hist.set_title('Brain Age Gap Distribution in PNC Eval Data')
#plt.savefig('BAGhistPNC_Female.png', dpi = 1000)

In [None]:
pearsonr(ABCD_predictedAge, ABCD_y)

In [None]:
sns.regplot(x = ABCD_y, y = ABCD_predictedAge)

In [None]:
rmse = mean_squared_error(ABCD_predictedAge, ABCD_y, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

In [None]:
print(f"RMSE of the base model: {rmse/12:.3f}")

In [None]:
absoluteErrors = abs(ABCD_predictedAge - ABCD_y)

MAE = sum(absoluteErrors)

MAE = MAE/len(absoluteErrors)

MAE = MAE/12

MAE

In [None]:
Test['predictedAge'] = ABCD_predictedAge
Test['BAG'] = Test['predictedAge'] - Test['interview_age_M1']

In [None]:
hist = sns.histplot(Test['BAG'])
hist.set_xlabel('BAG')
hist.set_title('Brain Age Gap Distribution in ABCD Menarche Subjects when sex was included as a feature')
#plt.savefig('BAGWithSexIncludedhist.png')

In [None]:
classprobsMenarche = pd.read_csv('processedData\\classprobabilitesDFfinal_independently_harmonizedScanMatchedAgePredFeaturesShrink071102.csv')

In [None]:
BAGandMenarche = classprobsMenarche.merge(Test, how = 'inner', on = 'subjectkey')

sns.regplot(x = BAGandMenarche['BAG'], y = BAGandMenarche['prob post'])

In [None]:
BAGandMenarche.to_csv('processedData\\BAGandMenarche.csv')