In [None]:
import pandas as pd
import numpy as np
import shap
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from utils import bayescv, cv_scores, plot_permutation_importance

In [None]:
pd.options.display.float_format = '{:.3f}'.format
sns.set_theme(style='whitegrid')

thres = 1.03
random_state = 42
n_iter = 30
zscore = False
bayes = True

## Load BF2 data

In [None]:
load_df = pd.read_csv('csv/BF2_FS.csv')
load_df['APOE'].replace({22.: '22', 23.: '23', 24.: '24', 33.: '33', 34.: '34', 44.: '44'}, inplace=True)
load_df = load_df.dropna(how='any')
load_df.info()

In [None]:
select_df = load_df.dropna(how='any').reset_index(drop=True)
select_df = select_df.drop(['CSF Abnormal Ratio'], axis=1)
select_df = pd.get_dummies(select_df).astype('float64')
select_df.info()

In [None]:
neg_idx = select_df['fnc_ber_com_composite']<=thres
pos_idx = (1-neg_idx).astype('bool')
neg_df = select_df[neg_idx]
pos_df = select_df[pos_idx]
neg_tv_df, neg_test_df = train_test_split(neg_df, test_size=0.2, random_state=random_state)
pos_tv_df, pos_test_df = train_test_split(pos_df, test_size=0.2, random_state=random_state)
tv_df = pd.concat([neg_tv_df, pos_tv_df])

In [None]:
plt.scatter(tv_df['Plasma P-tau217'].values*400,tv_df['Plasma %P-tau217'].values)
plt.ylabel('[Plasma %P-tau217]')
plt.xlabel('[Plasma P-tau217] * 400')
plt.title('Plasma %P-tau217 against Plasma P-tau217')
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 14))
sns.heatmap(tv_df.corr(), annot=True, ax=ax, fmt=".2f", annot_kws={"size": 8})
plt.title('Pearson correlation matrix')
plt.show()

### Initial model selection (Section 3.2.1)

In [None]:
cls = 'model_selection'
model_list = [
              SVR(kernel='poly'),
              Ridge(alpha=0.5, random_state=random_state),
              AdaBoostRegressor(n_estimators=450, random_state=random_state),
              KNeighborsRegressor(n_neighbors=10),
              GradientBoostingRegressor(max_depth=2, n_estimators=50, random_state=random_state),
              DecisionTreeRegressor(max_depth=3, random_state=random_state),
              BaggingRegressor(n_estimators=100, random_state=random_state),
              XGBRegressor(max_depth=2, n_estimators=50, eta=0.09, random_state=random_state),
              ExtraTreesRegressor(max_depth=6, random_state=random_state),
              RandomForestRegressor(max_depth=5, random_state=random_state)
              ]
model_selection_df = pd.DataFrame()
for model in model_list:
      cv_scores_df = cv_scores(tv_df, 5, cls, None, thres, random_state, model)
      model_selection_df = pd.concat([model_selection_df,cv_scores_df])

In [None]:
## sp table 4
model_selection_df.groupby('Model').mean().sort_values(by='Val_R2',ascending=False)

### Initial feature selection (Section 3.2.2)

In [None]:
cls = 'extratree'
if cls == 'extratree':
    model = ExtraTreesRegressor()
elif cls == 'gradientboost':
    model = GradientBoostingRegressor()

In [None]:
X_train = tv_df.drop(['fnc_ber_com_composite'], axis=1)
y_train = tv_df['fnc_ber_com_composite']

In [None]:
opt = bayescv(X_train, y_train, n_iter, model, random_state=random_state, cls=cls)

In [None]:
try:
    best_param = dict(opt.best_params_)
    # aa
except:
    best_param = {'max_depth': 7,
'min_samples_leaf': 2,
'min_samples_split': 5,
'n_estimators': 250}
best_param

In [None]:
cv_scores_df = cv_scores(tv_df, 5, cls, best_param, thres, random_state)
cv_scores_df.mean()

In [None]:
if cls == 'extratree':
    best_model = ExtraTreesRegressor(**best_param, random_state=random_state)
elif cls == 'gradientboost':
    best_model = GradientBoostingRegressor(**best_param, random_state=random_state)
best_model.fit(X_train,y_train)

In [None]:
shap_df = tv_df.sort_values(by = 'fnc_ber_com_composite').reset_index(drop=True)
X_shap = shap_df.drop(['fnc_ber_com_composite'], axis=1)
y_shap = shap_df['fnc_ber_com_composite']
y_pred_shap = best_model.predict(X_shap)

In [None]:
mdi_importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
tree_importance_sorted_idx = np.argsort(best_model.feature_importances_)
tree_indices = np.arange(0, len(best_model.feature_importances_)) + 0.5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
mdi_importances.sort_values().plot.barh(ax=ax1)
ax1.set_xlabel("Gini importance")
plot_permutation_importance(best_model, X_train, y_train, ax2, random_state)
ax2.set_xlabel("Decrease in accuracy score")
fig.suptitle(
    "Impurity-based vs. permutation importances on multicollinear features (train set)"
)
_ = fig.tight_layout()

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_shap)
plt.title('SHAP values')
shap.summary_plot(shap_values, X_shap, max_display=150, show=True, plot_size=(10,10))
plt.show()
# savefig_name = crop[8:-7] + 'RF_SHAP_impact.png'
# plt.savefig(savefig_name,format='png')

### Final feature selection (Section 3.2.3)

In [None]:
feature_list = [
    ['Plasma %p-tau217', 'CSF AB42/AB40', 'Age', 'APOE'],
    # ['Plasma p-tau217', 'CSF AB42/AB40', 'Age', 'APOE'],
    # ['CSF p-tau217', 'CSF AB42/AB40', 'Age', 'APOE'],
    ['Plasma %p-tau217', 'Age', 'APOE'],
    ['CSF AB42/AB40', 'Age', 'APOE'],
    ['Age', 'APOE'],
    ['Plasma %p-tau217', 'Plasma AB42/AB40', 'Age', 'APOE'],
    ['Plasma AB42/AB40', 'Age', 'APOE'],
    ['Plasma %p-tau217', 'CSF AB42/AB40', 'Age', 'APOE', 'Plasma %p-tau181', 'Plasma %p-tau205', 'Plasma p-tau231'],
    ['CSF p-tau217', 'CSF AB42/AB40', 'Age', 'APOE', 'Plasma %p-tau181', 'Plasma %p-tau205', 'Plasma p-tau231'],
    ['CSF AB42/AB40', 'Age', 'APOE', 'Plasma %p-tau181', 'Plasma %p-tau205', 'Plasma p-tau231'],
    ['Plasma %p-tau217', 'CSF AB42/AB40', 'Age', 'APOE', 'ADAS', 'Cognitive status'],
]

In [None]:
table_df = pd.DataFrame()
cls = 'gradientboost'
n_iter = 5
for random_state in tqdm(range(1)):
    idx = 1
    for features in feature_list:
        select_df = load_df[features + ['fnc_ber_com_composite']]
        select_df = pd.get_dummies(select_df).astype('float64')
        if cls == 'extratree':
            model = ExtraTreesRegressor()
        elif cls == 'gradientboost':
            model = GradientBoostingRegressor()
        neg_idx = select_df['fnc_ber_com_composite']<=thres
        pos_idx = (1-neg_idx).astype('bool')
        neg_df = select_df[neg_idx]
        pos_df = select_df[pos_idx]
        neg_tv_df, neg_test_df = train_test_split(neg_df, test_size=0.2, random_state=random_state)
        pos_tv_df, pos_test_df = train_test_split(pos_df, test_size=0.2, random_state=random_state)
        tv_df = pd.concat([neg_tv_df, pos_tv_df])

        X_train = tv_df.drop(['fnc_ber_com_composite'], axis=1)
        y_train = tv_df['fnc_ber_com_composite']

        opt = bayescv(X_train, y_train, n_iter, model, random_state=random_state, cls=cls)
        best_param = dict(opt.best_params_)
        ## cross-validation
        cv_scores_df = cv_scores(tv_df, 5, cls, best_param, thres, random_state)
        mid = pd.concat([pd.DataFrame([random_state,idx],index=['random_state','FC']),cv_scores_df.mean()]).T
        if len(table_df) == 0:
            table_df = mid
        else:
            table_df = pd.concat([table_df, mid],axis=0)
        idx += 1
table_df['random_state'] = table_df['random_state'].astype(int)
table_df['FC'] = table_df['FC'].astype(int)
table_df = table_df.drop(['kfold'],axis=1)

In [None]:
result_df = table_df.reset_index(drop=True)
result_df

In [None]:
# result_df.to_csv('sp_figure_10_gbdt.csv')

In [None]:
sns.set_theme(style='whitegrid', palette=sns.color_palette('tab10')[3:])
fig = plt.figure(figsize=(8,6))
sns.boxplot(data=result_df,x='FC',y='Val_R2',width=0.5,whis=1.5,linewidth = 1.)
# sns.boxplot(data=result_df,x='FC',y='Val_MAPE',width=0.5,whis=1.5,linewidth = 1.)
plt.title('R squared scores derived from GBDT trained with different feature combinations')
plt.xlabel('Feature combination')
plt.ylabel('R squared')
plt.show()