### Librerías

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, accuracy_score, recall_score, f1_score, precision_score, confusion_matrix, ConfusionMatrixDisplay

import lightgbm
import xgboost

import os
from datetime import date
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

# sns.set(style='whitegrid', rc={"grid.linewidth": 0.5})
# font = {'family' : 'arial',
#         'weight' : 'normal',
#         'size'   : 22}
# plt.rc('font', **font)
plt.rcParams['figure.autolayout']= True # salva los gráficos sin cortat tiítulos de ejes

In [2]:
os.getcwd()

'/home/marcelo/GitRepos/Tesis/code'

In [3]:
# Dir creation
# Dir for saving results
# Today
now = str(date.today())

# Parent Directory path
parent_dir = "/home/marcelo/GitRepos/Tesis/" # desktop
results_dir = "/home/marcelo/GitRepos/Tesis/pred_results/" #desktop
# parent_dir = "/home/mm/Tesis/" # ssh
# results_dir = "/home/mm/Tesis/pred_results/" # ssh

# Path
path = os.path.join(results_dir, now)
try:
    os.makedirs(path)
    print('Directory created!')
except FileExistsError:
    print('Directory already exsist!')
    pass

path_res = os.path.join(path, 'mean_res')
# path_results = os.path.join(path, 'graphs2')
try:
    os.makedirs(path_res)
    print('Directory created!')
except FileExistsError:
    print('Directory already exsist!')
    pass


Directory created!
Directory created!


In [4]:
for i in ['RandomForest', 'SVM_rbf', 'SVM_poly', 'LightGBM', 'XGBoost']:
    
    path_res2 = os.path.join(path_res, i)
    # path_results = os.path.join(path, 'graphs2')
    try:
        os.makedirs(path_res2)
        print('Directory created!')
    except FileExistsError:
        print('Directory already exsist!')
        pass

Directory created!
Directory created!
Directory created!
Directory created!
Directory created!


In [5]:
# Data path
data_path ='../data/' # Desktop
# data_path ='/home/mm/Tesis/data/' # ssh

# Load data 
df = pd.read_csv(data_path+'clean_data.csv')
# Drop participants with no MMSE or MoCA or ACE-III
df = df.dropna(subset='mmse_vs')
df.isna().sum()
df.shape # 1613, 48
df = df.drop(['aod', 'yod', 'laterality', 'income_sources' ], axis=1)
df['years_education']=df['years_education'].astype('float')
df['nationality'].unique()
nat_dictionary = {'Argentina':1, 'Chilena':2, 'Colombiana': 3, 'Mexicana':4, 'PERUANA':5, 'Española':6, 'Alemana':7}
# Tiramos un espaqñol y un alemán
df = df.loc[(df['nationality']!='Española') & (df['nationality']!='Alemana')]
# Reemplazamos
df['nationality'] = df['nationality'].replace(nat_dictionary)

# Idem con Residencia
df['country_of_residence'].unique()
res_dictionary = {'Argentina':1, 'Chile':2, 'Colombia': 3, 'Mexico':4, 'Peru':5, 'Panamá':6}
df['country_of_residence'] = df['country_of_residence'].replace(res_dictionary)


In [6]:
# Salvamos el df a modelar
df.to_csv("../data/data_to_model.csv", index=False) # Desktop
# df.to_csv("/home/mm/Tesis/data/data_to_model.csv", index=False) # ssh

df.info()
df.isna().sum()[df.isna().sum()>0]
# impute mean by Diagnosis group
grouped = df.groupby('diagnosis')

def impute_mean(group):
    return group.fillna(group.mean())

nan_cols = df.isna().sum()[df.isna().sum()>0].index.tolist()

for i in nan_cols:
    df[i] = df[i].transform(impute_mean)

df.drop(['nationality', 'country_of_residence'], axis=1, inplace=True)
df.isna().sum()[df.isna().sum()>0]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1611 entries, 0 to 1930
Data columns (total 44 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   site                  1611 non-null   object 
 1   id                    1611 non-null   object 
 2   diagnosis             1611 non-null   object 
 3   year_birth            1582 non-null   float64
 4   sex                   1611 non-null   float64
 5   years_education       1605 non-null   float64
 6   moca_total            620 non-null    float64
 7   aceiii_total          489 non-null    float64
 8   mmse_total            1288 non-null   float64
 9   ifs_total_score       680 non-null    float64
 10  mini_sea_total        421 non-null    float64
 11  barthel_total         383 non-null    float64
 12  pfeffer_total         443 non-null    float64
 13  cdr_sumofboxes        792 non-null    float64
 14  cdr_global            936 non-null    float64
 15  npi_total            

Series([], dtype: int64)

In [7]:
#### Librerías 
# cambiamos el dir de trabajo
os.chdir("/home/marcelo/GitRepos/Tesis/code") # desktop
# os.chdir("/home/mm/Tesis/code") # ssh

import py_funs
import stats_funs
import ml_hparams_clf as bhs # Bayes search
import ml_bootstrap_clf as bc # boostraping classifiers
import ml_plotting

In [8]:
####   Separate df between train and test (25%)
# Make strata column
df['strata'] = df.diagnosis + " / " + df.site

test = py_funs.stratified_sample(df, ['strata'], size=int(df.shape[0]*.20), seed=123, keep_index=True)
test = test.set_index('index')
test.index.name = None
train = df.drop(test.index, axis=0)
train.shape, test.shape

df.diagnosis.value_counts()/df.shape[0]
train.diagnosis.value_counts()/train.shape[0]
test.diagnosis.value_counts()/test.shape[0]

#save train and test 
train.to_csv("../data/train.csv", index=False) # desktop
test.to_csv("../data/test.csv", index=False) # desktop
# train.to_csv("/home/mm/Tesis/train.csv", index=False) # shh
# test.to_csv("/home/mm/Tesis/data/test.csv", index=False) #ssh

train = train.drop(['site','id','strata'], axis=1)
test = test.drop(['site','id','strata'], axis=1)

### Split into AD and FTD
data = train.query("diagnosis != 'CN'")
data['diagnosis'] = data['diagnosis'].replace({'AD':0, 'FTD':1})
train.shape

(1290, 40)

In [9]:
### Load Hyperparams
# hparams_path = "/home/mm/Tesis/pred_results/2023-07-14/mean_hparams/" # ssh

hparams_path = "/home/marcelo/GitRepos/Tesis/pred_results/2023-07-15/mean_hparams/" # Desktop

rf_hp = pd.read_csv(hparams_path + "RF_hparams.csv",
                    usecols=['param_criterion', 'param_max_depth','param_max_features', 'param_min_samples_leaf',
                             'param_min_samples_split', 'param_n_estimators','params', 'mean_test_score' ,'mean_train_score'])
rf_hp= rf_hp[rf_hp['mean_test_score'] < rf_hp['mean_train_score']].sort_values('mean_test_score', ascending=False).head()

rbf_hp = pd.read_csv(hparams_path + "SVC_RBF_hparams2.csv",
                     usecols=['param_C', 'param_gamma','params', 'mean_test_score' ,'mean_train_score'])
rbf_hp= rbf_hp[rbf_hp['mean_test_score'] < rbf_hp['mean_train_score']].sort_values('mean_test_score', ascending=False).head()

poly_hp = pd.read_csv(hparams_path + "SVC_Poly_hparams.csv",
                      usecols=['param_C', 'param_coef0', 'param_degree','param_gamma','params', 
                               'mean_test_score' ,'mean_train_score'])
poly_hp= poly_hp[poly_hp['mean_test_score'] < poly_hp['mean_train_score']].sort_values('mean_test_score', ascending=False).head()

lgbm_hp = pd.read_csv(hparams_path + "lgbm_hparams.csv", usecols=[ 'param_learning_rate', 'param_max_depth',
       'param_n_estimators', 'param_num_leaves', 'param_reg_alpha',
       'param_reg_lambda','params', 'mean_test_score' ,'mean_train_score'])
lgbm_hp= lgbm_hp[lgbm_hp['mean_test_score'] < lgbm_hp['mean_train_score']].sort_values('mean_test_score', ascending=False).head()

xgb_hp = pd.read_csv(hparams_path + "xgb_hparams.csv", usecols=['param_booster', 'param_learning_rate',
       'param_max_bin', 'param_max_depth', 'param_max_leaves',
       'param_n_estimators', 'param_reg_alpha', 'param_reg_lambda',
       'param_tree_method','params', 'mean_test_score' ,'mean_train_score'])
xgb_hp= xgb_hp[xgb_hp['mean_test_score'] < xgb_hp['mean_train_score']].sort_values('mean_test_score', ascending=False).head()

In [10]:
## Models Hparams
# RF
rf_params= {"n_estimators": rf_hp.loc[0,'param_n_estimators'],
    "criterion": rf_hp.loc[0,'param_criterion'],
    "max_depth": rf_hp.loc[0,'param_max_depth'],
    "min_samples_split": rf_hp.loc[0,'param_min_samples_split'],
    "min_samples_leaf": rf_hp.loc[0,'param_min_samples_leaf'],
    "max_features":rf_hp.loc[0,'param_max_features'],
    "class_weight":"balanced", "verbose":0, "n_jobs":-1}
#RBF
rbf_params = {'C': rbf_hp.loc[0, "param_C"],
              'gamma': rbf_hp.loc[0, "param_gamma"],
              "kernel":"rbf","class_weight":"balanced", "verbose":0}
#Poly
poly_params = {'C': poly_hp.loc[0, "param_C"],
              'gamma': poly_hp.loc[0, "param_gamma"],
               'coef0': poly_hp.loc[0, "param_coef0"],
               'degree': poly_hp.loc[0, 'param_degree'],
              "kernel":"poly","class_weight":"balanced", "verbose":0}
#LGBM
lgbm_params ={'num_leaves': lgbm_hp.loc[0, 'param_num_leaves'],
    'max_depth': lgbm_hp.loc[0, 'param_max_depth'],
    'learning_rate':lgbm_hp.loc[0, 'param_learning_rate'],
    'n_estimators': lgbm_hp.loc[0, 'param_n_estimators'],
    'reg_alpha':lgbm_hp.loc[0, 'param_reg_alpha'],
    'reg_lambda':lgbm_hp.loc[0, 'param_reg_lambda'],
    'subsample':1.0,
    'subsample_freq':-1,
    'objective':'binary',
    'n_jobs': -1,
    'verbose':-1}

#XGBOOST
xgb_params = {
    'booster': xgb_hp.loc[0, 'param_booster'],
    'tree_method': xgb_hp.loc[0, 'param_tree_method'],
    'max_leaves': xgb_hp.loc[0, 'param_max_leaves'],
    'max_depth': xgb_hp.loc[0, 'param_max_depth'],
    'max_bin': xgb_hp.loc[0, 'param_max_bin'],
    'learning_rate': xgb_hp.loc[0, 'param_learning_rate'],
    'n_estimators': xgb_hp.loc[0, 'param_n_estimators'],
    'reg_alpha': xgb_hp.loc[0, 'param_reg_alpha'],
    'reg_lambda': xgb_hp.loc[0, 'param_reg_lambda'],
    'gamma': 0.005,
    'subsample':1.0,
    'enable_categorical':True, # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
    'n_jobs': -1,
    'verbosity':0,
    'eval_metric':'auc',
    'objective':'binary:logistic',
    'use_label_encoder':None}

### RF Model

##### Bootstrap

In [11]:
Model_res, CI_df, Model_res_r, ROC_AUC, CM, ROC_chance, empirical_p_value = bc.Bootstrap_tree_classifier(
    data, 'diagnosis', RandomForestClassifier(), scaler='MM', params=rf_params, test_size=.2, n_iter=5000, chance_model=True)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=14, max_features=14, min_samples_leaf=0.01,
                       min_samples_split=0.01, n_estimators=500, n_jobs=-1)


Iterating:  14%|██▍              | 710/5000 [11:25<1:09:04,  1.04it/s]


KeyboardInterrupt: 

In [None]:
path_res

In [None]:
Model_res.to_csv(path_res+ "/RandomForest/Model_res.csv", index=False)
Model_res.mean()

In [None]:
print("%.4f" % Model_res['ROC_AUC'].mean())


In [None]:
CI_df.to_csv(path_res+ "/RandomForest/CI_df.csv", index=False)
CI_df

In [None]:
Model_res_r.sort_values('Importance_mean', ascending=False).to_csv(path_res+ "/RandomForest/Model_res_r.csv", index=False)
Model_res_r.sort_values('Importance_mean', ascending=False).head(10)

In [None]:
ml_plotting.plot_custom_roc_curve(ROC_AUC, ROC_chance, fname=path_res+ "/RandomForest/ROC_AUC_curve.png")
# Fig save?


In [None]:
ml_plotting.plot_boostrap_CM(CM, labels=['AD', 'FTD'],fname=path_res+ "/RandomForest/CM.png")

##### Permutation Feature importance

In [None]:
rf_fimp = bc.permutation_FIMP( data, 'diagnosis', RandomForestClassifier(), params=rf_params, test_size=.3, n_repeats=1000, random_state=123)

In [None]:
rf_fimp.sort_values('mean', ascending=False).to_csv(path_res+ "/RandomForest/rf_PFI.csv")
rf_fimp.sort_values('mean', ascending=False).head()

In [None]:
# ml_plotting.plot_permutation_FIMP(rf_fimp, names='variables', importances_col='Importance_mean', std_col='Importance_std')

In [None]:
Model_res_r.sort_values('Importance_mean', ascending=False).head()

##### unseen data

In [None]:
# train = train.drop(['site','id','strata','nationality', 'country_of_residence'], axis=1)
# test = test.drop(['site','id','strata', 'nationality', 'country_of_residence'], axis=1)

### Split into AD and FTD
train_data = train.query("diagnosis != 'CN'")
train_data['diagnosis'] = train_data['diagnosis'].replace({'AD':0, 'FTD':1})

### Split into AD and FTD
unseen_data = test.query("diagnosis != 'CN'")
unseen_data['diagnosis'] = unseen_data['diagnosis'].replace({'AD':0, 'FTD':1})
train_data.shape, unseen_data.shape

In [None]:
X_train, y_train = train_data.drop('diagnosis', axis=1), train_data[['diagnosis']]
X_test, y_test = unseen_data.drop('diagnosis', axis=1), unseen_data[['diagnosis']]

In [None]:
# predict & scores
rfc = RandomForestClassifier(**rf_params)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
roc = np.round(roc_auc_score(y_test, y_pred), 4)
acc = np.round(accuracy_score(y_test, y_pred), 4)
rec = np.round(recall_score(y_test, y_pred),4)
f1 = np.round(f1_score(y_test, y_pred), 4)
prec = np.round(precision_score(y_test, y_pred), 4)
cm = np.round(confusion_matrix(y_test, y_pred), 4)

In [None]:
unseen_df = pd.DataFrame([roc, acc, rec, f1, prec, cm]).T
unseen_df.columns= ['ROC AUC', 'Accuracy', 'Recall', 'F1', 'Precision', 'CM']
unseen_df.to_csv(path_res+ "/RandomForest/unseen.csv")

In [None]:
print(f"ROC score: {roc}\nAccuracy: {acc}\nRecall: {rec}\nF1: {f1}\nPrecision:{prec}")

In [None]:
print(f"Unseen Data ROC score: \t\t{roc}\nBootstrping test ROC score: \t{Model_res['ROC_AUC'].mean():.4f}")

In [None]:
unseen_cm = { 'TP':cm[0,0], 'FN':cm[0,1],'FP':cm[1,0],'TN':cm[1,1]}
ml_plotting.plot_boostrap_CM(unseen_cm, labels=['AD', 'FTD'])

### SVM - RBF

##### Bootstrap

In [None]:
rbf_Model_res, rbf_CI_df, rbf_Model_res_r, rbf_ROC_AUC, rbf_CM, rbf_ROC_chance, rbf_empirical_p_value = bc.Bootstrap_SVC_classifier(
    data, 'diagnosis', SVC(), params=rbf_params, test_size=.2, scaler='MM', n_iter=5000, chance_model=True)

In [None]:
rbf_Model_res.to_csv(path_res+ "/SVM_rbf/rbf_Model_res.csv", index=False)
rbf_Model_res.mean()

In [None]:
rbf_CI_df.to_csv(path_res+ "/SVM_rbf/rbf_CI_df.csv", index=False)
rbf_CI_df

In [None]:
rbf_Model_res_r

In [None]:
ml_plotting.plot_custom_roc_curve(rbf_ROC_AUC, rbf_ROC_chance, fname=path_res+ "/SVM_rbf/rbf_ROC_AUC_curve.png")

In [None]:
ml_plotting.plot_boostrap_CM(rbf_CM, labels=['AD', 'FTD'],fname=path_res+ "/SVM_rbf/rbf_CM.png")

#### Permutation Feature importance

In [None]:
rbf_fimp = bc.permutation_FIMP( data, 'diagnosis', SVC(), params=rbf_params, test_size=.3, n_repeats=1000, random_state=123)

In [None]:
rbf_fimp.sort_values('mean', ascending=False).to_csv(path_res+ "/SVM_rbf/rbf_PFI.csv")
rbf_fimp.sort_values('mean', ascending=False)

#### Unseen data

In [None]:
### Split into AD and FTD
train_data = train.query("diagnosis != 'CN'")
train_data['diagnosis'] = train_data['diagnosis'].replace({'AD':0, 'FTD':1})

In [None]:
### Split into AD and FTD
unseen_data = test.query("diagnosis != 'CN'")
unseen_data['diagnosis'] = unseen_data['diagnosis'].replace({'AD':0, 'FTD':1})
train_data.shape, unseen_data.shape

X_train, y_train = train_data.drop('diagnosis', axis=1), train_data[['diagnosis']]
X_test, y_test = unseen_data.drop('diagnosis', axis=1), unseen_data[['diagnosis']]

In [None]:
#Predict
svc_rbf = SVC(**rbf_params)
svc_rbf.fit(X_train, y_train)
y_pred = svc_rbf.predict(X_test)
roc = np.round(roc_auc_score(y_test, y_pred), 4)
acc = np.round(accuracy_score(y_test, y_pred), 4)
rec = np.round(recall_score(y_test, y_pred),4)
f1 = np.round(f1_score(y_test, y_pred), 4)
prec = np.round(precision_score(y_test, y_pred), 4)
cm = np.round(confusion_matrix(y_test, y_pred), 4)

In [None]:
unseen_df = pd.DataFrame([roc, acc, rec, f1, prec, cm]).T
unseen_df.columns= ['ROC AUC', 'Accuracy', 'Recall', 'F1', 'Precision', 'CM']
unseen_df.to_csv(path_res+ "/SVM_rbf/rbf_unseen.csv")

In [None]:
print(f"ROC score: {roc}\nAccuracy: {acc}\nRecall: {rec}\nF1: {f1}\nPrecision:{prec}")
print(f"Unseen Data ROC score: \t\t{roc}\nBootstrping test ROC score: \t{rbf_Model_res['ROC_AUC'].mean():.4f}")

### SVM - Poly

#### Bootstrap

In [None]:
p_Model_res, p_CI_df, p_Model_res_r, p_ROC_AUC, p_CM, p_ROC_chance, p_empirical_p_value = bc.Bootstrap_SVC_classifier(
    data, 'diagnosis', SVC(), params=poly_params, test_size=.2, scaler='MM', n_iter=5000, chance_model=True)

In [None]:
p_Model_res.to_csv(path_res+ "/SVM_poly/poly_Model_res.csv", index=False)
p_Model_res.mean()

In [None]:
p_CI_df.to_csv(path_res+ "/SVM_poly/poly_CI_df.csv", index=False)
p_CI_df.mean()

In [None]:
ml_plotting.plot_custom_roc_curve(p_ROC_AUC, p_ROC_chance, fname=path_res+ "/SVM_poly/poly_ROC_AUC_curve.png")

In [None]:
ml_plotting.plot_boostrap_CM(p_CM, labels=['AD', 'FTD'],fname=path_res+ "/SVM_poly/poly_CM.png")

#### Permutation Feature importance

In [None]:
p_fimp = bc.permutation_FIMP( data, 'diagnosis', SVC(), params=poly_params, test_size=.3, n_repeats=1000, random_state=123)
p_fimp.sort_values('mean', ascending=False).to_csv(path_res+ "/SVM_poly/poly_PFI.csv")
p_fimp.sort_values('mean', ascending=False)

#### Unseen data

In [None]:
### Split into AD and FTD
train_data = train.query("diagnosis != 'CN'")
train_data['diagnosis'] = train_data['diagnosis'].replace({'AD':0, 'FTD':1})

In [None]:
### Split into AD and FTD
unseen_data = test.query("diagnosis != 'CN'")
unseen_data['diagnosis'] = unseen_data['diagnosis'].replace({'AD':0, 'FTD':1})
train_data.shape, unseen_data.shape

In [None]:
X_train, y_train = train_data.drop('diagnosis', axis=1), train_data[['diagnosis']]
X_test, y_test = unseen_data.drop('diagnosis', axis=1), unseen_data[['diagnosis']]

In [None]:
poly = SVC(**poly_params)
poly.fit(X_train, y_train)
y_pred = poly.predict(X_test)
roc = np.round(roc_auc_score(y_test, y_pred), 4)
acc = np.round(accuracy_score(y_test, y_pred), 4)
rec = np.round(recall_score(y_test, y_pred),4)
f1 = np.round(f1_score(y_test, y_pred), 4)
prec = np.round(precision_score(y_test, y_pred), 4)
cm = np.round(confusion_matrix(y_test, y_pred), 4)

In [None]:
unseen_df = pd.DataFrame([roc, acc, rec, f1, prec, cm]).T
unseen_df.columns= ['ROC AUC', 'Accuracy', 'Recall', 'F1', 'Precision', 'CM']
unseen_df.to_csv(path_res+ "/SVM_poly/poly_PFI.csv")

In [None]:
print(f"ROC score: {roc}\nAccuracy: {acc}\nRecall: {rec}\nF1: {f1}\nPrecision:{prec}")
print(f"Unseen Data ROC score: \t\t{roc}\nBootstrping test ROC score: \t{p_Model_res['ROC_AUC'].mean():.4f}")

In [None]:
unseen_cm = { 'TP':cm[0,0], 'FN':cm[0,1],'FP':cm[1,0],'TN':cm[1,1]}
ml_plotting.plot_boostrap_CM(unseen_cm, labels=['AD', 'FTD'])

### LightGBM

#### Bootstrap

In [None]:
lgb_Model_res, lgb_CI_df, lgb_Model_res_r, lgb_ROC_AUC, lgbCM, lgb_ROC_chance, lgb_empirical_p_value, _ = bc.Bootstrap_LGBM_classifier(
    data, 'diagnosis',  params=lgbm_params, test_size=.2, scaler='MM', n_iter=5000, chance_model=True)

In [None]:
lgb_Model_res.to_csv(path_res+ "/LightGBM/lgb_Model_res.csv", index=False)
lgb_Model_res.mean()

In [None]:
lgb_CI_df.to_csv(path_res+ "/LightGBM/lgb_CI_df.csv", index=False)
lgb_CI_df

In [None]:
lgb_Model_res_r.sort_values('Importance_mean', ascending=False).to_csv(path_res+"/LightGBM/lgb_CI_df.csv", index=False)
lgb_Model_res_r.sort_values('Importance_mean', ascending=False).head(10)

In [None]:
ml_plotting.plot_custom_roc_curve(lgb_ROC_AUC, lgb_ROC_chance, fname=path_res+ "/LightGBM/lgb_ROC.png")

In [None]:
ml_plotting.plot_boostrap_CM(lgbCM, labels=['AD', 'FTD'],fname=path_res+ "/LightGBM/lgb_CM.png")

#### Permutation Feature importance

In [None]:
lgb_fimp = bc.permutation_FIMP( data, 'diagnosis', lightgbm.LGBMClassifier(), params=lgbm_params, test_size=.3, n_repeats=1000, random_state=123)
lgb_fimp.sort_values('mean', ascending=False).to_csv(path_res+ "/LightGBM/lgb_PFI.csv")
lgb_fimp.sort_values('mean', ascending=False).head()

#### Unseen data

In [None]:
### Split into AD and FTD
train_data = train.query("diagnosis != 'CN'")
train_data['diagnosis'] = train_data['diagnosis'].replace({'AD':0, 'FTD':1})

### Split into AD and FTD
unseen_data = test.query("diagnosis != 'CN'")
unseen_data['diagnosis'] = unseen_data['diagnosis'].replace({'AD':0, 'FTD':1})
train_data.shape, unseen_data.shape

In [None]:
X_train, y_train = train_data.drop('diagnosis', axis=1), train_data[['diagnosis']]
X_test, y_test = unseen_data.drop('diagnosis', axis=1), unseen_data[['diagnosis']]

In [None]:
lgb = lightgbm.LGBMClassifier(**lgbm_params)
lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_test)
roc = np.round(roc_auc_score(y_test, y_pred), 4)
acc = np.round(accuracy_score(y_test, y_pred), 4)
rec = np.round(recall_score(y_test, y_pred),4)
f1 = np.round(f1_score(y_test, y_pred), 4)
prec = np.round(precision_score(y_test, y_pred), 4)
cm = np.round(confusion_matrix(y_test, y_pred), 4)

In [None]:
unseen_df = pd.DataFrame([roc, acc, rec, f1, prec, cm]).T
unseen_df.columns= ['ROC AUC', 'Accuracy', 'Recall', 'F1', 'Precision', 'CM']
unseen_df.to_csv(path_res+ "/LightGBM/lgb_PFI.csv")

In [None]:
print(f"ROC score: {roc}\nAccuracy: {acc}\nRecall: {rec}\nF1: {f1}\nPrecision:{prec}")
print(f"Unseen Data ROC score: \t\t{roc}\nBootstrping test ROC score: \t{lgb_Model_res['ROC_AUC'].mean():.4f}")

In [None]:
unseen_cm = { 'TP':cm[0,0], 'FN':cm[0,1],'FP':cm[1,0],'TN':cm[1,1]}
ml_plotting.plot_boostrap_CM(unseen_cm, labels=['AD', 'FTD'])

### XGBoost

#### Bootstrap

In [None]:
xgb_Model_res, xgb_CI_df, xgb_Model_res_r, xgb_ROC_AUC, xgbCM, xgb_ROC_chance, xgb_empirical_p_value, _ = bc.Bootstrap_XGB_classifier(
    data, 'diagnosis',  params=xgb_params, test_size=.2, scaler='MM', n_iter=5000, chance_model=True)

In [None]:
xgb_Model_res.to_csv(path_res+ "/XGBoost/xgb_Model_res.csv", index=False)
xgb_Model_res.mean()

In [None]:
xgb_Model_res_r.sort_values('Importance_gain_mean', ascending=False).to_csv(path_res+"/XGBoost/xgb_Model_res_r.csv", index=False)
xgb_Model_res_r.sort_values('Importance_f_score_mean', ascending=False).head(10)

In [None]:
xgb_CI_df.to_csv(path_res+ "/XGBoost/xgb_CI_df.csv", index=False)
xgb_CI_df

In [None]:
ml_plotting.plot_custom_roc_curve(xgb_ROC_AUC, xgb_ROC_chance, fname=path_res+ "/XGBoost/xgb_ROC.png")

In [None]:
ml_plotting.plot_boostrap_CM(xgbCM, labels=['AD', 'FTD'],fname=path_res+ "/XGBoost/xgb_CM.png")

#### Permutation Feature importance

In [None]:
xgb_fimp = bc.permutation_FIMP( data, 'diagnosis', xgboost.XGBClassifier(), params=xgb_params, test_size=.3, n_repeats=1000, random_state=123)
xgb_fimp.sort_values('mean', ascending=False).to_csv(path_res+ "/XGBoost/xgb_PFI.csv")
xgb_fimp.sort_values('mean', ascending=False)

#### Unseen data

In [None]:
### Split into AD and FTD
train_data = train.query("diagnosis != 'CN'")
train_data['diagnosis'] = train_data['diagnosis'].replace({'AD':0, 'FTD':1})
### Split into AD and FTD
unseen_data = test.query("diagnosis != 'CN'")
unseen_data['diagnosis'] = unseen_data['diagnosis'].replace({'AD':0, 'FTD':1})
train_data.shape, unseen_data.shape

In [None]:
X_train, y_train = train_data.drop('diagnosis', axis=1), train_data[['diagnosis']]
X_test, y_test = unseen_data.drop('diagnosis', axis=1), unseen_data[['diagnosis']]

In [None]:
xgb = xgboost.XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
roc = np.round(roc_auc_score(y_test, y_pred), 4)
acc = np.round(accuracy_score(y_test, y_pred), 4)
rec = np.round(recall_score(y_test, y_pred),4)
f1 = np.round(f1_score(y_test, y_pred), 4)
prec = np.round(precision_score(y_test, y_pred), 4)
cm = np.round(confusion_matrix(y_test, y_pred), 4)

In [None]:
unseen_df = pd.DataFrame([roc, acc, rec, f1, prec, cm]).T
unseen_df.columns= ['ROC AUC', 'Accuracy', 'Recall', 'F1', 'Precision', 'CM']
unseen_df.to_csv(path_res+ "/XGBoost/xgb_PFI.csv")

In [None]:
print(f"ROC score: {roc}\nAccuracy: {acc}\nRecall: {rec}\nF1: {f1}\nPrecision:{prec}")
print(f"Unseen Data ROC score: \t\t{roc}\nBootstrping test ROC score: \t{xgb_Model_res['ROC_AUC'].mean():.4f}")

In [None]:
unseen_cm = { 'TP':cm[0,0], 'FN':cm[0,1],'FP':cm[1,0],'TN':cm[1,1]}
ml_plotting.plot_boostrap_CM(unseen_cm, labels=['AD', 'FTD'])