## Prerequisites

### Import libraries

In [330]:
# Import libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from sklearn.model_selection import train_test_split
# from google.colab import output

from pandas import DatetimeIndex as dt
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
# from google.colab import files
import IPython
from IPython.display import HTML, display, clear_output 
# from google.colab import drive
import sys

# hyper-parameters optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# metrics
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as TP_rate                          
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score as recall
from sklearn.metrics import average_precision_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer,fbeta_score
from sklearn.model_selection import StratifiedKFold


# classifiers
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostRegressor, RandomForestRegressor, GradientBoostingClassifier, StackingClassifier, VotingClassifier #
from sklearn.tree import DecisionTreeClassifier     #
from sklearn.svm import SVC                                    # both linear and radial classification
from sklearn.neighbors import KNeighborsClassifier             # k=3
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import catboost
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from mlxtend.classifier import StackingClassifier

# statistics
from scipy.stats import shapiro
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu

# imputations
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.base import clone

# feature selection
from sklearn.feature_selection import chi2, mutual_info_classif, f_classif, SelectKBest, RFE, RFECV, SequentialFeatureSelector
from scipy.stats import kendalltau, spearmanr
from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold
from mrmr import mrmr_classif

# to conver string to dict
import ast

# Interpretability
 # !pip install interpret
from interpret.blackbox import LimeTabular
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
from interpret import show

import lime
import lime.lime_tabular
from __future__ import print_function

# ignore warnings when graphs are plotted
import warnings
warnings.filterwarnings('ignore')

### Data import

In [331]:
# links to datasets

# lancet

link_train_lancet = './HSE project/Preprocessed Data/lancet dataset/train_abc_lancet.xlsx'
link_test_lancet  = './HSE project/Preprocessed Data/lancet dataset/test_abc_lancet.xlsx'

# death
link_train_death_a   = './HSE project/Preprocessed Data/cardiovascular death/train_a.xlsx'
link_test_death_a    = './HSE project/Preprocessed Data/cardiovascular death/test_a.xlsx'
link_train_death_b   = './HSE project/Preprocessed Data/cardiovascular death/train_b.xlsx'
link_test_death_b    = './HSE project/Preprocessed Data/cardiovascular death/test_b.xlsx'
link_train_death_c   = './HSE project/Preprocessed Data/cardiovascular death/train_c.xlsx'
link_test_death_c    = './HSE project/Preprocessed Data/cardiovascular death/test_c.xlsx'
link_train_death_abc = './HSE project/Preprocessed Data/cardiovascular death/train_abc.xlsx'
link_test_death_abc  = './HSE project/Preprocessed Data/cardiovascular death/test_abc.xlsx'

# combined
link_train_combined_a   = './HSE project/Preprocessed Data/combined/train_a.xlsx'
link_test_combined_a    = './HSE project/Preprocessed Data/combined/test_a.xlsx'
link_train_combined_b   = './HSE project/Preprocessed Data/combined/train_b.xlsx'
link_test_combined_b    = './HSE project/Preprocessed Data/combined/test_b.xlsx'
link_train_combined_c   = './HSE project/Preprocessed Data/combined/train_c.xlsx'
link_test_combined_c    = './HSE project/Preprocessed Data/combined/test_c.xlsx'
link_train_combined_abc = './HSE project/Preprocessed Data/combined/train_abc.xlsx'
link_test_combined_abc  = './HSE project/Preprocessed Data/combined/test_abc.xlsx'

# revascularization
link_train_revascularization_a   = './HSE project/Preprocessed Data/revascularization/train_a.xlsx'
link_test_revascularization_a    = './HSE project/Preprocessed Data/revascularization/test_a.xlsx'
link_train_revascularization_b   = './HSE project/Preprocessed Data/revascularization/train_b.xlsx'
link_test_revascularization_b    = './HSE project/Preprocessed Data/revascularization/test_b.xlsx'
link_train_revascularization_c   = './HSE project/Preprocessed Data/revascularization/train_c.xlsx'
link_test_revascularization_c    = './HSE project/Preprocessed Data/revascularization/test_c.xlsx'
link_train_revascularization_abc = './HSE project/Preprocessed Data/revascularization/train_abc.xlsx'
link_test_revascularization_abc  = './HSE project/Preprocessed Data/revascularization/test_abc.xlsx'

### Tuning of hyper-parameters

#### Grids of hyper-parameters

In [332]:
# hyper-parameters for gridsearchCV

# 1. Logistic regression
parameters_LR_model = dict(
                          C = [0.001, 0.01, 0.1, 1.],  # defeult
                          tol = [1.e-4],
                          penalty = ['l2', 'none'], #'elasticnet', 'l1', 
                          # njobs = [-1],
                          dual = [False],
                          fit_intercept = [False],
                          # intercept_scaling =
                          class_weight = ['balanced', None],
                          random_state = [10],
                          # solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                          max_iter = [10000],
                          multi_class = ['auto'],
                          verbose = [0],
                          warm_start = [True]
                          # l1_ratio
                          )

# 2. Random Forest
parameters_random_forest_model = dict(
                  n_estimators = [int(x) for x in np.linspace(start = 50, stop = 400, num = 10)],
                  criterion = ['gini'],
                  max_depth = [*[int(x) for x in np.linspace(2, 10, num = 5)]],
                  min_samples_split = [2,4],  
                  # min_samples_leaf = [1,2],  
                  min_weight_fraction_leaf = [0.0],
                  max_features = ['sqrt'],  
                  max_leaf_nodes = [None],
                  min_impurity_decrease = [0.],
                  bootstrap = [True],
                  oob_score = [False],
                  n_jobs = [-1],
                  random_state = [10],
                  verbose = [0],
                  warm_start = [True],
                  class_weight = ['balanced', 'balanced_subsample', None],
                  # ccp_alpha = 
                  max_samples = [None]  # maybe =0.1 here for getting almost independent samples for trees
                  )

# 3. k-NN
parameters_knn = dict(
                      n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 7, num = 7)],
                      weights = ['uniform', 'distance'],
                      algorithm = ['ball_tree', 'kd_tree', 'brute'], 
                      leaf_size = [15, 30, 60],
                      p = [3], 
                      metric = ['chebyshev', 'minkowski', 'euclidean', 'manhattan'],
                      # metric_params = 
                      n_jobs = [-1]
                      )

# 4. SVM
parameters_svm = dict(
                  C = [int(x) for x in np.linspace(start = 1, stop = 25, num = 7)],
                  kernel = ['rbf', 'linear', 'poly', 'sigmoid'],
                  degree = [3, 4, 5],
                  gamma = ['scale', 'auto'],
                  coef0 = [0.0],
                  shrinking = [True, False],
                  probability = [True],
                  tol = [1.e-3], 
                  cache_size = [200],
                  class_weight = ['balanced', None],
                  verbose = [False],
                  max_iter = [1.e6],  # мб поставить конечные итерации, как в Logistic Regression
                  # decision_function_shape = [],
                  # break_ties = [],
                  random_state = [10]
                  )

# 5. CatBoost
catboost_parameters = {'depth': [4,6,8,10],  # larger depth is preferable
              'learning_rate': [0.1,0.2,0.3],
              # 'l2_leaf_reg': [0,3,6,1],
              }
c_boost_params = {'eval_metric' : 'F1', # 'F1' my_f2_scorer, 'F'
                  # 'beta' : 2,
                  'verbose' : False,
                  'early_stopping_rounds' : 100,
                  #cat_features=cat_features,
                  'task_type' : "CPU",
                  'iterations' : 500,
                  'random_seed' : 10}

#### Tuning functions

In [333]:
def tuning(score, catboost_score, cross_validation, path, logistic_regression, knn, random_forest, svm, catboost):
    # score = my_f2_scorer(), 'f1', 'accuracy', 'precision', 'recall', 'roc_auc'
    # catboost_score = 'F1' 'F:beta=2'
    if logistic_regression:
        logistic_regression_tuning(score, cross_validation, path)     

    if knn:
        knn_tuning(score, cross_validation, path)     
    if random_forest:
        random_forest_tuning(score, cross_validation, path)     
    if svm:
        svm_tuning(score, cross_validation, path)     
    if catboost:
        catboost_tuning(catboost_score, cross_validation, path)     

In [334]:
def logistic_regression_tuning(score, cross_validation, path):

    # LogisticRegression: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

    LR_model = LogisticRegression(random_state=10)

    # calibrate hyper-parameters: perform gridsearch with cross-validation
    clf = GridSearchCV(
                      estimator = LR_model, 
                      param_grid = parameters_LR_model,
                      scoring = score,    
                      #  refit = my_f2_scorer,
                      cv = cross_validation,
                      n_jobs = -1
                      )              
    %time clf.fit(X_train, y_train)
    LR_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}LogisticRegression_optimisation.xlsx')

In [335]:
def knn_tuning(score, cross_validation, path):

    # KNeighborsClassifier: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

    knn_model = KNeighborsClassifier()

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=knn_model, 
                      param_grid=parameters_knn,
                      scoring=score,
                      #  refit=my_f2_scorer,
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    knn_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}knn_optimisation.xlsx')

In [336]:
def random_forest_tuning(score, cross_validation, path):
      
    # RandomForestClassifier: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

    random_forest_model = RandomForestClassifier(random_state=10)

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=random_forest_model, 
                      param_grid=parameters_random_forest_model,
                      scoring=score,  
                      #  refit=my_f2_scorer,
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    random_forest_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}randomforest_optimisation.xlsx')

In [337]:
def svm_tuning(score, cross_validation, path):
      
    # SVM_model
    # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC 

    SVM_model = SVC()

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=SVM_model, 
                      param_grid=parameters_svm,
                      scoring=score,  
                      # refit=score[0],
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    SVM_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}svm_optimisation.xlsx')
    # files.download("/content/svm_optimisation.xlsx")

In [338]:
def catboost_tuning(catboost_score, cross_validation, path):
    # Catboost
    # tuning: https://catboost.ai/en/docs/concepts/parameter-tuning


    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 

    np.random.seed(10)
    catboost = CatBoostClassifier(
                                eval_metric=catboost_score,
                                verbose=False,
                                early_stopping_rounds=100,
                                #cat_features=cat_features,
                                task_type="CPU",
                                iterations = 500,
                                random_seed=10)


    grid_res = catboost.grid_search(catboost_parameters,
                                    X_train,
                                    y_train,
                                    cv=cross_validation,
                                    search_by_train_test_split=True,
                                    calc_cv_statistics=True,
                                    refit=True,
                                    shuffle=True,
                                    partition_random_seed=10,
                                    verbose=True,
                                    stratified=True)

    # save optimisation parameters
    cv_results = pd.DataFrame(grid_res['cv_results'])
    cv_results['params'] = 0
    cv_results['params'][0:3] = str(grid_res['params'])


    # add roc_auc fCV values
    cv_results['roc_auc'] = str(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    cv_results['roc_auc_mean'] = np.mean(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    cv_results['roc_auc_std'] = np.std(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))

    cv_results.to_excel(f'{path}catboost_optimisation.xlsx')

#### Function: Optimised table of metrics 

In [339]:
def optimised_metrics_table(model_name):
    optimised_metrics = []
    # optimised_metrics.append(mcc(y_test, forecast))        #.round(3)                          # MCC
    optimised_metrics.append(f1(y_test, forecast).round(3))                                    # F1
    optimised_metrics.append(f2_func(y_test, forecast).round(3))                               # F2
    optimised_metrics.append(accuracy(y_test, forecast).round(3))                              # Accuracy
    optimised_metrics.append(TP_rate(y_test, forecast).round(3))                               # TP rate
    optimised_metrics.append(recall(y_test, forecast).round(3))                                # TN rate
    precision, recall_, thresholds = precision_recall_curve(y_test, forecast_proba)                  # ------
    optimised_metrics.append(auc(recall_, precision).round(3))                                 # PR AUC
    optimised_metrics.append(roc_auc(y_test, forecast_proba).round(3))                               # ROC AUC
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[0])                    # number of true negative
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[1])                    # number of false positive
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[2])                    # number of false negative
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[3])                    # number of true positive

    optimised_metrics = pd.DataFrame(optimised_metrics, columns=[model_name])
    # add rows names
    optimised_metrics.index = [
                              #  "MCC",
                               "F1", "F2",
                               "Accuracy",
                               "Precision",
                               "Recall",
                               "PR_AUC",
                               "ROC_AUC",
                               "TN", "FP", "FN", "TP"
                                ]
    optimised_metrics = optimised_metrics.T
    
    return optimised_metrics

#### Function metric_table - visualise model scores

In [340]:
'''
This function provides scores for gridsearch F1-score and metrics for test dataset
'''

def metric_table(path):  #, X_train=X_train, y_train=y_train

    # read gridsearch tables
    randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0]) #/content/  ./imp_feat
    svm_optimisation = pd.read_excel(f'{path}svm_optimisation.xlsx', header=[0])
    knn_optimisation = pd.read_excel(f'{path}knn_optimisation.xlsx', header=[0])
    LogisticRegression_optimisation = pd.read_excel(f'{path}LogisticRegression_optimisation.xlsx', header=[0])
    catboost_optimisation = pd.read_excel(f'{path}catboost_optimisation.xlsx', header=[0])

    params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    random_forest_model = RandomForestClassifier(**params)
    # 
    params = svm_optimisation[svm_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    SVM_model = SVC(**params)
    # 
    # params = nn_optimisation[nn_optimisation['rank_test_score']==1][["params"]].iloc[0]
    # params = ast.literal_eval(params[0])
    # newral_network_model = MLPClassifier(**params)
    # 
    params = knn_optimisation[knn_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    knn_model = KNeighborsClassifier(**params)
    # 
    params = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    LR_model = LogisticRegression(**params)
    # 
    params = catboost_optimisation['params'][0]
    params = ast.literal_eval(params)
    catboost_model = CatBoostClassifier(**c_boost_params, **params)

    models = [
    random_forest_model,
    SVM_model,
    # newral_network_model,
    LR_model,
    knn_model,
    catboost_model
    ]

    mcc_score, f1_score,f2_score, accuracy_score, TP_rate_score, recall_score, auc_precision_recall, roc_auc_score= [], [], [], [], [], [], [], []
    tn, fp, fn, tp = [], [], [], []

    for model in models:
        model.fit(X_train, y_train)
        forecast = model.predict(X_test)
        forecast_proba = model.predict_proba(X_test)

        # mcc_score.append(mcc(y_test, forecast))                                   # MCC
        f1_score.append(f1(y_test, forecast))                                       # F1
        f2_score.append(f2_func(y_test, forecast))                                  # F1
        accuracy_score.append(accuracy(y_test, forecast))                           # Accuracy  
        TP_rate_score.append(TP_rate(y_test, forecast))                             # TP rate   tp / (tp + fp)
        recall_score.append(recall(y_test, forecast))                               # TN rate
        auc_precision_recall.append(average_precision_score(y_test, forecast_proba[:,1]))      # PR AUC
        roc_auc_score.append(roc_auc(y_test, forecast_proba[:,1]))                       # ROC AUC
        tn.append(confusion_matrix(y_test, forecast).ravel()[0])                  # number of true negative
        fp.append(confusion_matrix(y_test, forecast).ravel()[1])                  # number of false positive
        fn.append(confusion_matrix(y_test, forecast).ravel()[2])                  # number of false negative
        tp.append(confusion_matrix(y_test, forecast).ravel()[3])                  # number of true positive

    # create matrix table 
    metrics_table = pd.DataFrame(columns=pd.MultiIndex.from_product([["F2, train set, cv=5"],["mean", 'std']]))
    # metrics_table[("Scores on the test set","MCC")] = mcc_score
    metrics_table[("Scores on the test set","F1")] = f1_score
    metrics_table[("Scores on the test set","F2")] = f2_score
    metrics_table[("Scores on the test set","Accuracy")] = accuracy_score
    metrics_table[("Scores on the test set","Precision")] = TP_rate_score
    metrics_table[("Scores on the test set","Recall")] = recall_score
    metrics_table[("Scores on the test set","PR_AUC")] = auc_precision_recall
    metrics_table[("Scores on the test set","ROC_AUC")] = roc_auc_score
    metrics_table[("Confusion matrix","TN")] = tn
    metrics_table[("Confusion matrix","FP")] = fp
    metrics_table[("Confusion matrix","FN")] = fn
    metrics_table[("Confusion matrix","TP")] = tp

    # modify the rows names
    metrics_table.index = [
                "Random Forest",
                "SVM",
                # "Multi-layer Perceptron",
                "Logistic Regression",
                "KNN",
                "CatBoost"
                ]



    # add cross validated F2 scores on the train set
    mean = []
    std = []
    mean_test_f1,std_test_f1 = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = svm_optimisation[svm_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    # mean_test_f1,std_test_f1 = nn_optimisation[nn_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    # mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = knn_optimisation[knn_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = catboost_optimisation[['test-F:beta=2-mean', 'test-F:beta=2-std']].iloc[catboost_optimisation.shape[0]-1]
    mean.append(mean_test_f1); std.append(std_test_f1)
    

    metrics_table[("F2, train set, cv=5","mean")] = mean
    metrics_table[("F2, train set, cv=5","std")] = std

    # add cross validated F2 scores on the train set
    mean_roc_auc = []
    std_roc_auc = []
    mean_test_roc_auc,std_test_roc_auc = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = svm_optimisation[svm_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    # mean_test_roc_auc,std_test_roc_auc = nn_optimisation[nn_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    # mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = knn_optimisation[knn_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = catboost_optimisation[["roc_auc_mean","roc_auc_std"]].iloc[catboost_optimisation.shape[0]-1]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)

    metrics_table[("ROC_AUC, train set, cv=5","mean")] = mean_roc_auc
    metrics_table[("ROC_AUC, train set, cv=5","std")] = std_roc_auc

    return metrics_table

#### Define $F_2$ metric

In [341]:
def f2_func(y_true, y_pred):
    f2_score = fbeta_score(y_true, y_pred, beta=2.)
    return f2_score

def my_f2_scorer():
    return make_scorer(f2_func)

### Feature selection

In [342]:
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# clf = RandomForestClassifier(max_depth=2, random_state=0)
# clf.fit(X_train, y_train)


# sfs1 = SFS(clf, 
#            k_features=10, 
#            forward=True, 
#            floating=False, 
#            verbose=2,
#            scoring='roc_auc',
#            cv=5,
#            n_jobs=-1)

# sfs1 = sfs1.fit(X_train, y_train)
# sfs1.subsets_

#### random_forest_importances

In [343]:
def random_forest_importances(path, n_features = 20, biomarkers=True, save=True):
    feature_importances = pd.DataFrame()

    randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0])


    # feature_importances.columns = ['Feature']
    params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    random_forest_model = RandomForestClassifier(**params)
    random_forest_model.fit(X_train, y_train)
    feature_importances["RandomForest"] = pd.Series(random_forest_model.feature_importances_)

    if biomarkers:
        feats = list(map(lambda x: x, list(X_train.columns)))
    else:
        feats = list(map(lambda x: x[1], list(X_train.columns)))

    feature_importances.index = feats

    feature_importances = feature_importances.sort_values("RandomForest", ascending=False)
    feature_importances['RandomForest'] = feature_importances['RandomForest']/feature_importances['RandomForest'][0]
    print()



    fig = px.bar(
        x='RandomForest',
                 data_frame=feature_importances['RandomForest'][:n_features][::-1],
                 y=feature_importances.index[:n_features][::-1])
    # figure size
    fig.update_layout(
        autosize=False,
        width=1000,
        height=450,) 
    fig.update_xaxes(title='Relative importance')
    fig.update_yaxes(title='')

    fig.update_layout(
                  # xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False},
                  # bargap=0.30,
                  # bargroupgap=0.3,
                  # legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text='Feature importance', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

    fig.show(renderer='colab')


    if save:
        name = path.split("/")[-3]
        fig.write_image(f"{results_path}importance {name}.pdf", engine="kaleido")


    return feature_importances

#### Function: upload_models(x_data, y_data, path, model_list)

In [344]:
def upload_models(x_data, y_data, path, model_list):
    'Return list of trained models'

    models = []
    model_names = []

    if model_list['SVM']:
        svm_optimisation = pd.read_excel(f'{path}svm_optimisation.xlsx', header=[0])
        params = svm_optimisation[svm_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        SVM_model = SVC(**params)
        SVM_model.fit(X_train, y_train)
        models.append(SVM_model)
        model_names.append('SVM')

    if model_list['Logistic']:
        LogisticRegression_optimisation = pd.read_excel(f'{path}LogisticRegression_optimisation.xlsx', header=[0])
        params = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        LR_model = LogisticRegression(**params)
        LR_model.fit(X_train, y_train)
        models.append(LR_model)
        model_names.append('Logistic')

    if model_list['RandomForest']:
        randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0])
        params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        random_forest_model = RandomForestClassifier(**params)
        random_forest_model.fit(X_train, y_train)
        models.append(random_forest_model)
        model_names.append('RandomForest')

    if model_list['KNN']:
        knn_optimisation = pd.read_excel(f'{path}knn_optimisation.xlsx', header=[0])
        params = knn_optimisation[knn_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        knn_model = KNeighborsClassifier(**params)
        knn_model.fit(X_train, y_train)
        models.append(knn_model)
        model_names.append('KNN')
        # https://catboost.ai/en/docs/concepts/fstr
    
    if model_list['Catboost']:
        catboost_optimisation = pd.read_excel(f'{path}catboost_optimisation.xlsx', header=[0])
        params = catboost_optimisation['params'][0]
        params = ast.literal_eval(params)
        catboost_model = CatBoostClassifier(**c_boost_params, **params)
        catboost_model.fit(X_train, y_train)
        models.append(catboost_model)
        model_names.append('Catboost')
        # https://catboost.ai/en/docs/concepts/fstr



    return models, model_names

#### Function: feature_selection(dataset, x_data, y_data, path)

In [345]:
# def feature_selection(x_data, y_data, path):
#     'Return dataset with ranged selected features'

#     # get list of all column names and continuous column names
#     '___________________________________________________________________________'
#     all_cols = list(x_data.columns)
#     continuous_cols = [col for col in x_data.columns if (len((x_data[col].unique())) >= 7)]
#     feature_selection_dataset = pd.DataFrame(columns=pd.MultiIndex.from_product([["LASSO"],["coef"]]))


#     # LASSO
#     '___________________________________________________________________________'
#     search = GridSearchCV(Lasso(),
#                           {'alpha':np.linspace(0.1, 1, num=10)**2}, #  np.linspace(0.1, 1, num=10)**2 np.arange(0.1,10,0.1)
#                           cv = 5, 
#                           scoring=my_f2_scorer(),  #"neg_mean_squared_error" my_f2_scorer() 'f1'
#                           verbose=0
#                           )

#     search.fit(X_train, y_train)
#     feature_selection_dataset['LASSO', 'coef'] = np.abs(search.best_estimator_.coef_)
#     # feature_selection_dataset['LASSO', 'coef'][feature_selection_dataset['LASSO', 'coef']>0] = 1
#     print("Calculated LASSO")


#     # get all trained models
#     '___________________________________________________________________________'
#     models, model_names = upload_models(x_data = x_data,
#                                         y_data = y_data,
#                                         path = path, 
#                                         model_list = {'SVM': True, 
#                                                       'Logistic': True, 
#                                                       'RandomForest': True, 
#                                                       'KNN': True, 
#                                                       'Catboost': True})


#     # Sequencial feature selection
#     '___________________________________________________________________________'
    
#     sfs1 = SFS_xtend(knn, 
#                   k_features=20, 
#                   forward=True, 
#                   floating=True, 
#                   verbose=2,
#                   direction = 'forward',
#                   cv = StratifiedKFold(5),
#                   scoring=my_f2_scorer(),
#                   n_jobs=-1)

#     sfs1 = sfs1.fit(X, y)

#     for number in [0,1]:
#         sfs = SequentialFeatureSelector(estimator = models[number],
#                                         n_features_to_select=None,
#                                         cv = StratifiedKFold(5),
#                                         scoring = my_f2_scorer(), 
#                                         direction = 'backward',
#                                         n_jobs=-1
#                                         )
#         sfs.fit(X_train, y_train)
#         feature_selection_dataset['SFS', model_names[number]] = sfs.get_support()*1
#     print("Calculated SFS")
#     # # Recursive feature elimination with cross validation  - плохо отбирает для RandomForest, Catboost
#     # '___________________________________________________________________________'
#     # # models with feature importance do not have to perform SFS.    
#     # for number in [1,2,4]:
#     #     rfecv = RFECV(estimator = models[number],
#     #                                     # n_features_to_select=None,
#     #                                     cv = StratifiedKFold(5),
#     #                                     scoring = my_f2_scorer(),
#     #                                     n_jobs=-1
#     #                                     )
#     #     rfecv.fit(X_train, y_train)
#     #     feature_selection_dataset['RFECV', model_names[number]] = rfecv.get_support()*1
#     # print("Calculated RFECV")

#     # model importances
#     '___________________________________________________________________________'
#     feature_selection_dataset['Importances', 'RandomForest'] = models[2].feature_importances_
#     feature_selection_dataset['Importances', 'CatBoost'] = models[4].feature_importances_
#     feature_selection_dataset['Importances', 'Logistic'] = np.abs(models[1].coef_[0])

#     # # Drop-Column Importance
#     # '___________________________________________________________________________'
#     # """get score via Drop-Column Importance for models"""
#     # for number in range(5):
#     #     # clone the model to have the exact same specification as the one initially trained
#     #     model_clone = clone(models[number])
#     #     # set random_state for comparability
#     #     model_clone.random_state = 37
#     #     # training and scoring the benchmark model
#     #     model_clone.fit(X_train, y_train)

#     #     # benchmark_score = model_clone.score(X_train, y_train)
#     #     y_pred = model_clone.predict(X_test)
#     #     benchmark_score = f2_func(y_test, y_pred)

#     #     # list for storing feature importances
#     #     importances = []
        
#     #     # iterating over all columns and storing feature importance (difference between benchmark and new model)
#     #     for col in X_train.columns:
#     #         model_clone = clone(models[number])
#     #         model_clone.random_state = random_state
#     #         model_clone.fit(X_train.drop(col, axis = 1), y_train)
#     #         # drop_col_score = model_clone.score(X_train.drop(col, axis = 1), y_train)
#     #         y_pred = model_clone.predict(X_test.drop(col, axis = 1))
#     #         drop_col_score = f2_func(y_test, y_pred)
#     #         importances.append(benchmark_score - drop_col_score)
        
#     #     feature_selection_dataset[('Drop-Column Importance', model_names[number])] = importances
    

#     # Set column names as index
#     '___________________________________________________________________________'
#     feature_selection_dataset.index = all_cols

#     # # MRMR
#     # '___________________________________________________________________________'
#     # feature_selection_dataset[('MRMR', '')] = 0
#     # selected_features = mrmr_classif(X=X_train, y=y_train, K=40)
#     # feature_selection_dataset.loc[selected_features][('MRMR', '')] = 1


#     # Unsupervised selection with Pearson correlation coefs
#     '___________________________________________________________________________'
#     # correlation_matrix = dataset[continuous_cols].corr( method='pearson').abs()
#     # correlation_matrix  = pd.DataFrame(correlation_matrix)
#     # # iteratively remove features that have correlation > 0.95
#     # i=0
#     # j=0
#     # cols = correlation_matrix.shape[1]
#     # rows = correlation_matrix.shape[0]

#     # while i < cols: 
#     #     while j < rows:
#     #         if correlation_matrix.iloc[j,i]>0.95 and correlation_matrix.iloc[j,i]!=1:
#     #             correlation_matrix.drop(index=correlation_matrix.index[j], inplace=True)
#     #             correlation_matrix.drop(columns=correlation_matrix.columns[j], inplace=True)
#     #         else:
#     #             j+=1
#     #         rows = correlation_matrix.shape[0]
#     #     i+=1
#     #     j=0
#     #     cols = correlation_matrix.shape[1]

#     # removed_after_unsupervised = list(set(continuous_cols) - set(correlation_matrix.columns))


#     # Process data and download dataset
#     '___________________________________________________________________________'
#     # drop columns from unsupervised selection
#     # feature_selection_dataset.drop(index = removed_after_unsupervised, inplace=True)

#     # rank columns
#     feature_selection_dataset['sum'] = feature_selection_dataset.apply((lambda x: x.iloc[:6].sum()), axis=1)
#     feature_selection_dataset.sort_values('sum', inplace=True, ascending=True)

#     feature_selection_dataset.to_excel(f'{path}feature_selection_dataset.xlsx')

#     return feature_selection_dataset

# Lancet paper

Dataset ABC - all-cause death

##### Subset

In [519]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_lancet, header=[0], index_col=0, usecols=list(range(15)))
y_train = pd.read_excel(link_train_lancet, header=[0], index_col=0, usecols=[0,15])
X_test = pd.read_excel(link_test_lancet, header=[0], index_col=0, usecols=list(range(15)))
y_test = pd.read_excel(link_test_lancet, header=[0], index_col=0, usecols=[0,15])

# create path for saving results
optimisation_path = './HSE project/Optimisation data/lancet/ABC death/'
results_path = './HSE project/Graphics/lancet/'

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (450, 14)
y_train shape:	 (450, 1)
X_test shape:	 (105, 14)
y_test shape:	 (105, 1)


##### Hyper-parameter optimisation

In [520]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

clear_output()

##### Metrics tables

In [521]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("ROC_AUC, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.798,0.085,0.472,0.535,0.638,0.395,0.586,0.585,0.74,50,26,12,17,0.848,0.068
SVM,0.83,0.027,0.5,0.704,0.467,0.337,0.966,0.506,0.734,21,55,1,28,0.774,0.079
Logistic Regression,0.762,0.1,0.492,0.526,0.686,0.444,0.552,0.64,0.764,56,20,13,16,0.813,0.085
KNN,0.867,0.055,0.567,0.617,0.724,0.5,0.655,0.517,0.749,57,19,10,19,0.866,0.07
CatBoost,0.797,0.15,0.519,0.496,0.752,0.56,0.483,0.657,0.778,65,11,15,14,0.878,0.087


In [522]:
# what metric to evaluate
metric = ['F2','Precision','Recall','ROC_AUC']

table = pd.read_excel(f'{optimisation_path}metrics_table.xlsx', header=[0,1], index_col=[0]) 
datasets = pd.DataFrame(table.loc[:, ('Scores on the test set', metric)].values.round(3),columns=['F2','Precision','Recall','ROC AUC'])

models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
# if 1:
#     # if metric == 'F2':
#     datasets_mean = pd.DataFrame()
#     datasets_mean[('F2, train set, cv=5', 'mean')] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
#     datasets_mean[('F2, train set, cv=5', 'std')] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
#     # datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 
#     fig.add_traces([go.Box(name=column, x=models, 
#                            y=datasets_mean.iloc[column, ('F2, train set, cv=5', 'mean')], 
#                            marker=dict(color="black"), 
#                            showlegend = False) for column in table.index])
#     fig.update_traces(
#     selector=dict(type="box"), # update only boxes
#     boxpoints="all", # show points
#     pointpos=0, # centered
#     jitter=0, # no jitter
#     line_color="rgba(255,255,255,0)", # hide box lines
#     fillcolor="rgba(255,255,255,0)", # hide box fill
    
#     )
    # fig.update_layout(boxmode="group",)

    # fig.add_traces([go.Bar(name=column, x=models, 
    #                        y=datasets_mean[column], 
    #                        xaxis="x2",  
    #                        error_y=dict(type='data',  
    #                                     array=datasets_std[column], 
    #                                     color="rgba(0,0,0,1)",
    #                                     thickness=1), 
    #                        marker=dict(opacity=0,
    #                                   #  color="rgba(255,255,255,0)"
    #                                    ), 
    #                        showlegend = False) for column in datasets.columns])  
    


# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Metric values', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Metrics', y=0.5), 
                  title=dict(text=f'Test metrics: "lancet" subset (ABC)', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if 1:
    fig.add_shape(type='line',
                  x0=-0.5,
                  y0=0.5,
                  x1=4.5,
                  y1=0.5,
                  line=dict(color='black',  width=2, dash='dot'),
                  xref='x',
                  yref='y',
                  layer='below')   
    
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)

fig.show(renderer='colab')

In [523]:
fig.write_image(f"{results_path}metrics.pdf", engine="kaleido")
# fig.write_image(f"{results_path}/metrics.jpeg", engine="kaleido")

##### Feature selection

In [524]:
# Random Forest feature importances
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
feature_importances = pd.DataFrame()

randomforest_optimisation = pd.read_excel(f'{optimisation_path}randomforest_optimisation.xlsx', header=[0])


# feature_importances.columns = ['Feature']
params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
params = ast.literal_eval(params[0])
random_forest_model = RandomForestClassifier(**params)
random_forest_model.fit(X_train, y_train)
feature_importances["RandomForest"] = pd.Series(random_forest_model.feature_importances_)

feats = list(map(lambda x: eval(x)[1], list(X_train.columns)))
feature_importances.index = feats

feature_importances = feature_importances.sort_values("RandomForest", ascending=False)
feature_importances['RandomForest'] = feature_importances['RandomForest']/feature_importances['RandomForest'][0]

feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["RandomForest"])\
                   .set_precision(3)

Unnamed: 0,RandomForest
МФА,1.0
Возраст,0.892
"Гемоглобин, г/л",0.77
СКФ EPI,0.475
ГБ,0.381
ФВ ЛЖ,0.226
"Хсобщ, ммоль/л",0.204
Cегмент ST,0.203
СД,0.031
пост-стент,0.012


In [525]:
data_from_paper = pd.DataFrame()
# https://ars.els-cdn.com/content/image/1-s2.0-S0140673620325198-mmc1.pdf
data_from_paper['Paper']= [0.77, 0.49, 1, 0.22, 0.1, 0.7, 0.1, 0.22, 0.09,  0.24, 0.09, 0.12, 0.11, 0.28]
data_from_paper.index = ['Возраст', 'СКФ EPI', 'ФВ ЛЖ', 'МФА', 'Хсобщ, ммоль/л', 'Гемоглобин, г/л', 'пост-ИМ','пост-стент','ГБ',  'СД', 'пост-ОНМК',  'Cегмент ST','Пол', 'пост-ВЧ-кровоизлияние']

In [526]:
result = pd.concat([feature_importances, data_from_paper], axis=1) 
result

Unnamed: 0,RandomForest,Paper
МФА,1.0,0.22
Возраст,0.892054,0.77
"Гемоглобин, г/л",0.769786,0.7
СКФ EPI,0.475432,0.49
ГБ,0.380586,0.09
ФВ ЛЖ,0.226214,1.0
"Хсобщ, ммоль/л",0.204344,0.1
Cегмент ST,0.202585,0.12
СД,0.031247,0.24
пост-стент,0.012231,0.22


In [527]:

result['RandomForest'] = result['RandomForest']/result['RandomForest'][0]
result.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["RandomForest", "Paper"])\
                   .set_precision(3)

Unnamed: 0,RandomForest,Paper
МФА,1.0,0.22
Возраст,0.892,0.77
"Гемоглобин, г/л",0.77,0.7
СКФ EPI,0.475,0.49
ГБ,0.381,0.09
ФВ ЛЖ,0.226,1.0
"Хсобщ, ммоль/л",0.204,0.1
Cегмент ST,0.203,0.12
СД,0.031,0.24
пост-стент,0.012,0.22


In [528]:
# what metric to evaluate
# metric = ['Эксперимент','Статья']

result.columns = ['RandomForest', 'Published paper']

features=result.index
# create the graph
fig = go.Figure(data=[go.Bar(name=i, x=list(result.index), 
                             y=result[i]) for i in result.columns ])

# Change the bar mode
fig.update_xaxes(title='Features')
fig.update_yaxes(title='Relative importance', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Data', y=0.5), 
                  title=dict(text=f'Feature importance: comparison with lancet paper', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)
  
fig.show(renderer='colab')

In [529]:
fig.write_image(f"{results_path}importance.pdf", engine="kaleido")
# fig.write_image(f"{results_path}importance.jpeg", engine="kaleido")

##### Top features

# Canadian paper

Dataset ABC

In [501]:
results_path =      './HSE project/Graphics/canadian/'

##### Subset

Features from [paper](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5)

In [502]:
# choose the same columns as in reference table

canadian_features = [
    # 1. Target
      # - in reference: "if the patient died or survived before the end of the follow-up period, that was 130 days on average"
      # - in our case: Heart-desease-death, follow-up period - 4-155 months
    # ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть'),
    # 2. Serum creatinine, mg/dL
    ('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'),
    # 3. Ejection fraction
    ('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ'),
    # 4. Age
    ('АНТРОПОФИЗИОМЕТРИЯ', 'Возраст'),
    # 7. Sex
    ('АНТРОПОФИЗИОМЕТРИЯ', 'Пол'),
    # 8. Anaemia
    ('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Анемия, степень'),
    # 9. High blood pressure
    ('АНТРОПОФИЗИОМЕТРИЯ', 'систол. АД'),
    # 10. Smoking
    # ('ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ','Курение')
    # 12. Diabetes
    ('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'СД'),
    # (),
    # 5. Creatinine phosphokinase
    # data_12_columns['Creatinine phosphokinase'] = 
    # 6. Serum sodium
    # data_12_columns['Serum sodium']
    # 8. Platelets
    # data_12_columns['Platelets'] = 
]

canadian_2_features = [('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'), ('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ')]

canadian_target = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')

In [503]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_abc, header=[0], index_col=0, usecols=list(range(57)))
y_train = pd.read_excel(link_train_death_abc, header=[0], index_col=0, usecols=[0, 57])
X_test =  pd.read_excel(link_test_death_abc,  header=[0], index_col=0, usecols=list(range(57)))
y_test =  pd.read_excel(link_test_death_abc,  header=[0], index_col=0, usecols=[0, 57])
X_train.columns = [eval(col) for col in X_train.columns]
X_test.columns =  [eval(col) for col in X_test.columns]
X_train = X_train[canadian_features]
X_test =  X_test[canadian_features]

# create path for saving results
optimisation_path = './HSE project/Optimisation data/canadian/ABC dataset/8 features/'


# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (438, 7)
y_train shape:	 (438, 1)
X_test shape:	 (104, 7)
y_test shape:	 (104, 1)


##### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

##### Metrics tables

In [505]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.714,0.08,0.491,0.519,0.721,0.452,0.538,0.49,0.726,61,17,12,14,0.783,0.046
SVM,0.697,0.073,0.419,0.464,0.654,0.361,0.5,0.44,0.643,55,23,13,13,0.742,0.066
Logistic Regression,0.659,0.05,0.556,0.667,0.692,0.435,0.769,0.468,0.715,52,26,6,20,0.676,0.064
KNN,0.769,0.078,0.418,0.483,0.625,0.341,0.538,0.44,0.671,51,27,12,14,0.785,0.054
CatBoost,0.712,0.091,0.557,0.612,0.74,0.486,0.654,0.601,0.743,60,18,9,17,0.797,0.056


In [506]:
# what metric to evaluate
metric = ['F2','Precision','Recall','ROC_AUC']


table = pd.read_excel(f'{optimisation_path}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
# datasets[list(datasets.columns)] = 
datasets = pd.DataFrame(table.loc[:, ('Scores on the test set', metric)].values.round(3),columns=['F2','Precision','Recall','ROC AUC'])
# if metric == 'F2':
#     datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) 
#     datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 

# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
# ensemble models
# models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    


# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Metric values', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Metrics', y=0.5), 
                  title=dict(text=f'Test metrics: "canadian" subset (ABC)', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if 1:
    fig.add_shape(type='line',
                  x0=-0.5,
                  y0=0.5,
                  x1=4.5,
                  y1=0.5,
                  line=dict(color='black',  width=2, dash='dot'),
                  xref='x',
                  yref='y',
                  layer='below')   
    
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)

fig.show(renderer='colab')

In [507]:
fig.write_image(f"{results_path}metrics.pdf", engine="kaleido")
# fig.write_image(f"{results_path}metrics.jpeg", engine="kaleido")

##### Feature selection

In [508]:
# Эксперимент feature importances
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
feature_importances = pd.DataFrame()

randomforest_optimisation = pd.read_excel(f'{optimisation_path}randomforest_optimisation.xlsx', header=[0])

params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
params = ast.literal_eval(params[0])
random_forest_model = RandomForestClassifier(**params)
random_forest_model.fit(X_train, y_train)


feature_importances["Эксперимент"] = pd.Series(random_forest_model.feature_importances_)
# Перепечатал из статьи
feature_importances['Эксперимент'] = feature_importances['Эксперимент']/feature_importances['Эксперимент'][0]
# feature_importances["Logistic regression"] = pd.Series(LR_model.coef_[0]).abs()
# feature_importances["SVM"] = pd.Series(SVM_model.coef_[0]).abs()


feats = list(map(lambda x: x[1], list(X_train.columns)))
feature_importances.index = feats

feature_importances = feature_importances.sort_values("Эксперимент", ascending=False)
feature_importances['Эксперимент'] = feature_importances['Эксперимент']/feature_importances['Эксперимент'][0]

feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["Эксперимент"], axis=0)\
                   .set_precision(3)

Unnamed: 0,Эксперимент
ФВ ЛЖ,1.0
"Креатинин, мкмоль/л",0.895
Возраст,0.876
систол. АД,0.745
"Анемия, степень",0.328
Пол,0.133
СД,0.087


In [509]:
data_from_paper = pd.DataFrame()
data_from_paper['Статья'] = [11.84,  10.71, 8.58,  1.06, 1.13, 1.02, 1.12] # 
data_from_paper['Статья'] = data_from_paper['Статья']/data_from_paper['Статья'][0]
data_from_paper.index = ['Креатинин, мкмоль/л', 'ФВ ЛЖ', 'Возраст', 'Анемия, степень',   'систол. АД', 'СД', 'Пол']

In [510]:
feature_importances = pd.concat([feature_importances, data_from_paper], axis=1) 
feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["Эксперимент", "Статья"], axis=0)\
                   .set_precision(3)

Unnamed: 0,Эксперимент,Статья
ФВ ЛЖ,1.0,0.905
"Креатинин, мкмоль/л",0.895,1.0
Возраст,0.876,0.725
систол. АД,0.745,0.095
"Анемия, степень",0.328,0.09
Пол,0.133,0.095
СД,0.087,0.086


In [511]:
# what metric to evaluate
metric = ['Эксперимент','Статья']
feature_importances.columns = ['RandomForest', 'Published paper']

feats = [str(feature[1]) for feature in feature_importances.index]
features=feature_importances.index

# create the graph
fig = go.Figure(data=[go.Bar(name=i, x=list(feature_importances.index), 
                             y=feature_importances[i]) for i in feature_importances.columns ])

# Change the bar mode
fig.update_xaxes(title='Features')
fig.update_yaxes(title='Relative importance', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Data', y=0.5), 
                  title=dict(text=f'Feature importance: comparison with canadian paper', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)  

# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)
  
fig.show(renderer='colab')

In [512]:
fig.write_image(f"{results_path}importance.pdf", engine="kaleido")
# fig.write_image(f"{results_path}importance.jpeg", engine="kaleido")

##### Two features

##### Subset

In [513]:
X_train = X_train[canadian_2_features]
X_test =  X_test[canadian_2_features]

# create path for saving results
optimisation_path = './HSE project/Optimisation data/canadian/ABC dataset/2 features/'
# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (438, 2)
y_train shape:	 (438, 1)
X_test shape:	 (104, 2)
y_test shape:	 (104, 1)


In [514]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=X_train['ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'], 
                         y=X_train[('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ')],
                         marker_color=y_train["('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')"],
                         mode='markers',
                         text= y_train["('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')"]
                    # mode='markers',
                    # name='markers')
))

fig.update_traces(marker_size=10, selector=dict(type='scatter'))

fig.update_layout(
                  height=800, 
                  font_family="'Nunito', sans-serif",
                  # title={'text': "Correlations between sales and other columns", 'y':0.97, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
                  # showlegend=True, 
                  margin=dict(l=40, r=10, t=60, b=60),
                  xaxis_range=[-5,10]

                  )

# fig = px.scatter(X_train, x=['ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'], y=['ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ'], color=['target', ''],
#                 title="Automatic Labels Based on X_train Frame Column Names")
    
# figure size
fig.update_layout(
    autosize=False,
    width=1000,
    height=450,)
# fig.layout.template = 'plotly_dark'
fig.show(renderer='colab')

##### Hyper-parameter optimisation

In [515]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

##### Metrics tables

In [518]:
metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.655,0.088,0.232,0.272,0.49,0.186,0.308,0.294,0.448,43,35,18,8,0.674,0.09
SVM,0.679,0.274,0.4,0.625,0.25,0.25,1.0,0.263,0.453,0,78,0,26,0.463,0.075
Logistic Regression,0.627,0.063,0.369,0.42,0.606,0.308,0.462,0.353,0.612,51,27,14,12,0.595,0.076
KNN,0.667,0.096,0.261,0.306,0.51,0.209,0.346,0.227,0.418,44,34,17,9,0.69,0.085
CatBoost,0.623,0.088,0.247,0.298,0.471,0.191,0.346,0.247,0.416,40,38,17,9,0.653,0.08


# **Target**: Death from heart desease

In [346]:
target_column = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')
results_path = './HSE project/Graphics/cardiovascular death/'

## Biomarkers A

### Subset
### Split into train and test

In [347]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (150, 86)
y_train shape:	 (150, 1)
X_test shape:	 (49, 86)
y_test shape:	 (49, 1)


### Hyper-parameter optimisation

In [348]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [349]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.69,0.053,0.389,0.365,0.551,0.438,0.35,0.528,0.571,20,9,13,7,0.693,0.033
SVM,0.674,0.136,0.596,0.654,0.612,0.519,0.7,0.46,0.568,16,13,6,14,0.571,0.113
Logistic Regression,0.69,0.121,0.558,0.583,0.612,0.522,0.6,0.635,0.634,18,11,8,12,0.649,0.107
KNN,0.513,0.155,0.5,0.469,0.633,0.562,0.45,0.581,0.624,22,7,11,9,0.492,0.127
CatBoost,0.573,0.141,0.5,0.5,0.592,0.5,0.5,0.482,0.545,19,10,10,10,0.65,0.042


In [350]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [351]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [352]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [353]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.78,0.084,0.513,0.505,0.612,0.526,0.5,0.52,0.6,20,9,10,10,0.742,0.047
SVM,0.647,0.155,0.478,0.519,0.51,0.423,0.55,0.473,0.546,14,15,9,11,0.672,0.062
Logistic Regression,0.761,0.067,0.553,0.607,0.571,0.481,0.65,0.553,0.584,15,14,7,13,0.749,0.074
KNN,0.636,0.111,0.372,0.388,0.449,0.348,0.4,0.372,0.422,14,15,12,8,0.689,0.066
CatBoost,0.686,0.136,0.474,0.459,0.592,0.5,0.45,0.533,0.607,20,9,11,9,0.716,0.051


## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [354]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (150, 146)
y_train shape:	 (150, 1)
X_test shape:	 (49, 146)
y_test shape:	 (49, 1)


### Hyper-parameter optimisation

In [355]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [356]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.711,0.113,0.529,0.479,0.673,0.643,0.45,0.665,0.702,24,5,11,9,0.74,0.047
SVM,0.714,0.021,0.744,0.777,0.776,0.696,0.8,0.689,0.769,22,7,4,16,0.702,0.06
Logistic Regression,0.721,0.122,0.65,0.65,0.714,0.65,0.65,0.726,0.719,22,7,7,13,0.682,0.102
KNN,0.583,0.169,0.647,0.585,0.755,0.786,0.55,0.649,0.728,26,3,9,11,0.512,0.087
CatBoost,0.647,0.071,0.545,0.484,0.694,0.692,0.45,0.632,0.659,25,4,11,9,0.724,0.056


In [357]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [358]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [359]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [360]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.743,0.08,0.545,0.484,0.694,0.692,0.45,0.678,0.691,25,4,11,9,0.782,0.063
SVM,0.76,0.062,0.513,0.505,0.612,0.526,0.5,0.642,0.648,20,9,10,10,0.8,0.074
Logistic Regression,0.814,0.049,0.558,0.583,0.612,0.522,0.6,0.65,0.702,18,11,8,12,0.781,0.05
KNN,0.719,0.077,0.541,0.515,0.653,0.588,0.5,0.644,0.686,22,7,10,10,0.768,0.067
CatBoost,0.668,0.098,0.485,0.43,0.653,0.615,0.4,0.633,0.626,24,5,12,8,0.749,0.048


## Clinical features A

#### Subset
#### Split into train and test

In [361]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (150, 60)
y_train shape:	 (150, 1)
X_test shape:	 (49, 60)
y_test shape:	 (49, 1)


### Hyper-parameter optimisation

In [362]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [363]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.696,0.043,0.606,0.538,0.735,0.769,0.5,0.769,0.766,26,3,10,10,0.727,0.045
SVM,0.739,0.047,0.619,0.637,0.673,0.591,0.65,0.703,0.717,20,9,7,13,0.734,0.037
Logistic Regression,0.793,0.057,0.654,0.759,0.633,0.531,0.85,0.758,0.752,14,15,3,17,0.726,0.057
KNN,0.628,0.121,0.65,0.65,0.714,0.65,0.65,0.565,0.704,22,7,7,13,0.653,0.058
CatBoost,0.648,0.02,0.6,0.6,0.673,0.6,0.6,0.734,0.745,21,8,8,12,0.692,0.043


In [364]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [365]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical A/top features/"

### Hyper-parameter optimisation

In [366]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [367]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.737,0.097,0.611,0.573,0.714,0.688,0.55,0.761,0.76,24,5,9,11,0.736,0.06
SVM,0.673,0.057,0.5,0.435,0.673,0.667,0.4,0.706,0.753,25,4,12,8,0.728,0.042
Logistic Regression,0.794,0.051,0.59,0.744,0.49,0.439,0.9,0.731,0.731,6,23,2,18,0.732,0.042
KNN,0.643,0.041,0.563,0.489,0.714,0.75,0.45,0.676,0.723,26,3,11,9,0.717,0.041
CatBoost,0.61,0.082,0.595,0.567,0.694,0.647,0.55,0.658,0.688,23,6,9,11,0.708,0.039


## Biomarkers B

### Subset
### Split into train and test

In [368]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (94, 6)
y_train shape:	 (94, 1)
X_test shape:	 (30, 6)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [369]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [370]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.84,0.062,0.182,0.172,0.7,0.2,0.167,0.252,0.562,20,4,5,1,0.809,0.027
SVM,0.773,0.087,0.0,0.0,0.567,0.0,0.0,0.217,0.514,17,7,6,0,0.738,0.095
Logistic Regression,0.589,0.081,0.455,0.625,0.6,0.312,0.833,0.326,0.625,13,11,1,5,0.605,0.125
KNN,0.863,0.055,0.267,0.303,0.633,0.222,0.333,0.222,0.503,17,7,4,2,0.818,0.065
CatBoost,0.815,0.072,0.154,0.161,0.633,0.143,0.167,0.224,0.486,18,6,5,1,0.795,0.034


In [371]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [372]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (94, 77)
y_train shape:	 (94, 1)
X_test shape:	 (30, 77)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [373]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [374]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.965,0.031,0.5,0.385,0.867,1.0,0.333,0.6,0.715,24,0,4,2,0.991,0.013
SVM,0.914,0.09,0.222,0.185,0.767,0.333,0.167,0.42,0.594,22,2,5,1,0.951,0.064
Logistic Regression,0.933,0.038,0.571,0.625,0.8,0.5,0.667,0.735,0.806,20,4,2,4,0.935,0.076
KNN,0.971,0.021,0.333,0.333,0.733,0.333,0.333,0.25,0.569,20,4,4,2,0.926,0.056
CatBoost,0.924,0.083,0.444,0.37,0.833,0.667,0.333,0.567,0.701,23,1,4,2,0.982,0.019


In [375]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

#### subset

In [376]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [377]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [378]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.931,0.033,0.333,0.333,0.733,0.333,0.333,0.548,0.715,20,4,4,2,0.976,0.018
SVM,0.934,0.072,0.5,0.588,0.733,0.4,0.667,0.153,0.229,18,6,2,4,0.927,0.058
Logistic Regression,0.853,0.114,0.4,0.455,0.7,0.333,0.5,0.527,0.674,18,6,3,3,0.776,0.085
KNN,0.947,0.036,0.286,0.312,0.667,0.25,0.333,0.217,0.542,18,6,4,2,0.893,0.05
CatBoost,0.911,0.03,0.4,0.357,0.8,0.5,0.333,0.539,0.812,22,2,4,2,0.969,0.024


## Clinical features B

#### Subset
#### Split into train and test

In [379]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (94, 71)
y_train shape:	 (94, 1)
X_test shape:	 (30, 71)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [380]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [381]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.991,0.017,0.667,0.556,0.9,1.0,0.5,0.671,0.771,24,0,3,3,1.0,0.0
SVM,0.983,0.024,0.25,0.192,0.8,0.5,0.167,0.33,0.628,23,1,5,1,0.995,0.01
Logistic Regression,0.941,0.038,0.571,0.625,0.8,0.5,0.667,0.772,0.826,20,4,2,4,0.964,0.045
KNN,0.971,0.028,0.364,0.345,0.767,0.4,0.333,0.269,0.566,21,3,4,2,0.927,0.072
CatBoost,0.939,0.051,0.222,0.185,0.767,0.333,0.167,0.4,0.632,22,2,5,1,0.98,0.013


In [382]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [383]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [384]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [385]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.951,0.044,0.5,0.385,0.867,1.0,0.333,0.531,0.674,24,0,4,2,0.978,0.018
SVM,0.983,0.016,0.182,0.172,0.7,0.2,0.167,0.321,0.639,20,4,5,1,0.998,0.004
Logistic Regression,0.902,0.072,0.375,0.441,0.667,0.3,0.5,0.25,0.573,17,7,3,3,0.879,0.041
KNN,0.975,0.009,0.5,0.588,0.733,0.4,0.667,0.333,0.708,18,6,2,4,0.936,0.023
CatBoost,0.935,0.043,0.2,0.179,0.733,0.25,0.167,0.406,0.646,21,3,5,1,0.986,0.013


## Biomarkers C

### Subset
### Split into train and test

In [386]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (160, 5)
y_train shape:	 (160, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [387]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [388]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.98,0.023,0.5,0.714,0.953,0.333,1.0,1.0,1.0,40,2,0,1,0.993,0.014
SVM,0.976,0.011,0.0,0.0,0.884,0.0,0.0,0.036,0.357,38,4,1,0,0.984,0.021
Logistic Regression,0.585,0.072,0.095,0.208,0.558,0.05,1.0,0.111,0.81,23,19,0,1,0.641,0.079
KNN,0.971,0.02,0.0,0.0,0.884,0.0,0.0,0.023,0.452,38,4,1,0,0.931,0.05
CatBoost,0.938,0.057,0.222,0.417,0.837,0.125,1.0,0.5,0.976,35,7,0,1,0.991,0.017


In [389]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [390]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (160, 105)
y_train shape:	 (160, 1)
X_test shape:	 (43, 105)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [391]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [392]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.99,0.02,0.0,0.0,0.977,0.0,0.0,0.143,0.857,42,0,1,0,1.0,0.0
SVM,0.988,0.015,0.0,0.0,0.93,0.0,0.0,0.045,0.5,40,2,1,0,0.995,0.009
Logistic Regression,0.978,0.023,0.0,0.0,0.953,0.0,0.0,0.05,0.548,41,1,1,0,0.985,0.016
KNN,0.978,0.016,0.0,0.0,0.93,0.0,0.0,0.023,0.405,40,2,1,0,0.944,0.041
CatBoost,0.98,0.019,0.4,0.625,0.93,0.25,1.0,0.25,0.929,39,3,0,1,0.999,0.002


In [393]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### subset

In [394]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [395]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [396]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.99,0.02,0.0,0.0,0.93,0.0,0.0,0.143,0.857,40,2,1,0,1.0,0.0
SVM,0.99,0.02,0.0,0.0,0.977,0.0,0.0,0.059,0.619,42,0,1,0,0.999,0.002
Logistic Regression,0.998,0.005,0.0,0.0,0.837,0.0,0.0,0.125,0.833,36,6,1,0,1.0,0.0
KNN,0.998,0.005,0.0,0.0,0.977,0.0,0.0,0.023,0.476,42,0,1,0,0.994,0.013
CatBoost,0.995,0.007,0.0,0.0,0.907,0.0,0.0,0.2,0.905,39,3,1,0,1.0,0.0


## Clinical features C

#### Subset
#### Split into train and test

In [397]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (160, 100)
y_train shape:	 (160, 1)
X_test shape:	 (43, 100)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [398]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [399]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.987,0.02,0.0,0.0,0.977,0.0,0.0,0.048,0.524,42,0,1,0,0.998,0.003
SVM,0.985,0.02,0.0,0.0,0.977,0.0,0.0,0.045,0.5,42,0,1,0,1.0,0.0
Logistic Regression,0.983,0.019,0.0,0.0,0.953,0.0,0.0,0.045,0.5,41,1,1,0,0.988,0.013
KNN,0.978,0.017,0.0,0.0,0.977,0.0,0.0,0.023,0.452,42,0,1,0,0.944,0.046
CatBoost,0.985,0.021,0.0,0.0,0.907,0.0,0.0,0.167,0.881,39,3,1,0,0.999,0.002


In [400]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### subset

In [401]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical C/top features/"

### Hyper-parameter optimisation

In [402]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [403]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.987,0.02,0.0,0.0,0.953,0.0,0.0,0.167,0.881,41,1,1,0,1.0,0.0
SVM,0.988,0.025,0.0,0.0,0.953,0.0,0.0,0.048,0.524,41,1,1,0,0.998,0.003
Logistic Regression,0.995,0.01,0.4,0.625,0.93,0.25,1.0,0.25,0.929,39,3,0,1,0.998,0.003
KNN,0.998,0.005,0.0,0.0,0.977,0.0,0.0,0.023,0.5,42,0,1,0,0.994,0.013
CatBoost,0.995,0.007,0.0,0.0,0.907,0.0,0.0,0.25,0.929,39,3,1,0,1.0,0.0


## Clinical features A-B-C

#### Subset
#### Split into train and test

In [404]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_abc, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_death_abc, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_death_abc, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_death_abc, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (438, 56)
y_train shape:	 (438, 1)
X_test shape:	 (104, 56)
y_test shape:	 (104, 1)


### Hyper-parameter optimisation

In [405]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [406]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.884,0.084,0.655,0.677,0.817,0.621,0.692,0.71,0.839,67,11,8,18,0.935,0.046
SVM,0.875,0.082,0.694,0.669,0.856,0.739,0.654,0.286,0.538,72,6,9,17,0.896,0.054
Logistic Regression,0.83,0.019,0.688,0.775,0.808,0.579,0.846,0.735,0.854,62,16,4,22,0.824,0.049
KNN,0.911,0.044,0.586,0.625,0.769,0.531,0.654,0.434,0.731,63,15,9,17,0.834,0.064
CatBoost,0.836,0.138,0.615,0.615,0.808,0.615,0.615,0.707,0.831,68,10,10,16,0.926,0.068


In [407]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

In [408]:
%%time 
df = feature_selection(
                              # dataset = clinical_and_biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

NameError: ignored

### Top feature optimisation and metrics

#### subset

In [409]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [410]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [411]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.863,0.097,0.714,0.746,0.846,0.667,0.769,0.699,0.84,68,10,6,20,0.925,0.054
SVM,0.859,0.095,0.48,0.469,0.75,0.5,0.462,0.537,0.763,66,12,14,12,0.891,0.059
Logistic Regression,0.831,0.023,0.636,0.729,0.769,0.525,0.808,0.729,0.866,59,19,5,21,0.793,0.042
KNN,0.871,0.074,0.548,0.607,0.731,0.472,0.654,0.724,0.801,59,19,9,17,0.895,0.077
CatBoost,0.842,0.132,0.571,0.597,0.769,0.533,0.615,0.658,0.807,64,14,10,16,0.905,0.069


## Results

### Scores of models

In [417]:
# what metric to evaluate
metrics = ['F2', 'ROC_AUC']

for metric in metrics:

    # list of paths
    paths = [ 
            './HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/',
            './HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Clinical A/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Clinical B/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Clinical C/all biomarkers and clinical/',
            ]

    # create datframe for scores
    datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

    # get dataframe with with scores of models from different datasets
    for i in range(len(paths)):
        table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
        datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
        if metric == 'F2':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
        if metric == 'ROC_AUC':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'std')].values.round(3)) 

    # list of models
    # standart models
    models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
    # ensemble models
    # models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

    # create the graph
    fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

    # add error whiskers from gridsearchCV
    if True:
        fig.add_traces([go.Box(name=column, x=models, 
                              y=datasets_mean[column], 
                              #  xaxis="x1",  
                              
                              marker=dict(color="black"), 
                              showlegend = False) for column in datasets.columns])
        fig.update_traces(
        selector=dict(type="box"), # update only boxes
        boxpoints="all", # show points
        pointpos=0, # centered
        jitter=0, # no jitter
        line_color="rgba(255,255,255,0)", # hide box lines
        fillcolor="rgba(255,255,255,0)", # hide box fill
        
        )
        fig.update_layout(boxmode="group",)

        fig.add_traces([go.Bar(name=column, x=models, 
                              y=datasets_mean[column], 
                              xaxis="x2",  
                              error_y=dict(type='data',  
                                            array=datasets_std[column], 
                                            color="rgba(0,0,0,1)",
                                            thickness=1), 
                              marker=dict(opacity=0,
                                          #  color="rgba(255,255,255,0)"
                                          ), 
                              showlegend = False) for column in datasets.columns])  
        


    # Change the bar mode
    fig.update_xaxes(title='Models')
    fig.update_yaxes(title='Score', range=[0., 1.0])
    fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
    fig.update_layout(barmode='group', 
                      bargap=0.30,
                      bargroupgap=0.3,
                      legend=dict(orientation="v", title='Datasets'), 
                      title=dict(text=f'{metric} values', x=0.5,),
                      margin=dict(l=60, r=20, t=60, b=40),)

    # add dotted line for ROC AUC = 0.5
    if metric == 'ROC_AUC':
        fig.add_shape(type='line',
                        x0=-0.5,
                        y0=0.5,
                        x1=4.5,
                        y1=0.5,
                        line=dict(color='firebrick',  width=2, dash='dot'),
                        xref='x',
                        yref='y')  

    # figure size
    fig.update_layout(
        autosize=False,
        width=1300,
        height=450,) 

    fig.show(renderer='colab')
    fig.write_image(f"{results_path}{metric}.pdf", engine="kaleido")

### Compare with Top 10

In [413]:
# # what metric to evaluate
# # ROC_AUC F1 F2
# metric = 'F2'

# # list of paths
# paths = [
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/',
#         './HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/'
#         ]

# # list of paths of top 10
# paths_top = [
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers A/biomarkers top features/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers B/biomarkers top features/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers C/biomarkers top features/',
#         './HSE project/Optimisation data/cardiovascular death/Clinical ABC/top features/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/top features/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/top features/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/top features/'
#         ]

# # create datframe for scores
# datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
# datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
# datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# # get dataframe with with scores of models from different datasets
# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
#     table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
#     datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
#                                     list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
#     if metric == 'F2':
#         datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3))
#         datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'std')].values.round(3))

# # list of models
# # standart models
# models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost'] + ['RandomForest top 10', 'SVM top 10', 'Logistic Regression top 10', 'KNN top 10', 'CatBoost top 10']


# # create the graph
# fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# # add error whiskers from gridsearchCV
# if metric == 'F2':
#     fig.add_traces([go.Box(name=column, x=models, 
#                            y=datasets_mean[column], 
#                           #  xaxis="x1",  
                           
#                            marker=dict(color="black"), 
#                            showlegend = False) for column in datasets.columns])
#     fig.update_traces(
#     selector=dict(type="box"), # update only boxes
#     boxpoints="all", # show points
#     pointpos=0, # centered
#     jitter=0, # no jitter
#     line_color="rgba(255,255,255,0)", # hide box lines
#     fillcolor="rgba(255,255,255,0)", # hide box fill
    
#     )
#     fig.update_layout(boxmode="group",)

#     fig.add_traces([go.Bar(name=column, x=models, 
#                            y=datasets_mean[column], 
#                            xaxis="x2",  
#                            error_y=dict(type='data',  
#                                         array=datasets_std[column], 
#                                         color="rgba(0,0,0,1)",
#                                         thickness=1), 
#                            marker=dict(opacity=0,
#                                       #  color="rgba(255,255,255,0)"
#                                        ), 
#                            showlegend = False) for column in datasets.columns])  
    
# # Change the bar mode
# fig.update_xaxes(title='Models')
# fig.update_yaxes(title='Score', range=[0., 1.0])
# fig.update_layout(barmode='group', 
#                   xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
#                   bargap=0.30,
#                   bargroupgap=0.3,
#                   legend=dict(orientation="v", title='Datasets'), 
#                   title=dict(text=f'{metric} score', x=0.5,),
#                   margin=dict(l=60, r=20, t=60, b=40),)

# # add dotted line for ROC AUC = 0.5
# if metric == 'ROC_AUC':
#     fig.add_shape(type='line',
#                     x0=-0.5,
#                     y0=0.5,
#                     x1=9.5,
#                     y1=0.5,
#                     line=dict(color='firebrick',  width=2, dash='dot'),
#                     xref='x',
#                     yref='y')   
# # figure size
# fig.update_layout(
#     autosize=False,
#     width=1300,
#     height=450,) 

# fig.show(renderer='colab')

In [414]:
# fig.write_image(f"{results_path}top_metrics.pdf", engine="kaleido")

### Feature selection

In [415]:
# features = pd.DataFrame(columns=['features', 
#                                 #  'biomarkers A',
#                                 #  'biomarkers B',
#                                 #  'biomarkers C',
#                                  'Clinical+biomarkers ABC',
#                                  'Clinical+biomarkers A',
#                                  'Clinical+biomarkers B',
#                                 #  'Clinical+biomarkers C',
#                                  ])
# # list of paths
# paths = [
#         # './HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/',
#         # './HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/',
#         # './HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/',
#         './HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/'
#         ]

# # get dataframe with with scores of models from different datasets
# top_features = []
# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     # if i < 3: 
#     #     top_features = top_features+list(str(col) for col in table.index[:10])
#     # else:    
#     top_features = top_features+list(eval(col)[1] for col in table.index[:10])

# features['features'] = list(set(top_features))
# features.index = list(set(top_features))
# features.fillna(0, inplace=True)

# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     # if i < 3: 
#     #     features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
#     # else:    
#         # top_features = top_features+list(eval(col) for col in table.index[:10])   
#     features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

# features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
# features.sort_values(ascending=False,  inplace=True, by=("features"))
# features.columns = ['sum'] + list(features.columns[1:])
# features.to_excel('./HSE project/Optimisation data/cardiovascular death/feature_selection.xlsx')
# features

# **Target**: Revascularization

In [418]:
target_column = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация')
results_path = './HSE project/Graphics/revascularization/'

## Biomarkers A

### Subset
### Split into train and test

In [419]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (122, 86)
y_train shape:	 (122, 1)
X_test shape:	 (33, 86)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [420]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [421]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.828,0.079,0.4,0.455,0.545,0.333,0.5,0.358,0.578,13,10,5,5,0.831,0.124
SVM,0.833,0.007,0.465,0.685,0.303,0.303,1.0,0.275,0.324,0,23,0,10,0.492,0.062
Logistic Regression,0.827,0.098,0.538,0.625,0.636,0.438,0.7,0.563,0.757,14,9,3,7,0.676,0.192
KNN,0.847,0.107,0.5,0.603,0.576,0.389,0.7,0.397,0.63,12,11,3,7,0.718,0.181
CatBoost,0.779,0.174,0.5,0.603,0.576,0.389,0.7,0.423,0.665,12,11,3,7,0.797,0.165


In [422]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [423]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [424]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [425]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.85,0.095,0.519,0.614,0.606,0.412,0.7,0.412,0.657,13,10,3,7,0.86,0.107
SVM,0.839,0.044,0.444,0.606,0.394,0.308,0.8,0.293,0.435,5,18,2,8,0.759,0.105
Logistic Regression,0.807,0.079,0.571,0.769,0.545,0.4,1.0,0.533,0.791,8,15,0,10,0.68,0.132
KNN,0.855,0.051,0.533,0.667,0.576,0.4,0.8,0.408,0.663,11,12,2,8,0.745,0.148
CatBoost,0.81,0.147,0.545,0.714,0.545,0.391,0.9,0.346,0.596,9,14,1,9,0.848,0.117


## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [426]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (122, 146)
y_train shape:	 (122, 1)
X_test shape:	 (33, 146)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [427]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [428]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.782,0.146,0.414,0.508,0.485,0.316,0.6,0.359,0.6,10,13,4,6,0.826,0.15
SVM,0.816,0.04,0.465,0.685,0.303,0.303,1.0,0.219,0.235,0,23,0,10,0.67,0.108
Logistic Regression,0.754,0.155,0.538,0.625,0.636,0.438,0.7,0.492,0.739,14,9,3,7,0.734,0.181
KNN,0.847,0.066,0.375,0.484,0.394,0.273,0.6,0.294,0.478,7,16,4,6,0.767,0.141
CatBoost,0.747,0.182,0.516,0.656,0.545,0.381,0.8,0.479,0.639,10,13,2,8,0.809,0.138


In [429]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [430]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [431]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [432]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.855,0.105,0.435,0.472,0.606,0.385,0.5,0.38,0.63,15,8,5,5,0.874,0.099
SVM,0.864,0.136,0.4,0.455,0.545,0.333,0.5,0.295,0.487,13,10,5,5,0.85,0.15
Logistic Regression,0.784,0.105,0.519,0.614,0.606,0.412,0.7,0.539,0.7,13,10,3,7,0.777,0.122
KNN,0.871,0.064,0.519,0.614,0.606,0.412,0.7,0.379,0.633,13,10,3,7,0.714,0.098
CatBoost,0.839,0.155,0.48,0.545,0.606,0.4,0.6,0.38,0.639,14,9,4,6,0.866,0.102


## Clinical features A

#### Subset
#### Split into train and test

In [433]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (122, 60)
y_train shape:	 (122, 1)
X_test shape:	 (33, 60)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [434]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=False, 
       knn=False, 
       random_forest=False, 
       svm=False, 
       catboost=True );  clear_output()

### Metrics tables

In [435]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.715,0.171,0.296,0.351,0.424,0.235,0.4,0.325,0.465,10,13,6,4,0.725,0.178
SVM,0.764,0.113,0.296,0.351,0.424,0.235,0.4,0.479,0.613,10,13,6,4,0.658,0.063
Logistic Regression,0.587,0.169,0.296,0.351,0.424,0.235,0.4,0.348,0.439,10,13,6,4,0.619,0.144
KNN,0.768,0.101,0.333,0.417,0.394,0.25,0.5,0.288,0.439,8,15,5,5,0.727,0.122
CatBoost,0.719,0.187,0.4,0.455,0.545,0.333,0.5,0.351,0.561,13,10,5,5,0.659,0.222


In [436]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [437]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [438]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [439]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.783,0.155,0.207,0.254,0.303,0.158,0.3,0.275,0.335,7,16,7,3,0.773,0.187
SVM,0.843,0.082,0.4,0.5,0.455,0.3,0.6,0.419,0.604,9,14,4,6,0.788,0.085
Logistic Regression,0.64,0.159,0.286,0.345,0.394,0.222,0.4,0.272,0.374,9,14,6,4,0.699,0.132
KNN,0.823,0.077,0.323,0.41,0.364,0.238,0.5,0.271,0.402,7,16,5,5,0.729,0.079
CatBoost,0.791,0.179,0.345,0.424,0.424,0.263,0.5,0.307,0.448,9,14,5,5,0.799,0.16


## Biomarkers B

### Subset
### Split into train and test

In [440]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 6)
y_train shape:	 (86, 1)
X_test shape:	 (29, 6)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [441]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [442]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.8,0.097,0.182,0.192,0.69,0.167,0.2,0.238,0.592,19,5,4,1,0.835,0.051
SVM,0.856,0.017,0.242,0.417,0.138,0.143,0.8,0.136,0.217,0,24,1,4,0.76,0.081
Logistic Regression,0.797,0.055,0.19,0.278,0.414,0.125,0.4,0.201,0.417,10,14,3,2,0.7,0.059
KNN,0.858,0.04,0.118,0.156,0.483,0.083,0.2,0.15,0.342,13,11,4,1,0.792,0.066
CatBoost,0.717,0.096,0.333,0.37,0.724,0.286,0.4,0.224,0.508,19,5,3,2,0.828,0.07


In [443]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [444]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 77)
y_train shape:	 (86, 1)
X_test shape:	 (29, 77)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [445]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [446]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.937,0.081,0.286,0.227,0.828,0.5,0.2,0.324,0.7,23,1,4,1,0.969,0.043
SVM,0.924,0.032,0.182,0.192,0.69,0.167,0.2,0.149,0.35,19,5,4,1,0.867,0.088
Logistic Regression,0.883,0.063,0.25,0.385,0.379,0.158,0.6,0.346,0.458,8,16,2,3,0.844,0.082
KNN,0.921,0.034,0.462,0.536,0.759,0.375,0.6,0.294,0.696,19,5,2,3,0.828,0.077
CatBoost,0.878,0.109,0.4,0.4,0.793,0.4,0.4,0.347,0.758,21,3,3,2,0.948,0.037


In [447]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [448]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [449]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [450]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.946,0.022,0.0,0.0,0.586,0.0,0.0,0.21,0.567,17,7,5,0,0.956,0.044
SVM,0.964,0.038,0.0,0.0,0.621,0.0,0.0,0.146,0.308,18,6,5,0,0.992,0.007
Logistic Regression,0.917,0.036,0.0,0.0,0.414,0.0,0.0,0.129,0.208,12,12,5,0,0.906,0.061
KNN,0.964,0.023,0.118,0.156,0.483,0.083,0.2,0.155,0.371,13,11,4,1,0.906,0.059
CatBoost,0.914,0.069,0.182,0.192,0.69,0.167,0.2,0.268,0.667,19,5,4,1,0.937,0.063


## Clinical features B

#### Subset
#### Split into train and test

In [451]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 71)
y_train shape:	 (86, 1)
X_test shape:	 (29, 71)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [452]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [453]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.943,0.055,0.25,0.217,0.793,0.333,0.2,0.455,0.758,22,2,4,1,0.961,0.035
SVM,0.927,0.06,0.0,0.0,0.69,0.0,0.0,0.213,0.558,20,4,5,0,0.967,0.033
Logistic Regression,0.872,0.025,0.385,0.61,0.448,0.238,1.0,0.281,0.667,8,16,0,5,0.839,0.114
KNN,0.934,0.032,0.167,0.185,0.655,0.143,0.2,0.164,0.475,18,6,4,1,0.826,0.077
CatBoost,0.891,0.115,0.333,0.37,0.724,0.286,0.4,0.327,0.75,19,5,3,2,0.943,0.053


In [454]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [455]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [456]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [457]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.933,0.038,0.2,0.2,0.724,0.2,0.2,0.273,0.683,20,4,4,1,0.958,0.035
SVM,0.977,0.014,0.0,0.0,0.517,0.0,0.0,0.151,0.342,15,9,5,0,0.986,0.028
Logistic Regression,0.934,0.03,0.0,0.0,0.379,0.0,0.0,0.13,0.217,11,13,5,0,0.836,0.111
KNN,0.96,0.025,0.143,0.172,0.586,0.111,0.2,0.151,0.4,16,8,4,1,0.893,0.07
CatBoost,0.911,0.078,0.2,0.2,0.724,0.2,0.2,0.242,0.625,20,4,4,1,0.937,0.062


## Biomarkers C

### Subset
### Split into train and test

In [458]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (146, 5)
y_train shape:	 (146, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [459]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [460]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.893,0.055,0.222,0.208,0.837,0.25,0.2,0.21,0.532,35,3,4,1,0.906,0.079
SVM,0.897,0.038,0.2,0.286,0.628,0.133,0.4,0.171,0.558,25,13,3,2,0.843,0.026
Logistic Regression,0.624,0.048,0.235,0.312,0.698,0.167,0.4,0.232,0.705,28,10,3,2,0.724,0.106
KNN,0.875,0.061,0.308,0.357,0.791,0.25,0.4,0.238,0.713,32,6,3,2,0.889,0.05
CatBoost,0.907,0.016,0.0,0.0,0.814,0.0,0.0,0.145,0.532,35,3,5,0,0.903,0.069


In [461]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [462]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (146, 105)
y_train shape:	 (146, 1)
X_test shape:	 (43, 105)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [463]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [464]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.995,0.006,0.0,0.0,0.86,0.0,0.0,0.352,0.758,37,1,5,0,1.0,0.0
SVM,0.956,0.037,0.333,0.37,0.814,0.286,0.4,0.405,0.626,33,5,3,2,0.917,0.099
Logistic Regression,0.92,0.068,0.4,0.5,0.791,0.3,0.6,0.318,0.7,31,7,2,3,0.924,0.061
KNN,0.965,0.018,0.154,0.179,0.744,0.125,0.2,0.116,0.492,31,7,4,1,0.912,0.044
CatBoost,0.965,0.032,0.444,0.417,0.884,0.5,0.4,0.497,0.726,36,2,3,2,0.995,0.011


In [465]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [466]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [467]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [468]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.995,0.007,0.0,0.0,0.814,0.0,0.0,0.303,0.8,35,3,5,0,1.0,0.0
SVM,0.978,0.016,0.25,0.217,0.86,0.333,0.2,0.18,0.526,36,2,4,1,0.991,0.015
Logistic Regression,0.913,0.116,0.154,0.179,0.744,0.125,0.2,0.155,0.516,31,7,4,1,0.868,0.083
KNN,0.963,0.021,0.167,0.185,0.767,0.143,0.2,0.193,0.7,32,6,4,1,0.904,0.054
CatBoost,0.978,0.026,0.429,0.517,0.814,0.333,0.6,0.351,0.758,32,6,2,3,0.992,0.016


## Clinical features C

#### Subset
#### Split into train and test

In [469]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (146, 100)
y_train shape:	 (146, 1)
X_test shape:	 (43, 100)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [470]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [471]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.995,0.007,0.0,0.0,0.86,0.0,0.0,0.435,0.821,37,1,5,0,1.0,0.0
SVM,0.95,0.06,0.222,0.208,0.837,0.25,0.2,0.302,0.679,35,3,4,1,0.987,0.023
Logistic Regression,0.935,0.067,0.429,0.517,0.814,0.333,0.6,0.243,0.695,32,6,2,3,0.919,0.083
KNN,0.968,0.024,0.167,0.185,0.767,0.143,0.2,0.114,0.416,32,6,4,1,0.918,0.061
CatBoost,0.97,0.024,0.222,0.208,0.837,0.25,0.2,0.384,0.742,35,3,4,1,0.996,0.007


In [472]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [473]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [474]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [475]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.987,0.027,0.25,0.217,0.86,0.333,0.2,0.357,0.779,36,2,4,1,0.999,0.002
SVM,0.981,0.02,0.444,0.417,0.884,0.5,0.4,0.44,0.653,36,2,3,2,0.997,0.004
Logistic Regression,0.911,0.139,0.286,0.345,0.767,0.222,0.4,0.236,0.595,31,7,3,2,0.913,0.049
KNN,0.969,0.024,0.235,0.312,0.698,0.167,0.4,0.143,0.587,28,10,3,2,0.916,0.07
CatBoost,0.973,0.028,0.333,0.37,0.814,0.286,0.4,0.45,0.768,33,5,3,2,0.991,0.018


## Clinical features A-B-C

#### Subset
#### Split into train and test

In [476]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_abc, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_revascularization_abc, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_revascularization_abc, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_revascularization_abc, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (390, 56)
y_train shape:	 (390, 1)
X_test shape:	 (87, 56)
y_test shape:	 (87, 1)


### Hyper-parameter optimisation

In [477]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [478]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.845,0.097,0.229,0.211,0.69,0.267,0.2,0.34,0.636,56,11,16,4,0.91,0.069
SVM,0.904,0.056,0.372,0.388,0.69,0.348,0.4,0.221,0.475,52,15,12,8,0.869,0.086
Logistic Regression,0.703,0.127,0.36,0.409,0.632,0.3,0.45,0.3,0.569,46,21,11,9,0.77,0.071
KNN,0.901,0.025,0.375,0.417,0.655,0.321,0.45,0.271,0.583,48,19,11,9,0.782,0.024
CatBoost,0.865,0.108,0.25,0.217,0.724,0.333,0.2,0.386,0.635,59,8,16,4,0.937,0.058


In [479]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

#### subset

In [480]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [481]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [482]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.845,0.081,0.4,0.4,0.724,0.4,0.4,0.422,0.682,55,12,12,8,0.906,0.058
SVM,0.893,0.038,0.393,0.474,0.609,0.306,0.55,0.359,0.633,42,25,9,11,0.78,0.048
Logistic Regression,0.657,0.072,0.408,0.459,0.667,0.345,0.5,0.368,0.595,48,19,10,10,0.706,0.061
KNN,0.911,0.027,0.353,0.405,0.621,0.29,0.45,0.257,0.561,45,22,11,9,0.828,0.031
CatBoost,0.843,0.137,0.378,0.361,0.736,0.412,0.35,0.44,0.675,57,10,13,7,0.914,0.065


## Results

### Scores of models

In [483]:
# # what metric to evaluate
# # F2 F1 ROC_AUC
# metrics = ['F2', 'ROC_AUC']

# for metric in metrics:
#     # list of paths
#     paths = [
#             './HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/',
#             './HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/',
#             './HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/',
#             './HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/',
#             './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
#             './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
#             './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/',
#             './HSE project/Optimisation data/revascularization/Clinical A/all biomarkers and clinical/',
#             './HSE project/Optimisation data/revascularization/Clinical B/all biomarkers and clinical/',
#             './HSE project/Optimisation data/revascularization/Clinical C/all biomarkers and clinical/',
#             ]

#     # create datframe for scores
#     datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
#     datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
#     datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

#     # get dataframe with with scores of models from different datasets
#     for i in range(len(paths)):
#         table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#         datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
#         if metric == 'F2':
#             datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
#             datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
#         if metric == 'ROC_AUC':
#             datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'mean')].values.round(3)) 
#             datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'std')].values.round(3)) 

#     # list of models
#     # standart models
#     models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
#     # ensemble models
#     # models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

#     # create the graph
#     fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns])
        
#     # add error whiskers from gridsearchCV
#     if True:
#         fig.add_traces([go.Box(name=column, x=models, 
#                               y=datasets_mean[column], 
#                               #  xaxis="x1",  
                              
#                               marker=dict(color="black"), 
#                               showlegend = False) for column in datasets.columns])
#         fig.update_traces(
#         selector=dict(type="box"), # update only boxes
#         boxpoints="all", # show points
#         pointpos=0, # centered
#         jitter=0, # no jitter
#         line_color="rgba(255,255,255,0)", # hide box lines
#         fillcolor="rgba(255,255,255,0)", # hide box fill
        
#         )
#         fig.update_layout(boxmode="group",)

#         fig.add_traces([go.Bar(name=column, x=models, 
#                               y=datasets_mean[column], 
#                               xaxis="x2",  
#                               error_y=dict(type='data',  
#                                             array=datasets_std[column], 
#                                             color="rgba(0,0,0,1)",
#                                             thickness=1), 
#                               marker=dict(opacity=0,
#                                           #  color="rgba(255,255,255,0)"
#                                           ), 
#                               showlegend = False) for column in datasets.columns])  
        

#     fig.update_xaxes(title='Models')
#     # Change the bar mode
#     fig.update_layout(barmode='group', 
#                       xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False},
#                       bargap=0.30,
#                       bargroupgap=0.3,
#                       legend=dict(orientation="v", title='Datasets'), 
#                       title=dict(text=f'{metric} score', x=0.5,),
#                       margin=dict(l=60, r=20, t=60, b=40),)

#     fig.update_yaxes(title='Score', range=[0., 1.0])


#     # add dotted line for ROC AUC = 0.5
#     if metric == 'ROC_AUC':
#         fig.add_shape(type='line',
#                         x0=-0.5,
#                         y0=0.5,
#                         x1=4.5,
#                         y1=0.5,
#                         line=dict(color='firebrick',  width=2, dash='dot'),
#                         xref='x',
#                         yref='y')   
#     # figure size
#     fig.update_layout(
#         autosize=False,
#         width=1300,
#         height=450,) 
#     # fig.write_image(f"{results_path}metrics.pdf", engine="kaleido")
#     fig.show(renderer='colab')
#     fig.write_image(f"{results_path}{metric}.pdf", engine="kaleido")

### Compare with Top 10

In [484]:
# # what metric to evaluate
# # ROC_AUC F1 F2
# metric = 'ROC_AUC'

# # list of paths
# paths = [
#         './HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/',
#         './HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/',
#         './HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/',
#         './HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/',
#         './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
#         './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
#         './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
#         ]

# # list of paths of top 10
# paths_top = [
#         './HSE project/Optimisation data/revascularization/Biomarkers A/biomarkers top features/',
#         './HSE project/Optimisation data/revascularization/Biomarkers B/biomarkers top features/',
#         './HSE project/Optimisation data/revascularization/Biomarkers C/biomarkers top features/',
#         './HSE project/Optimisation data/revascularization/Clinical ABC/top features/',
#         './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/top features/',
#         './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/top features/',
#         './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/top features/'
#         ]

# # create datframe for scores
# datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
# datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
# datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# # get dataframe with with scores of models from different datasets
# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
#     table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
#     datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
#                                     list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
#     if metric == 'F2':
#         datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3))
#         datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3))


# # list of models
# # standart models
# models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost'] + ['RandomForest top 10', 'SVM top 10', 'Logistic Regression top 10', 'KNN top 10', 'CatBoost top 10']


# # create the graph
# fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# # add error whiskers from gridsearchCV
# if metric == 'F2':
#     fig.add_traces([go.Box(name=column, x=models, 
#                            y=datasets_mean[column], 
#                           #  xaxis="x1",  
                           
#                            marker=dict(color="black"), 
#                            showlegend = False) for column in datasets.columns])
#     fig.update_traces(
#     selector=dict(type="box"), # update only boxes
#     boxpoints="all", # show points
#     pointpos=0, # centered
#     jitter=0, # no jitter
#     line_color="rgba(255,255,255,0)", # hide box lines
#     fillcolor="rgba(255,255,255,0)", # hide box fill
    
#     )
#     fig.update_layout(boxmode="group",)

#     fig.add_traces([go.Bar(name=column, x=models, 
#                            y=datasets_mean[column], 
#                            xaxis="x2",  
#                            error_y=dict(type='data',  
#                                         array=datasets_std[column], 
#                                         color="rgba(0,0,0,1)",
#                                         thickness=1), 
#                            marker=dict(opacity=0,
#                                       #  color="rgba(255,255,255,0)"
#                                        ), 
#                            showlegend = False) for column in datasets.columns])  
    
# # Change the bar mode
# fig.update_xaxes(title='Models')
# fig.update_yaxes(title='Score', range=[0., 1.0])
# fig.update_layout(barmode='group', 
#                   xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
#                   bargap=0.30,
#                   bargroupgap=0.3,
#                   legend=dict(orientation="v", title='Datasets'), 
#                   title=dict(text=f'{metric} score', x=0.5,),
#                   margin=dict(l=60, r=20, t=60, b=40),)

# # add dotted line for ROC AUC = 0.5
# if metric == 'ROC_AUC':
#     fig.add_shape(type='line',
#                     x0=-0.5,
#                     y0=0.5,
#                     x1=9.5,
#                     y1=0.5,
#                     line=dict(color='firebrick',  width=2, dash='dot'),
#                     xref='x',
#                     yref='y')   
# # figure size
# fig.update_layout(
#     autosize=False,
#     width=1300,
#     height=450,) 
# fig.show(renderer='colab')
# fig.write_image(f"{results_path}top_metrics.pdf", engine="kaleido")

### Feature selection

In [485]:
# features = pd.DataFrame(columns=['features', 
#                                  'biomarkers A',
#                                 #  'biomarkers B',
#                                 #  'biomarkers C',
#                                  'Clinical+biomarkers ABC',
#                                  'Clinical+biomarkers A',
#                                 #  'Clinical+biomarkers B',
#                                 #  'Clinical+biomarkers C',
#                                  ])
# # list of paths
# paths = [
#         './HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/',
#         './HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/',
#         './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
#         ]

# # get dataframe with with scores of models from different datasets
# top_features = []
# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     if i < 1: 
#         top_features = top_features+list(str(col) for col in table.index[:10])
#     else:    
#         top_features = top_features+list(eval(col)[1] for col in table.index[:10])

# features['features'] = list(set(top_features))
# features.index = list(set(top_features))
# features.fillna(0, inplace=True)

# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     if i < 1: 
#         features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
#     else:    
#         # top_features = top_features+list(eval(col) for col in table.index[:10])   
#         features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

# features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
# features.sort_values(ascending=False,  inplace=True, by=("features"))
# features.columns = ['sum'] + list(features.columns[1:])
# features.to_excel('./HSE project/Optimisation data/revascularization/feature_selection.xlsx')
# features

# **Target**: Combined

In [293]:
results_path = './HSE project/Graphics/combined/'

## Biomarkers A

### Subset
### Split into train and test

In [294]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (192, 86)
y_train shape:	 (192, 1)
X_test shape:	 (50, 86)
y_test shape:	 (50, 1)


### Hyper-parameter optimisation

In [68]:
tuning(
       score='f1', #my_f2_scorer() 'f1'
       catboost_score='F:beta=1', #'F:beta=2' 'F:beta=1'
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [295]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.791,0.032,0.645,0.699,0.56,0.571,0.741,0.626,0.575,8,15,7,20,0.865,0.113
SVM,0.741,0.081,0.656,0.704,0.58,0.588,0.741,0.634,0.641,9,14,7,20,0.825,0.089
Logistic Regression,0.691,0.064,0.724,0.755,0.68,0.677,0.778,0.647,0.667,13,10,6,21,0.67,0.094
KNN,0.57,0.073,0.383,0.352,0.42,0.45,0.333,0.51,0.428,12,11,18,9,0.682,0.021
CatBoost,0.715,0.104,0.576,0.607,0.5,0.531,0.63,0.602,0.593,8,15,10,17,0.828,0.146


In [296]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [71]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [72]:
tuning(
       score='f1', #my_f2_scorer()
       catboost_score='F:beta=1', #'F:beta=2'
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [73]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
# metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.815,0.069,0.576,0.607,0.5,0.531,0.63,0.548,0.507,8,15,10,17,0.883,0.086
SVM,0.809,0.058,0.655,0.683,0.6,0.613,0.704,0.661,0.655,11,12,8,19,0.85,0.062
Logistic Regression,0.658,0.083,0.667,0.688,0.62,0.633,0.704,0.742,0.66,12,11,8,19,0.678,0.043
KNN,0.619,0.094,0.449,0.423,0.46,0.5,0.407,0.524,0.465,12,11,16,11,0.666,0.055
CatBoost,0.773,0.093,0.59,0.634,0.5,0.529,0.667,0.55,0.514,7,16,9,18,0.858,0.121


## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [297]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (192, 146)
y_train shape:	 (192, 1)
X_test shape:	 (50, 146)
y_test shape:	 (50, 1)


### Hyper-parameter optimisation

In [75]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [298]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.815,0.054,0.746,0.786,0.7,0.688,0.815,0.657,0.671,13,10,5,22,0.86,0.118
SVM,0.74,0.061,0.721,0.775,0.66,0.647,0.815,0.672,0.663,11,12,5,22,0.859,0.08
Logistic Regression,0.674,0.023,0.772,0.797,0.74,0.733,0.815,0.717,0.746,15,8,5,22,0.712,0.066
KNN,0.626,0.097,0.372,0.323,0.46,0.5,0.296,0.528,0.474,15,8,19,8,0.708,0.06
CatBoost,0.726,0.094,0.69,0.719,0.64,0.645,0.741,0.707,0.688,12,11,7,20,0.867,0.109


In [299]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [78]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [79]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [80]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.829,0.067,0.689,0.739,0.62,0.618,0.778,0.589,0.58,10,13,6,21,0.898,0.079
SVM,0.762,0.036,0.61,0.643,0.54,0.562,0.667,0.694,0.586,9,14,9,18,0.867,0.068
Logistic Regression,0.693,0.059,0.667,0.688,0.62,0.633,0.704,0.637,0.659,12,11,8,19,0.69,0.052
KNN,0.612,0.073,0.52,0.496,0.52,0.565,0.481,0.552,0.523,13,10,14,13,0.687,0.035
CatBoost,0.794,0.108,0.654,0.639,0.64,0.68,0.63,0.641,0.668,15,8,10,17,0.9,0.076


## Clinical features A

#### Subset
#### Split into train and test

In [300]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (192, 60)
y_train shape:	 (192, 1)
X_test shape:	 (50, 60)
y_test shape:	 (50, 1)


### Hyper-parameter optimisation

In [82]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [301]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.756,0.041,0.712,0.75,0.66,0.656,0.778,0.669,0.647,12,11,6,21,0.822,0.092
SVM,0.705,0.054,0.618,0.625,0.58,0.607,0.63,0.614,0.638,12,11,10,17,0.759,0.09
Logistic Regression,0.685,0.031,0.63,0.63,0.6,0.63,0.63,0.697,0.673,13,10,10,17,0.732,0.101
KNN,0.668,0.054,0.571,0.584,0.52,0.552,0.593,0.547,0.514,10,13,11,16,0.704,0.081
CatBoost,0.702,0.092,0.702,0.725,0.66,0.667,0.741,0.665,0.663,13,10,7,20,0.793,0.123


In [302]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [85]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [86]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [87]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.758,0.069,0.655,0.683,0.6,0.613,0.704,0.58,0.575,11,12,8,19,0.851,0.071
SVM,0.707,0.082,0.655,0.662,0.62,0.643,0.667,0.627,0.62,13,10,9,18,0.794,0.11
Logistic Regression,0.625,0.068,0.612,0.577,0.62,0.682,0.556,0.65,0.612,16,7,12,15,0.663,0.065
KNN,0.676,0.065,0.44,0.42,0.44,0.478,0.407,0.515,0.443,11,12,16,11,0.725,0.076
CatBoost,0.731,0.092,0.607,0.62,0.56,0.586,0.63,0.626,0.573,11,12,10,17,0.81,0.089


## Biomarkers B

### Subset
### Split into train and test

In [303]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 6)
y_train shape:	 (86, 1)
X_test shape:	 (32, 6)
y_test shape:	 (32, 1)


### Hyper-parameter optimisation

In [89]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [304]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.769,0.026,0.286,0.244,0.688,0.4,0.222,0.295,0.396,20,3,7,2,0.772,0.095
SVM,0.669,0.082,0.462,0.566,0.562,0.353,0.667,0.366,0.662,12,11,3,6,0.64,0.168
Logistic Regression,0.7,0.034,0.381,0.417,0.594,0.333,0.444,0.407,0.662,15,8,5,4,0.652,0.057
KNN,0.721,0.055,0.353,0.341,0.656,0.375,0.333,0.325,0.449,18,5,6,3,0.769,0.079
CatBoost,0.728,0.135,0.222,0.222,0.562,0.222,0.222,0.347,0.333,16,7,7,2,0.738,0.062


In [305]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [306]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 77)
y_train shape:	 (86, 1)
X_test shape:	 (32, 77)
y_test shape:	 (32, 1)


### Hyper-parameter optimisation

In [93]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [307]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.909,0.027,0.0,0.0,0.594,0.0,0.0,0.268,0.444,19,4,9,0,0.951,0.031
SVM,0.887,0.036,0.143,0.122,0.625,0.2,0.111,0.335,0.527,19,4,8,1,0.933,0.041
Logistic Regression,0.818,0.078,0.3,0.319,0.562,0.273,0.333,0.256,0.411,15,8,6,3,0.86,0.09
KNN,0.878,0.066,0.125,0.116,0.562,0.143,0.111,0.266,0.425,17,6,8,1,0.863,0.073
CatBoost,0.86,0.063,0.267,0.238,0.656,0.333,0.222,0.358,0.556,19,4,7,2,0.947,0.039


In [308]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [96]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [97]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [98]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.917,0.071,0.143,0.122,0.625,0.2,0.111,0.27,0.435,19,4,8,1,0.964,0.046
SVM,0.865,0.08,0.0,0.0,0.5,0.0,0.0,0.215,0.285,16,7,9,0,0.846,0.089
Logistic Regression,0.762,0.045,0.0,0.0,0.438,0.0,0.0,0.243,0.377,14,9,9,0,0.827,0.039
KNN,0.854,0.094,0.118,0.114,0.531,0.125,0.111,0.243,0.365,16,7,8,1,0.851,0.106
CatBoost,0.89,0.044,0.25,0.233,0.625,0.286,0.222,0.304,0.44,18,5,7,2,0.94,0.034


## Clinical features B

#### Subset
#### Split into train and test

In [309]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 71)
y_train shape:	 (86, 1)
X_test shape:	 (32, 71)
y_test shape:	 (32, 1)


### Hyper-parameter optimisation

In [100]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [310]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.846,0.094,0.235,0.227,0.594,0.25,0.222,0.292,0.481,17,6,7,2,0.927,0.062
SVM,0.882,0.036,0.286,0.244,0.688,0.4,0.222,0.283,0.444,20,3,7,2,0.956,0.036
Logistic Regression,0.814,0.063,0.111,0.111,0.5,0.111,0.111,0.248,0.401,15,8,8,1,0.861,0.106
KNN,0.881,0.04,0.118,0.114,0.531,0.125,0.111,0.287,0.5,16,7,8,1,0.888,0.04
CatBoost,0.819,0.065,0.2,0.213,0.5,0.182,0.222,0.34,0.406,14,9,7,2,0.93,0.02


In [311]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [103]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [104]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [105]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.852,0.086,0.235,0.227,0.594,0.25,0.222,0.276,0.449,17,6,7,2,0.899,0.063
SVM,0.832,0.08,0.4,0.426,0.625,0.364,0.444,0.359,0.57,16,7,5,4,0.869,0.076
Logistic Regression,0.735,0.031,0.3,0.319,0.562,0.273,0.333,0.308,0.483,15,8,6,3,0.78,0.057
KNN,0.862,0.055,0.4,0.426,0.625,0.364,0.444,0.318,0.57,16,7,5,4,0.836,0.069
CatBoost,0.827,0.08,0.105,0.109,0.469,0.1,0.111,0.237,0.367,14,9,8,1,0.907,0.024


## Biomarkers C

### Subset
### Split into train and test

In [312]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (120, 5)
y_train shape:	 (120, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [313]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.725,0.068,0.545,0.612,0.767,0.462,0.667,0.431,0.725,27,7,3,6,0.726,0.083
SVM,0.724,0.061,0.435,0.5,0.698,0.357,0.556,0.42,0.686,25,9,4,5,0.693,0.117
Logistic Regression,0.677,0.064,0.452,0.603,0.605,0.318,0.778,0.684,0.817,19,15,2,7,0.704,0.1
KNN,0.709,0.047,0.273,0.306,0.628,0.231,0.333,0.216,0.52,24,10,6,3,0.683,0.111
CatBoost,0.719,0.113,0.381,0.417,0.698,0.333,0.444,0.255,0.595,26,8,5,4,0.785,0.101


In [314]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [315]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (120, 105)
y_train shape:	 (120, 1)
X_test shape:	 (43, 105)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [111]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [316]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.871,0.105,0.375,0.349,0.767,0.429,0.333,0.523,0.735,30,4,6,3,0.926,0.095
SVM,0.877,0.086,0.632,0.652,0.837,0.6,0.667,0.645,0.789,30,4,3,6,0.958,0.043
Logistic Regression,0.881,0.084,0.522,0.6,0.744,0.429,0.667,0.445,0.752,26,8,3,6,0.932,0.052
KNN,0.872,0.041,0.357,0.455,0.581,0.263,0.556,0.239,0.572,20,14,4,5,0.85,0.057
CatBoost,0.868,0.101,0.375,0.349,0.767,0.429,0.333,0.53,0.765,30,4,6,3,0.931,0.062


In [317]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [114]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [115]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [116]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.898,0.049,0.353,0.341,0.744,0.375,0.333,0.406,0.708,29,5,6,3,0.929,0.074
SVM,0.942,0.057,0.4,0.357,0.791,0.5,0.333,0.435,0.701,31,3,6,3,0.979,0.019
Logistic Regression,0.846,0.099,0.348,0.4,0.651,0.286,0.444,0.427,0.637,24,10,5,4,0.914,0.052
KNN,0.897,0.037,0.455,0.51,0.721,0.385,0.556,0.307,0.66,26,8,4,5,0.883,0.049
CatBoost,0.887,0.045,0.471,0.455,0.791,0.5,0.444,0.427,0.709,30,4,5,4,0.943,0.038


## Clinical features C

#### Subset
#### Split into train and test

In [318]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (120, 100)
y_train shape:	 (120, 1)
X_test shape:	 (43, 100)
y_test shape:	 (43, 1)


In [319]:
X_train.isna().sum().sum()

0

### Hyper-parameter optimisation

In [128]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [320]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.857,0.119,0.444,0.444,0.767,0.444,0.444,0.385,0.686,29,5,5,4,0.924,0.096
SVM,0.885,0.086,0.5,0.532,0.767,0.455,0.556,0.568,0.74,28,6,4,5,0.957,0.05
Logistic Regression,0.838,0.131,0.476,0.521,0.744,0.417,0.556,0.418,0.742,27,7,4,5,0.915,0.07
KNN,0.853,0.035,0.24,0.288,0.558,0.188,0.333,0.202,0.475,21,13,6,3,0.825,0.049
CatBoost,0.83,0.121,0.476,0.521,0.744,0.417,0.556,0.531,0.758,27,7,4,5,0.914,0.103


In [321]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [131]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Clinical C/top features/"

### Hyper-parameter optimisation

In [132]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [133]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.874,0.1,0.333,0.333,0.721,0.333,0.333,0.444,0.699,28,6,6,3,0.919,0.091
SVM,0.95,0.061,0.571,0.625,0.791,0.5,0.667,0.481,0.755,28,6,3,6,0.981,0.024
Logistic Regression,0.857,0.117,0.429,0.545,0.628,0.316,0.667,0.399,0.647,21,13,3,6,0.91,0.073
KNN,0.9,0.063,0.348,0.4,0.651,0.286,0.444,0.243,0.575,24,10,5,4,0.883,0.081
CatBoost,0.834,0.099,0.381,0.417,0.698,0.333,0.444,0.488,0.752,26,8,5,4,0.926,0.09


## Clinical features A-B-C

#### Subset
#### Split into train and test

In [322]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_abc, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_combined_abc, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_combined_abc, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_combined_abc, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (358, 56)
y_train shape:	 (358, 1)
X_test shape:	 (106, 56)
y_test shape:	 (106, 1)


### Hyper-parameter optimisation

In [135]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [323]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.716,0.079,0.652,0.636,0.698,0.682,0.625,0.686,0.761,44,14,18,30,0.739,0.083
SVM,0.692,0.083,0.517,0.494,0.594,0.561,0.479,0.45,0.497,40,18,25,23,0.684,0.054
Logistic Regression,0.65,0.083,0.562,0.536,0.632,0.61,0.521,0.635,0.67,42,16,23,25,0.691,0.053
KNN,0.715,0.078,0.612,0.62,0.642,0.6,0.625,0.569,0.642,38,20,18,30,0.692,0.121
CatBoost,0.708,0.079,0.617,0.609,0.66,0.63,0.604,0.618,0.707,41,17,19,29,0.765,0.105


In [324]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

#### subset

In [138]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [139]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [140]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.737,0.057,0.659,0.638,0.708,0.698,0.625,0.702,0.744,45,13,18,30,0.773,0.059
SVM,0.683,0.045,0.592,0.599,0.623,0.58,0.604,0.431,0.446,37,21,19,29,0.675,0.055
Logistic Regression,0.661,0.046,0.538,0.527,0.594,0.556,0.521,0.604,0.654,38,20,23,25,0.661,0.058
KNN,0.704,0.052,0.612,0.62,0.642,0.6,0.625,0.591,0.658,38,20,18,30,0.718,0.089
CatBoost,0.696,0.082,0.584,0.558,0.651,0.634,0.542,0.592,0.688,43,15,22,26,0.761,0.074


## Results

### Scores of models

In [161]:
# what metric to evaluate
# F2 F1 ROC_AUC
metrics = ['F2', 'ROC_AUC']

for metric in metrics:
    # list of paths
    paths = [
            './HSE project/Optimisation data/combined/Biomarkers A/all biomarkers/',
            './HSE project/Optimisation data/combined/Biomarkers B/all biomarkers/',
            './HSE project/Optimisation data/combined/Biomarkers C/all biomarkers/',
            './HSE project/Optimisation data/combined/Clinical ABC/all clinical/',
            './HSE project/Optimisation data/combined/Biomarkers A + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Biomarkers B + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Biomarkers C + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Clinical A/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Clinical B/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Clinical C/all biomarkers and clinical/',
            ]

    # create datframe for scores
    datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

    # get dataframe with with scores of models from different datasets
    for i in range(len(paths)):
        table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
        datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
        if metric == 'F2':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
        if metric == 'ROC_AUC':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'std')].values.round(3)) 

    # list of models
    # standart models
    models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
    # ensemble models
    # models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

    # create the graph
    fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns])
        
    # add error whiskers from gridsearchCV
    if True:
        fig.add_traces([go.Box(name=column, x=models, 
                              y=datasets_mean[column], 
                              #  xaxis="x1",  
                              
                              marker=dict(color="black"), 
                              showlegend = False) for column in datasets.columns])
        fig.update_traces(
        selector=dict(type="box"), # update only boxes
        boxpoints="all", # show points
        pointpos=0, # centered
        jitter=0, # no jitter
        line_color="rgba(255,255,255,0)", # hide box lines
        fillcolor="rgba(255,255,255,0)", # hide box fill
        
        )
        fig.update_layout(boxmode="group",)

        fig.add_traces([go.Bar(name=column, x=models, 
                              y=datasets_mean[column], 
                              xaxis="x2",  
                              error_y=dict(type='data',  
                                            array=datasets_std[column], 
                                            color="rgba(0,0,0,1)",
                                            thickness=1), 
                              marker=dict(opacity=0,
                                          #  color="rgba(255,255,255,0)"
                                          ), 
                              showlegend = False) for column in datasets.columns])  
        

    fig.update_xaxes(title='Models')
    # Change the bar mode
    fig.update_layout(barmode='group', 
                      xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False},
                      bargap=0.30,
                      bargroupgap=0.3,
                      legend=dict(orientation="v", title='Datasets'), 
                      title=dict(text=f'{metric} score', x=0.5,),
                      margin=dict(l=60, r=20, t=60, b=40),)

    fig.update_yaxes(title='Score', range=[0., 1.0])


    # add dotted line for ROC AUC = 0.5
    if metric == 'ROC_AUC':
        fig.add_shape(type='line',
                        x0=-0.5,
                        y0=0.5,
                        x1=4.5,
                        y1=0.5,
                        line=dict(color='firebrick',  width=2, dash='dot'),
                        xref='x',
                        yref='y')   
    # figure size
    fig.update_layout(
        autosize=False,
        width=1300,
        height=450,) 
    fig.show(renderer='colab')
    fig.write_image(f"{results_path}{metric}.pdf", engine="kaleido")

### Compare with Top 10

In [159]:
# what metric to evaluate
# ROC_AUC F1 F2
metric = 'ROC_AUC'

# list of paths
paths = [
        './HSE project/Optimisation data/combined/Biomarkers A/all biomarkers/',
        './HSE project/Optimisation data/combined/Biomarkers B/all biomarkers/',
        './HSE project/Optimisation data/combined/Biomarkers C/all biomarkers/',
        './HSE project/Optimisation data/combined/Clinical ABC/all clinical/',
        './HSE project/Optimisation data/combined/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/combined/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/combined/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# list of paths of top 10
paths_top = [
        './HSE project/Optimisation data/combined/Biomarkers A/biomarkers top features/',
        './HSE project/Optimisation data/combined/Biomarkers B/biomarkers top features/',
        './HSE project/Optimisation data/combined/Biomarkers C/biomarkers top features/',
        './HSE project/Optimisation data/combined/Clinical ABC/top features/',
        './HSE project/Optimisation data/combined/Biomarkers A + Clinical/top features/',
        './HSE project/Optimisation data/combined/Biomarkers B + Clinical/top features/',
        './HSE project/Optimisation data/combined/Biomarkers C + Clinical/top features/'
        ]

# create datframe for scores
datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
    table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
                                    list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3))
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'std')].values.round(3))


# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost'] + ['RandomForest top 10', 'SVM top 10', 'Logistic Regression top 10', 'KNN top 10', 'CatBoost top 10']


# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    
# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Score', range=[0., 1.0])
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=9.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,) 
fig.show(renderer='colab')
fig.write_image(f"{results_path}top_{metric}.pdf", engine="kaleido")

### Feature selection

In [None]:
# features = pd.DataFrame(columns=['features', 
#                                  'biomarkers A',
#                                 #  'biomarkers B',
#                                 #  'biomarkers C',
#                                  'Clinical+biomarkers ABC',
#                                  'Clinical+biomarkers A',
#                                 #  'Clinical+biomarkers B',
#                                 #  'Clinical+biomarkers C',
#                                  ])
# # list of paths
# paths = [
#         './HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/',
#         './HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/',
#         './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
#         ]

# # get dataframe with with scores of models from different datasets
# top_features = []
# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     if i < 1: 
#         top_features = top_features+list(str(col) for col in table.index[:10])
#     else:    
#         top_features = top_features+list(eval(col)[1] for col in table.index[:10])

# features['features'] = list(set(top_features))
# features.index = list(set(top_features))
# features.fillna(0, inplace=True)

# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     if i < 1: 
#         features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
#     else:    
#         # top_features = top_features+list(eval(col) for col in table.index[:10])   
#         features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

# features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
# features.sort_values(ascending=False,  inplace=True, by=("features"))
# features.columns = ['sum'] + list(features.columns[1:])
# features.to_excel('./HSE project/Optimisation data/revascularization/feature_selection.xlsx')
# features

# References

- [Guidelines and quality criteria for artificial intelligence-based prediction models in healthcare: a scoping review](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8748878/pdf/41746_2021_Article_549.pdf)

- ✅ [Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5#citeas)

- ✅ [Machine learning-based prediction of adverse events
following an acute coronary syndrome (PRAISE): a modelling
study of pooled datasets](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)32519-8/fulltext)

- ✅ [Critical appraisal of artificial intelligence-based prediction models for cardiovascular disease](https://watermark.silverchair.com/ehac238.pdf?token=AQECAHi208BE49Ooan9kkhW_Ercy7Dm3ZL_9Cf3qfKAc485ysgAAAtQwggLQBgkqhkiG9w0BBwagggLBMIICvQIBADCCArYGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMSvd0U0E66pd-sc_vAgEQgIICh_0OOn38okwwjvtHKZZRS6iesoJ0VuLm_qXiAJTeVb_83xAfB7oneCMsGdP7SkYUZPlcO3UtImKRROpfFzoAH87-TvQu04QMni8-YL47A9k13em0EMsLU86rv0fjaSmxgG-hPnAe7eRJaEDf1ckm-YBNx65aPTx1UC8yW3YO0gDra3ROrfsyl2UariiUse8hZ5S-I2WvFx0gic__qBLni02hEetj0dt-mInD7DxKqGuk28AuNOCDlF9Q1Tfj7oSyk6_1aNHJJ9XklpOJgzsKn-j4yusaYkapojnZzcNzBGcx6tTWYDn-YFcevxsYSc_uKlSUl40oTPl5Gwp-gAyxaLx9bFRuCDA6bxfPsNjgLQR0Eo4QxBuMD5h8FR6H6hEkZ1heaEpiWvZHqwTbEMddl1L1EgD2w-L-ng1YHbegVuZLa-Noll9OWfYSsVZf330LvUYMnTSu3FxrJ72voWUNhS3xzpTvkaeTqIkQgRU5Q75TfoKpMWfefufVgDshQhRM0ww1qRImd34Faql0RyBAKOPXG_HaucEkyXb60GCd6-0yjP5Mjbq-TML0Y9pnKIvmf9wXcTw-DJTcMT97fzWbp_psY70J02wEjvHPxfkOyEl9TiA08sI24GqKHAZuSU_M5R2dGN5W7qGuN_A-TbFKvO3FyMDOgV89BtJXHk8wVYpR-f2uppZydQydht_KTHlkV8hbYf0StZGbCXLb-fk38yZ6rerF9dTXfT6PtrYdlBYrVW65ZRn1HbxhoA0LBI0f5z8gpiqQjnyxSzrX-e9FYtOfOPu-i-IfGTLMFELowQ3IXkTup2Ee1dvT0sosTfoC5Q6x6d8nubiZFtw_SLYg21vF1XH2Gw9d)

- [Interpretation of machine learning predictions for patient outcomes in electronic health records](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7153071/pdf/3200408.pdf)
- [Minimum sample size for external validation of a clinical prediction model with a binary outcome](https://pubmed.ncbi.nlm.nih.gov/34031906/)
- [Machine learning of clinical variables and coronary artery calcium scoring for the prediction of obstructive coronary artery disease on coronary computed tomography angiography: analysis from the CONFIRM registry](https://pubmed.ncbi.nlm.nih.gov/31513271/)
- [Reflection on modern methods: when worlds collide-prediction, machine learning and causal inference](https://pubmed.ncbi.nlm.nih.gov/31298274/)
- General Cardiovascular Risk Profile for Use in Primary Care

### Feature importance

- ✅ [Feature Importance May Be Lying To You](https://towardsdatascience.com/feature-importance-may-be-lying-to-you-3247cafa7ee7)
- [Different Measures of Feature Importance Behave Differently](https://hippocampus-garden.com/feature_importance/)
- [Explaining Feature Importance by example of a Random Forest](https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e)
- [Interpret Logistic Regression Coefficients (For Beginners)](https://quantifyinghealth.com/interpret-logistic-regression-coefficients/)

- [FAQ: HOW DO I INTERPRET ODDS RATIOS IN LOGISTIC REGRESSION?](https://stats.oarc.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/#:~:text=A%20logistic%20regression%20model%20allows,relationship%20with%20the%20predictor%20variables.)

### Imputation
I decided to use kNN imputation b/c it was easy to implement using sklearn package and it was much better than other simpler imputers. However, there are also two types of complex imputers that might be reasonable to use (MICE and datawig)
- [6 Different Ways to Compensate for Missing Values In a Dataset (Data Imputation with examples)](https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779)
- [sklearn](https://scikit-learn.org/stable/modules/impute.html)
- [kNN Imputation for Missing Values in Machine Learning](https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/)


### Understanding model predictions


In paper "Critical appraisal of artificial intelligence-based prediction models for cardiovascular disease" it was mentioned the use of LIME and SHAP
  
##### LIME 
- [“Why Should I Trust You?” Explaining the Predictions of Any Classifier - paper about LIME](https://arxiv.org/pdf/1602.04938.pdf)

- [Understanding model predictions with LIME](https://towardsdatascience.com/understanding-model-predictions-with-lime-a582fdff3a3b)
- [Understanding how LIME explains predictions](https://towardsdatascience.com/understanding-how-lime-explains-predictions-d404e5d1829c)
- ✅ [How to explain ML models and feature importance with LIME?](https://analyticsindiamag.com/how-to-explain-ml-models-and-feature-importance-with-lime/)
- ✅ [Local Interpretable Model-Agnostic Explanations (LIME): An Introduction](https://www.oreilly.com/content/introduction-to-local-interpretable-model-agnostic-explanations-lime/)
- ✅ [Explanations (LIME)](https://ema.drwhy.ai/LIME.html)  
##### SHAP   
- ✅ [SHAP Values Explained Exactly How You Wished Someone Explained to You](https://towardsdatascience.com/shap-explained-the-way-i-wish-someone-explained-it-to-me-ab81cc69ef30)
-[Using SHAP Values to Explain How Your Machine Learning Model Works](https://towardsdatascience.com/using-shap-values-to-explain-how-your-machine-learning-model-works-732b3f40e137)
-[I have to find out what approximations they use to calculate Shapley values for all features, considering $2^n$ complexity]()
-[How to define fairness to detect and prevent discriminatory outcomes in Machine Learning](https://towardsdatascience.com/how-to-define-fairness-to-detect-and-prevent-discriminatory-outcomes-in-machine-learning-ef23fd408ef2)

### Fairness
- ✅ [A Tutorial on Fairness in Machine Learning](https://towardsdatascience.com/a-tutorial-on-fairness-in-machine-learning-3ff8ba1040cb)
- [sklego documentation](https://scikit-lego.readthedocs.io/en/latest/fairness.html)
- ✅ [Equality and fairness measures in classification models](https://www.auditingalgorithms.net/EqualityAndFairness.html)
- [Fairness Definitions Explained - должна быть понятная и полезная статья](http://fairware.cs.umass.edu/papers/Verma.pdf)
- [CS 294: Fairness in Machine Learning](https://fairmlclass.github.io/)
- []()

### Feature seletion
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 1](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-1-3574269d5c69)
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 2](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-2-c258f8a2ac43)
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 3](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-3-de2a7593247f)
- ✅ [How to Choose a Feature Selection Method For Machine Learning](https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/)
- ✅ [Understanding ANOVA-F for feature selection in Python](https://datascience.stackexchange.com/questions/74465/how-to-understand-anova-f-for-feature-selection-in-python-sklearn-selectkbest-w#answer-74486)
- [sklearn: Feature selection](https://scikit-learn.org/stable/modules/feature_selection.html)
- ✅ [What are variable importance rankings useful for?](https://stats.stackexchange.com/questions/202277/what-are-variable-importance-rankings-useful-for#question-header)
- ✅ [feature importance is a slippery concept](https://stats.stackexchange.com/questions/202221/for-linear-classifiers-do-larger-coefficients-imply-more-important-features/202853#answer-202853)
- ✅ [Why lasso for feature selection?](https://stats.stackexchange.com/questions/367155/why-lasso-for-feature-selection#question-header)
- [Boruta SHAP: A Tool for Feature Selection Every Data Scientist Should Know](https://towardsdatascience.com/boruta-shap-an-amazing-tool-for-feature-selection-every-data-scientist-should-know-33a5f01285c0#:~:text=The%20idea%20of%20the%20Boruta,importance%20of%20the%20shadow%20features.)
- ✅ [Intuitions on L1 and L2 Regularisation](https://towardsdatascience.com/intuitions-on-l1-and-l2-regularisation-235f2db4c261)
- [L0 Norm, L1 Norm, L2 Norm & L-Infinity Norm](https://montjoile.medium.com/l0-norm-l1-norm-l2-norm-l-infinity-norm-7a7d18a4f40c#:~:text=L1%20Norm%20is%20the%20sum,the%20vector%20are%20weighted%20equally.)
- []()
- []()

### Advanced predictions
- [Ensemble methods: bagging, boosting and stacking](https://towardsdatascience.com/ensemble-methods-bagging-boosting-and-stacking-c9214a10a205)
- [sklearn: Ensemble methods](https://scikit-learn.org/stable/modules/ensemble.html)
- [A Deep Dive into Stacking Ensemble Machine Learning — Part I](https://towardsdatascience.com/a-deep-dive-into-stacking-ensemble-machine-learning-part-i-10476b2ade3)
- [Cтекинг (Stacking) и блендинг (Blending)](https://dyakonov.org/2017/03/10/c%D1%82%D0%B5%D0%BA%D0%B8%D0%BD%D0%B3-stacking-%D0%B8-%D0%B1%D0%BB%D0%B5%D0%BD%D0%B4%D0%B8%D0%BD%D0%B3-blending/)
- []()
- []()


### Clustering
- [Overview of Clustering Algorithms](https://towardsdatascience.com/overview-of-clustering-algorithms-27e979e3724d)
- []()
- []()