## Prerequisites

### Import libraries

In [1]:
# Import libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from sklearn.model_selection import train_test_split
# from google.colab import output

from pandas import DatetimeIndex as dt
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
# from google.colab import files
import IPython
from IPython.display import HTML, display, clear_output 
# from google.colab import drive
import sys

# hyper-parameters optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# metrics
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as TP_rate                          
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score as recall
from sklearn.metrics import average_precision_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer,fbeta_score
from sklearn.model_selection import StratifiedKFold


# classifiers
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostRegressor, RandomForestRegressor, GradientBoostingClassifier, StackingClassifier, VotingClassifier #
from sklearn.tree import DecisionTreeClassifier     #
from sklearn.svm import SVC                                    # both linear and radial classification
from sklearn.neighbors import KNeighborsClassifier             # k=3
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import catboost
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from mlxtend.classifier import StackingClassifier

# statistics
from scipy.stats import shapiro
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu

# imputations
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.base import clone

# feature selection
from sklearn.feature_selection import chi2, mutual_info_classif, f_classif, SelectKBest, RFE, RFECV, SequentialFeatureSelector
from scipy.stats import kendalltau, spearmanr
from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold
from mrmr import mrmr_classif

# to conver string to dict
import ast

# Interpretability
 # !pip install interpret
from interpret.blackbox import LimeTabular
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
from interpret import show

import lime
import lime.lime_tabular
from __future__ import print_function

# ignore warnings when graphs are plotted
import warnings
warnings.filterwarnings('ignore')

### Data import

In [2]:
# links to datasets

# lancet

link_train_lancet = './HSE project/Preprocessed Data/lancet dataset/train_abc_lancet.xlsx'
link_test_lancet  = './HSE project/Preprocessed Data/lancet dataset/test_abc_lancet.xlsx'

# death
link_train_death_a   = './HSE project/Preprocessed Data/cardiovascular death/train_a.xlsx'
link_test_death_a    = './HSE project/Preprocessed Data/cardiovascular death/test_a.xlsx'
link_train_death_b   = './HSE project/Preprocessed Data/cardiovascular death/train_b.xlsx'
link_test_death_b    = './HSE project/Preprocessed Data/cardiovascular death/test_b.xlsx'
link_train_death_c   = './HSE project/Preprocessed Data/cardiovascular death/train_c.xlsx'
link_test_death_c    = './HSE project/Preprocessed Data/cardiovascular death/test_c.xlsx'
link_train_death_abc = './HSE project/Preprocessed Data/cardiovascular death/train_abc.xlsx'
link_test_death_abc  = './HSE project/Preprocessed Data/cardiovascular death/test_abc.xlsx'

# combined
link_train_combined_a   = './HSE project/Preprocessed Data/combined/train_a.xlsx'
link_test_combined_a    = './HSE project/Preprocessed Data/combined/test_a.xlsx'
link_train_combined_b   = './HSE project/Preprocessed Data/combined/train_b.xlsx'
link_test_combined_b    = './HSE project/Preprocessed Data/combined/test_b.xlsx'
link_train_combined_c   = './HSE project/Preprocessed Data/combined/train_c.xlsx'
link_test_combined_c    = './HSE project/Preprocessed Data/combined/test_c.xlsx'
link_train_combined_abc = './HSE project/Preprocessed Data/combined/train_abc.xlsx'
link_test_combined_abc  = './HSE project/Preprocessed Data/combined/test_abc.xlsx'

# revascularization
link_train_revascularization_a   = './HSE project/Preprocessed Data/revascularization/train_a.xlsx'
link_test_revascularization_a    = './HSE project/Preprocessed Data/revascularization/test_a.xlsx'
link_train_revascularization_b   = './HSE project/Preprocessed Data/revascularization/train_b.xlsx'
link_test_revascularization_b    = './HSE project/Preprocessed Data/revascularization/test_b.xlsx'
link_train_revascularization_c   = './HSE project/Preprocessed Data/revascularization/train_c.xlsx'
link_test_revascularization_c    = './HSE project/Preprocessed Data/revascularization/test_c.xlsx'
link_train_revascularization_abc = './HSE project/Preprocessed Data/revascularization/train_abc.xlsx'
link_test_revascularization_abc  = './HSE project/Preprocessed Data/revascularization/test_abc.xlsx'

### Tuning of hyper-parameters

#### Grids of hyper-parameters

In [3]:
# hyper-parameters for gridsearchCV

# 1. Logistic regression
parameters_LR_model = dict(
                          C = [0.001, 0.01, 0.1, 1.],  # defeult
                          tol = [1.e-4],
                          penalty = ['l2', 'none'], #'elasticnet', 'l1', 
                          # njobs = [-1],
                          dual = [False],
                          fit_intercept = [False],
                          # intercept_scaling =
                          class_weight = ['balanced', None],
                          random_state = [10],
                          # solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                          max_iter = [10000],
                          multi_class = ['auto'],
                          verbose = [0],
                          warm_start = [True]
                          # l1_ratio
                          )

# 2. Random Forest
parameters_random_forest_model = dict(
                  n_estimators = [int(x) for x in np.linspace(start = 50, stop = 400, num = 10)],
                  criterion = ['gini'],
                  max_depth = [*[int(x) for x in np.linspace(2, 10, num = 5)]],
                  min_samples_split = [2,4],  
                  # min_samples_leaf = [1,2],  
                  min_weight_fraction_leaf = [0.0],
                  max_features = ['sqrt'],  
                  max_leaf_nodes = [None],
                  min_impurity_decrease = [0.],
                  bootstrap = [True],
                  oob_score = [False],
                  n_jobs = [-1],
                  random_state = [10],
                  verbose = [0],
                  warm_start = [True],
                  class_weight = ['balanced', 'balanced_subsample', None],
                  # ccp_alpha = 
                  max_samples = [None]  # maybe =0.1 here for getting almost independent samples for trees
                  )

# 3. k-NN
parameters_knn = dict(
                      n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 7, num = 7)],
                      weights = ['uniform', 'distance'],
                      algorithm = ['ball_tree', 'kd_tree', 'brute'], 
                      leaf_size = [15, 30, 60],
                      p = [3], 
                      metric = ['chebyshev', 'minkowski', 'euclidean', 'manhattan'],
                      # metric_params = 
                      n_jobs = [-1]
                      )

# 4. SVM
parameters_svm = dict(
                  C = [int(x) for x in np.linspace(start = 1, stop = 25, num = 7)],
                  kernel = ['rbf', 'linear', 'poly', 'sigmoid'],
                  degree = [3, 4, 5],
                  gamma = ['scale', 'auto'],
                  coef0 = [0.0],
                  shrinking = [True, False],
                  probability = [True],
                  tol = [1.e-3], 
                  cache_size = [200],
                  class_weight = ['balanced', None],
                  verbose = [False],
                  max_iter = [1.e6],  # мб поставить конечные итерации, как в Logistic Regression
                  # decision_function_shape = [],
                  # break_ties = [],
                  random_state = [10]
                  )

# 5. CatBoost
catboost_parameters = {'depth': [4,6,8,10],  # larger depth is preferable
              'learning_rate': [0.1,0.2,0.3],
              # 'l2_leaf_reg': [0,3,6,1],
              }
c_boost_params = {'eval_metric' : 'F1', # 'F1' my_f2_scorer, 'F'
                  # 'beta' : 2,
                  'verbose' : False,
                  'early_stopping_rounds' : 100,
                  #cat_features=cat_features,
                  'task_type' : "CPU",
                  'iterations' : 500,
                  'random_seed' : 10}

#### Tuning functions

In [4]:
def tuning(score, catboost_score, cross_validation, path, logistic_regression, knn, random_forest, svm, catboost):
    # score = my_f2_scorer(), 'f1', 'accuracy', 'precision', 'recall', 'roc_auc'
    # catboost_score = 'F1' 'F:beta=2'
    if logistic_regression:
        logistic_regression_tuning(score, cross_validation, path)     

    if knn:
        knn_tuning(score, cross_validation, path)     
    if random_forest:
        random_forest_tuning(score, cross_validation, path)     
    if svm:
        svm_tuning(score, cross_validation, path)     
    if catboost:
        catboost_tuning(catboost_score, cross_validation, path)     

In [5]:
def logistic_regression_tuning(score, cross_validation, path):

    # LogisticRegression: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

    LR_model = LogisticRegression(random_state=10)

    # calibrate hyper-parameters: perform gridsearch with cross-validation
    clf = GridSearchCV(
                      estimator = LR_model, 
                      param_grid = parameters_LR_model,
                      scoring = score,    
                      #  refit = my_f2_scorer,
                      cv = cross_validation,
                      n_jobs = -1
                      )              
    %time clf.fit(X_train, y_train)
    LR_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}LogisticRegression_optimisation.xlsx')

In [6]:
def knn_tuning(score, cross_validation, path):

    # KNeighborsClassifier: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

    knn_model = KNeighborsClassifier()

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=knn_model, 
                      param_grid=parameters_knn,
                      scoring=score,
                      #  refit=my_f2_scorer,
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    knn_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}knn_optimisation.xlsx')

In [7]:
def random_forest_tuning(score, cross_validation, path):
      
    # RandomForestClassifier: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

    random_forest_model = RandomForestClassifier(random_state=10)

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=random_forest_model, 
                      param_grid=parameters_random_forest_model,
                      scoring=score,  
                      #  refit=my_f2_scorer,
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    random_forest_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}randomforest_optimisation.xlsx')

In [8]:
def svm_tuning(score, cross_validation, path):
      
    # SVM_model
    # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC 

    SVM_model = SVC()

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=SVM_model, 
                      param_grid=parameters_svm,
                      scoring=score,  
                      # refit=score[0],
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    SVM_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}svm_optimisation.xlsx')
    # files.download("/content/svm_optimisation.xlsx")

In [9]:
def catboost_tuning(catboost_score, cross_validation, path):
    # Catboost
    # tuning: https://catboost.ai/en/docs/concepts/parameter-tuning


    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 

    np.random.seed(10)
    catboost = CatBoostClassifier(
                                eval_metric=catboost_score,
                                verbose=False,
                                early_stopping_rounds=100,
                                #cat_features=cat_features,
                                task_type="CPU",
                                iterations = 500,
                                random_seed=10)


    grid_res = catboost.grid_search(catboost_parameters,
                                    X_train,
                                    y_train,
                                    cv=cross_validation,
                                    search_by_train_test_split=True,
                                    calc_cv_statistics=True,
                                    refit=True,
                                    shuffle=True,
                                    partition_random_seed=10,
                                    verbose=True,
                                    stratified=True)

    # save optimisation parameters
    cv_results = pd.DataFrame(grid_res['cv_results'])
    cv_results['params'] = 0
    cv_results['params'][0:3] = str(grid_res['params'])


    # add roc_auc fCV values
    cv_results['roc_auc'] = str(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    cv_results['roc_auc_mean'] = np.mean(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    cv_results['roc_auc_std'] = np.std(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))

    cv_results.to_excel(f'{path}catboost_optimisation.xlsx')

#### Function: Optimised table of metrics 

In [10]:
def optimised_metrics_table(model_name):
    optimised_metrics = []
    # optimised_metrics.append(mcc(y_test, forecast))        #.round(3)                          # MCC
    optimised_metrics.append(f1(y_test, forecast).round(3))                                    # F1
    optimised_metrics.append(f2_func(y_test, forecast).round(3))                               # F2
    optimised_metrics.append(accuracy(y_test, forecast).round(3))                              # Accuracy
    optimised_metrics.append(TP_rate(y_test, forecast).round(3))                               # TP rate
    optimised_metrics.append(recall(y_test, forecast).round(3))                                # TN rate
    precision, recall_, thresholds = precision_recall_curve(y_test, forecast_proba)                  # ------
    optimised_metrics.append(auc(recall_, precision).round(3))                                 # PR AUC
    optimised_metrics.append(roc_auc(y_test, forecast_proba).round(3))                               # ROC AUC
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[0])                    # number of true negative
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[1])                    # number of false positive
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[2])                    # number of false negative
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[3])                    # number of true positive

    optimised_metrics = pd.DataFrame(optimised_metrics, columns=[model_name])
    # add rows names
    optimised_metrics.index = [
                              #  "MCC",
                               "F1", "F2",
                               "Accuracy",
                               "Precision",
                               "Recall",
                               "PR_AUC",
                               "ROC_AUC",
                               "TN", "FP", "FN", "TP"
                                ]
    optimised_metrics = optimised_metrics.T
    
    return optimised_metrics

#### Function metric_table - visualise model scores

In [11]:
'''
This function provides scores for gridsearch F1-score and metrics for test dataset
'''

def metric_table(path):  #, X_train=X_train, y_train=y_train

    # read gridsearch tables
    randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0]) #/content/  ./imp_feat
    svm_optimisation = pd.read_excel(f'{path}svm_optimisation.xlsx', header=[0])
    knn_optimisation = pd.read_excel(f'{path}knn_optimisation.xlsx', header=[0])
    LogisticRegression_optimisation = pd.read_excel(f'{path}LogisticRegression_optimisation.xlsx', header=[0])
    catboost_optimisation = pd.read_excel(f'{path}catboost_optimisation.xlsx', header=[0])

    params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    random_forest_model = RandomForestClassifier(**params)
    # 
    params = svm_optimisation[svm_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    SVM_model = SVC(**params)
    # 
    # params = nn_optimisation[nn_optimisation['rank_test_score']==1][["params"]].iloc[0]
    # params = ast.literal_eval(params[0])
    # newral_network_model = MLPClassifier(**params)
    # 
    params = knn_optimisation[knn_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    knn_model = KNeighborsClassifier(**params)
    # 
    params = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    LR_model = LogisticRegression(**params)
    # 
    params = catboost_optimisation['params'][0]
    params = ast.literal_eval(params)
    catboost_model = CatBoostClassifier(**c_boost_params, **params)

    models = [
    random_forest_model,
    SVM_model,
    # newral_network_model,
    LR_model,
    knn_model,
    catboost_model
    ]

    mcc_score, f1_score,f2_score, accuracy_score, TP_rate_score, recall_score, auc_precision_recall, roc_auc_score= [], [], [], [], [], [], [], []
    tn, fp, fn, tp = [], [], [], []

    for model in models:
        model.fit(X_train, y_train)
        forecast = model.predict(X_test)
        forecast_proba = model.predict_proba(X_test)

        # mcc_score.append(mcc(y_test, forecast))                                   # MCC
        f1_score.append(f1(y_test, forecast))                                       # F1
        f2_score.append(f2_func(y_test, forecast))                                  # F1
        accuracy_score.append(accuracy(y_test, forecast))                           # Accuracy  
        TP_rate_score.append(TP_rate(y_test, forecast))                             # TP rate   tp / (tp + fp)
        recall_score.append(recall(y_test, forecast))                               # TN rate
        auc_precision_recall.append(average_precision_score(y_test, forecast_proba[:,1]))      # PR AUC
        roc_auc_score.append(roc_auc(y_test, forecast_proba[:,1]))                       # ROC AUC
        tn.append(confusion_matrix(y_test, forecast).ravel()[0])                  # number of true negative
        fp.append(confusion_matrix(y_test, forecast).ravel()[1])                  # number of false positive
        fn.append(confusion_matrix(y_test, forecast).ravel()[2])                  # number of false negative
        tp.append(confusion_matrix(y_test, forecast).ravel()[3])                  # number of true positive

    # create matrix table 
    metrics_table = pd.DataFrame(columns=pd.MultiIndex.from_product([["F2, train set, cv=5"],["mean", 'std']]))
    # metrics_table[("Scores on the test set","MCC")] = mcc_score
    metrics_table[("Scores on the test set","F1")] = f1_score
    metrics_table[("Scores on the test set","F2")] = f2_score
    metrics_table[("Scores on the test set","Accuracy")] = accuracy_score
    metrics_table[("Scores on the test set","Precision")] = TP_rate_score
    metrics_table[("Scores on the test set","Recall")] = recall_score
    metrics_table[("Scores on the test set","PR_AUC")] = auc_precision_recall
    metrics_table[("Scores on the test set","ROC_AUC")] = roc_auc_score
    metrics_table[("Confusion matrix","TN")] = tn
    metrics_table[("Confusion matrix","FP")] = fp
    metrics_table[("Confusion matrix","FN")] = fn
    metrics_table[("Confusion matrix","TP")] = tp

    # modify the rows names
    metrics_table.index = [
                "Random Forest",
                "SVM",
                # "Multi-layer Perceptron",
                "Logistic Regression",
                "KNN",
                "CatBoost"
                ]



    # add cross validated F2 scores on the train set
    mean = []
    std = []
    mean_test_f1,std_test_f1 = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = svm_optimisation[svm_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    # mean_test_f1,std_test_f1 = nn_optimisation[nn_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    # mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = knn_optimisation[knn_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = catboost_optimisation[['test-F:beta=1-mean', 'test-F:beta=1-std']].iloc[catboost_optimisation.shape[0]-1]
    mean.append(mean_test_f1); std.append(std_test_f1)
    

    metrics_table[("F2, train set, cv=5","mean")] = mean
    metrics_table[("F2, train set, cv=5","std")] = std

    # add cross validated F2 scores on the train set
    mean_roc_auc = []
    std_roc_auc = []
    mean_test_roc_auc,std_test_roc_auc = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = svm_optimisation[svm_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    # mean_test_roc_auc,std_test_roc_auc = nn_optimisation[nn_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    # mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = knn_optimisation[knn_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = catboost_optimisation[["roc_auc_mean","roc_auc_std"]].iloc[catboost_optimisation.shape[0]-1]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)

    metrics_table[("ROC_AUC, train set, cv=5","mean")] = mean_roc_auc
    metrics_table[("ROC_AUC, train set, cv=5","std")] = std_roc_auc

    return metrics_table

#### Define $F_2$ metric

In [12]:
def f2_func(y_true, y_pred):
    f2_score = fbeta_score(y_true, y_pred, beta=2.)
    return f2_score

def my_f2_scorer():
    return make_scorer(f2_func)

### Feature selection

In [13]:
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# clf = RandomForestClassifier(max_depth=2, random_state=0)
# clf.fit(X_train, y_train)


# sfs1 = SFS(clf, 
#            k_features=10, 
#            forward=True, 
#            floating=False, 
#            verbose=2,
#            scoring='roc_auc',
#            cv=5,
#            n_jobs=-1)

# sfs1 = sfs1.fit(X_train, y_train)
# sfs1.subsets_

#### random_forest_importances

In [14]:
def random_forest_importances(path, n_features = 20, biomarkers=True, save=True):
    feature_importances = pd.DataFrame()

    randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0])


    # feature_importances.columns = ['Feature']
    params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    random_forest_model = RandomForestClassifier(**params)
    random_forest_model.fit(X_train, y_train)
    feature_importances["RandomForest"] = pd.Series(random_forest_model.feature_importances_)

    if biomarkers:
        feats = list(map(lambda x: x, list(X_train.columns)))
    else:
        feats = list(map(lambda x: x[1], list(X_train.columns)))

    feature_importances.index = feats

    feature_importances = feature_importances.sort_values("RandomForest", ascending=False)
    feature_importances['RandomForest'] = feature_importances['RandomForest']/feature_importances['RandomForest'][0]
    print()



    fig = px.bar(
        x='RandomForest',
                 data_frame=feature_importances['RandomForest'][:n_features][::-1],
                 y=feature_importances.index[:n_features][::-1])
    # figure size
    fig.update_layout(
        autosize=False,
        width=1000,
        height=450,) 
    fig.update_xaxes(title='Relative importance')
    fig.update_yaxes(title='')

    fig.update_layout(
                  # xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False},
                  # bargap=0.30,
                  # bargroupgap=0.3,
                  # legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text='Feature importance', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

    fig.show(renderer='colab')


    if save:
        name = path.split("/")[-3]
        fig.write_image(f"{results_path}importance {name}.pdf", engine="kaleido")


    return feature_importances

#### Function: upload_models(x_data, y_data, path, model_list)

In [15]:
def upload_models(x_data, y_data, path, model_list):
    'Return list of trained models'

    models = []
    model_names = []

    if model_list['SVM']:
        svm_optimisation = pd.read_excel(f'{path}svm_optimisation.xlsx', header=[0])
        params = svm_optimisation[svm_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        SVM_model = SVC(**params)
        SVM_model.fit(X_train, y_train)
        models.append(SVM_model)
        model_names.append('SVM')

    if model_list['Logistic']:
        LogisticRegression_optimisation = pd.read_excel(f'{path}LogisticRegression_optimisation.xlsx', header=[0])
        params = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        LR_model = LogisticRegression(**params)
        LR_model.fit(X_train, y_train)
        models.append(LR_model)
        model_names.append('Logistic')

    if model_list['RandomForest']:
        randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0])
        params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        random_forest_model = RandomForestClassifier(**params)
        random_forest_model.fit(X_train, y_train)
        models.append(random_forest_model)
        model_names.append('RandomForest')

    if model_list['KNN']:
        knn_optimisation = pd.read_excel(f'{path}knn_optimisation.xlsx', header=[0])
        params = knn_optimisation[knn_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        knn_model = KNeighborsClassifier(**params)
        knn_model.fit(X_train, y_train)
        models.append(knn_model)
        model_names.append('KNN')
        # https://catboost.ai/en/docs/concepts/fstr
    
    if model_list['Catboost']:
        catboost_optimisation = pd.read_excel(f'{path}catboost_optimisation.xlsx', header=[0])
        params = catboost_optimisation['params'][0]
        params = ast.literal_eval(params)
        catboost_model = CatBoostClassifier(**c_boost_params, **params)
        catboost_model.fit(X_train, y_train)
        models.append(catboost_model)
        model_names.append('Catboost')
        # https://catboost.ai/en/docs/concepts/fstr



    return models, model_names

#### Function: feature_selection(dataset, x_data, y_data, path)

In [16]:
# def feature_selection(x_data, y_data, path):
#     'Return dataset with ranged selected features'

#     # get list of all column names and continuous column names
#     '___________________________________________________________________________'
#     all_cols = list(x_data.columns)
#     continuous_cols = [col for col in x_data.columns if (len((x_data[col].unique())) >= 7)]
#     feature_selection_dataset = pd.DataFrame(columns=pd.MultiIndex.from_product([["LASSO"],["coef"]]))


#     # LASSO
#     '___________________________________________________________________________'
#     search = GridSearchCV(Lasso(),
#                           {'alpha':np.linspace(0.1, 1, num=10)**2}, #  np.linspace(0.1, 1, num=10)**2 np.arange(0.1,10,0.1)
#                           cv = 5, 
#                           scoring=my_f2_scorer(),  #"neg_mean_squared_error" my_f2_scorer() 'f1'
#                           verbose=0
#                           )

#     search.fit(X_train, y_train)
#     feature_selection_dataset['LASSO', 'coef'] = np.abs(search.best_estimator_.coef_)
#     # feature_selection_dataset['LASSO', 'coef'][feature_selection_dataset['LASSO', 'coef']>0] = 1
#     print("Calculated LASSO")


#     # get all trained models
#     '___________________________________________________________________________'
#     models, model_names = upload_models(x_data = x_data,
#                                         y_data = y_data,
#                                         path = path, 
#                                         model_list = {'SVM': True, 
#                                                       'Logistic': True, 
#                                                       'RandomForest': True, 
#                                                       'KNN': True, 
#                                                       'Catboost': True})


#     # Sequencial feature selection
#     '___________________________________________________________________________'
    
#     sfs1 = SFS_xtend(knn, 
#                   k_features=20, 
#                   forward=True, 
#                   floating=True, 
#                   verbose=2,
#                   direction = 'forward',
#                   cv = StratifiedKFold(5),
#                   scoring=my_f2_scorer(),
#                   n_jobs=-1)

#     sfs1 = sfs1.fit(X, y)

#     for number in [0,1]:
#         sfs = SequentialFeatureSelector(estimator = models[number],
#                                         n_features_to_select=None,
#                                         cv = StratifiedKFold(5),
#                                         scoring = my_f2_scorer(), 
#                                         direction = 'backward',
#                                         n_jobs=-1
#                                         )
#         sfs.fit(X_train, y_train)
#         feature_selection_dataset['SFS', model_names[number]] = sfs.get_support()*1
#     print("Calculated SFS")
#     # # Recursive feature elimination with cross validation  - плохо отбирает для RandomForest, Catboost
#     # '___________________________________________________________________________'
#     # # models with feature importance do not have to perform SFS.    
#     # for number in [1,2,4]:
#     #     rfecv = RFECV(estimator = models[number],
#     #                                     # n_features_to_select=None,
#     #                                     cv = StratifiedKFold(5),
#     #                                     scoring = my_f2_scorer(),
#     #                                     n_jobs=-1
#     #                                     )
#     #     rfecv.fit(X_train, y_train)
#     #     feature_selection_dataset['RFECV', model_names[number]] = rfecv.get_support()*1
#     # print("Calculated RFECV")

#     # model importances
#     '___________________________________________________________________________'
#     feature_selection_dataset['Importances', 'RandomForest'] = models[2].feature_importances_
#     feature_selection_dataset['Importances', 'CatBoost'] = models[4].feature_importances_
#     feature_selection_dataset['Importances', 'Logistic'] = np.abs(models[1].coef_[0])

#     # # Drop-Column Importance
#     # '___________________________________________________________________________'
#     # """get score via Drop-Column Importance for models"""
#     # for number in range(5):
#     #     # clone the model to have the exact same specification as the one initially trained
#     #     model_clone = clone(models[number])
#     #     # set random_state for comparability
#     #     model_clone.random_state = 37
#     #     # training and scoring the benchmark model
#     #     model_clone.fit(X_train, y_train)

#     #     # benchmark_score = model_clone.score(X_train, y_train)
#     #     y_pred = model_clone.predict(X_test)
#     #     benchmark_score = f2_func(y_test, y_pred)

#     #     # list for storing feature importances
#     #     importances = []
        
#     #     # iterating over all columns and storing feature importance (difference between benchmark and new model)
#     #     for col in X_train.columns:
#     #         model_clone = clone(models[number])
#     #         model_clone.random_state = random_state
#     #         model_clone.fit(X_train.drop(col, axis = 1), y_train)
#     #         # drop_col_score = model_clone.score(X_train.drop(col, axis = 1), y_train)
#     #         y_pred = model_clone.predict(X_test.drop(col, axis = 1))
#     #         drop_col_score = f2_func(y_test, y_pred)
#     #         importances.append(benchmark_score - drop_col_score)
        
#     #     feature_selection_dataset[('Drop-Column Importance', model_names[number])] = importances
    

#     # Set column names as index
#     '___________________________________________________________________________'
#     feature_selection_dataset.index = all_cols

#     # # MRMR
#     # '___________________________________________________________________________'
#     # feature_selection_dataset[('MRMR', '')] = 0
#     # selected_features = mrmr_classif(X=X_train, y=y_train, K=40)
#     # feature_selection_dataset.loc[selected_features][('MRMR', '')] = 1


#     # Unsupervised selection with Pearson correlation coefs
#     '___________________________________________________________________________'
#     # correlation_matrix = dataset[continuous_cols].corr( method='pearson').abs()
#     # correlation_matrix  = pd.DataFrame(correlation_matrix)
#     # # iteratively remove features that have correlation > 0.95
#     # i=0
#     # j=0
#     # cols = correlation_matrix.shape[1]
#     # rows = correlation_matrix.shape[0]

#     # while i < cols: 
#     #     while j < rows:
#     #         if correlation_matrix.iloc[j,i]>0.95 and correlation_matrix.iloc[j,i]!=1:
#     #             correlation_matrix.drop(index=correlation_matrix.index[j], inplace=True)
#     #             correlation_matrix.drop(columns=correlation_matrix.columns[j], inplace=True)
#     #         else:
#     #             j+=1
#     #         rows = correlation_matrix.shape[0]
#     #     i+=1
#     #     j=0
#     #     cols = correlation_matrix.shape[1]

#     # removed_after_unsupervised = list(set(continuous_cols) - set(correlation_matrix.columns))


#     # Process data and download dataset
#     '___________________________________________________________________________'
#     # drop columns from unsupervised selection
#     # feature_selection_dataset.drop(index = removed_after_unsupervised, inplace=True)

#     # rank columns
#     feature_selection_dataset['sum'] = feature_selection_dataset.apply((lambda x: x.iloc[:6].sum()), axis=1)
#     feature_selection_dataset.sort_values('sum', inplace=True, ascending=True)

#     feature_selection_dataset.to_excel(f'{path}feature_selection_dataset.xlsx')

#     return feature_selection_dataset

# Lancet paper

Dataset ABC - all-cause death

##### Subset

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_lancet, header=[0], index_col=0, usecols=list(range(15)))
y_train = pd.read_excel(link_train_lancet, header=[0], index_col=0, usecols=[0,15])
X_test = pd.read_excel(link_test_lancet, header=[0], index_col=0, usecols=list(range(15)))
y_test = pd.read_excel(link_test_lancet, header=[0], index_col=0, usecols=[0,15])

# create path for saving results
optimisation_path = './HSE project/Optimisation data/lancet/ABC death/'
results_path = './HSE project/Graphics/lancet/'

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (314, 14)
y_train shape:	 (314, 1)
X_test shape:	 (105, 14)
y_test shape:	 (105, 1)


##### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

clear_output()

##### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("ROC_AUC, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.586,0.108,0.571,0.6,0.743,0.529,0.621,0.673,0.815,60,16,11,18,0.732,0.085
SVM,0.67,0.053,0.469,0.497,0.676,0.429,0.517,0.575,0.733,56,20,14,15,0.748,0.04
Logistic Regression,0.648,0.051,0.588,0.645,0.733,0.513,0.69,0.72,0.839,57,19,9,20,0.762,0.048
KNN,0.431,0.06,0.588,0.543,0.8,0.682,0.517,0.486,0.713,69,7,14,15,0.609,0.044
CatBoost,0.4,0.087,0.478,0.414,0.771,0.647,0.379,0.636,0.78,70,6,18,11,0.674,0.042


In [None]:
# what metric to evaluate
metric = ['F2','Precision','Recall','ROC_AUC']

table = pd.read_excel(f'{optimisation_path}metrics_table.xlsx', header=[0,1], index_col=[0]) 
datasets = pd.DataFrame(table.loc[:, ('Scores on the test set', metric)].values.round(3),columns=['F2','Precision','Recall','ROC AUC'])

models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
# if 1:
#     # if metric == 'F2':
#     datasets_mean = pd.DataFrame()
#     datasets_mean[('F2, train set, cv=5', 'mean')] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
#     datasets_mean[('F2, train set, cv=5', 'std')] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
#     # datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 
#     fig.add_traces([go.Box(name=column, x=models, 
#                            y=datasets_mean.iloc[column, ('F2, train set, cv=5', 'mean')], 
#                            marker=dict(color="black"), 
#                            showlegend = False) for column in table.index])
#     fig.update_traces(
#     selector=dict(type="box"), # update only boxes
#     boxpoints="all", # show points
#     pointpos=0, # centered
#     jitter=0, # no jitter
#     line_color="rgba(255,255,255,0)", # hide box lines
#     fillcolor="rgba(255,255,255,0)", # hide box fill
    
#     )
    # fig.update_layout(boxmode="group",)

    # fig.add_traces([go.Bar(name=column, x=models, 
    #                        y=datasets_mean[column], 
    #                        xaxis="x2",  
    #                        error_y=dict(type='data',  
    #                                     array=datasets_std[column], 
    #                                     color="rgba(0,0,0,1)",
    #                                     thickness=1), 
    #                        marker=dict(opacity=0,
    #                                   #  color="rgba(255,255,255,0)"
    #                                    ), 
    #                        showlegend = False) for column in datasets.columns])  
    


# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Metric values', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Metrics', y=0.5), 
                  title=dict(text=f'Test metrics: "lancet" subset (ABC)', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if 1:
    fig.add_shape(type='line',
                  x0=-0.5,
                  y0=0.5,
                  x1=4.5,
                  y1=0.5,
                  line=dict(color='black',  width=2, dash='dot'),
                  xref='x',
                  yref='y',
                  layer='below')   
    
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)

fig.show(renderer='colab')

In [None]:
fig.write_image(f"{results_path}metrics.pdf", engine="kaleido")
# fig.write_image(f"{results_path}/metrics.jpeg", engine="kaleido")

##### Feature selection

In [None]:
# Random Forest feature importances
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
feature_importances = pd.DataFrame()

randomforest_optimisation = pd.read_excel(f'{optimisation_path}randomforest_optimisation.xlsx', header=[0])


# feature_importances.columns = ['Feature']
params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
params = ast.literal_eval(params[0])
random_forest_model = RandomForestClassifier(**params)
random_forest_model.fit(X_train, y_train)
feature_importances["RandomForest"] = pd.Series(random_forest_model.feature_importances_)

feats = list(map(lambda x: eval(x)[1], list(X_train.columns)))
feature_importances.index = feats

feature_importances = feature_importances.sort_values("RandomForest", ascending=False)
feature_importances['RandomForest'] = feature_importances['RandomForest']/feature_importances['RandomForest'][0]

feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["RandomForest"])\
                   .set_precision(3)

Unnamed: 0,RandomForest
"Гемоглобин, г/л",1.0
ФВ ЛЖ,0.72
Возраст,0.658
СКФ EPI,0.602
МФА,0.562
"Хсобщ, ммоль/л",0.274
ГБ,0.143
СД,0.113
пост-ИМ,0.103
пост-ОНМК,0.081


In [None]:
data_from_paper = pd.DataFrame()
# https://ars.els-cdn.com/content/image/1-s2.0-S0140673620325198-mmc1.pdf
data_from_paper['Paper']= [0.77, 0.49, 1, 0.22, 0.1, 0.7, 0.1, 0.22, 0.09,  0.24, 0.09, 0.12, 0.11, 0.28]
data_from_paper.index = ['Возраст', 'СКФ EPI', 'ФВ ЛЖ', 'МФА', 'Хсобщ, ммоль/л', 'Гемоглобин, г/л', 'пост-ИМ','пост-стент','ГБ',  'СД', 'пост-ОНМК',  'Cегмент ST','Пол', 'пост-ВЧ-кровоизлияние']

In [None]:
result = pd.concat([feature_importances, data_from_paper], axis=1) 
result

Unnamed: 0,RandomForest,Paper
"Гемоглобин, г/л",1.0,0.7
ФВ ЛЖ,0.720207,1.0
Возраст,0.657958,0.77
СКФ EPI,0.601937,0.49
МФА,0.561925,0.22
"Хсобщ, ммоль/л",0.273933,0.1
ГБ,0.143326,0.09
СД,0.112582,0.24
пост-ИМ,0.102661,0.1
пост-ОНМК,0.080781,0.09


In [None]:

result['RandomForest'] = result['RandomForest']/result['RandomForest'][0]
result.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["RandomForest", "Paper"])\
                   .set_precision(3)

Unnamed: 0,RandomForest,Paper
"Гемоглобин, г/л",1.0,0.7
ФВ ЛЖ,0.72,1.0
Возраст,0.658,0.77
СКФ EPI,0.602,0.49
МФА,0.562,0.22
"Хсобщ, ммоль/л",0.274,0.1
ГБ,0.143,0.09
СД,0.113,0.24
пост-ИМ,0.103,0.1
пост-ОНМК,0.081,0.09


In [None]:
# what metric to evaluate
# metric = ['Эксперимент','Статья']

result.columns = ['RandomForest', 'Published paper']

features=result.index
# create the graph
fig = go.Figure(data=[go.Bar(name=i, x=list(result.index), 
                             y=result[i]) for i in result.columns ])

# Change the bar mode
fig.update_xaxes(title='Features')
fig.update_yaxes(title='Relative importance', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Data', y=0.5), 
                  title=dict(text=f'Feature importance: comparison with lancet paper', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)
  
fig.show(renderer='colab')

In [None]:
fig.write_image(f"{results_path}importance.pdf", engine="kaleido")
# fig.write_image(f"{results_path}importance.jpeg", engine="kaleido")

##### Top features

# Canadian paper

Dataset ABC

In [None]:
results_path =      './HSE project/Graphics/canadian/'

##### Subset

Features from [paper](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5)

In [None]:
# choose the same columns as in reference table

canadian_features = [
    # 1. Target
      # - in reference: "if the patient died or survived before the end of the follow-up period, that was 130 days on average"
      # - in our case: Heart-desease-death, follow-up period - 4-155 months
    # ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть'),
    # 2. Serum creatinine, mg/dL
    ('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'),
    # 3. Ejection fraction
    ('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ'),
    # 4. Age
    ('АНТРОПОФИЗИОМЕТРИЯ', 'Возраст'),
    # 7. Sex
    ('АНТРОПОФИЗИОМЕТРИЯ', 'Пол'),
    # 8. Anaemia
    ('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Анемия, степень'),
    # 9. High blood pressure
    ('АНТРОПОФИЗИОМЕТРИЯ', 'систол. АД'),
    # 10. Smoking
    # ('ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ','Курение')
    # 12. Diabetes
    ('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'СД'),
    # (),
    # 5. Creatinine phosphokinase
    # data_12_columns['Creatinine phosphokinase'] = 
    # 6. Serum sodium
    # data_12_columns['Serum sodium']
    # 8. Platelets
    # data_12_columns['Platelets'] = 
]

canadian_2_features = [('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'), ('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ')]

canadian_target = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_abc, header=[0], index_col=0, usecols=list(range(57)))
y_train = pd.read_excel(link_train_death_abc, header=[0], index_col=0, usecols=[0, 57])
X_test =  pd.read_excel(link_test_death_abc,  header=[0], index_col=0, usecols=list(range(57)))
y_test =  pd.read_excel(link_test_death_abc,  header=[0], index_col=0, usecols=[0, 57])
X_train.columns = [eval(col) for col in X_train.columns]
X_test.columns =  [eval(col) for col in X_test.columns]
X_train = X_train[canadian_features]
X_test =  X_test[canadian_features]

# create path for saving results
optimisation_path = './HSE project/Optimisation data/canadian/ABC dataset/8 features/'


# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (309, 7)
y_train shape:	 (309, 1)
X_test shape:	 (104, 7)
y_test shape:	 (104, 1)


##### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

##### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.576,0.154,0.545,0.564,0.76,0.517,0.577,0.578,0.777,64,14,11,15,0.731,0.083
SVM,0.551,0.095,0.491,0.519,0.721,0.452,0.538,0.468,0.758,61,17,12,14,0.716,0.07
Logistic Regression,0.652,0.03,0.59,0.737,0.692,0.442,0.885,0.476,0.762,49,29,3,23,0.695,0.067
KNN,0.365,0.064,0.353,0.349,0.683,0.36,0.346,0.288,0.571,62,16,17,9,0.577,0.024
CatBoost,0.319,0.076,0.578,0.528,0.817,0.684,0.5,0.592,0.767,72,6,13,13,0.655,0.066


In [None]:
# what metric to evaluate
metric = ['F2','Precision','Recall','ROC_AUC']


table = pd.read_excel(f'{optimisation_path}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
# datasets[list(datasets.columns)] = 
datasets = pd.DataFrame(table.loc[:, ('Scores on the test set', metric)].values.round(3),columns=['F2','Precision','Recall','ROC AUC'])
# if metric == 'F2':
#     datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) 
#     datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 

# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
# ensemble models
# models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    


# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Metric values', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Metrics', y=0.5), 
                  title=dict(text=f'Test metrics: "canadian" subset (ABC)', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if 1:
    fig.add_shape(type='line',
                  x0=-0.5,
                  y0=0.5,
                  x1=4.5,
                  y1=0.5,
                  line=dict(color='black',  width=2, dash='dot'),
                  xref='x',
                  yref='y',
                  layer='below')   
    
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)

fig.show(renderer='colab')

In [None]:
fig.write_image(f"{results_path}metrics.pdf", engine="kaleido")
# fig.write_image(f"{results_path}metrics.jpeg", engine="kaleido")

##### Feature selection

In [None]:
# Эксперимент feature importances
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
feature_importances = pd.DataFrame()

randomforest_optimisation = pd.read_excel(f'{optimisation_path}randomforest_optimisation.xlsx', header=[0])

params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
params = ast.literal_eval(params[0])
random_forest_model = RandomForestClassifier(**params)
random_forest_model.fit(X_train, y_train)


feature_importances["Эксперимент"] = pd.Series(random_forest_model.feature_importances_)
# Перепечатал из статьи
feature_importances['Эксперимент'] = feature_importances['Эксперимент']/feature_importances['Эксперимент'][0]
# feature_importances["Logistic regression"] = pd.Series(LR_model.coef_[0]).abs()
# feature_importances["SVM"] = pd.Series(SVM_model.coef_[0]).abs()


feats = list(map(lambda x: x[1], list(X_train.columns)))
feature_importances.index = feats

feature_importances = feature_importances.sort_values("Эксперимент", ascending=False)
feature_importances['Эксперимент'] = feature_importances['Эксперимент']/feature_importances['Эксперимент'][0]

feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["Эксперимент"], axis=0)\
                   .set_precision(3)

Unnamed: 0,Эксперимент
"Анемия, степень",1.0
ФВ ЛЖ,0.834
"Креатинин, мкмоль/л",0.554
Возраст,0.415
систол. АД,0.335
СД,0.143
Пол,0.012


In [None]:
data_from_paper = pd.DataFrame()
data_from_paper['Статья'] = [11.84,  10.71, 8.58,  1.06, 1.13, 1.02, 1.12] # 
data_from_paper['Статья'] = data_from_paper['Статья']/data_from_paper['Статья'][0]
data_from_paper.index = ['Креатинин, мкмоль/л', 'ФВ ЛЖ', 'Возраст', 'Анемия, степень',   'систол. АД', 'СД', 'Пол']

In [None]:
feature_importances = pd.concat([feature_importances, data_from_paper], axis=1) 
feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["Эксперимент", "Статья"], axis=0)\
                   .set_precision(3)

Unnamed: 0,Эксперимент,Статья
"Анемия, степень",1.0,0.09
ФВ ЛЖ,0.834,0.905
"Креатинин, мкмоль/л",0.554,1.0
Возраст,0.415,0.725
систол. АД,0.335,0.095
СД,0.143,0.086
Пол,0.012,0.095


In [None]:
# what metric to evaluate
metric = ['Эксперимент','Статья']
feature_importances.columns = ['RandomForest', 'Published paper']

feats = [str(feature[1]) for feature in feature_importances.index]
features=feature_importances.index

# create the graph
fig = go.Figure(data=[go.Bar(name=i, x=list(feature_importances.index), 
                             y=feature_importances[i]) for i in feature_importances.columns ])

# Change the bar mode
fig.update_xaxes(title='Features')
fig.update_yaxes(title='Relative importance', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Data', y=0.5), 
                  title=dict(text=f'Feature importance: comparison with canadian paper', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)  

# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)
  
fig.show(renderer='colab')

In [None]:
fig.write_image(f"{results_path}importance.pdf", engine="kaleido")
# fig.write_image(f"{results_path}importance.jpeg", engine="kaleido")

##### Two features

##### Subset

In [None]:
X_train = X_train[canadian_2_features]
X_test =  X_test[canadian_2_features]

# create path for saving results
optimisation_path = './HSE project/Optimisation data/canadian/ABC dataset/2 features/'
# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (309, 2)
y_train shape:	 (309, 1)
X_test shape:	 (104, 2)
y_test shape:	 (104, 1)


In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=X_train['ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'], 
                         y=X_train[('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ')],
                         marker_color=y_train["('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')"],
                         mode='markers',
                         text= y_train["('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')"]
                    # mode='markers',
                    # name='markers')
))

fig.update_traces(marker_size=10, selector=dict(type='scatter'))

fig.update_layout(
                  height=800, 
                  font_family="'Nunito', sans-serif",
                  # title={'text': "Correlations between sales and other columns", 'y':0.97, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
                  # showlegend=True, 
                  margin=dict(l=40, r=10, t=60, b=60),
                  xaxis_range=[-5,10]

                  )

# fig = px.scatter(X_train, x=['ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'], y=['ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ'], color=['target', ''],
#                 title="Automatic Labels Based on X_train Frame Column Names")
    
# figure size
fig.update_layout(
    autosize=False,
    width=1000,
    height=450,)
# fig.layout.template = 'plotly_dark'
fig.show(renderer='colab')

##### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

##### Metrics tables

In [None]:
metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.418,0.079,0.346,0.346,0.673,0.346,0.346,0.365,0.571,61,17,17,9,0.617,0.073
SVM,0.443,0.083,0.4,0.414,0.683,0.379,0.423,0.374,0.629,60,18,15,11,0.625,0.059
Logistic Regression,0.582,0.066,0.395,0.487,0.558,0.3,0.577,0.335,0.592,43,35,11,15,0.624,0.054
KNN,0.314,0.124,0.296,0.303,0.635,0.286,0.308,0.261,0.526,58,20,18,8,0.539,0.058
CatBoost,0.313,0.045,0.24,0.234,0.635,0.25,0.231,0.248,0.464,60,18,20,6,0.566,0.043


# **Target**: Death from heart desease

In [99]:
target_column = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')
results_path = './HSE project/Graphics/cardiovascular death/'

## Biomarkers A

### Subset
### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (147, 86)
y_train shape:	 (147, 1)
X_test shape:	 (49, 86)
y_test shape:	 (49, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.655,0.119,0.389,0.365,0.551,0.438,0.35,0.512,0.509,20,9,13,7,0.669,0.069
SVM,0.663,0.331,0.58,0.775,0.408,0.408,1.0,0.463,0.471,0,29,0,20,0.542,0.088
Logistic Regression,0.664,0.118,0.605,0.631,0.653,0.565,0.65,0.64,0.648,19,10,7,13,0.631,0.102
KNN,0.443,0.131,0.387,0.33,0.612,0.545,0.3,0.568,0.642,24,5,14,6,0.444,0.069
CatBoost,0.559,0.048,0.488,0.495,0.571,0.476,0.5,0.481,0.548,18,11,10,10,0.633,0.062


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.712,0.1,0.432,0.412,0.571,0.471,0.4,0.553,0.569,20,9,12,8,0.718,0.097
SVM,0.665,0.095,0.474,0.459,0.592,0.5,0.45,0.529,0.556,20,9,11,9,0.694,0.063
Logistic Regression,0.771,0.045,0.542,0.602,0.551,0.464,0.65,0.589,0.624,14,15,7,13,0.765,0.073
KNN,0.631,0.155,0.41,0.404,0.531,0.421,0.4,0.409,0.491,18,11,12,8,0.706,0.086
CatBoost,0.592,0.133,0.3,0.3,0.429,0.3,0.3,0.454,0.474,15,14,14,6,0.672,0.062


## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (147, 146)
y_train shape:	 (147, 1)
X_test shape:	 (49, 146)
y_test shape:	 (49, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.69,0.085,0.571,0.526,0.694,0.667,0.5,0.685,0.743,24,5,10,10,0.734,0.062
SVM,0.678,0.027,0.744,0.777,0.776,0.696,0.8,0.692,0.769,22,7,4,16,0.659,0.059
Logistic Regression,0.705,0.135,0.65,0.65,0.714,0.65,0.65,0.739,0.724,22,7,7,13,0.665,0.101
KNN,0.524,0.1,0.667,0.625,0.755,0.75,0.6,0.644,0.669,25,4,8,12,0.501,0.045
CatBoost,0.562,0.072,0.375,0.326,0.592,0.5,0.3,0.565,0.584,23,6,14,6,0.686,0.095


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.726,0.089,0.629,0.579,0.735,0.733,0.55,0.721,0.753,25,4,9,11,0.755,0.087
SVM,0.776,0.044,0.545,0.577,0.592,0.5,0.6,0.65,0.672,17,12,8,12,0.787,0.08
Logistic Regression,0.838,0.046,0.605,0.631,0.653,0.565,0.65,0.611,0.686,19,10,7,13,0.795,0.056
KNN,0.638,0.059,0.55,0.55,0.633,0.55,0.55,0.58,0.679,20,9,9,11,0.779,0.071
CatBoost,0.657,0.142,0.632,0.612,0.714,0.667,0.6,0.685,0.709,23,6,8,12,0.746,0.111


## Clinical features A

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (147, 60)
y_train shape:	 (147, 1)
X_test shape:	 (49, 60)
y_test shape:	 (49, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.681,0.107,0.563,0.489,0.714,0.75,0.45,0.789,0.783,26,3,11,9,0.715,0.066
SVM,0.749,0.034,0.634,0.644,0.694,0.619,0.65,0.705,0.717,21,8,7,13,0.699,0.052
Logistic Regression,0.789,0.053,0.654,0.759,0.633,0.531,0.85,0.759,0.759,14,15,3,17,0.727,0.063
KNN,0.577,0.028,0.514,0.474,0.653,0.6,0.45,0.551,0.674,23,6,11,9,0.611,0.045
CatBoost,0.53,0.07,0.615,0.606,0.694,0.632,0.6,0.749,0.75,22,7,8,12,0.637,0.067


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical A/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.686,0.147,0.611,0.573,0.714,0.688,0.55,0.763,0.755,24,5,9,11,0.724,0.082
SVM,0.667,0.072,0.5,0.529,0.551,0.458,0.55,0.568,0.608,16,13,9,11,0.674,0.094
Logistic Regression,0.742,0.097,0.591,0.625,0.633,0.542,0.65,0.749,0.738,18,11,7,13,0.736,0.049
KNN,0.584,0.111,0.444,0.417,0.592,0.5,0.4,0.598,0.635,21,8,12,8,0.691,0.046
CatBoost,0.626,0.146,0.529,0.479,0.673,0.643,0.45,0.73,0.707,24,5,11,9,0.681,0.064


## Biomarkers B

### Subset
### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (58, 6)
y_train shape:	 (58, 1)
X_test shape:	 (30, 6)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.0,0.0,0.286,0.2,0.833,1.0,0.167,0.644,0.799,24,0,5,1,0.231,0.129
SVM,0.468,0.285,0.211,0.27,0.5,0.154,0.333,0.285,0.486,13,11,4,2,0.603,0.097
Logistic Regression,0.198,0.228,0.4,0.526,0.6,0.286,0.667,0.316,0.597,14,10,2,4,0.426,0.246
KNN,0.199,0.211,0.2,0.179,0.733,0.25,0.167,0.208,0.521,21,3,5,1,0.483,0.12
CatBoost,0.0,0.0,0.0,0.0,0.667,0.0,0.0,0.292,0.618,20,4,6,0,0.256,0.146


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (58, 77)
y_train shape:	 (58, 1)
X_test shape:	 (30, 77)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.399,0.209,0.6,0.536,0.867,0.75,0.5,0.655,0.722,23,1,3,3,0.706,0.193
SVM,0.568,0.161,0.556,0.694,0.733,0.417,0.833,0.853,0.889,17,7,1,5,0.675,0.207
Logistic Regression,0.495,0.118,0.4,0.581,0.5,0.263,0.833,0.8,0.868,10,14,1,5,0.628,0.128
KNN,0.2,0.4,0.182,0.172,0.7,0.2,0.167,0.2,0.5,20,4,5,1,0.559,0.227
CatBoost,0.311,0.285,0.6,0.536,0.867,0.75,0.5,0.648,0.819,23,1,3,3,0.711,0.206


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

#### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.399,0.209,0.545,0.517,0.833,0.6,0.5,0.637,0.722,22,2,3,3,0.709,0.244
SVM,0.459,0.291,0.462,0.484,0.767,0.429,0.5,0.47,0.701,20,4,3,3,0.731,0.239
Logistic Regression,0.536,0.303,0.222,0.185,0.767,0.333,0.167,0.521,0.799,22,2,5,1,0.636,0.255
KNN,0.268,0.222,0.6,0.536,0.867,0.75,0.5,0.475,0.729,23,1,3,3,0.603,0.086
CatBoost,0.322,0.295,0.6,0.536,0.867,0.75,0.5,0.64,0.708,23,1,3,3,0.74,0.235


## Clinical features B

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (58, 71)
y_train shape:	 (58, 1)
X_test shape:	 (30, 71)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.299,0.252,0.545,0.517,0.833,0.6,0.5,0.573,0.792,22,2,3,3,0.72,0.24
SVM,0.534,0.294,0.4,0.526,0.6,0.286,0.667,0.546,0.743,14,10,2,4,0.679,0.215
Logistic Regression,0.511,0.141,0.4,0.625,0.4,0.25,1.0,0.788,0.847,6,18,0,6,0.727,0.192
KNN,0.188,0.237,0.182,0.172,0.7,0.2,0.167,0.2,0.5,20,4,5,1,0.53,0.157
CatBoost,0.2,0.274,0.6,0.536,0.867,0.75,0.5,0.678,0.681,23,1,3,3,0.727,0.099


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.499,0.322,0.545,0.517,0.833,0.6,0.5,0.699,0.806,22,2,3,3,0.787,0.179
SVM,0.517,0.27,0.133,0.152,0.567,0.111,0.167,0.211,0.5,16,8,5,1,0.601,0.113
Logistic Regression,0.548,0.139,0.4,0.581,0.5,0.263,0.833,0.792,0.861,10,14,1,5,0.72,0.185
KNN,0.311,0.255,0.5,0.385,0.867,1.0,0.333,0.523,0.764,24,0,4,2,0.638,0.123
CatBoost,0.2,0.274,0.667,0.556,0.9,1.0,0.5,0.792,0.868,24,0,3,3,0.624,0.187


## Biomarkers C

### Subset
### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 5)
y_train shape:	 (86, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.208,0.361,0.4,0.625,0.93,0.25,1.0,1.0,1.0,39,3,0,1,0.719,0.168
SVM,0.495,0.2,0.0,0.0,0.698,0.0,0.0,0.031,0.262,30,12,1,0,0.856,0.092
Logistic Regression,0.281,0.174,0.074,0.167,0.419,0.038,1.0,0.167,0.881,17,25,0,1,0.519,0.193
KNN,0.25,0.433,0.0,0.0,0.93,0.0,0.0,0.023,0.333,40,2,1,0,0.703,0.206
CatBoost,0.0,0.0,0.0,0.0,0.86,0.0,0.0,0.167,0.881,37,5,1,0,0.8,0.155


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 105)
y_train shape:	 (86, 1)
X_test shape:	 (43, 105)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.0,0.0,0.0,0.0,0.977,0.0,0.0,0.25,0.929,42,0,1,0,0.863,0.187
SVM,0.427,0.221,0.0,0.0,0.721,0.0,0.0,0.056,0.595,31,11,1,0,0.769,0.108
Logistic Regression,0.329,0.167,0.0,0.0,0.628,0.0,0.0,0.04,0.429,27,15,1,0,0.769,0.139
KNN,0.4,0.49,0.0,0.0,0.977,0.0,0.0,0.023,0.44,42,0,1,0,0.775,0.24
CatBoost,0.167,0.373,0.0,0.0,0.93,0.0,0.0,0.167,0.881,40,2,1,0,0.863,0.133


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.311,0.406,0.0,0.0,0.977,0.0,0.0,0.167,0.881,42,0,1,0,0.975,0.031
SVM,0.675,0.081,0.0,0.0,0.767,0.0,0.0,0.083,0.738,33,9,1,0,0.894,0.037
Logistic Regression,0.508,0.083,0.125,0.263,0.674,0.067,1.0,0.25,0.929,28,14,0,1,0.919,0.047
KNN,0.6,0.49,0.0,0.0,0.907,0.0,0.0,0.023,0.44,39,3,1,0,0.834,0.21
CatBoost,0.167,0.373,0.0,0.0,0.907,0.0,0.0,0.143,0.857,39,3,1,0,0.875,0.068


## Clinical features C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 100)
y_train shape:	 (86, 1)
X_test shape:	 (43, 100)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.0,0.0,0.0,0.0,0.977,0.0,0.0,0.1,0.786,42,0,1,0,0.912,0.094
SVM,0.427,0.221,0.0,0.0,0.744,0.0,0.0,0.056,0.595,32,10,1,0,0.781,0.093
Logistic Regression,0.334,0.169,0.0,0.0,0.628,0.0,0.0,0.04,0.429,27,15,1,0,0.794,0.121
KNN,0.4,0.49,0.0,0.0,0.977,0.0,0.0,0.023,0.464,42,0,1,0,0.775,0.24
CatBoost,0.167,0.373,0.0,0.0,0.953,0.0,0.0,0.2,0.905,41,1,1,0,0.9,0.075


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical C/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.4,0.49,0.0,0.0,0.93,0.0,0.0,0.2,0.905,40,2,1,0,0.988,0.025
SVM,0.678,0.376,0.0,0.0,0.907,0.0,0.0,0.056,0.595,39,3,1,0,0.975,0.05
Logistic Regression,0.554,0.279,0.0,0.0,0.767,0.0,0.0,0.083,0.738,33,9,1,0,0.719,0.362
KNN,0.6,0.49,0.0,0.0,0.93,0.0,0.0,0.023,0.476,40,2,1,0,0.794,0.253
CatBoost,0.0,0.0,0.0,0.0,0.907,0.0,0.0,0.2,0.905,39,3,1,0,0.869,0.041


## Clinical features A-B-C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_abc, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_death_abc, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_death_abc, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_death_abc, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (309, 56)
y_train shape:	 (309, 1)
X_test shape:	 (104, 56)
y_test shape:	 (104, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.577,0.098,0.667,0.704,0.817,0.613,0.731,0.749,0.868,66,12,7,19,0.777,0.069
SVM,0.657,0.105,0.689,0.755,0.817,0.6,0.808,0.752,0.876,64,14,5,21,0.794,0.024
Logistic Regression,0.729,0.035,0.63,0.762,0.74,0.489,0.885,0.721,0.861,54,24,3,23,0.775,0.053
KNN,0.444,0.174,0.578,0.528,0.817,0.684,0.5,0.571,0.789,72,6,13,13,0.718,0.088
CatBoost,0.474,0.181,0.609,0.565,0.827,0.7,0.538,0.734,0.858,72,6,12,14,0.802,0.07


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = clinical_and_biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

NameError: ignored

### Top feature optimisation and metrics

#### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.579,0.132,0.679,0.709,0.827,0.633,0.731,0.695,0.816,67,11,7,19,0.769,0.079
SVM,0.659,0.106,0.679,0.709,0.827,0.633,0.731,0.745,0.851,67,11,7,19,0.785,0.067
Logistic Regression,0.693,0.047,0.579,0.714,0.692,0.44,0.846,0.688,0.835,50,28,4,22,0.778,0.062
KNN,0.479,0.199,0.35,0.297,0.75,0.5,0.269,0.454,0.774,71,7,19,7,0.754,0.086
CatBoost,0.49,0.198,0.565,0.524,0.808,0.65,0.5,0.684,0.809,71,7,13,13,0.805,0.064


## Results

### Scores of models

In [None]:
# what metric to evaluate
metrics = ['F2', 'ROC_AUC']

for metric in metrics:

    # list of paths
    paths = [ 
            './HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/',
            './HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Clinical A/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Clinical B/all biomarkers and clinical/',
            './HSE project/Optimisation data/cardiovascular death/Clinical C/all biomarkers and clinical/',
            ]

    # create datframe for scores
    datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

    # get dataframe with with scores of models from different datasets
    for i in range(len(paths)):
        table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
        datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
        if metric == 'F2':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
        if metric == 'ROC_AUC':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'std')].values.round(3)) 

    # list of models
    # standart models
    models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
    # ensemble models
    # models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

    # create the graph
    fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

    # add error whiskers from gridsearchCV
    if True:
        fig.add_traces([go.Box(name=column, x=models, 
                              y=datasets_mean[column], 
                              #  xaxis="x1",  
                              
                              marker=dict(color="black"), 
                              showlegend = False) for column in datasets.columns])
        fig.update_traces(
        selector=dict(type="box"), # update only boxes
        boxpoints="all", # show points
        pointpos=0, # centered
        jitter=0, # no jitter
        line_color="rgba(255,255,255,0)", # hide box lines
        fillcolor="rgba(255,255,255,0)", # hide box fill
        
        )
        fig.update_layout(boxmode="group",)

        fig.add_traces([go.Bar(name=column, x=models, 
                              y=datasets_mean[column], 
                              xaxis="x2",  
                              error_y=dict(type='data',  
                                            array=datasets_std[column], 
                                            color="rgba(0,0,0,1)",
                                            thickness=1), 
                              marker=dict(opacity=0,
                                          #  color="rgba(255,255,255,0)"
                                          ), 
                              showlegend = False) for column in datasets.columns])  
        


    # Change the bar mode
    fig.update_xaxes(title='Models')
    fig.update_yaxes(title='Score', range=[0., 1.0])
    fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
    fig.update_layout(barmode='group', 
                      bargap=0.30,
                      bargroupgap=0.3,
                      legend=dict(orientation="v", title='Datasets'), 
                      title=dict(text=f'{metric} values', x=0.5,),
                      margin=dict(l=60, r=20, t=60, b=40),)

    # add dotted line for ROC AUC = 0.5
    if metric == 'ROC_AUC':
        fig.add_shape(type='line',
                        x0=-0.5,
                        y0=0.5,
                        x1=4.5,
                        y1=0.5,
                        line=dict(color='firebrick',  width=2, dash='dot'),
                        xref='x',
                        yref='y')  

    # figure size
    fig.update_layout(
        autosize=False,
        width=1300,
        height=450,) 

    fig.show(renderer='colab')
    fig.write_image(f"{results_path}{metric}.pdf", engine="kaleido")

### Compare with Top 10

In [101]:
# what metric to evaluate
# ROC_AUC F1 F2
metric = 'ROC_AUC'

# list of paths
paths = [
        # './HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/',
        # './HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/',
        # './HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/',
        './HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# list of paths of top 20
paths_top = [
        # './HSE project/Optimisation data/cardiovascular death/Biomarkers A/biomarkers top features/',
        # './HSE project/Optimisation data/cardiovascular death/Biomarkers B/biomarkers top features/',
        # './HSE project/Optimisation data/cardiovascular death/Biomarkers C/biomarkers top features/',
        './HSE project/Optimisation data/cardiovascular death/Clinical ABC/top features/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/top features/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/top features/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/top features/'
        ]

# create datframe for scores # 'biomarkers A','biomarkers B','biomarkers C',
datasets = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_mean = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_std = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
    table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
                                    list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3))
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'std')].values.round(3))

# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression',  'CatBoost'] + ['RandomForest top 20', 'SVM top 20', 'Logistic Regression top 20',  'CatBoost top 20'] # 'KNN', 'KNN top 20',


# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column][[0,1,2,4,5,6,7,9]]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
# if metric == 'F2':
#     fig.add_traces([go.Box(name=column, x=models, 
#                            y=datasets_mean[column], 
#                           #  xaxis="x1",  
                           
#                            marker=dict(color="black"), 
#                            showlegend = False) for column in datasets.columns])
#     fig.update_traces(
#     selector=dict(type="box"), # update only boxes
#     boxpoints="all", # show points
#     pointpos=0, # centered
#     jitter=0, # no jitter
#     line_color="rgba(255,255,255,0)", # hide box lines
#     fillcolor="rgba(255,255,255,0)", # hide box fill
    
#     )
#     fig.update_layout(boxmode="group",)

#     fig.add_traces([go.Bar(name=column, x=models, 
#                            y=datasets_mean[column], 
#                            xaxis="x2",  
#                            error_y=dict(type='data',  
#                                         array=datasets_std[column], 
#                                         color="rgba(0,0,0,1)",
#                                         thickness=1), 
#                            marker=dict(opacity=0,
#                                       #  color="rgba(255,255,255,0)"
#                                        ), 
#                            showlegend = False) for column in datasets.columns])  
    
# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Score', range=[0., 1.0])
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=7.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,) 

fig.show(renderer='colab')
fig.write_image(f"{results_path}top_{metric}.pdf", engine="kaleido")

### Feature selection

In [None]:
# features = pd.DataFrame(columns=['features', 
#                                 #  'biomarkers A',
#                                 #  'biomarkers B',
#                                 #  'biomarkers C',
#                                  'Clinical+biomarkers ABC',
#                                  'Clinical+biomarkers A',
#                                  'Clinical+biomarkers B',
#                                 #  'Clinical+biomarkers C',
#                                  ])
# # list of paths
# paths = [
#         # './HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/',
#         # './HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/',
#         # './HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/',
#         './HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/',
#         './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/'
#         ]

# # get dataframe with with scores of models from different datasets
# top_features = []
# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     # if i < 3: 
#     #     top_features = top_features+list(str(col) for col in table.index[:10])
#     # else:    
#     top_features = top_features+list(eval(col)[1] for col in table.index[:10])

# features['features'] = list(set(top_features))
# features.index = list(set(top_features))
# features.fillna(0, inplace=True)

# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     # if i < 3: 
#     #     features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
#     # else:    
#         # top_features = top_features+list(eval(col) for col in table.index[:10])   
#     features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

# features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
# features.sort_values(ascending=False,  inplace=True, by=("features"))
# features.columns = ['sum'] + list(features.columns[1:])
# features.to_excel('./HSE project/Optimisation data/cardiovascular death/feature_selection.xlsx')
# features

# **Target**: Revascularization

In [102]:
target_column = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация')
results_path = './HSE project/Graphics/revascularization/'

## Biomarkers A

### Subset
### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (98, 86)
y_train shape:	 (98, 1)
X_test shape:	 (33, 86)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.343,0.175,0.462,0.536,0.576,0.375,0.6,0.33,0.557,13,10,4,6,0.445,0.145
SVM,0.694,0.127,0.465,0.685,0.303,0.303,1.0,0.252,0.352,0,23,0,10,0.456,0.051
Logistic Regression,0.552,0.107,0.741,0.877,0.788,0.588,1.0,0.658,0.848,16,7,0,10,0.476,0.124
KNN,0.484,0.07,0.5,0.5,0.697,0.5,0.5,0.402,0.641,18,5,5,5,0.574,0.081
CatBoost,0.225,0.081,0.211,0.204,0.545,0.222,0.2,0.358,0.587,16,7,8,2,0.42,0.094


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.526,0.158,0.552,0.678,0.606,0.421,0.8,0.539,0.713,12,11,2,8,0.627,0.126
SVM,0.72,0.033,0.476,0.694,0.333,0.312,1.0,0.232,0.278,1,22,0,10,0.551,0.061
Logistic Regression,0.545,0.158,0.462,0.536,0.576,0.375,0.6,0.382,0.622,13,10,4,6,0.556,0.148
KNN,0.564,0.05,0.417,0.463,0.576,0.357,0.5,0.444,0.648,14,9,5,5,0.611,0.087
CatBoost,0.453,0.18,0.615,0.714,0.697,0.5,0.8,0.67,0.8,15,8,2,8,0.531,0.102


## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (98, 146)
y_train shape:	 (98, 1)
X_test shape:	 (33, 146)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.277,0.153,0.364,0.385,0.576,0.333,0.4,0.333,0.574,15,8,6,4,0.435,0.123
SVM,0.666,0.182,0.465,0.685,0.303,0.303,1.0,0.244,0.33,0,23,0,10,0.483,0.042
Logistic Regression,0.427,0.093,0.522,0.566,0.667,0.462,0.6,0.546,0.73,16,7,4,6,0.495,0.134
KNN,0.447,0.199,0.333,0.312,0.636,0.375,0.3,0.468,0.663,18,5,7,3,0.639,0.129
CatBoost,0.29,0.103,0.417,0.463,0.576,0.357,0.5,0.543,0.63,14,9,5,5,0.487,0.105


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.556,0.11,0.5,0.603,0.576,0.389,0.7,0.366,0.604,12,11,3,7,0.645,0.084
SVM,0.682,0.112,0.462,0.652,0.364,0.31,0.9,0.39,0.587,3,20,1,9,0.549,0.23
Logistic Regression,0.589,0.168,0.538,0.625,0.636,0.438,0.7,0.464,0.709,14,9,3,7,0.615,0.219
KNN,0.559,0.139,0.429,0.517,0.515,0.333,0.6,0.331,0.522,11,12,4,6,0.583,0.063
CatBoost,0.38,0.174,0.091,0.096,0.394,0.083,0.1,0.296,0.491,12,11,9,1,0.566,0.11


## Clinical features A

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (98, 60)
y_train shape:	 (98, 1)
X_test shape:	 (33, 60)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.248,0.228,0.353,0.319,0.667,0.429,0.3,0.382,0.596,19,4,7,3,0.464,0.104
SVM,0.548,0.072,0.5,0.682,0.455,0.346,0.9,0.353,0.57,6,17,1,9,0.532,0.027
Logistic Regression,0.445,0.157,0.231,0.268,0.394,0.188,0.3,0.347,0.439,10,13,7,3,0.556,0.133
KNN,0.48,0.116,0.222,0.208,0.576,0.25,0.2,0.292,0.47,17,6,8,2,0.557,0.089
CatBoost,0.257,0.1,0.125,0.109,0.576,0.167,0.1,0.244,0.296,18,5,9,1,0.493,0.039


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.35,0.128,0.154,0.116,0.667,0.333,0.1,0.298,0.361,21,2,9,1,0.566,0.075
SVM,0.522,0.137,0.286,0.345,0.394,0.222,0.4,0.443,0.639,9,14,6,4,0.5,0.116
Logistic Regression,0.522,0.142,0.296,0.351,0.424,0.235,0.4,0.249,0.335,10,13,6,4,0.578,0.127
KNN,0.412,0.114,0.261,0.283,0.485,0.231,0.3,0.281,0.433,13,10,7,3,0.57,0.086
CatBoost,0.302,0.176,0.235,0.213,0.606,0.286,0.2,0.301,0.404,18,5,8,2,0.583,0.076


## Biomarkers B

### Subset
### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (56, 6)
y_train shape:	 (56, 1)
X_test shape:	 (29, 6)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.17,0.173,0.286,0.227,0.828,0.5,0.2,0.282,0.592,23,1,4,1,0.452,0.074
SVM,0.63,0.067,0.242,0.417,0.138,0.143,0.8,0.328,0.383,0,24,1,4,0.588,0.159
Logistic Regression,0.545,0.063,0.2,0.286,0.448,0.133,0.4,0.319,0.35,11,13,3,2,0.547,0.068
KNN,0.083,0.144,0.0,0.0,0.69,0.0,0.0,0.194,0.5,20,4,5,0,0.301,0.165
CatBoost,0.0,0.0,0.222,0.208,0.759,0.25,0.2,0.509,0.792,21,3,4,1,0.415,0.075


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (56, 77)
y_train shape:	 (56, 1)
X_test shape:	 (29, 77)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.077,0.154,0.0,0.0,0.759,0.0,0.0,0.238,0.55,22,2,5,0,0.657,0.113
SVM,0.62,0.066,0.303,0.521,0.207,0.179,1.0,0.331,0.4,1,23,0,5,0.476,0.16
Logistic Regression,0.419,0.25,0.276,0.455,0.276,0.167,0.8,0.35,0.475,4,20,1,4,0.555,0.159
KNN,0.305,0.275,0.222,0.208,0.759,0.25,0.2,0.256,0.683,21,3,4,1,0.581,0.206
CatBoost,0.077,0.172,0.222,0.208,0.759,0.25,0.2,0.423,0.708,21,3,4,1,0.688,0.053


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.448,0.286,0.0,0.0,0.759,0.0,0.0,0.252,0.625,22,2,5,0,0.712,0.107
SVM,0.676,0.148,0.471,0.625,0.69,0.333,0.8,0.395,0.758,16,8,1,4,0.694,0.055
Logistic Regression,0.7,0.193,0.435,0.658,0.552,0.278,1.0,0.379,0.808,11,13,0,5,0.708,0.108
KNN,0.468,0.104,0.462,0.536,0.759,0.375,0.6,0.294,0.696,19,5,2,3,0.662,0.062
CatBoost,0.384,0.243,0.222,0.208,0.759,0.25,0.2,0.436,0.65,21,3,4,1,0.743,0.11


## Clinical features B

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (56, 71)
y_train shape:	 (56, 1)
X_test shape:	 (29, 71)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.183,0.232,0.222,0.208,0.759,0.25,0.2,0.457,0.733,21,3,4,1,0.605,0.132
SVM,0.631,0.17,0.261,0.395,0.414,0.167,0.6,0.194,0.517,9,15,2,3,0.594,0.072
Logistic Regression,0.421,0.142,0.323,0.543,0.276,0.192,1.0,0.297,0.7,3,21,0,5,0.449,0.072
KNN,0.448,0.243,0.154,0.179,0.621,0.125,0.2,0.237,0.475,17,7,4,1,0.666,0.153
CatBoost,0.148,0.203,0.25,0.217,0.793,0.333,0.2,0.433,0.692,22,2,4,1,0.553,0.087


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.459,0.309,0.222,0.208,0.759,0.25,0.2,0.276,0.642,21,3,4,1,0.709,0.147
SVM,0.52,0.294,0.4,0.5,0.69,0.3,0.6,0.26,0.642,17,7,2,3,0.619,0.146
Logistic Regression,0.665,0.076,0.37,0.595,0.414,0.227,1.0,0.258,0.667,7,17,0,5,0.628,0.054
KNN,0.238,0.328,0.333,0.37,0.724,0.286,0.4,0.281,0.733,19,5,3,2,0.733,0.139
CatBoost,0.414,0.078,0.571,0.455,0.897,1.0,0.4,0.557,0.717,24,0,3,2,0.61,0.084


## Biomarkers C

### Subset
### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 5)
y_train shape:	 (86, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.245,0.275,0.25,0.217,0.86,0.333,0.2,0.333,0.611,36,2,4,1,0.583,0.233
SVM,0.534,0.064,0.182,0.27,0.581,0.118,0.4,0.116,0.421,23,15,3,2,0.589,0.134
Logistic Regression,0.504,0.097,0.4,0.571,0.721,0.267,0.8,0.256,0.763,27,11,1,4,0.583,0.08
KNN,0.443,0.115,0.2,0.2,0.814,0.2,0.2,0.133,0.547,34,4,4,1,0.639,0.071
CatBoost,0.074,0.147,0.0,0.0,0.86,0.0,0.0,0.252,0.663,37,1,5,0,0.704,0.201


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 105)
y_train shape:	 (86, 1)
X_test shape:	 (43, 105)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.111,0.222,0.0,0.0,0.86,0.0,0.0,0.4,0.805,37,1,5,0,0.578,0.088
SVM,0.391,0.111,0.385,0.61,0.628,0.238,1.0,0.29,0.763,22,16,0,5,0.626,0.055
Logistic Regression,0.244,0.138,0.4,0.571,0.721,0.267,0.8,0.347,0.826,27,11,1,4,0.453,0.088
KNN,0.221,0.303,0.286,0.227,0.884,0.5,0.2,0.193,0.587,37,1,4,1,0.572,0.173
CatBoost,0.0,0.0,0.0,0.0,0.814,0.0,0.0,0.194,0.695,35,3,5,0,0.522,0.087


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.188,0.237,0.444,0.417,0.884,0.5,0.4,0.469,0.863,36,2,3,2,0.753,0.107
SVM,0.634,0.218,0.32,0.5,0.605,0.2,0.8,0.216,0.679,22,16,1,4,0.767,0.111
Logistic Regression,0.61,0.127,0.333,0.513,0.628,0.211,0.8,0.314,0.758,23,15,1,4,0.792,0.065
KNN,0.171,0.215,0.2,0.2,0.814,0.2,0.2,0.204,0.718,34,4,4,1,0.733,0.122
CatBoost,0.0,0.0,0.0,0.0,0.791,0.0,0.0,0.182,0.6,34,4,5,0,0.715,0.103


## Clinical features C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 100)
y_train shape:	 (86, 1)
X_test shape:	 (43, 100)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.111,0.222,0.0,0.0,0.86,0.0,0.0,0.271,0.7,37,1,5,0,0.501,0.151
SVM,0.359,0.207,0.3,0.429,0.674,0.2,0.6,0.242,0.763,26,12,2,3,0.571,0.089
Logistic Regression,0.206,0.17,0.533,0.667,0.837,0.4,0.8,0.27,0.753,32,6,1,4,0.501,0.075
KNN,0.133,0.267,0.0,0.0,0.86,0.0,0.0,0.116,0.487,37,1,5,0,0.526,0.157
CatBoost,0.0,0.0,0.0,0.0,0.791,0.0,0.0,0.189,0.679,34,4,5,0,0.458,0.175


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.111,0.222,0.286,0.227,0.884,0.5,0.2,0.431,0.832,37,1,4,1,0.65,0.166
SVM,0.6,0.135,0.308,0.488,0.581,0.19,0.8,0.221,0.679,21,17,1,4,0.781,0.116
Logistic Regression,0.475,0.285,0.286,0.417,0.651,0.188,0.6,0.219,0.647,25,13,2,3,0.727,0.05
KNN,0.306,0.307,0.4,0.5,0.791,0.3,0.6,0.227,0.708,31,7,2,3,0.619,0.174
CatBoost,0.0,0.0,0.0,0.0,0.767,0.0,0.0,0.222,0.705,33,5,5,0,0.529,0.088


## Clinical features A-B-C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_abc, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_revascularization_abc, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_revascularization_abc, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_revascularization_abc, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (258, 56)
y_train shape:	 (258, 1)
X_test shape:	 (87, 56)
y_test shape:	 (87, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.34,0.148,0.431,0.495,0.667,0.355,0.55,0.394,0.657,47,20,9,11,0.594,0.072
SVM,0.52,0.074,0.444,0.569,0.598,0.326,0.7,0.336,0.626,38,29,6,14,0.593,0.058
Logistic Regression,0.481,0.087,0.424,0.556,0.563,0.304,0.7,0.334,0.614,35,32,6,14,0.596,0.056
KNN,0.317,0.071,0.256,0.253,0.667,0.263,0.25,0.238,0.521,53,14,15,5,0.546,0.049
CatBoost,0.056,0.082,0.0,0.0,0.759,0.0,0.0,0.301,0.59,66,1,20,0,0.523,0.094


In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

#### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.467,0.113,0.467,0.583,0.632,0.35,0.7,0.339,0.653,41,26,6,14,0.618,0.068
SVM,0.618,0.016,0.377,0.602,0.241,0.233,1.0,0.259,0.508,1,66,0,20,0.569,0.088
Logistic Regression,0.47,0.086,0.4,0.5,0.586,0.3,0.6,0.295,0.587,39,28,8,12,0.616,0.048
KNN,0.299,0.123,0.343,0.316,0.736,0.4,0.3,0.281,0.583,58,9,14,6,0.546,0.064
CatBoost,0.069,0.111,0.154,0.116,0.747,0.333,0.1,0.281,0.526,63,4,18,2,0.497,0.124


## Results

### Scores of models

In [None]:
# what metric to evaluate
# F2 F1 ROC_AUC
metrics = ['F2', 'ROC_AUC']

for metric in metrics:
    # list of paths
    paths = [
            './HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/',
            './HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/',
            './HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/',
            './HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/',
            './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/revascularization/Clinical A/all biomarkers and clinical/',
            './HSE project/Optimisation data/revascularization/Clinical B/all biomarkers and clinical/',
            './HSE project/Optimisation data/revascularization/Clinical C/all biomarkers and clinical/',
            ]

    # create datframe for scores
    datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

    # get dataframe with with scores of models from different datasets
    for i in range(len(paths)):
        table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
        datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
        if metric == 'F2':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
        if metric == 'ROC_AUC':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'std')].values.round(3)) 

    # list of models
    # standart models
    models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
    # ensemble models
    # models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

    # create the graph
    fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns])
        
    # add error whiskers from gridsearchCV
    if True:
        fig.add_traces([go.Box(name=column, x=models, 
                              y=datasets_mean[column], 
                              #  xaxis="x1",  
                              
                              marker=dict(color="black"), 
                              showlegend = False) for column in datasets.columns])
        fig.update_traces(
        selector=dict(type="box"), # update only boxes
        boxpoints="all", # show points
        pointpos=0, # centered
        jitter=0, # no jitter
        line_color="rgba(255,255,255,0)", # hide box lines
        fillcolor="rgba(255,255,255,0)", # hide box fill
        
        )
        fig.update_layout(boxmode="group",)

        fig.add_traces([go.Bar(name=column, x=models, 
                              y=datasets_mean[column], 
                              xaxis="x2",  
                              error_y=dict(type='data',  
                                            array=datasets_std[column], 
                                            color="rgba(0,0,0,1)",
                                            thickness=1), 
                              marker=dict(opacity=0,
                                          #  color="rgba(255,255,255,0)"
                                          ), 
                              showlegend = False) for column in datasets.columns])  
        

    fig.update_xaxes(title='Models')
    # Change the bar mode
    fig.update_layout(barmode='group', 
                      xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False},
                      bargap=0.30,
                      bargroupgap=0.3,
                      legend=dict(orientation="v", title='Datasets'), 
                      title=dict(text=f'{metric} score', x=0.5,),
                      margin=dict(l=60, r=20, t=60, b=40),)

    fig.update_yaxes(title='Score', range=[0., 1.0])


    # add dotted line for ROC AUC = 0.5
    if metric == 'ROC_AUC':
        fig.add_shape(type='line',
                        x0=-0.5,
                        y0=0.5,
                        x1=4.5,
                        y1=0.5,
                        line=dict(color='firebrick',  width=2, dash='dot'),
                        xref='x',
                        yref='y')   
    # figure size
    fig.update_layout(
        autosize=False,
        width=1300,
        height=450,) 
    # fig.write_image(f"{results_path}metrics.pdf", engine="kaleido")
    fig.show(renderer='colab')
    fig.write_image(f"{results_path}{metric}.pdf", engine="kaleido")

### Compare with Top 10

In [105]:
# what metric to evaluate
# ROC_AUC F1 F2
metric = 'F2'

# list of paths
paths = [
        # './HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/',
        # './HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/',
        # './HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/',
        './HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/',
        './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# list of paths of top 20
paths_top = [
        # './HSE project/Optimisation data/revascularization/Biomarkers A/biomarkers top features/',
        # './HSE project/Optimisation data/revascularization/Biomarkers B/biomarkers top features/',
        # './HSE project/Optimisation data/revascularization/Biomarkers C/biomarkers top features/',
        './HSE project/Optimisation data/revascularization/Clinical ABC/top features/',
        './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/top features/',
        './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/top features/',
        './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/top features/'
        ]

# create datframe for scores # 'biomarkers A','biomarkers B','biomarkers C',
datasets = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_mean = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_std = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
    table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
                                    list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3))
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'std')].values.round(3))


# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression',  'CatBoost'] + ['RandomForest top 20', 'SVM top 20', 'Logistic Regression top 20', 'CatBoost top 20'] # 'KNN','KNN top 20', 


# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column][[0,1,2,4,5,6,7,9]]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
# if metric == 'F2':
#     fig.add_traces([go.Box(name=column, x=models, 
#                            y=datasets_mean[column], 
#                           #  xaxis="x1",  
                           
#                            marker=dict(color="black"), 
#                            showlegend = False) for column in datasets.columns])
#     fig.update_traces(
#     selector=dict(type="box"), # update only boxes
#     boxpoints="all", # show points
#     pointpos=0, # centered
#     jitter=0, # no jitter
#     line_color="rgba(255,255,255,0)", # hide box lines
#     fillcolor="rgba(255,255,255,0)", # hide box fill
    
#     )
#     fig.update_layout(boxmode="group",)

#     fig.add_traces([go.Bar(name=column, x=models, 
#                            y=datasets_mean[column], 
#                            xaxis="x2",  
#                            error_y=dict(type='data',  
#                                         array=datasets_std[column], 
#                                         color="rgba(0,0,0,1)",
#                                         thickness=1), 
#                            marker=dict(opacity=0,
#                                       #  color="rgba(255,255,255,0)"
#                                        ), 
#                            showlegend = False) for column in datasets.columns])  
    
# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Score', range=[0., 1.0])
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=7.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,) 
fig.show(renderer='colab')
fig.write_image(f"{results_path}top_{metric}.pdf", engine="kaleido")

### Feature selection

In [None]:
# features = pd.DataFrame(columns=['features', 
#                                  'biomarkers A',
#                                 #  'biomarkers B',
#                                 #  'biomarkers C',
#                                  'Clinical+biomarkers ABC',
#                                  'Clinical+biomarkers A',
#                                 #  'Clinical+biomarkers B',
#                                 #  'Clinical+biomarkers C',
#                                  ])
# # list of paths
# paths = [
#         './HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/',
#         './HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/',
#         './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
#         ]

# # get dataframe with with scores of models from different datasets
# top_features = []
# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     if i < 1: 
#         top_features = top_features+list(str(col) for col in table.index[:10])
#     else:    
#         top_features = top_features+list(eval(col)[1] for col in table.index[:10])

# features['features'] = list(set(top_features))
# features.index = list(set(top_features))
# features.fillna(0, inplace=True)

# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     if i < 1: 
#         features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
#     else:    
#         # top_features = top_features+list(eval(col) for col in table.index[:10])   
#         features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

# features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
# features.sort_values(ascending=False,  inplace=True, by=("features"))
# features.columns = ['sum'] + list(features.columns[1:])
# features.to_excel('./HSE project/Optimisation data/revascularization/feature_selection.xlsx')
# features

# **Target**: Combined

In [106]:
results_path = './HSE project/Graphics/combined/'

## Biomarkers A

### Subset
### Split into train and test

In [43]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (150, 86)
y_train shape:	 (150, 1)
X_test shape:	 (50, 86)
y_test shape:	 (50, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', #my_f2_scorer() 'f1'
       catboost_score='F:beta=1', #'F:beta=2' 'F:beta=1'
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [44]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.792,0.022,0.712,0.844,0.58,0.565,0.963,0.66,0.61,3,20,1,26,0.442,0.133
SVM,0.784,0.016,0.701,0.854,0.54,0.54,1.0,0.632,0.548,0,23,0,27,0.47,0.056
Logistic Regression,0.725,0.058,0.69,0.719,0.64,0.645,0.741,0.638,0.655,12,11,7,20,0.511,0.071
KNN,0.694,0.035,0.743,0.861,0.64,0.605,0.963,0.562,0.524,6,17,1,26,0.439,0.104
CatBoost,0.714,0.045,0.708,0.788,0.62,0.605,0.852,0.676,0.671,8,15,4,23,0.551,0.101


In [45]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [46]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', #my_f2_scorer()
       catboost_score='F:beta=1', #'F:beta=2'
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [47]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.8,0.02,0.647,0.738,0.52,0.537,0.815,0.564,0.536,4,19,5,22,0.671,0.08
SVM,0.777,0.015,0.711,0.86,0.56,0.551,1.0,0.657,0.628,1,22,0,27,0.652,0.167
Logistic Regression,0.68,0.017,0.656,0.704,0.58,0.588,0.741,0.603,0.586,9,14,7,20,0.557,0.076
KNN,0.705,0.038,0.667,0.709,0.6,0.606,0.741,0.597,0.592,10,13,7,20,0.546,0.039
CatBoost,0.734,0.037,0.657,0.743,0.54,0.55,0.815,0.57,0.509,5,18,5,22,0.621,0.038


## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features A

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Biomarkers B

### Subset
### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [76]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (62, 77)
y_train shape:	 (62, 1)
X_test shape:	 (32, 77)
y_test shape:	 (32, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features B

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Biomarkers C

### Subset
### Split into train and test

In [18]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 5)
y_train shape:	 (86, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [19]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.427,0.145,0.522,0.6,0.744,0.429,0.667,0.565,0.755,26,8,3,6,0.522,0.111
SVM,0.525,0.117,0.32,0.385,0.605,0.25,0.444,0.27,0.562,22,12,5,4,0.667,0.15
Logistic Regression,0.561,0.071,0.432,0.625,0.512,0.286,0.889,0.687,0.824,14,20,1,8,0.641,0.081
KNN,0.479,0.087,0.25,0.294,0.581,0.2,0.333,0.206,0.49,22,12,6,3,0.63,0.068
CatBoost,0.302,0.071,0.118,0.114,0.651,0.125,0.111,0.264,0.614,27,7,8,1,0.591,0.051


In [20]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features C

#### Subset
#### Split into train and test

In [21]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 100)
y_train shape:	 (86, 1)
X_test shape:	 (43, 100)
y_test shape:	 (43, 1)


In [22]:
X_train.isna().sum().sum()

0

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [23]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.557,0.205,0.353,0.341,0.744,0.375,0.333,0.404,0.732,29,5,6,3,0.754,0.109
SVM,0.602,0.062,0.387,0.517,0.558,0.273,0.667,0.287,0.614,18,16,3,6,0.768,0.042
Logistic Regression,0.472,0.256,0.455,0.51,0.721,0.385,0.556,0.41,0.735,26,8,4,5,0.686,0.162
KNN,0.334,0.249,0.421,0.435,0.744,0.4,0.444,0.294,0.634,28,6,5,4,0.578,0.134
CatBoost,0.512,0.183,0.381,0.417,0.698,0.333,0.444,0.548,0.66,26,8,5,4,0.705,0.126


In [24]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

### subset

In [27]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Clinical C/top features/"

### Hyper-parameter optimisation

In [28]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [29]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.609,0.136,0.375,0.349,0.767,0.429,0.333,0.365,0.683,30,4,6,3,0.817,0.117
SVM,0.628,0.076,0.424,0.583,0.558,0.292,0.778,0.373,0.719,17,17,2,7,0.785,0.068
Logistic Regression,0.569,0.059,0.364,0.5,0.512,0.25,0.667,0.4,0.618,16,18,3,6,0.728,0.066
KNN,0.533,0.183,0.381,0.417,0.698,0.333,0.444,0.356,0.708,26,8,5,4,0.745,0.129
CatBoost,0.496,0.167,0.421,0.435,0.744,0.4,0.444,0.375,0.725,28,6,5,4,0.746,0.14


## Clinical features A-B-C

#### Subset
#### Split into train and test

In [30]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_abc, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_combined_abc, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_combined_abc, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_combined_abc, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (317, 56)
y_train shape:	 (317, 1)
X_test shape:	 (106, 56)
y_test shape:	 (106, 1)


### Hyper-parameter optimisation

In [31]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [32]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.579,0.086,0.615,0.596,0.67,0.651,0.583,0.708,0.742,43,15,20,28,0.655,0.06
SVM,0.598,0.088,0.653,0.649,0.689,0.66,0.646,0.669,0.723,42,16,17,31,0.666,0.053
Logistic Regression,0.603,0.083,0.604,0.604,0.642,0.604,0.604,0.638,0.676,39,19,19,29,0.672,0.057
KNN,0.557,0.058,0.568,0.565,0.613,0.574,0.562,0.583,0.648,38,20,21,27,0.618,0.051
CatBoost,0.523,0.092,0.595,0.548,0.679,0.694,0.521,0.68,0.737,47,11,23,25,0.631,0.025


In [33]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)




### Feature selection

### Top feature optimisation and metrics

#### subset

In [34]:
X_train = X_train[_.index[:20]]
X_test = X_test[_.index[:20]]

optimisation_path = "./HSE project/Optimisation data/combined/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [35]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [36]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.595,0.06,0.674,0.644,0.726,0.732,0.625,0.732,0.777,47,11,18,30,0.681,0.048
SVM,0.582,0.1,0.632,0.628,0.67,0.638,0.625,0.691,0.729,41,17,18,30,0.652,0.061
Logistic Regression,0.607,0.069,0.68,0.709,0.689,0.636,0.729,0.653,0.714,38,20,13,35,0.652,0.071
KNN,0.552,0.053,0.452,0.417,0.566,0.528,0.396,0.503,0.59,41,17,29,19,0.61,0.075
CatBoost,0.529,0.083,0.598,0.563,0.67,0.667,0.542,0.644,0.709,45,13,22,26,0.643,0.023


## Results

### Scores of models

In [37]:
# what metric to evaluate
# F2 F1 ROC_AUC
metrics = ['F2', 'ROC_AUC']

for metric in metrics:
    # list of paths
    paths = [
            './HSE project/Optimisation data/combined/Biomarkers A/all biomarkers/',
            './HSE project/Optimisation data/combined/Biomarkers B/all biomarkers/',
            './HSE project/Optimisation data/combined/Biomarkers C/all biomarkers/',
            './HSE project/Optimisation data/combined/Clinical ABC/all clinical/',
            './HSE project/Optimisation data/combined/Biomarkers A + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Biomarkers B + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Biomarkers C + Clinical/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Clinical A/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Clinical B/all biomarkers and clinical/',
            './HSE project/Optimisation data/combined/Clinical C/all biomarkers and clinical/',
            ]

    # create datframe for scores
    datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
    datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

    # get dataframe with with scores of models from different datasets
    for i in range(len(paths)):
        table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
        datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
        if metric == 'F2':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
        if metric == 'ROC_AUC':
            datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'mean')].values.round(3)) 
            datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('ROC_AUC, train set, cv=5', 'std')].values.round(3)) 

    # list of models
    # standart models
    models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
    # ensemble models
    # models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

    # create the graph
    fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns])
        
    # add error whiskers from gridsearchCV
    if True:
        fig.add_traces([go.Box(name=column, x=models, 
                              y=datasets_mean[column], 
                              #  xaxis="x1",  
                              
                              marker=dict(color="black"), 
                              showlegend = False) for column in datasets.columns])
        fig.update_traces(
        selector=dict(type="box"), # update only boxes
        boxpoints="all", # show points
        pointpos=0, # centered
        jitter=0, # no jitter
        line_color="rgba(255,255,255,0)", # hide box lines
        fillcolor="rgba(255,255,255,0)", # hide box fill
        
        )
        fig.update_layout(boxmode="group",)

        fig.add_traces([go.Bar(name=column, x=models, 
                              y=datasets_mean[column], 
                              xaxis="x2",  
                              error_y=dict(type='data',  
                                            array=datasets_std[column], 
                                            color="rgba(0,0,0,1)",
                                            thickness=1), 
                              marker=dict(opacity=0,
                                          #  color="rgba(255,255,255,0)"
                                          ), 
                              showlegend = False) for column in datasets.columns])  
        

    fig.update_xaxes(title='Models')
    # Change the bar mode
    fig.update_layout(barmode='group', 
                      xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False},
                      bargap=0.30,
                      bargroupgap=0.3,
                      legend=dict(orientation="v", title='Datasets'), 
                      title=dict(text=f'{metric} score', x=0.5,),
                      margin=dict(l=60, r=20, t=60, b=40),)

    fig.update_yaxes(title='Score', range=[0., 1.0])


    # add dotted line for ROC AUC = 0.5
    if metric == 'ROC_AUC':
        fig.add_shape(type='line',
                        x0=-0.5,
                        y0=0.5,
                        x1=4.5,
                        y1=0.5,
                        line=dict(color='firebrick',  width=2, dash='dot'),
                        xref='x',
                        yref='y')   
    # figure size
    fig.update_layout(
        autosize=False,
        width=1300,
        height=450,) 
    fig.show(renderer='colab')
    fig.write_image(f"{results_path}{metric}.pdf", engine="kaleido")

### Compare with Top 10

In [108]:
# what metric to evaluate
# ROC_AUC F1 F2
metric = 'F2'

# list of paths
paths = [
        # './HSE project/Optimisation data/combined/Biomarkers A/all biomarkers/',
        # './HSE project/Optimisation data/combined/Biomarkers B/all biomarkers/',
        # './HSE project/Optimisation data/combined/Biomarkers C/all biomarkers/',
        './HSE project/Optimisation data/combined/Clinical ABC/all clinical/',
        './HSE project/Optimisation data/combined/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/combined/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/combined/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# list of paths of top 20
paths_top = [
        # './HSE project/Optimisation data/combined/Biomarkers A/biomarkers top features/',
        # './HSE project/Optimisation data/combined/Biomarkers B/biomarkers top features/',
        # './HSE project/Optimisation data/combined/Biomarkers C/biomarkers top features/',
        './HSE project/Optimisation data/combined/Clinical ABC/top features/',
        './HSE project/Optimisation data/combined/Biomarkers A + Clinical/top features/',
        './HSE project/Optimisation data/combined/Biomarkers B + Clinical/top features/',
        './HSE project/Optimisation data/combined/Biomarkers C + Clinical/top features/'
        ]

# create datframe for scores # 'biomarkers A','biomarkers B','biomarkers C',
datasets = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_mean = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_std = pd.DataFrame(columns=['Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
    table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
                                    list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3))
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2, train set, cv=5', 'std')].values.round(3))


# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'CatBoost'] + ['RandomForest top 20', 'SVM top 20', 'Logistic Regression top 20',  'CatBoost top 20'] # 'KNN', 'KNN top 20',


# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column][[0,1,2,4,5,6,7,9]]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
# if metric == 'F2':
#     fig.add_traces([go.Box(name=column, x=models, 
#                            y=datasets_mean[column], 
#                           #  xaxis="x1",  
                           
#                            marker=dict(color="black"), 
#                            showlegend = False) for column in datasets.columns])
#     fig.update_traces(
#     selector=dict(type="box"), # update only boxes
#     boxpoints="all", # show points
#     pointpos=0, # centered
#     jitter=0, # no jitter
#     line_color="rgba(255,255,255,0)", # hide box lines
#     fillcolor="rgba(255,255,255,0)", # hide box fill
    
#     )
#     fig.update_layout(boxmode="group",)

#     fig.add_traces([go.Bar(name=column, x=models, 
#                            y=datasets_mean[column], 
#                            xaxis="x2",  
#                            error_y=dict(type='data',  
#                                         array=datasets_std[column], 
#                                         color="rgba(0,0,0,1)",
#                                         thickness=1), 
#                            marker=dict(opacity=0,
#                                       #  color="rgba(255,255,255,0)"
#                                        ), 
#                            showlegend = False) for column in datasets.columns])  
    
# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Score', range=[0., 1.0])
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=7.5,
                    y1=0.5,
                    line=dict(color='black',  width=2, dash='dot'),
                    xref='x',
                    yref='y',
                    layer='below')   
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,) 
fig.show(renderer='colab')
fig.write_image(f"{results_path}top_{metric}.pdf", engine="kaleido")

### Feature selection

In [None]:
# features = pd.DataFrame(columns=['features', 
#                                  'biomarkers A',
#                                 #  'biomarkers B',
#                                 #  'biomarkers C',
#                                  'Clinical+biomarkers ABC',
#                                  'Clinical+biomarkers A',
#                                 #  'Clinical+biomarkers B',
#                                 #  'Clinical+biomarkers C',
#                                  ])
# # list of paths
# paths = [
#         './HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/',
#         './HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/',
#         './HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
#         # './HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
#         ]

# # get dataframe with with scores of models from different datasets
# top_features = []
# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     if i < 1: 
#         top_features = top_features+list(str(col) for col in table.index[:10])
#     else:    
#         top_features = top_features+list(eval(col)[1] for col in table.index[:10])

# features['features'] = list(set(top_features))
# features.index = list(set(top_features))
# features.fillna(0, inplace=True)

# for i in range(len(paths)):
#     table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
#     table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
#     if i < 1: 
#         features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
#     else:    
#         # top_features = top_features+list(eval(col) for col in table.index[:10])   
#         features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

# features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
# features.sort_values(ascending=False,  inplace=True, by=("features"))
# features.columns = ['sum'] + list(features.columns[1:])
# features.to_excel('./HSE project/Optimisation data/revascularization/feature_selection.xlsx')
# features

# References

- [Guidelines and quality criteria for artificial intelligence-based prediction models in healthcare: a scoping review](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8748878/pdf/41746_2021_Article_549.pdf)

- ✅ [Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5#citeas)

- ✅ [Machine learning-based prediction of adverse events
following an acute coronary syndrome (PRAISE): a modelling
study of pooled datasets](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)32519-8/fulltext)

- ✅ [Critical appraisal of artificial intelligence-based prediction models for cardiovascular disease](https://watermark.silverchair.com/ehac238.pdf?token=AQECAHi208BE49Ooan9kkhW_Ercy7Dm3ZL_9Cf3qfKAc485ysgAAAtQwggLQBgkqhkiG9w0BBwagggLBMIICvQIBADCCArYGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMSvd0U0E66pd-sc_vAgEQgIICh_0OOn38okwwjvtHKZZRS6iesoJ0VuLm_qXiAJTeVb_83xAfB7oneCMsGdP7SkYUZPlcO3UtImKRROpfFzoAH87-TvQu04QMni8-YL47A9k13em0EMsLU86rv0fjaSmxgG-hPnAe7eRJaEDf1ckm-YBNx65aPTx1UC8yW3YO0gDra3ROrfsyl2UariiUse8hZ5S-I2WvFx0gic__qBLni02hEetj0dt-mInD7DxKqGuk28AuNOCDlF9Q1Tfj7oSyk6_1aNHJJ9XklpOJgzsKn-j4yusaYkapojnZzcNzBGcx6tTWYDn-YFcevxsYSc_uKlSUl40oTPl5Gwp-gAyxaLx9bFRuCDA6bxfPsNjgLQR0Eo4QxBuMD5h8FR6H6hEkZ1heaEpiWvZHqwTbEMddl1L1EgD2w-L-ng1YHbegVuZLa-Noll9OWfYSsVZf330LvUYMnTSu3FxrJ72voWUNhS3xzpTvkaeTqIkQgRU5Q75TfoKpMWfefufVgDshQhRM0ww1qRImd34Faql0RyBAKOPXG_HaucEkyXb60GCd6-0yjP5Mjbq-TML0Y9pnKIvmf9wXcTw-DJTcMT97fzWbp_psY70J02wEjvHPxfkOyEl9TiA08sI24GqKHAZuSU_M5R2dGN5W7qGuN_A-TbFKvO3FyMDOgV89BtJXHk8wVYpR-f2uppZydQydht_KTHlkV8hbYf0StZGbCXLb-fk38yZ6rerF9dTXfT6PtrYdlBYrVW65ZRn1HbxhoA0LBI0f5z8gpiqQjnyxSzrX-e9FYtOfOPu-i-IfGTLMFELowQ3IXkTup2Ee1dvT0sosTfoC5Q6x6d8nubiZFtw_SLYg21vF1XH2Gw9d)

- [Interpretation of machine learning predictions for patient outcomes in electronic health records](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7153071/pdf/3200408.pdf)
- [Minimum sample size for external validation of a clinical prediction model with a binary outcome](https://pubmed.ncbi.nlm.nih.gov/34031906/)
- [Machine learning of clinical variables and coronary artery calcium scoring for the prediction of obstructive coronary artery disease on coronary computed tomography angiography: analysis from the CONFIRM registry](https://pubmed.ncbi.nlm.nih.gov/31513271/)
- [Reflection on modern methods: when worlds collide-prediction, machine learning and causal inference](https://pubmed.ncbi.nlm.nih.gov/31298274/)
- General Cardiovascular Risk Profile for Use in Primary Care

### Feature importance

- ✅ [Feature Importance May Be Lying To You](https://towardsdatascience.com/feature-importance-may-be-lying-to-you-3247cafa7ee7)
- [Different Measures of Feature Importance Behave Differently](https://hippocampus-garden.com/feature_importance/)
- [Explaining Feature Importance by example of a Random Forest](https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e)
- [Interpret Logistic Regression Coefficients (For Beginners)](https://quantifyinghealth.com/interpret-logistic-regression-coefficients/)

- [FAQ: HOW DO I INTERPRET ODDS RATIOS IN LOGISTIC REGRESSION?](https://stats.oarc.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/#:~:text=A%20logistic%20regression%20model%20allows,relationship%20with%20the%20predictor%20variables.)

### Imputation
I decided to use kNN imputation b/c it was easy to implement using sklearn package and it was much better than other simpler imputers. However, there are also two types of complex imputers that might be reasonable to use (MICE and datawig)
- [6 Different Ways to Compensate for Missing Values In a Dataset (Data Imputation with examples)](https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779)
- [sklearn](https://scikit-learn.org/stable/modules/impute.html)
- [kNN Imputation for Missing Values in Machine Learning](https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/)


### Understanding model predictions


In paper "Critical appraisal of artificial intelligence-based prediction models for cardiovascular disease" it was mentioned the use of LIME and SHAP
  
##### LIME 
- [“Why Should I Trust You?” Explaining the Predictions of Any Classifier - paper about LIME](https://arxiv.org/pdf/1602.04938.pdf)

- [Understanding model predictions with LIME](https://towardsdatascience.com/understanding-model-predictions-with-lime-a582fdff3a3b)
- [Understanding how LIME explains predictions](https://towardsdatascience.com/understanding-how-lime-explains-predictions-d404e5d1829c)
- ✅ [How to explain ML models and feature importance with LIME?](https://analyticsindiamag.com/how-to-explain-ml-models-and-feature-importance-with-lime/)
- ✅ [Local Interpretable Model-Agnostic Explanations (LIME): An Introduction](https://www.oreilly.com/content/introduction-to-local-interpretable-model-agnostic-explanations-lime/)
- ✅ [Explanations (LIME)](https://ema.drwhy.ai/LIME.html)  
##### SHAP   
- ✅ [SHAP Values Explained Exactly How You Wished Someone Explained to You](https://towardsdatascience.com/shap-explained-the-way-i-wish-someone-explained-it-to-me-ab81cc69ef30)
-[Using SHAP Values to Explain How Your Machine Learning Model Works](https://towardsdatascience.com/using-shap-values-to-explain-how-your-machine-learning-model-works-732b3f40e137)
-[I have to find out what approximations they use to calculate Shapley values for all features, considering $2^n$ complexity]()
-[How to define fairness to detect and prevent discriminatory outcomes in Machine Learning](https://towardsdatascience.com/how-to-define-fairness-to-detect-and-prevent-discriminatory-outcomes-in-machine-learning-ef23fd408ef2)

### Fairness
- ✅ [A Tutorial on Fairness in Machine Learning](https://towardsdatascience.com/a-tutorial-on-fairness-in-machine-learning-3ff8ba1040cb)
- [sklego documentation](https://scikit-lego.readthedocs.io/en/latest/fairness.html)
- ✅ [Equality and fairness measures in classification models](https://www.auditingalgorithms.net/EqualityAndFairness.html)
- [Fairness Definitions Explained - должна быть понятная и полезная статья](http://fairware.cs.umass.edu/papers/Verma.pdf)
- [CS 294: Fairness in Machine Learning](https://fairmlclass.github.io/)
- []()

### Feature seletion
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 1](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-1-3574269d5c69)
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 2](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-2-c258f8a2ac43)
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 3](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-3-de2a7593247f)
- ✅ [How to Choose a Feature Selection Method For Machine Learning](https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/)
- ✅ [Understanding ANOVA-F for feature selection in Python](https://datascience.stackexchange.com/questions/74465/how-to-understand-anova-f-for-feature-selection-in-python-sklearn-selectkbest-w#answer-74486)
- [sklearn: Feature selection](https://scikit-learn.org/stable/modules/feature_selection.html)
- ✅ [What are variable importance rankings useful for?](https://stats.stackexchange.com/questions/202277/what-are-variable-importance-rankings-useful-for#question-header)
- ✅ [feature importance is a slippery concept](https://stats.stackexchange.com/questions/202221/for-linear-classifiers-do-larger-coefficients-imply-more-important-features/202853#answer-202853)
- ✅ [Why lasso for feature selection?](https://stats.stackexchange.com/questions/367155/why-lasso-for-feature-selection#question-header)
- [Boruta SHAP: A Tool for Feature Selection Every Data Scientist Should Know](https://towardsdatascience.com/boruta-shap-an-amazing-tool-for-feature-selection-every-data-scientist-should-know-33a5f01285c0#:~:text=The%20idea%20of%20the%20Boruta,importance%20of%20the%20shadow%20features.)
- ✅ [Intuitions on L1 and L2 Regularisation](https://towardsdatascience.com/intuitions-on-l1-and-l2-regularisation-235f2db4c261)
- [L0 Norm, L1 Norm, L2 Norm & L-Infinity Norm](https://montjoile.medium.com/l0-norm-l1-norm-l2-norm-l-infinity-norm-7a7d18a4f40c#:~:text=L1%20Norm%20is%20the%20sum,the%20vector%20are%20weighted%20equally.)
- []()
- []()

### Advanced predictions
- [Ensemble methods: bagging, boosting and stacking](https://towardsdatascience.com/ensemble-methods-bagging-boosting-and-stacking-c9214a10a205)
- [sklearn: Ensemble methods](https://scikit-learn.org/stable/modules/ensemble.html)
- [A Deep Dive into Stacking Ensemble Machine Learning — Part I](https://towardsdatascience.com/a-deep-dive-into-stacking-ensemble-machine-learning-part-i-10476b2ade3)
- [Cтекинг (Stacking) и блендинг (Blending)](https://dyakonov.org/2017/03/10/c%D1%82%D0%B5%D0%BA%D0%B8%D0%BD%D0%B3-stacking-%D0%B8-%D0%B1%D0%BB%D0%B5%D0%BD%D0%B4%D0%B8%D0%BD%D0%B3-blending/)
- []()
- []()


### Clustering
- [Overview of Clustering Algorithms](https://towardsdatascience.com/overview-of-clustering-algorithms-27e979e3724d)
- []()
- []()