## Prerequisites

### Import libraries

In [1]:
# Import libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from sklearn.model_selection import train_test_split
# from google.colab import output

from pandas import DatetimeIndex as dt
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
# from google.colab import files
import IPython
from IPython.display import HTML, display, clear_output 
# from google.colab import drive
import sys

# hyper-parameters optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# metrics
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as TP_rate                          
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score as recall
from sklearn.metrics import average_precision_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer,fbeta_score
from sklearn.model_selection import StratifiedKFold


# classifiers
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostRegressor, RandomForestRegressor, GradientBoostingClassifier, StackingClassifier, VotingClassifier #
from sklearn.tree import DecisionTreeClassifier     #
from sklearn.svm import SVC                                    # both linear and radial classification
from sklearn.neighbors import KNeighborsClassifier             # k=3
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import catboost
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from mlxtend.classifier import StackingClassifier

# statistics
from scipy.stats import shapiro
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu

# imputations
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.base import clone

# feature selection
from sklearn.feature_selection import chi2, mutual_info_classif, f_classif, SelectKBest, RFE, RFECV, SequentialFeatureSelector
from scipy.stats import kendalltau, spearmanr
from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold
from mrmr import mrmr_classif

# to conver string to dict
import ast

# Interpretability
 # !pip install interpret
from interpret.blackbox import LimeTabular
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
from interpret import show

import lime
import lime.lime_tabular
from __future__ import print_function

# ignore warnings when graphs are plotted
import warnings
warnings.filterwarnings('ignore')

### Data import

In [2]:
# links to datasets

# lancet
link_train_lancet = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/lancet%20dataset/train_abc_lancet.xlsx?raw=true'
link_test_lancet  = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/lancet%20dataset/test_abc_lancet.xlsx?raw=true'

# death
link_train_death_a   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/cardiovascular%20death/train_a.xlsx?raw=true'
link_test_death_a    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/cardiovascular%20death/test_a.xlsx?raw=true'
link_train_death_b   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/cardiovascular%20death/train_b.xlsx?raw=true'
link_test_death_b    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/cardiovascular%20death/test_b.xlsx?raw=true'
link_train_death_c   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/cardiovascular%20death/train_c.xlsx?raw=true'
link_test_death_c    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/cardiovascular%20death/test_c.xlsx?raw=true'
link_train_death_abc = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/cardiovascular%20death/train_abc.xlsx?raw=true'
link_test_death_abc  = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/cardiovascular%20death/test_abc.xlsx?raw=true'

# combined
link_train_combined_a   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/combined/train_a.xlsx?raw=true'
link_test_combined_a    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/combined/test_a.xlsx?raw=true'
link_train_combined_b   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/combined/train_b.xlsx?raw=true'
link_test_combined_b    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/combined/test_b.xlsx?raw=true'
link_train_combined_c   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/combined/train_c.xlsx?raw=true'
link_test_combined_c    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/combined/test_c.xlsx?raw=true'
link_train_combined_abc = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/combined/train_abc.xlsx?raw=true'
link_test_combined_abc  = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/combined/test_abc.xlsx?raw=true'

# revascularization
link_train_revascularization_a   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/revascularization/train_a.xlsx?raw=true'
link_test_revascularization_a    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/revascularization/test_a.xlsx?raw=true'
link_train_revascularization_b   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/revascularization/train_b.xlsx?raw=true'
link_test_revascularization_b    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/revascularization/test_b.xlsx?raw=true'
link_train_revascularization_c   = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/revascularization/train_c.xlsx?raw=true'
link_test_revascularization_c    = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/revascularization/test_c.xlsx?raw=true'
link_train_revascularization_abc = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/revascularization/train_abc.xlsx?raw=true'
link_test_revascularization_abc  = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Preprocessed%20Data/revascularization/test_abc.xlsx?raw=true'

### Tuning of hyper-parameters

#### Grids of hyper-parameters

In [3]:
# hyper-parameters for gridsearchCV

# 1. Logistic regression
parameters_LR_model = dict(
                          C = [0.001, 0.01, 0.1, 1.],  # defeult
                          tol = [1.e-4],
                          penalty = ['elasticnet', 'l1', 'l2', 'none'],
                          # njobs = [-1],
                          dual = [False],
                          fit_intercept = [False],
                          # intercept_scaling =
                          class_weight = ['balanced', None],
                          random_state = [10],
                          solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                          max_iter = [10000],
                          multi_class = ['auto'],
                          verbose = [0],
                          warm_start = [True]
                          # l1_ratio
                          )

# 2. Random Forest
parameters_random_forest_model = dict(
                  n_estimators = [int(x) for x in np.linspace(start = 50, stop = 400, num = 25)],
                  criterion = ['gini'],
                  max_depth = [*[int(x) for x in np.linspace(2, 10, num = 5)]],
                  min_samples_split = [2,4],  
                  # min_samples_leaf = [1,2],  
                  min_weight_fraction_leaf = [0.0],
                  max_features = ['sqrt'],  
                  max_leaf_nodes = [None],
                  min_impurity_decrease = [0.],
                  bootstrap = [True],
                  oob_score = [False],
                  n_jobs = [-1],
                  random_state = [10],
                  verbose = [0],
                  warm_start = [True],
                  class_weight = ['balanced', 'balanced_subsample', None],
                  # ccp_alpha = 
                  max_samples = [None]  # maybe =0.1 here for getting almost independent samples for trees
                  )

# 3. k-NN
parameters_knn = dict(
                      n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 7, num = 7)],
                      weights = ['uniform', 'distance'],
                      algorithm = ['ball_tree', 'kd_tree', 'brute'], 
                      leaf_size = [15, 30, 60],
                      p = [3], 
                      metric = ['chebyshev', 'minkowski', 'euclidean', 'manhattan'],
                      # metric_params = 
                      n_jobs = [-1]
                      )

# 4. SVM
parameters_svm = dict(
                  C = [int(x) for x in np.linspace(start = 1, stop = 25, num = 20)],
                  kernel = ['rbf', 'linear', 'poly', 'sigmoid'],
                  degree = [3, 4, 5],
                  gamma = ['scale', 'auto'],
                  coef0 = [0.0],
                  shrinking = [True, False],
                  probability = [True],
                  tol = [1.e-3], 
                  cache_size = [200],
                  class_weight = ['balanced', None],
                  verbose = [False],
                  max_iter = [1.e6],  # мб поставить конечные итерации, как в Logistic Regression
                  # decision_function_shape = [],
                  # break_ties = [],
                  random_state = [10]
                  )

# 5. CatBoost
catboost_parameters = {'depth': [4,6,8,10],  # larger depth is preferable
              'learning_rate': [0.1,0.2,0.3],
              'l2_leaf_reg': [0,3,6,1],
              }
c_boost_params = {'eval_metric' : 'F1', # 'F1' my_f2_scorer, 'F'
                  # 'beta' : 2,
                  'verbose' : False,
                  'early_stopping_rounds' : 100,
                  #cat_features=cat_features,
                  'task_type' : "CPU",
                  'iterations' : 500,
                  'random_seed' : 10}

#### Tuning functions

In [4]:
def tuning(score, catboost_score, cross_validation, path, logistic_regression, knn, random_forest, svm, catboost):
    # score = my_f2_scorer(), 'f1', 'accuracy', 'precision', 'recall', 'roc_auc'
    # catboost_score = 'F1' 'F:beta=2'
    if logistic_regression:
        logistic_regression_tuning(score, cross_validation, path)     

    if knn:
        knn_tuning(score, cross_validation, path)     
    if random_forest:
        random_forest_tuning(score, cross_validation, path)     
    if svm:
        svm_tuning(score, cross_validation, path)     
    if catboost:
        catboost_tuning(catboost_score, cross_validation, path)     

In [5]:
def logistic_regression_tuning(score, cross_validation, path):

    # LogisticRegression: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

    LR_model = LogisticRegression(random_state=10)

    # calibrate hyper-parameters: perform gridsearch with cross-validation
    clf = GridSearchCV(
                      estimator = LR_model, 
                      param_grid = parameters_LR_model,
                      scoring = score,    
                      #  refit = my_f2_scorer,
                      cv = cross_validation,
                      n_jobs = -1
                      )              
    %time clf.fit(X_train, y_train)
    LR_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(LR_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}LogisticRegression_optimisation.xlsx')

In [6]:
def knn_tuning(score, cross_validation, path):

    # KNeighborsClassifier: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

    knn_model = KNeighborsClassifier()

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=knn_model, 
                      param_grid=parameters_knn,
                      scoring=score,
                      #  refit=my_f2_scorer,
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    knn_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(knn_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}knn_optimisation.xlsx')

In [7]:
def random_forest_tuning(score, cross_validation, path):
      
    # RandomForestClassifier: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

    random_forest_model = RandomForestClassifier(random_state=10)

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=random_forest_model, 
                      param_grid=parameters_random_forest_model,
                      scoring=score,  
                      #  refit=my_f2_scorer,
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    random_forest_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(random_forest_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}randomforest_optimisation.xlsx')

In [8]:
def svm_tuning(score, cross_validation, path):
      
    # SVM_model
    # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC 

    SVM_model = SVC()

    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 
    clf = GridSearchCV(
                      estimator=SVM_model, 
                      param_grid=parameters_svm,
                      scoring=score,  
                      # refit=score[0],
                      cv=cross_validation,
                      n_jobs=-1
                      )              
    %time clf.fit(X_train, y_train)
    SVM_model = clf.best_estimator_

    # save optimisation parameters
    optimisation_table = pd.DataFrame(clf.cv_results_)

    # add roc_auc fCV values
    optimisation_table['roc_auc'] = str(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_mean'] = np.mean(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table['roc_auc_std'] = np.std(cross_val_score(SVM_model, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    optimisation_table.to_excel(f'{path}svm_optimisation.xlsx')
    # files.download("/content/svm_optimisation.xlsx")

In [9]:
def catboost_tuning(catboost_score, cross_validation, path):
    # Catboost
    # tuning: https://catboost.ai/en/docs/concepts/parameter-tuning


    # calibrate hyper-parameters: perform gridsearch with cross-validation = 5 

    np.random.seed(10)
    catboost = CatBoostClassifier(
                                eval_metric=catboost_score,
                                verbose=False,
                                early_stopping_rounds=100,
                                #cat_features=cat_features,
                                task_type="CPU",
                                iterations = 500,
                                random_seed=10)


    grid_res = catboost.grid_search(catboost_parameters,
                                    X_train,
                                    y_train,
                                    cv=cross_validation,
                                    search_by_train_test_split=True,
                                    calc_cv_statistics=True,
                                    refit=True,
                                    shuffle=True,
                                    partition_random_seed=10,
                                    verbose=True,
                                    stratified=True)

    # save optimisation parameters
    cv_results = pd.DataFrame(grid_res['cv_results'])
    cv_results['params'] = 0
    cv_results['params'][0:3] = grid_res['params']


    # add roc_auc fCV values
    cv_results['roc_auc'] = str(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    cv_results['roc_auc_mean'] = np.mean(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))
    cv_results['roc_auc_std'] = np.std(cross_val_score(catboost, X_train, y_train, cv=StratifiedKFold(5), scoring='roc_auc'))

    cv_results.to_excel(f'{path}catboost_optimisation.xlsx')

#### Function: Optimised table of metrics 

In [10]:
def optimised_metrics_table(model_name):
    optimised_metrics = []
    # optimised_metrics.append(mcc(y_test, forecast))        #.round(3)                          # MCC
    optimised_metrics.append(f1(y_test, forecast).round(3))                                    # F1
    optimised_metrics.append(f2_func(y_test, forecast).round(3))                               # F2
    optimised_metrics.append(accuracy(y_test, forecast).round(3))                              # Accuracy
    optimised_metrics.append(TP_rate(y_test, forecast).round(3))                               # TP rate
    optimised_metrics.append(recall(y_test, forecast).round(3))                                # TN rate
    precision, recall_, thresholds = precision_recall_curve(y_test, forecast_proba)                  # ------
    optimised_metrics.append(auc(recall_, precision).round(3))                                 # PR AUC
    optimised_metrics.append(roc_auc(y_test, forecast_proba).round(3))                               # ROC AUC
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[0])                    # number of true negative
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[1])                    # number of false positive
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[2])                    # number of false negative
    optimised_metrics.append(confusion_matrix(y_test, forecast).ravel()[3])                    # number of true positive

    optimised_metrics = pd.DataFrame(optimised_metrics, columns=[model_name])
    # add rows names
    optimised_metrics.index = [
                              #  "MCC",
                               "F1", "F2",
                               "Accuracy",
                               "Precision",
                               "Recall",
                               "PR_AUC",
                               "ROC_AUC",
                               "TN", "FP", "FN", "TP"
                                ]
    optimised_metrics = optimised_metrics.T
    
    return optimised_metrics

#### Function metric_table - visualise model scores

In [11]:
'''
This function provides scores for gridsearch F1-score and metrics for test dataset
'''

def metric_table(path):  #, X_train=X_train, y_train=y_train

    # read gridsearch tables
    randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0]) #/content/  ./imp_feat
    svm_optimisation = pd.read_excel(f'{path}svm_optimisation.xlsx', header=[0])
    knn_optimisation = pd.read_excel(f'{path}knn_optimisation.xlsx', header=[0])
    LogisticRegression_optimisation = pd.read_excel(f'{path}LogisticRegression_optimisation.xlsx', header=[0])
    catboost_optimisation = pd.read_excel(f'{path}catboost_optimisation.xlsx', header=[0])

    params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    random_forest_model = RandomForestClassifier(**params)
    # 
    params = svm_optimisation[svm_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    SVM_model = SVC(**params)
    # 
    # params = nn_optimisation[nn_optimisation['rank_test_score']==1][["params"]].iloc[0]
    # params = ast.literal_eval(params[0])
    # newral_network_model = MLPClassifier(**params)
    # 
    params = knn_optimisation[knn_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    knn_model = KNeighborsClassifier(**params)
    # 
    params = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    LR_model = LogisticRegression(**params)
    # 
    params = catboost_optimisation['params'][0]
    params = ast.literal_eval(params)
    catboost_model = CatBoostClassifier(**c_boost_params, **params)

    models = [
    random_forest_model,
    SVM_model,
    # newral_network_model,
    LR_model,
    knn_model,
    catboost_model
    ]

    mcc_score, f1_score,f2_score, accuracy_score, TP_rate_score, recall_score, auc_precision_recall, roc_auc_score= [], [], [], [], [], [], [], []
    tn, fp, fn, tp = [], [], [], []

    for model in models:
        model.fit(X_train, y_train)
        forecast = model.predict(X_test)
        forecast_proba = model.predict_proba(X_test)

        # mcc_score.append(mcc(y_test, forecast))                                   # MCC
        f1_score.append(f1(y_test, forecast))                                       # F1
        f2_score.append(f2_func(y_test, forecast))                                  # F1
        accuracy_score.append(accuracy(y_test, forecast))                           # Accuracy  
        TP_rate_score.append(TP_rate(y_test, forecast))                             # TP rate   tp / (tp + fp)
        recall_score.append(recall(y_test, forecast))                               # TN rate
        auc_precision_recall.append(average_precision_score(y_test, forecast_proba[:,1]))      # PR AUC
        roc_auc_score.append(roc_auc(y_test, forecast_proba[:,1]))                       # ROC AUC
        tn.append(confusion_matrix(y_test, forecast).ravel()[0])                  # number of true negative
        fp.append(confusion_matrix(y_test, forecast).ravel()[1])                  # number of false positive
        fn.append(confusion_matrix(y_test, forecast).ravel()[2])                  # number of false negative
        tp.append(confusion_matrix(y_test, forecast).ravel()[3])                  # number of true positive

    # create matrix table 
    metrics_table = pd.DataFrame(columns=pd.MultiIndex.from_product([["F2, train set, cv=5"],["mean", 'std']]))
    # metrics_table[("Scores on the test set","MCC")] = mcc_score
    metrics_table[("Scores on the test set","F1")] = f1_score
    metrics_table[("Scores on the test set","F2")] = f2_score
    metrics_table[("Scores on the test set","Accuracy")] = accuracy_score
    metrics_table[("Scores on the test set","Precision")] = TP_rate_score
    metrics_table[("Scores on the test set","Recall")] = recall_score
    metrics_table[("Scores on the test set","PR_AUC")] = auc_precision_recall
    metrics_table[("Scores on the test set","ROC_AUC")] = roc_auc_score
    metrics_table[("Confusion matrix","TN")] = tn
    metrics_table[("Confusion matrix","FP")] = fp
    metrics_table[("Confusion matrix","FN")] = fn
    metrics_table[("Confusion matrix","TP")] = tp

    # modify the rows names
    metrics_table.index = [
                "Random Forest",
                "SVM",
                # "Multi-layer Perceptron",
                "Logistic Regression",
                "KNN",
                "CatBoost"
                ]



    # add cross validated F2 scores on the train set
    mean = []
    std = []
    mean_test_f1,std_test_f1 = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = svm_optimisation[svm_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    # mean_test_f1,std_test_f1 = nn_optimisation[nn_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    # mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = knn_optimisation[knn_optimisation['rank_test_score']==1][["mean_test_score","std_test_score"]].iloc[0]
    mean.append(mean_test_f1); std.append(std_test_f1)
    mean_test_f1,std_test_f1 = catboost_optimisation[['test-F:beta=2-mean', 'test-F:beta=2-std']].iloc[catboost_optimisation.shape[0]-1]
    mean.append(mean_test_f1); std.append(std_test_f1)
    

    metrics_table[("F2, train set, cv=5","mean")] = mean
    metrics_table[("F2, train set, cv=5","std")] = std

    # add cross validated F2 scores on the train set
    mean_roc_auc = []
    std_roc_auc = []
    mean_test_roc_auc,std_test_roc_auc = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = svm_optimisation[svm_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    # mean_test_roc_auc,std_test_roc_auc = nn_optimisation[nn_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    # mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = knn_optimisation[knn_optimisation['rank_test_score']==1][["roc_auc_mean","roc_auc_std"]].iloc[0]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)
    mean_test_roc_auc,std_test_roc_auc = catboost_optimisation[['test-F:beta=2-mean', 'test-F:beta=2-std']].iloc[catboost_optimisation.shape[0]-1]
    mean_roc_auc.append(mean_test_roc_auc); std_roc_auc.append(std_test_roc_auc)

    metrics_table[("ROC_AUC, train set, cv=5","mean")] = mean_roc_auc
    metrics_table[("ROC_AUC, train set, cv=5","std")] = std_roc_auc

    return metrics_table

#### Define $F_2$ metric

In [12]:
def f2_func(y_true, y_pred):
    f2_score = fbeta_score(y_true, y_pred, beta=2.)
    return f2_score

def my_f2_scorer():
    return make_scorer(f2_func)

### Feature selection

In [None]:
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# clf = RandomForestClassifier(max_depth=2, random_state=0)
# clf.fit(X_train, y_train)


# sfs1 = SFS(clf, 
#            k_features=10, 
#            forward=True, 
#            floating=False, 
#            verbose=2,
#            scoring='roc_auc',
#            cv=5,
#            n_jobs=-1)

# sfs1 = sfs1.fit(X_train, y_train)
# sfs1.subsets_

#### random_forest_importances

In [None]:
def random_forest_importances(path, n_features = 10, biomarkers=False):
    feature_importances = pd.DataFrame()

    randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0])


    # feature_importances.columns = ['Feature']
    params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
    params = ast.literal_eval(params[0])
    random_forest_model = RandomForestClassifier(**params)
    random_forest_model.fit(X_train, y_train)
    feature_importances["RandomForest"] = pd.Series(random_forest_model.feature_importances_)

    if biomarkers:
        feats = list(map(lambda x: x, list(X_train.columns)))
    else:
        feats = list(map(lambda x: x[1], list(X_train.columns)))

    feature_importances.index = feats

    feature_importances = feature_importances.sort_values("RandomForest", ascending=False)
    feature_importances['RandomForest'] = feature_importances['RandomForest']/feature_importances['RandomForest'][0]
    print()
    plt.barh(feature_importances.index[:n_features][::-1], feature_importances['RandomForest'][:n_features][::-1])
    plt.show()

    return feature_importances

#### Function: upload_models(x_data, y_data, path, model_list)

In [None]:
def upload_models(x_data, y_data, path, model_list):
    'Return list of trained models'

    models = []
    model_names = []

    if model_list['SVM']:
        svm_optimisation = pd.read_excel(f'{path}svm_optimisation.xlsx', header=[0])
        params = svm_optimisation[svm_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        SVM_model = SVC(**params)
        SVM_model.fit(X_train, y_train)
        models.append(SVM_model)
        model_names.append('SVM')

    if model_list['Logistic']:
        LogisticRegression_optimisation = pd.read_excel(f'{path}LogisticRegression_optimisation.xlsx', header=[0])
        params = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        LR_model = LogisticRegression(**params)
        LR_model.fit(X_train, y_train)
        models.append(LR_model)
        model_names.append('Logistic')

    if model_list['RandomForest']:
        randomforest_optimisation = pd.read_excel(f'{path}randomforest_optimisation.xlsx', header=[0])
        params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        random_forest_model = RandomForestClassifier(**params)
        random_forest_model.fit(X_train, y_train)
        models.append(random_forest_model)
        model_names.append('RandomForest')

    if model_list['KNN']:
        knn_optimisation = pd.read_excel(f'{path}knn_optimisation.xlsx', header=[0])
        params = knn_optimisation[knn_optimisation['rank_test_score']==1][["params"]].iloc[0]
        params = ast.literal_eval(params[0])
        knn_model = KNeighborsClassifier(**params)
        knn_model.fit(X_train, y_train)
        models.append(knn_model)
        model_names.append('KNN')
        # https://catboost.ai/en/docs/concepts/fstr
    
    if model_list['Catboost']:
        catboost_optimisation = pd.read_excel(f'{path}catboost_optimisation.xlsx', header=[0])
        params = catboost_optimisation['params'][0]
        params = ast.literal_eval(params)
        catboost_model = CatBoostClassifier(**c_boost_params, **params)
        catboost_model.fit(X_train, y_train)
        models.append(catboost_model)
        model_names.append('Catboost')
        # https://catboost.ai/en/docs/concepts/fstr



    return models, model_names

#### Function: feature_selection(dataset, x_data, y_data, path)

In [None]:
def feature_selection(x_data, y_data, path):
    'Return dataset with ranged selected features'

    # get list of all column names and continuous column names
    '___________________________________________________________________________'
    all_cols = list(x_data.columns)
    continuous_cols = [col for col in x_data.columns if (len((x_data[col].unique())) >= 7)]
    feature_selection_dataset = pd.DataFrame(columns=pd.MultiIndex.from_product([["LASSO"],["coef"]]))


    # LASSO
    '___________________________________________________________________________'
    search = GridSearchCV(Lasso(),
                          {'alpha':np.linspace(0.1, 1, num=10)**2}, #  np.linspace(0.1, 1, num=10)**2 np.arange(0.1,10,0.1)
                          cv = 5, 
                          scoring=my_f2_scorer(),  #"neg_mean_squared_error" my_f2_scorer() 'f1'
                          verbose=0
                          )

    search.fit(X_train, y_train)
    feature_selection_dataset['LASSO', 'coef'] = np.abs(search.best_estimator_.coef_)
    # feature_selection_dataset['LASSO', 'coef'][feature_selection_dataset['LASSO', 'coef']>0] = 1
    print("Calculated LASSO")


    # get all trained models
    '___________________________________________________________________________'
    models, model_names = upload_models(x_data = x_data,
                                        y_data = y_data,
                                        path = path, 
                                        model_list = {'SVM': True, 
                                                      'Logistic': True, 
                                                      'RandomForest': True, 
                                                      'KNN': True, 
                                                      'Catboost': True})


    # Sequencial feature selection
    '___________________________________________________________________________'
    
    sfs1 = SFS_xtend(knn, 
                  k_features=20, 
                  forward=True, 
                  floating=True, 
                  verbose=2,
                  direction = 'forward',
                  cv = StratifiedKFold(5),
                  scoring=my_f2_scorer(),
                  n_jobs=-1)

    sfs1 = sfs1.fit(X, y)

    for number in [0,1]:
        sfs = SequentialFeatureSelector(estimator = models[number],
                                        n_features_to_select=None,
                                        cv = StratifiedKFold(5),
                                        scoring = my_f2_scorer(), 
                                        direction = 'backward',
                                        n_jobs=-1
                                        )
        sfs.fit(X_train, y_train)
        feature_selection_dataset['SFS', model_names[number]] = sfs.get_support()*1
    print("Calculated SFS")
    # # Recursive feature elimination with cross validation  - плохо отбирает для RandomForest, Catboost
    # '___________________________________________________________________________'
    # # models with feature importance do not have to perform SFS.    
    # for number in [1,2,4]:
    #     rfecv = RFECV(estimator = models[number],
    #                                     # n_features_to_select=None,
    #                                     cv = StratifiedKFold(5),
    #                                     scoring = my_f2_scorer(),
    #                                     n_jobs=-1
    #                                     )
    #     rfecv.fit(X_train, y_train)
    #     feature_selection_dataset['RFECV', model_names[number]] = rfecv.get_support()*1
    # print("Calculated RFECV")

    # model importances
    '___________________________________________________________________________'
    feature_selection_dataset['Importances', 'RandomForest'] = models[2].feature_importances_
    feature_selection_dataset['Importances', 'CatBoost'] = models[4].feature_importances_
    feature_selection_dataset['Importances', 'Logistic'] = np.abs(models[1].coef_[0])

    # # Drop-Column Importance
    # '___________________________________________________________________________'
    # """get score via Drop-Column Importance for models"""
    # for number in range(5):
    #     # clone the model to have the exact same specification as the one initially trained
    #     model_clone = clone(models[number])
    #     # set random_state for comparability
    #     model_clone.random_state = 37
    #     # training and scoring the benchmark model
    #     model_clone.fit(X_train, y_train)

    #     # benchmark_score = model_clone.score(X_train, y_train)
    #     y_pred = model_clone.predict(X_test)
    #     benchmark_score = f2_func(y_test, y_pred)

    #     # list for storing feature importances
    #     importances = []
        
    #     # iterating over all columns and storing feature importance (difference between benchmark and new model)
    #     for col in X_train.columns:
    #         model_clone = clone(models[number])
    #         model_clone.random_state = random_state
    #         model_clone.fit(X_train.drop(col, axis = 1), y_train)
    #         # drop_col_score = model_clone.score(X_train.drop(col, axis = 1), y_train)
    #         y_pred = model_clone.predict(X_test.drop(col, axis = 1))
    #         drop_col_score = f2_func(y_test, y_pred)
    #         importances.append(benchmark_score - drop_col_score)
        
    #     feature_selection_dataset[('Drop-Column Importance', model_names[number])] = importances
    

    # Set column names as index
    '___________________________________________________________________________'
    feature_selection_dataset.index = all_cols

    # # MRMR
    # '___________________________________________________________________________'
    # feature_selection_dataset[('MRMR', '')] = 0
    # selected_features = mrmr_classif(X=X_train, y=y_train, K=40)
    # feature_selection_dataset.loc[selected_features][('MRMR', '')] = 1


    # Unsupervised selection with Pearson correlation coefs
    '___________________________________________________________________________'
    # correlation_matrix = dataset[continuous_cols].corr( method='pearson').abs()
    # correlation_matrix  = pd.DataFrame(correlation_matrix)
    # # iteratively remove features that have correlation > 0.95
    # i=0
    # j=0
    # cols = correlation_matrix.shape[1]
    # rows = correlation_matrix.shape[0]

    # while i < cols: 
    #     while j < rows:
    #         if correlation_matrix.iloc[j,i]>0.95 and correlation_matrix.iloc[j,i]!=1:
    #             correlation_matrix.drop(index=correlation_matrix.index[j], inplace=True)
    #             correlation_matrix.drop(columns=correlation_matrix.columns[j], inplace=True)
    #         else:
    #             j+=1
    #         rows = correlation_matrix.shape[0]
    #     i+=1
    #     j=0
    #     cols = correlation_matrix.shape[1]

    # removed_after_unsupervised = list(set(continuous_cols) - set(correlation_matrix.columns))


    # Process data and download dataset
    '___________________________________________________________________________'
    # drop columns from unsupervised selection
    # feature_selection_dataset.drop(index = removed_after_unsupervised, inplace=True)

    # rank columns
    feature_selection_dataset['sum'] = feature_selection_dataset.apply((lambda x: x.iloc[:6].sum()), axis=1)
    feature_selection_dataset.sort_values('sum', inplace=True, ascending=True)

    feature_selection_dataset.to_excel(f'{path}feature_selection_dataset.xlsx')

    return feature_selection_dataset

# Lancet paper

Dataset ABC - all-cause death

##### Subset

In [13]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_lancet, header=[0], index_col=0, usecols=list(range(15)))
y_train = pd.read_excel(link_train_lancet, header=[0], index_col=0, usecols=[0,15])
X_test = pd.read_excel(link_test_lancet, header=[0], index_col=0, usecols=list(range(15)))
y_test = pd.read_excel(link_test_lancet, header=[0], index_col=0, usecols=[0,15])

# create path for saving results
optimisation_path = './HSE project/Optimisation data/lancet/ABC death/'
results_path = './HSE project/Graphics/lancet/'

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (450, 14)
y_train shape:	 (450, 1)
X_test shape:	 (105, 14)
y_test shape:	 (105, 1)


##### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

clear_output()

##### Metrics tables

In [14]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2, train set, cv=5","mean"), ("ROC_AUC, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

Unnamed: 0_level_0,"F2, train set, cv=5","F2, train set, cv=5",Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Scores on the test set,Confusion matrix,Confusion matrix,Confusion matrix,Confusion matrix,"ROC_AUC, train set, cv=5","ROC_AUC, train set, cv=5"
Unnamed: 0_level_1,mean,std,F1,F2,Accuracy,Precision,Recall,PR_AUC,ROC_AUC,TN,FP,FN,TP,mean,std
Random Forest,0.806,0.098,0.486,0.541,0.657,0.415,0.586,0.585,0.747,52,24,12,17,0.852,0.073
SVM,0.83,0.027,0.5,0.704,0.467,0.337,0.966,0.506,0.734,21,55,1,28,0.774,0.079
Logistic Regression,0.762,0.104,0.492,0.526,0.686,0.444,0.552,0.643,0.776,56,20,13,16,0.814,0.086
KNN,0.867,0.055,0.567,0.617,0.724,0.5,0.655,0.517,0.749,57,19,10,19,0.866,0.07
CatBoost,0.797,0.15,0.519,0.496,0.752,0.56,0.483,0.657,0.778,65,11,15,14,0.797,0.15


In [15]:
# what metric to evaluate
metric = ['F2','Precision','Recall','ROC_AUC']

table = pd.read_excel(f'{optimisation_path}metrics_table.xlsx', header=[0,1], index_col=[0]) 
datasets = pd.DataFrame(table.loc[:, ('Scores on the test set', metric)].values.round(3),columns=['F2','Precision','Recall','ROC AUC'])

models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
# if 1:
#     # if metric == 'F2':
#     datasets_mean = pd.DataFrame()
#     datasets_mean[('F2, train set, cv=5', 'mean')] = list(table.loc[:, ('F2, train set, cv=5', 'mean')].values.round(3)) 
#     datasets_mean[('F2, train set, cv=5', 'std')] = list(table.loc[:, ('F2, train set, cv=5', 'std')].values.round(3)) 
#     # datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 
#     fig.add_traces([go.Box(name=column, x=models, 
#                            y=datasets_mean.iloc[column, ('F2, train set, cv=5', 'mean')], 
#                            marker=dict(color="black"), 
#                            showlegend = False) for column in table.index])
#     fig.update_traces(
#     selector=dict(type="box"), # update only boxes
#     boxpoints="all", # show points
#     pointpos=0, # centered
#     jitter=0, # no jitter
#     line_color="rgba(255,255,255,0)", # hide box lines
#     fillcolor="rgba(255,255,255,0)", # hide box fill
    
#     )
    # fig.update_layout(boxmode="group",)

    # fig.add_traces([go.Bar(name=column, x=models, 
    #                        y=datasets_mean[column], 
    #                        xaxis="x2",  
    #                        error_y=dict(type='data',  
    #                                     array=datasets_std[column], 
    #                                     color="rgba(0,0,0,1)",
    #                                     thickness=1), 
    #                        marker=dict(opacity=0,
    #                                   #  color="rgba(255,255,255,0)"
    #                                    ), 
    #                        showlegend = False) for column in datasets.columns])  
    


# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Metric values', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Metrics', y=0.5), 
                  title=dict(text=f'Test metrics: "lancet" subset (ABC)', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if 1:
    fig.add_shape(type='line',
                  x0=-0.5,
                  y0=0.5,
                  x1=4.5,
                  y1=0.5,
                  line=dict(color='black',  width=2, dash='dot'),
                  xref='x',
                  yref='y',
                  layer='below')   
    
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)

fig.show(renderer='colab')

In [16]:
fig.write_image(f"{results_path}metrics.pdf", engine="kaleido")
fig.write_image(f"{results_path}/metrics.jpeg", engine="kaleido")

##### Feature selection

In [17]:
# Random Forest feature importances
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
feature_importances = pd.DataFrame()

randomforest_optimisation = pd.read_excel(f'{optimisation_path}randomforest_optimisation.xlsx', header=[0])


# feature_importances.columns = ['Feature']
params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
params = ast.literal_eval(params[0])
random_forest_model = RandomForestClassifier(**params)
random_forest_model.fit(X_train, y_train)
feature_importances["RandomForest"] = pd.Series(random_forest_model.feature_importances_)

feats = list(map(lambda x: eval(x)[1], list(X_train.columns)))
feature_importances.index = feats

feature_importances = feature_importances.sort_values("RandomForest", ascending=False)
feature_importances['RandomForest'] = feature_importances['RandomForest']/feature_importances['RandomForest'][0]

feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["RandomForest"])\
                   .set_precision(3)

Unnamed: 0,RandomForest
МФА,1.0
Возраст,0.909
"Гемоглобин, г/л",0.786
СКФ EPI,0.415
ГБ,0.37
"Хсобщ, ммоль/л",0.249
Cегмент ST,0.232
ФВ ЛЖ,0.203
СД,0.025
Пол,0.014


In [18]:
data_from_paper = pd.DataFrame()
# https://ars.els-cdn.com/content/image/1-s2.0-S0140673620325198-mmc1.pdf
data_from_paper['Paper']= [0.77, 0.49, 1, 0.22, 0.1, 0.7, 0.1, 0.22, 0.09,  0.24, 0.09, 0.12, 0.11, 0.28]
data_from_paper.index = ['Возраст', 'СКФ EPI', 'ФВ ЛЖ', 'МФА', 'Хсобщ, ммоль/л', 'Гемоглобин, г/л', 'пост-ИМ','пост-стент','ГБ',  'СД', 'пост-ОНМК',  'Cегмент ST','Пол', 'пост-ВЧ-кровоизлияние']

In [19]:
result = pd.concat([feature_importances, data_from_paper], axis=1) 
result

Unnamed: 0,RandomForest,Paper
МФА,1.0,0.22
Возраст,0.909144,0.77
"Гемоглобин, г/л",0.785624,0.7
СКФ EPI,0.414865,0.49
ГБ,0.369983,0.09
"Хсобщ, ммоль/л",0.249126,0.1
Cегмент ST,0.23212,0.12
ФВ ЛЖ,0.202931,1.0
СД,0.025074,0.24
Пол,0.013807,0.11


In [20]:

result['RandomForest'] = result['RandomForest']/result['RandomForest'][0]
result.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["RandomForest", "Paper"])\
                   .set_precision(3)

Unnamed: 0,RandomForest,Paper
МФА,1.0,0.22
Возраст,0.909,0.77
"Гемоглобин, г/л",0.786,0.7
СКФ EPI,0.415,0.49
ГБ,0.37,0.09
"Хсобщ, ммоль/л",0.249,0.1
Cегмент ST,0.232,0.12
ФВ ЛЖ,0.203,1.0
СД,0.025,0.24
Пол,0.014,0.11


In [21]:
# what metric to evaluate
# metric = ['Эксперимент','Статья']

result.columns = ['RandomForest', 'Published paper']

features=result.index
# create the graph
fig = go.Figure(data=[go.Bar(name=i, x=list(result.index), 
                             y=result[i]) for i in result.columns ])

# Change the bar mode
fig.update_xaxes(title='Features')
fig.update_yaxes(title='Relative importance', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Data', y=0.5), 
                  title=dict(text=f'Feature importance: comparison with lancet paper', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)
  
fig.show(renderer='colab')

In [22]:
fig.write_image(f"{results_path}importance.pdf", engine="kaleido")
fig.write_image(f"{results_path}importance.jpeg", engine="kaleido")

##### Top features

##### Subset

In [None]:
optimisation_path = './HSE project/f2 optimised models/Lancet features/ABC death top features/'

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
# divide into taget and features
dataset = data_12_columns.copy()
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]
target = data_12_columns.target

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

##### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True )

clear_output()

##### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

# Canadian paper

Dataset ABC

##### Subset

Features from [paper](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5)

In [23]:
# choose the same columns as in reference table

canadian_features = [
    # 1. Target
      # - in reference: "if the patient died or survived before the end of the follow-up period, that was 130 days on average"
      # - in our case: Heart-desease-death, follow-up period - 4-155 months
    # ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть'),
    # 2. Serum creatinine, mg/dL
    ('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'),
    # 3. Ejection fraction
    ('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ'),
    # 4. Age
    ('АНТРОПОФИЗИОМЕТРИЯ', 'Возраст'),
    # 7. Sex
    ('АНТРОПОФИЗИОМЕТРИЯ', 'Пол'),
    # 8. Anaemia
    ('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Анемия, степень'),
    # 9. High blood pressure
    ('АНТРОПОФИЗИОМЕТРИЯ', 'систол. АД'),
    # 10. Smoking
    # ('ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ','Курение')
    # 12. Diabetes
    ('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'СД'),
    # (),
    # 5. Creatinine phosphokinase
    # data_12_columns['Creatinine phosphokinase'] = 
    # 6. Serum sodium
    # data_12_columns['Serum sodium']
    # 8. Platelets
    # data_12_columns['Platelets'] = 
]

canadian_2_features = [('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'), ('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ')]

canadian_target = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')

In [24]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_abc, header=[0], index_col=0, usecols=list(range(57)))
y_train = pd.read_excel(link_train_death_abc, header=[0], index_col=0, usecols=[0, 57])
X_test =  pd.read_excel(link_test_death_abc,  header=[0], index_col=0, usecols=list(range(57)))
y_test =  pd.read_excel(link_test_death_abc,  header=[0], index_col=0, usecols=[0, 57])
X_train.columns = [eval(col) for col in X_train.columns]
X_test.columns =  [eval(col) for col in X_test.columns]
X_train = X_train[canadian_features]
X_test =  X_test[canadian_features]

# create path for saving results
optimisation_path = './HSE project/Optimisation data/canadian/ABC dataset/8 features/'
results_path =      './HSE project/Graphics/canadian/'

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (438, 7)
y_train shape:	 (438, 1)
X_test shape:	 (104, 7)
y_test shape:	 (104, 1)


##### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

##### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [26]:
# what metric to evaluate
metric = ['F2','Precision','Recall','ROC_AUC']


table = pd.read_excel(f'{optimisation_path}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
# datasets[list(datasets.columns)] = 
datasets = pd.DataFrame(table.loc[:, ('Scores on the test set', metric)].values.round(3),columns=['F2','Precision','Recall','ROC AUC'])
# if metric == 'F2':
#     datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) 
#     datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 

# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
# ensemble models
# models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    


# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Metric values', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Metrics', y=0.5), 
                  title=dict(text=f'Test metrics: "canadian" subset (ABC)', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if 1:
    fig.add_shape(type='line',
                  x0=-0.5,
                  y0=0.5,
                  x1=4.5,
                  y1=0.5,
                  line=dict(color='black',  width=2, dash='dot'),
                  xref='x',
                  yref='y',
                  layer='below')   
    
# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)

fig.show(renderer='colab')

In [None]:
fig.write_image(f"{results_path}metrics.pdf", engine="kaleido")
fig.write_image(f"{results_path}metrics.jpeg", engine="kaleido")

##### Feature selection

In [None]:
# Эксперимент feature importances
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
feature_importances = pd.DataFrame()

randomforest_optimisation = pd.read_excel(f'{optimisation_path}randomforest_optimisation.xlsx', header=[0])

params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
params = ast.literal_eval(params[0])
random_forest_model = RandomForestClassifier(**params)
random_forest_model.fit(X_train, y_train)


feature_importances["Эксперимент"] = pd.Series(random_forest_model.feature_importances_)
# Перепечатал из статьи
feature_importances['Эксперимент'] = feature_importances['Эксперимент']/feature_importances['Эксперимент'][0]
# feature_importances["Logistic regression"] = pd.Series(LR_model.coef_[0]).abs()
# feature_importances["SVM"] = pd.Series(SVM_model.coef_[0]).abs()


feats = list(map(lambda x: x[1], list(X_train.columns)))
feature_importances.index = feats

feature_importances = feature_importances.sort_values("Эксперимент", ascending=False)
feature_importances['Эксперимент'] = feature_importances['Эксперимент']/feature_importances['Эксперимент'][0]

feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["Эксперимент"], axis=0)\
                   .set_precision(3)

In [None]:
data_from_paper = pd.DataFrame()
data_from_paper['Статья'] = [11.84,  10.71, 8.58,  1.06, 1.13, 1.02, 1.12] # 
data_from_paper['Статья'] = data_from_paper['Статья']/data_from_paper['Статья'][0]
data_from_paper.index = ['Креатинин, мкмоль/л', 'ФВ ЛЖ', 'Возраст', 'Анемия, степень',   'систол. АД', 'СД', 'Пол']

In [None]:
feature_importances = pd.concat([feature_importances, data_from_paper], axis=1) 
feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["Эксперимент", "Статья"], axis=0)\
                   .set_precision(3)

In [None]:
# what metric to evaluate
metric = ['Эксперимент','Статья']
feature_importances.columns = ['RandomForest', 'Published paper']

feats = [str(feature[1]) for feature in feature_importances.index]
features=feature_importances.index

# create the graph
fig = go.Figure(data=[go.Bar(name=i, x=list(feature_importances.index), 
                             y=feature_importances[i]) for i in feature_importances.columns ])

# Change the bar mode
fig.update_xaxes(title='Features')
fig.update_yaxes(title='Relative importance', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Data', y=0.5), 
                  title=dict(text=f'Feature importance: comparison with canadian paper', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)  

# figure size
fig.update_layout(
    autosize=False,
    width=1300,
    height=450,)
  
fig.show(renderer='colab')

In [None]:
fig.write_image(f"{results_path}importance.pdf", engine="kaleido")
fig.write_image(f"{results_path}importance.jpeg", engine="kaleido")

##### Two features

##### Subset

In [None]:
# choose the same columns as in reference table
data_2_columns = pd.DataFrame(data_abc[clinical_abc_columns][canadian_2_features])
data_2_columns['target', 'target'] = data_abc[canadian_target] 

data_2_columns.head(3)

In [None]:
# data has NAs
((data_12_columns == -1).sum())

In [None]:
# Drop these columns from the dataframe
data_2_columns = data_2_columns.replace(-1,np.nan)
data_2_columns.dropna(axis=0, how='any', inplace=True)
data_2_columns.target.value_counts()

In [None]:
data_2_columns.columns

In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=data_abc['ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'], 
                         y=data_abc[('ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ')],
                         marker_color=data_abc[('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')],
                         mode='markers',
                         text= data_abc[('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')]
                    # mode='markers',
                    # name='markers')
))

fig.update_traces(marker_size=10, selector=dict(type='scatter'))

fig.update_layout(
                  height=800, 
                  font_family="'Nunito', sans-serif",
                  # title={'text': "Correlations between sales and other columns", 'y':0.97, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
                  showlegend=True, 
                  margin=dict(l=40, r=10, t=60, b=60),
                  xaxis_range=[-5,10]

                  )

# fig = px.scatter(data_abc, x=['ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Креатинин, мкмоль/л'], y=['ИСХОДНАЯ ЭХОКГ', 'ФВ ЛЖ'], color=['target', ''],
#                 title="Automatic Labels Based on data_abc Frame Column Names")
fig.layout.template = 'plotly_dark'
fig.show(renderer='colab')

In [None]:
# divide into taget and features
dataset = data_2_columns[data_2_columns.columns.drop('target')]
target = data_2_columns.target

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
path_canadian_2_features_ABC = './HSE project/f2 optimised models/Canadian features/ABC dataset/2 features/'

##### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=path_canadian_2_features_ABC, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

##### Metrics tables

In [None]:
metrics_table = metric_table(path=path_canadian_2_features_ABC)

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

##### Feature selection

In [None]:
# Random Forest feature importances
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
feature_importances = pd.DataFrame()

randomforest_optimisation = pd.read_excel(f'{path_canadian_2_features_ABC}randomforest_optimisation.xlsx', header=[0])
svm_optimisation = pd.read_excel(f'{path_canadian_2_features_ABC}svm_optimisation.xlsx', header=[0])
LogisticRegression_optimisation = pd.read_excel(f'{path_canadian_2_features_ABC}LogisticRegression_optimisation.xlsx', header=[0])

# feature_importances.columns = ['Feature']
params = randomforest_optimisation[randomforest_optimisation['rank_test_score']==1][["params"]].iloc[0]
params = ast.literal_eval(params[0])
random_forest_model = RandomForestClassifier(**params)
random_forest_model.fit(X_train, y_train)
# 
# params = svm_optimisation[svm_optimisation['rank_test_score']==1][["params"]].iloc[0]
# params = ast.literal_eval(params[0])
# SVM_model = SVC(**params)
# 
params = LogisticRegression_optimisation[LogisticRegression_optimisation['rank_test_score']==1][["params"]].iloc[0]
params = ast.literal_eval(params[0])
LR_model = LogisticRegression(**params)
LR_model.fit(X_train, y_train)

feature_importances["Random Forest"] = pd.Series(random_forest_model.feature_importances_)
feature_importances["Logistic regression"] = pd.Series(LR_model.coef_[0]).abs()
# feature_importances["SVM"] = pd.Series(SVM_model.coef_[0]).abs()


feature_importances.index = list(X_train.columns)

feature_importances = feature_importances.sort_values("Random Forest", ascending=False)

feature_importances.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=["Random Forest", "Logistic regression"])\
                   .set_precision(3)

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = path_canadian_2_features_ABC)
df[df['sum'] >= 5]

# **Target**: Death from heart desease

In [None]:
target_column = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')
results_path = './HSE project/Graphics/cardiovascular death/'

## Biomarkers A

### Subset
### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

### Top feature optimisation and metrics

### subset

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(col for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = biomarkers_a.copy()
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]
target = biomarkers_a.target

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/f2 optimised models/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_a.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/f2 optimised models/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features A

#### Subset
#### Split into train and test

In [258]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_death_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_death_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_death_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (150, 60)
y_train shape:	 (150, 1)
X_test shape:	 (49, 60)
y_test shape:	 (49, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
# %%time 
# df = feature_selection( 
#                         x_data = X_train,
#                         y_data = y_train,
#                         path = optimisation_path)
# df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_a.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/f2 optimised models/Clinical A/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Biomarkers B

### Subset
### Split into train and test

In [266]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (94, 6)
y_train shape:	 (94, 1)
X_test shape:	 (30, 6)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# top_features = list(eval(col) for col in dataset.index[:10])
# top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [271]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (94, 77)
y_train shape:	 (94, 1)
X_test shape:	 (30, 77)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = clinical_and_biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

### Top feature optimisation and metrics

#### subset

In [None]:
# dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=0, index_col=[0])
# top_features = [eval(col) for col in dataset[dataset['sum']>=4].index]
# top_features 

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_b.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.75,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/f2 optimised models/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features B

#### Subset
#### Split into train and test

In [270]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_death_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_death_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_death_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (94, 71)
y_train shape:	 (94, 1)
X_test shape:	 (30, 71)
y_test shape:	 (30, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = clinical_and_biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

### Top feature optimisation and metrics

### subset

In [None]:
# dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=0, index_col=[0])
# top_features = [eval(col) for col in dataset[dataset['sum']>=4].index]
# top_features 

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_b.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.75,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/f2 optimised models/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Biomarkers C

### Subset
### Split into train and test

In [281]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (160, 5)
y_train shape:	 (160, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [280]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (160, 105)
y_train shape:	 (160, 1)
X_test shape:	 (43, 105)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = clinical_and_biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

## Clinical features C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
# %%time 
# df = feature_selection(
#                               # dataset = clinical_and_biomarkers_b,
#                               x_data = X_train,
#                               y_data = y_train,
#                               path = optimisation_path)
# df.tail(40)

## Clinical features A-B-C

#### Subset
#### Split into train and test

In [None]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_death_c, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_death_c, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_death_c, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_death_c, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = clinical_and_biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

### Top feature optimisation and metrics

#### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
[str(eval(feature)[1]) for feature in dataset.index[:10]]

In [None]:
dataset['Importances', 'RandomForest']

In [None]:
# what metric to evaluate
metric = ['Эксперимент','Статья']

dataset['Importances', 'RandomForest'] = dataset['Importances', 'RandomForest']/dataset['Importances', 'RandomForest'].max()
feats = [str(eval(feature)[1]) for feature in dataset.index[:15]]
features=dataset.index
# ensemble models
# models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

# create the graph
fig = go.Figure(data=[go.Bar(name=i, x=feats, 
                             y=dataset['Importances', 'RandomForest'])])

# add error whiskers from gridsearchCV
# i

# Change the bar mode
fig.update_xaxes(title='Модели')
fig.update_yaxes(title='', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="h", title='Данные'), 
                  title=dict(text=f'Значения метрик для выбранных моделей', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
# if 1:
#     fig.add_shape(type='line',
#                     x0=-0.5,
#                     y0=0.5,
#                     x1=4.5,
#                     y1=0.5,
#                     line=dict(color='firebrick',  width=2, dash='dot'),
#                     xref='x',
#                     yref='y')   

fig.show(renderer='colab')

In [None]:
# divide into taget and features
dataset = clinical_abc.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/f2 optimised models/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Results

### Scores of models

In [283]:
# what metric to evaluate
metric = 'ROC_AUC'

# list of paths
paths = [ 
        './HSE project/Optimisation data/cardiovascular death/Biomarkers A/all biomarkers/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers B/all biomarkers/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers C/all biomarkers/',
        './HSE project/Optimisation data/cardiovascular death/Clinical ABC/all clinical/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/cardiovascular death/Biomarkers C + Clinical/all biomarkers and clinical/',
        './HSE project/Optimisation data/cardiovascular death/Clinical A/all biomarkers and clinical/',
        './HSE project/Optimisation data/cardiovascular death/Clinical B/all biomarkers and clinical/',
        './HSE project/Optimisation data/cardiovascular death/Clinical C/all biomarkers and clinical/',
        ]

# create datframe for scores
datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) 
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 

# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
# ensemble models
# models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    


# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Score', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} values', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=4.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   

fig.show(renderer='colab')

### Compare with Top 10

In [None]:
# what metric to evaluate
# ROC_AUC F1 F2
metric = 'F2'

# list of paths
paths = [
        './HSE project/f2 optimised models/Biomarkers A/all biomarkers/',
        './HSE project/f2 optimised models/Biomarkers B/all biomarkers/',
        './HSE project/f2 optimised models/Biomarkers C/all biomarkers/',
        './HSE project/f2 optimised models/Clinical ABC/all clinical/',
        './HSE project/f2 optimised models/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/f2 optimised models/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/f2 optimised models/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# list of paths of top 10
paths_top = [
        './HSE project/f2 optimised models/Biomarkers A/biomarkers top features/',
        './HSE project/f2 optimised models/Biomarkers B/biomarkers top features/',
        './HSE project/f2 optimised models/Biomarkers C/biomarkers top features/',
        './HSE project/f2 optimised models/Clinical ABC/top features/',
        './HSE project/f2 optimised models/Biomarkers A + Clinical/top features/',
        './HSE project/f2 optimised models/Biomarkers B + Clinical/top features/',
        './HSE project/f2 optimised models/Biomarkers C + Clinical/top features/'
        ]

# create datframe for scores
datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
    table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
                                    list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3))
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3))

# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost'] + ['RandomForest top 10', 'SVM top 10', 'Logistic Regression top 10', 'KNN top 10', 'CatBoost top 10']


# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    
# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Score', range=[0., 1.0])
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=9.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   

fig.show(renderer='colab')

### Feature selection

In [None]:
features = pd.DataFrame(columns=['features', 
                                #  'biomarkers A',
                                #  'biomarkers B',
                                #  'biomarkers C',
                                 'Clinical+biomarkers ABC',
                                 'Clinical+biomarkers A',
                                 'Clinical+biomarkers B',
                                #  'Clinical+biomarkers C',
                                 ])
# list of paths
paths = [
        # './HSE project/f2 optimised models/Biomarkers A/all biomarkers/',
        # './HSE project/f2 optimised models/Biomarkers B/all biomarkers/',
        # './HSE project/f2 optimised models/Biomarkers C/all biomarkers/',
        './HSE project/f2 optimised models/Clinical ABC/all clinical/',
        './HSE project/f2 optimised models/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/f2 optimised models/Biomarkers B + Clinical/all biomarkers and clinical/',
        # './HSE project/f2 optimised models/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# get dataframe with with scores of models from different datasets
top_features = []
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
    # if i < 3: 
    #     top_features = top_features+list(str(col) for col in table.index[:10])
    # else:    
    top_features = top_features+list(eval(col)[1] for col in table.index[:10])

features['features'] = list(set(top_features))
features.index = list(set(top_features))
features.fillna(0, inplace=True)

for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
    # if i < 3: 
    #     features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
    # else:    
        # top_features = top_features+list(eval(col) for col in table.index[:10])   
    features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
features.sort_values(ascending=False,  inplace=True, by=("features"))
features.columns = ['sum'] + list(features.columns[1:])
features.to_excel('./HSE project/f2 optimised models/feature_selection.xlsx')
features

# **Target**: Revascularization

In [None]:
target_column = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация') revascularization

## Biomarkers A

### Subset
### Split into train and test

In [27]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (122, 86)
y_train shape:	 (122, 1)
X_test shape:	 (33, 86)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                              # dataset = biomarkers_a,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(col for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = biomarkers_a.copy()
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]
target = biomarkers_a.target

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/revascularization/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')

# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [29]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (122, 146)
y_train shape:	 (122, 1)
X_test shape:	 (33, 146)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_a.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/revascularization/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features A

#### Subset
#### Split into train and test

In [31]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_revascularization_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_revascularization_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (122, 60)
y_train shape:	 (122, 1)
X_test shape:	 (33, 60)
y_test shape:	 (33, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_a.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/revascularization/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Biomarkers B

### Subset
### Split into train and test

In [33]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 6)
y_train shape:	 (86, 1)
X_test shape:	 (29, 6)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# top_features = list(eval(col) for col in dataset.index[:10])
# top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [35]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 77)
y_train shape:	 (86, 1)
X_test shape:	 (29, 77)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_b.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/revascularization/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features B

#### Subset
#### Split into train and test

In [37]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_revascularization_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_revascularization_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 71)
y_train shape:	 (86, 1)
X_test shape:	 (29, 71)
y_test shape:	 (29, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_b.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/revascularization/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Biomarkers C

### Subset
### Split into train and test

In [39]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (146, 5)
y_train shape:	 (146, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [41]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (146, 105)
y_train shape:	 (146, 1)
X_test shape:	 (43, 105)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_c.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path_top_features = "./HSE project/revascularization/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path_top_features, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path_top_features)
metrics_table.to_excel(f'{optimisation_path_top_features}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features C

#### Subset
#### Split into train and test

In [43]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (146, 100)
y_train shape:	 (146, 1)
X_test shape:	 (43, 100)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_c.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path_top_features = "./HSE project/revascularization/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path_top_features, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path_top_features)
metrics_table.to_excel(f'{optimisation_path_top_features}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features A-B-C

#### Subset
#### Split into train and test

In [45]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_revascularization_c, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_revascularization_c, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/revascularization/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (146, 56)
y_train shape:	 (146, 1)
X_test shape:	 (43, 56)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = clinical_and_biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

### Top feature optimisation and metrics

#### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# what metric to evaluate
metric = ['Эксперимент','Статья']

dataset['Importances', 'RandomForest'] = dataset['Importances', 'RandomForest']/dataset['Importances', 'RandomForest'].max()
feats = [str(eval(feature)[1]) for feature in dataset.index[:15]]
features=dataset.index
# ensemble models
# models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

# create the graph
fig = go.Figure(data=[go.Bar(name=i, x=feats, 
                             y=dataset['Importances', 'RandomForest'])])

# add error whiskers from gridsearchCV
# i

# Change the bar mode
fig.update_xaxes(title='Модели')
fig.update_yaxes(title='', range=[0., 1.0])
fig.update_layout(xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False})
fig.update_layout(barmode='group', 
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="h", title='Данные'), 
                  title=dict(text=f'Значения метрик для выбранных моделей', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
# if 1:
#     fig.add_shape(type='line',
#                     x0=-0.5,
#                     y0=0.5,
#                     x1=4.5,
#                     y1=0.5,
#                     line=dict(color='firebrick',  width=2, dash='dot'),
#                     xref='x',
#                     yref='y')   

fig.show(renderer='colab')

In [None]:
# divide into taget and features
dataset = clinical_abc.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/revascularization/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score=my_f2_scorer(), 
       catboost_score='F:beta=2', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Results

### Scores of models

In [None]:
# what metric to evaluate
# F2 F1 ROC_AUC
metric = 'ROC_AUC'
# list of paths
paths = [
        './HSE project/revascularization/Biomarkers A/all biomarkers/',
        './HSE project/revascularization/Biomarkers B/all biomarkers/',
        './HSE project/revascularization/Biomarkers C/all biomarkers/',
        './HSE project/revascularization/Clinical ABC/all clinical/',
        './HSE project/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/',
        './HSE project/revascularization/Clinical A/all biomarkers and clinical/',
        './HSE project/revascularization/Clinical B/all biomarkers and clinical/',
        './HSE project/revascularization/Clinical C/all biomarkers and clinical/',
        ]

# create datframe for scores
datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) 
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 

# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
# ensemble models
# models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns])
    
# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    

fig.update_xaxes(title='Models')
# Change the bar mode
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

fig.update_yaxes(title='Score', range=[0., 1.0])


# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=4.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   

fig.show(renderer='colab')

### Compare with Top 10

In [None]:
# what metric to evaluate
# ROC_AUC F1 F2
metric = 'ROC_AUC'

# list of paths
paths = [
        './HSE project/revascularization/Biomarkers A/all biomarkers/',
        './HSE project/revascularization/Biomarkers B/all biomarkers/',
        './HSE project/revascularization/Biomarkers C/all biomarkers/',
        './HSE project/revascularization/Clinical ABC/all clinical/',
        './HSE project/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# list of paths of top 10
paths_top = [
        './HSE project/revascularization/Biomarkers A/biomarkers top features/',
        './HSE project/revascularization/Biomarkers B/biomarkers top features/',
        './HSE project/revascularization/Biomarkers C/biomarkers top features/',
        './HSE project/revascularization/Clinical ABC/top features/',
        './HSE project/revascularization/Biomarkers A + Clinical/top features/',
        './HSE project/revascularization/Biomarkers B + Clinical/top features/',
        './HSE project/revascularization/Biomarkers C + Clinical/top features/'
        ]

# create datframe for scores
datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
    table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
                                    list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3))
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3))


# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost'] + ['RandomForest top 10', 'SVM top 10', 'Logistic Regression top 10', 'KNN top 10', 'CatBoost top 10']


# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    
# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Score', range=[0., 1.0])
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=9.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   

fig.show(renderer='colab')

### Feature selection

In [None]:
features = pd.DataFrame(columns=['features', 
                                 'biomarkers A',
                                #  'biomarkers B',
                                #  'biomarkers C',
                                 'Clinical+biomarkers ABC',
                                 'Clinical+biomarkers A',
                                #  'Clinical+biomarkers B',
                                #  'Clinical+biomarkers C',
                                 ])
# list of paths
paths = [
        './HSE project/revascularization/Biomarkers A/all biomarkers/',
        # './HSE project/revascularization/Biomarkers B/all biomarkers/',
        # './HSE project/revascularization/Biomarkers C/all biomarkers/',
        './HSE project/revascularization/Clinical ABC/all clinical/',
        './HSE project/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
        # './HSE project/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
        # './HSE project/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# get dataframe with with scores of models from different datasets
top_features = []
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
    if i < 1: 
        top_features = top_features+list(str(col) for col in table.index[:10])
    else:    
        top_features = top_features+list(eval(col)[1] for col in table.index[:10])

features['features'] = list(set(top_features))
features.index = list(set(top_features))
features.fillna(0, inplace=True)

for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
    if i < 1: 
        features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
    else:    
        # top_features = top_features+list(eval(col) for col in table.index[:10])   
        features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
features.sort_values(ascending=False,  inplace=True, by=("features"))
features.columns = ['sum'] + list(features.columns[1:])
features.to_excel('./HSE project/revascularization/feature_selection.xlsx')
features

# **Target**: Combined

## Biomarkers A

### Subset
### Split into train and test

In [28]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(61,147)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(61,147)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (192, 86)
y_train shape:	 (192, 1)
X_test shape:	 (50, 86)
y_test shape:	 (50, 1)


In [None]:
print((100*(target.value_counts()[1]) / (target.value_counts().sum())).round(2), '% of positive target')

In [None]:
optimisation_path = "./HSE project/combined/Biomarkers A/all biomarkers/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', #my_f2_scorer() 'f1'
       catboost_score='F:beta=1', #'F:beta=2' 'F:beta=1'
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics table

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                              # dataset = biomarkers_a,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(col for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = biomarkers_a.copy()
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]
target = biomarkers_a.target

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/combined/Biomarkers A/biomarkers top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', #my_f2_scorer()
       catboost_score='F:beta=1', #'F:beta=2'
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
# metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features + Biomarkers A

#### Subset
#### Split into train and test

In [30]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(1,147)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(1,147)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers A + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (192, 146)
y_train shape:	 (192, 1)
X_test shape:	 (50, 146)
y_test shape:	 (50, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_a.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/combined/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features A

#### Subset
#### Split into train and test

In [32]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_a, header=[0], usecols=list(range(1,61)))
y_train = pd.read_excel(link_train_combined_a, header=[0], usecols=[147])
X_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=list(range(1,61)))
y_test  = pd.read_excel(link_test_combined_a, header=[0], usecols=[147])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical A/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (192, 60)
y_train shape:	 (192, 1)
X_test shape:	 (50, 60)
y_test shape:	 (50, 1)


In [None]:
optimisation_path = "./HSE project/combined/Clinical A/all biomarkers and clinical/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_a.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/combined/Biomarkers A + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Biomarkers B

### Subset
### Split into train and test

In [47]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(72,78)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(72,78)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 6)
y_train shape:	 (86, 1)
X_test shape:	 (32, 6)
y_test shape:	 (32, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

In [None]:
# top_features
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# top_features = list(eval(col) for col in dataset.index[:10])
# top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

## Clinical features + Biomarkers B

#### Subset
#### Split into train and test

In [36]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(1,78)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(1,78)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers B + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 77)
y_train shape:	 (86, 1)
X_test shape:	 (32, 77)
y_test shape:	 (32, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_b.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/combined/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features B

#### Subset
#### Split into train and test

In [38]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_b, header=[0], usecols=list(range(1,72)))
y_train = pd.read_excel(link_train_combined_b, header=[0], usecols=[78])
X_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=list(range(1,72)))
y_test  = pd.read_excel(link_test_combined_b, header=[0], usecols=[78])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical B/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (86, 71)
y_train shape:	 (86, 1)
X_test shape:	 (32, 71)
y_test shape:	 (32, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_b.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/combined/Biomarkers B + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Biomarkers C

### Subset
### Split into train and test

In [40]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(101,106)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(101,106)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers C/all biomarkers/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (120, 5)
y_train shape:	 (120, 1)
X_test shape:	 (43, 5)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=4, 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=True)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

## Clinical features + Biomarkers C

#### Subset
#### Split into train and test

In [42]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(1,106)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(1,106)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Biomarkers C + Clinical/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (120, 105)
y_train shape:	 (120, 1)
X_test shape:	 (43, 105)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_c.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path_top_features = "./HSE project/combined/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path_top_features, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path_top_features)
metrics_table.to_excel(f'{optimisation_path_top_features}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features C

#### Subset
#### Split into train and test

In [44]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(1,101)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[106])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(1,101)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[106])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical C/all biomarkers and clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (120, 100)
y_train shape:	 (120, 1)
X_test shape:	 (43, 100)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection( 
                        x_data = X_train,
                        y_data = y_train,
                        path = optimisation_path)
df[df['sum'] >= 5]

In [None]:
df = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1])
df.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
# visualise the table
df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Importances","RandomForest"), ("Importances","CatBoost"), ("Importances","Logistic")])\
                   .set_precision(3)

### Top feature optimisation and metrics

### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_and_biomarkers_c.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path_top_features = "./HSE project/combined/Biomarkers C + Clinical/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path_top_features, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path_top_features)
metrics_table.to_excel(f'{optimisation_path_top_features}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Clinical features A-B-C

#### Subset
#### Split into train and test

In [46]:
# download Dataset from Github repo and read as excel file

# download train and test subsets
X_train = pd.read_excel(link_train_combined_c, header=[0], usecols=list(range(1,57)))
y_train = pd.read_excel(link_train_combined_c, header=[0], usecols=[57])
X_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=list(range(1,57)))
y_test  = pd.read_excel(link_test_combined_c, header=[0], usecols=[57])

# create path for saving results
optimisation_path = "./HSE project/Optimisation data/combined/Clinical ABC/all clinical/"

# print subsets parameters
print('X_train shape:\t', X_train.shape)
print('y_train shape:\t', y_train.shape)
print('X_test shape:\t', X_test.shape)
print('y_test shape:\t', y_test.shape)

X_train shape:	 (120, 56)
y_train shape:	 (120, 1)
X_test shape:	 (43, 56)
y_test shape:	 (43, 1)


### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics tables

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

In [None]:
_ = random_forest_importances(path = optimisation_path, n_features = 20, biomarkers=False)

### Feature selection

In [None]:
%%time 
df = feature_selection(
                              # dataset = clinical_and_biomarkers_b,
                              x_data = X_train,
                              y_data = y_train,
                              path = optimisation_path)
df.tail(40)

### Top feature optimisation and metrics

#### subset

In [None]:
dataset = pd.read_excel(f'{optimisation_path}feature_selection_dataset.xlsx', header=[0,1], index_col=[0])
dataset.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
top_features = list(eval(col) for col in dataset.index[:10])
top_features

In [None]:
plt.barh(dataset.index[:20], dataset[("Importances","RandomForest")][:20])
plt.show()

In [None]:
# divide into taget and features
dataset = clinical_abc.copy()
target = dataset.target
dataset.drop(columns='target', inplace=True)
dataset = dataset[top_features]

# make 4 subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(
                                                    dataset,
                                                    target,
                                                    train_size = 0.8,
                                                    random_state=20,
                                                    shuffle=True
                                                    )

In [None]:
dataset.head(3)

In [None]:
optimisation_path = "./HSE project/combined/Clinical ABC/top features/"

### Hyper-parameter optimisation

In [None]:
tuning(
       score='f1', 
       catboost_score='F:beta=1', 
       cross_validation=StratifiedKFold(5), 
       path=optimisation_path, 
       logistic_regression=True, 
       knn=True, 
       random_forest=True, 
       svm=True, 
       catboost=True );  clear_output()

### Metrics

In [None]:
# download optimisation tables and execute the code

metrics_table = metric_table(path=optimisation_path)
metrics_table.to_excel(f'{optimisation_path}metrics_table.xlsx')
# visualise the table
metrics_table.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
                   .set_properties(**{'text-align': 'center'})\
                   .background_gradient(cmap='coolwarm', subset=[("Scores on the test set","F2"), ("F2 score, train set, cv=5","mean"), ("Scores on the test set","F1"), ("Scores on the test set","ROC_AUC")])\
                   .set_precision(3)

## Results

### Scores of models

In [None]:
# what metric to evaluate
# F2 F1 ROC_AUC
metric = 'F2'
# list of paths
paths = [
        './HSE project/combined/Biomarkers A/all biomarkers/',
        './HSE project/combined/Biomarkers B/all biomarkers/',
        './HSE project/combined/Biomarkers C/all biomarkers/',
        './HSE project/combined/Clinical ABC/all clinical/',
        './HSE project/combined/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/combined/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/combined/Biomarkers C + Clinical/all biomarkers and clinical/',
        './HSE project/combined/Clinical A/all biomarkers and clinical/',
        './HSE project/combined/Clinical B/all biomarkers and clinical/',
        './HSE project/combined/Clinical C/all biomarkers and clinical/',
        ]

# create datframe for scores
datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])
datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C', 'Clinical A','Clinical B','Clinical C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) 
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) 
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) 

# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost']
# ensemble models
# models=['Hard voting', 'Soft voting', 'Stacking', 'Bagging', 'adaBoosting']

# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns])
    
# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    

fig.update_xaxes(title='Models')
# Change the bar mode
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.515, 4.515], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

fig.update_yaxes(title='Score', range=[0., 1.0])


# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=4.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   

fig.show(renderer='colab')

### Compare with Top 10

In [None]:
# what metric to evaluate
# ROC_AUC F1 F2
metric = 'F2'

# list of paths
paths = [
        './HSE project/combined/Biomarkers A/all biomarkers/',
        './HSE project/combined/Biomarkers B/all biomarkers/',
        # './HSE project/combined/Biomarkers C/all biomarkers/',
        './HSE project/combined/Clinical ABC/all clinical/',
        './HSE project/combined/Biomarkers A + Clinical/all biomarkers and clinical/',
        './HSE project/combined/Biomarkers B + Clinical/all biomarkers and clinical/',
        './HSE project/combined/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# list of paths of top 10
paths_top = [
        './HSE project/combined/Biomarkers A/biomarkers top features/',
        './HSE project/combined/Biomarkers B/biomarkers top features/',
        # './HSE project/combined/Biomarkers C/biomarkers top features/',
        './HSE project/combined/Clinical ABC/top features/',
        './HSE project/combined/Biomarkers A + Clinical/top features/',
        './HSE project/combined/Biomarkers B + Clinical/top features/',
        './HSE project/combined/Biomarkers C + Clinical/top features/'
        ]

# create datframe for scores
datasets = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_mean = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])
datasets_std = pd.DataFrame(columns=['biomarkers A','biomarkers B','biomarkers C','Clinical+biomarkers ABC','Clinical+biomarkers A','Clinical+biomarkers B','Clinical+biomarkers C'])

# get dataframe with with scores of models from different datasets
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}metrics_table.xlsx', header=[0,1], index_col=[0]) 
    table_top = pd.read_excel(f'{paths_top[i]}metrics_table.xlsx', header=[0,1], index_col=[0])
    datasets[datasets.columns[i]] = list(table.loc[:, ('Scores on the test set', metric)].values.round(3)) + \
                                    list(table_top.loc[:, ('Scores on the test set', metric)].values.round(3))
    if metric == 'F2':
        datasets_mean[datasets_mean.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3)) + list(table_top.loc[:, ('F2 score, train set, cv=5', 'mean')].values.round(3))
        datasets_std[datasets_std.columns[i]] = list(table.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3)) + list(table_top.loc[:, ('F2 score, train set, cv=5', 'std')].values.round(3))


# list of models
# standart models
models=['RandomForest', 'SVM', 'Logistic Regression', 'KNN', 'CatBoost'] + ['RandomForest top 10', 'SVM top 10', 'Logistic Regression top 10', 'KNN top 10', 'CatBoost top 10']


# create the graph
fig = go.Figure(data=[go.Bar(name=column, x=models, y=datasets[column]) for column in datasets.columns ])

# add error whiskers from gridsearchCV
if metric == 'F2':
    fig.add_traces([go.Box(name=column, x=models, 
                           y=datasets_mean[column], 
                          #  xaxis="x1",  
                           
                           marker=dict(color="black"), 
                           showlegend = False) for column in datasets.columns])
    fig.update_traces(
    selector=dict(type="box"), # update only boxes
    boxpoints="all", # show points
    pointpos=0, # centered
    jitter=0, # no jitter
    line_color="rgba(255,255,255,0)", # hide box lines
    fillcolor="rgba(255,255,255,0)", # hide box fill
    
    )
    fig.update_layout(boxmode="group",)

    fig.add_traces([go.Bar(name=column, x=models, 
                           y=datasets_mean[column], 
                           xaxis="x2",  
                           error_y=dict(type='data',  
                                        array=datasets_std[column], 
                                        color="rgba(0,0,0,1)",
                                        thickness=1), 
                           marker=dict(opacity=0,
                                      #  color="rgba(255,255,255,0)"
                                       ), 
                           showlegend = False) for column in datasets.columns])  
    
# Change the bar mode
fig.update_xaxes(title='Models')
fig.update_yaxes(title='Score', range=[0., 1.0])
fig.update_layout(barmode='group', 
                  xaxis2={"overlaying": "x", "range": [-0.525, 9.525], "showticklabels": False},
                  bargap=0.30,
                  bargroupgap=0.3,
                  legend=dict(orientation="v", title='Datasets'), 
                  title=dict(text=f'{metric} score', x=0.5,),
                  margin=dict(l=60, r=20, t=60, b=40),)

# add dotted line for ROC AUC = 0.5
if metric == 'ROC_AUC':
    fig.add_shape(type='line',
                    x0=-0.5,
                    y0=0.5,
                    x1=9.5,
                    y1=0.5,
                    line=dict(color='firebrick',  width=2, dash='dot'),
                    xref='x',
                    yref='y')   

fig.show(renderer='colab')

### Feature selection

In [None]:
features = pd.DataFrame(columns=['features', 
                                 'biomarkers A',
                                #  'biomarkers B',
                                #  'biomarkers C',
                                 'Clinical+biomarkers ABC',
                                 'Clinical+biomarkers A',
                                #  'Clinical+biomarkers B',
                                #  'Clinical+biomarkers C',
                                 ])
# list of paths
paths = [
        './HSE project/revascularization/Biomarkers A/all biomarkers/',
        # './HSE project/revascularization/Biomarkers B/all biomarkers/',
        # './HSE project/revascularization/Biomarkers C/all biomarkers/',
        './HSE project/revascularization/Clinical ABC/all clinical/',
        './HSE project/revascularization/Biomarkers A + Clinical/all biomarkers and clinical/',
        # './HSE project/revascularization/Biomarkers B + Clinical/all biomarkers and clinical/',
        # './HSE project/revascularization/Biomarkers C + Clinical/all biomarkers and clinical/'
        ]

# get dataframe with with scores of models from different datasets
top_features = []
for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
    if i < 1: 
        top_features = top_features+list(str(col) for col in table.index[:10])
    else:    
        top_features = top_features+list(eval(col)[1] for col in table.index[:10])

features['features'] = list(set(top_features))
features.index = list(set(top_features))
features.fillna(0, inplace=True)

for i in range(len(paths)):
    table = pd.read_excel(f'{paths[i]}feature_selection_dataset.xlsx', header=[0,1], index_col=[0]) #ensemble: advanced_models_metrics, standart: metrics_table
    table.sort_values(by=("Importances","RandomForest"), axis=0, ascending=False, inplace=True)
    if i < 1: 
        features.loc[list(str(col) for col in table.index[:10]), features.columns[i+1]] = 1
    else:    
        # top_features = top_features+list(eval(col) for col in table.index[:10])   
        features.loc[list(eval(col)[1] for col in table.index[:10]), features.columns[i+1]] = 1 

features['features'] = features.iloc[:,1:].apply((lambda x: x.sum()), axis=1)
features.sort_values(ascending=False,  inplace=True, by=("features"))
features.columns = ['sum'] + list(features.columns[1:])
features.to_excel('./HSE project/revascularization/feature_selection.xlsx')
features

# References

- [Guidelines and quality criteria for artificial intelligence-based prediction models in healthcare: a scoping review](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8748878/pdf/41746_2021_Article_549.pdf)

- ✅ [Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone](https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5#citeas)

- ✅ [Machine learning-based prediction of adverse events
following an acute coronary syndrome (PRAISE): a modelling
study of pooled datasets](https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)32519-8/fulltext)

- ✅ [Critical appraisal of artificial intelligence-based prediction models for cardiovascular disease](https://watermark.silverchair.com/ehac238.pdf?token=AQECAHi208BE49Ooan9kkhW_Ercy7Dm3ZL_9Cf3qfKAc485ysgAAAtQwggLQBgkqhkiG9w0BBwagggLBMIICvQIBADCCArYGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMSvd0U0E66pd-sc_vAgEQgIICh_0OOn38okwwjvtHKZZRS6iesoJ0VuLm_qXiAJTeVb_83xAfB7oneCMsGdP7SkYUZPlcO3UtImKRROpfFzoAH87-TvQu04QMni8-YL47A9k13em0EMsLU86rv0fjaSmxgG-hPnAe7eRJaEDf1ckm-YBNx65aPTx1UC8yW3YO0gDra3ROrfsyl2UariiUse8hZ5S-I2WvFx0gic__qBLni02hEetj0dt-mInD7DxKqGuk28AuNOCDlF9Q1Tfj7oSyk6_1aNHJJ9XklpOJgzsKn-j4yusaYkapojnZzcNzBGcx6tTWYDn-YFcevxsYSc_uKlSUl40oTPl5Gwp-gAyxaLx9bFRuCDA6bxfPsNjgLQR0Eo4QxBuMD5h8FR6H6hEkZ1heaEpiWvZHqwTbEMddl1L1EgD2w-L-ng1YHbegVuZLa-Noll9OWfYSsVZf330LvUYMnTSu3FxrJ72voWUNhS3xzpTvkaeTqIkQgRU5Q75TfoKpMWfefufVgDshQhRM0ww1qRImd34Faql0RyBAKOPXG_HaucEkyXb60GCd6-0yjP5Mjbq-TML0Y9pnKIvmf9wXcTw-DJTcMT97fzWbp_psY70J02wEjvHPxfkOyEl9TiA08sI24GqKHAZuSU_M5R2dGN5W7qGuN_A-TbFKvO3FyMDOgV89BtJXHk8wVYpR-f2uppZydQydht_KTHlkV8hbYf0StZGbCXLb-fk38yZ6rerF9dTXfT6PtrYdlBYrVW65ZRn1HbxhoA0LBI0f5z8gpiqQjnyxSzrX-e9FYtOfOPu-i-IfGTLMFELowQ3IXkTup2Ee1dvT0sosTfoC5Q6x6d8nubiZFtw_SLYg21vF1XH2Gw9d)

- [Interpretation of machine learning predictions for patient outcomes in electronic health records](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7153071/pdf/3200408.pdf)
- [Minimum sample size for external validation of a clinical prediction model with a binary outcome](https://pubmed.ncbi.nlm.nih.gov/34031906/)
- [Machine learning of clinical variables and coronary artery calcium scoring for the prediction of obstructive coronary artery disease on coronary computed tomography angiography: analysis from the CONFIRM registry](https://pubmed.ncbi.nlm.nih.gov/31513271/)
- [Reflection on modern methods: when worlds collide-prediction, machine learning and causal inference](https://pubmed.ncbi.nlm.nih.gov/31298274/)
- General Cardiovascular Risk Profile for Use in Primary Care

### Feature importance

- ✅ [Feature Importance May Be Lying To You](https://towardsdatascience.com/feature-importance-may-be-lying-to-you-3247cafa7ee7)
- [Different Measures of Feature Importance Behave Differently](https://hippocampus-garden.com/feature_importance/)
- [Explaining Feature Importance by example of a Random Forest](https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e)
- [Interpret Logistic Regression Coefficients (For Beginners)](https://quantifyinghealth.com/interpret-logistic-regression-coefficients/)

- [FAQ: HOW DO I INTERPRET ODDS RATIOS IN LOGISTIC REGRESSION?](https://stats.oarc.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/#:~:text=A%20logistic%20regression%20model%20allows,relationship%20with%20the%20predictor%20variables.)

### Imputation
I decided to use kNN imputation b/c it was easy to implement using sklearn package and it was much better than other simpler imputers. However, there are also two types of complex imputers that might be reasonable to use (MICE and datawig)
- [6 Different Ways to Compensate for Missing Values In a Dataset (Data Imputation with examples)](https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779)
- [sklearn](https://scikit-learn.org/stable/modules/impute.html)
- [kNN Imputation for Missing Values in Machine Learning](https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/)


### Understanding model predictions


In paper "Critical appraisal of artificial intelligence-based prediction models for cardiovascular disease" it was mentioned the use of LIME and SHAP
  
##### LIME 
- [“Why Should I Trust You?” Explaining the Predictions of Any Classifier - paper about LIME](https://arxiv.org/pdf/1602.04938.pdf)

- [Understanding model predictions with LIME](https://towardsdatascience.com/understanding-model-predictions-with-lime-a582fdff3a3b)
- [Understanding how LIME explains predictions](https://towardsdatascience.com/understanding-how-lime-explains-predictions-d404e5d1829c)
- ✅ [How to explain ML models and feature importance with LIME?](https://analyticsindiamag.com/how-to-explain-ml-models-and-feature-importance-with-lime/)
- ✅ [Local Interpretable Model-Agnostic Explanations (LIME): An Introduction](https://www.oreilly.com/content/introduction-to-local-interpretable-model-agnostic-explanations-lime/)
- ✅ [Explanations (LIME)](https://ema.drwhy.ai/LIME.html)  
##### SHAP   
- ✅ [SHAP Values Explained Exactly How You Wished Someone Explained to You](https://towardsdatascience.com/shap-explained-the-way-i-wish-someone-explained-it-to-me-ab81cc69ef30)
-[Using SHAP Values to Explain How Your Machine Learning Model Works](https://towardsdatascience.com/using-shap-values-to-explain-how-your-machine-learning-model-works-732b3f40e137)
-[I have to find out what approximations they use to calculate Shapley values for all features, considering $2^n$ complexity]()
-[How to define fairness to detect and prevent discriminatory outcomes in Machine Learning](https://towardsdatascience.com/how-to-define-fairness-to-detect-and-prevent-discriminatory-outcomes-in-machine-learning-ef23fd408ef2)

### Fairness
- ✅ [A Tutorial on Fairness in Machine Learning](https://towardsdatascience.com/a-tutorial-on-fairness-in-machine-learning-3ff8ba1040cb)
- [sklego documentation](https://scikit-lego.readthedocs.io/en/latest/fairness.html)
- ✅ [Equality and fairness measures in classification models](https://www.auditingalgorithms.net/EqualityAndFairness.html)
- [Fairness Definitions Explained - должна быть понятная и полезная статья](http://fairware.cs.umass.edu/papers/Verma.pdf)
- [CS 294: Fairness in Machine Learning](https://fairmlclass.github.io/)
- []()

### Feature seletion
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 1](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-1-3574269d5c69)
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 2](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-2-c258f8a2ac43)
- ✅ [Deep-dive on ML techniques for feature selection in Python - Part 3](https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-3-de2a7593247f)
- ✅ [How to Choose a Feature Selection Method For Machine Learning](https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/)
- ✅ [Understanding ANOVA-F for feature selection in Python](https://datascience.stackexchange.com/questions/74465/how-to-understand-anova-f-for-feature-selection-in-python-sklearn-selectkbest-w#answer-74486)
- [sklearn: Feature selection](https://scikit-learn.org/stable/modules/feature_selection.html)
- ✅ [What are variable importance rankings useful for?](https://stats.stackexchange.com/questions/202277/what-are-variable-importance-rankings-useful-for#question-header)
- ✅ [feature importance is a slippery concept](https://stats.stackexchange.com/questions/202221/for-linear-classifiers-do-larger-coefficients-imply-more-important-features/202853#answer-202853)
- ✅ [Why lasso for feature selection?](https://stats.stackexchange.com/questions/367155/why-lasso-for-feature-selection#question-header)
- [Boruta SHAP: A Tool for Feature Selection Every Data Scientist Should Know](https://towardsdatascience.com/boruta-shap-an-amazing-tool-for-feature-selection-every-data-scientist-should-know-33a5f01285c0#:~:text=The%20idea%20of%20the%20Boruta,importance%20of%20the%20shadow%20features.)
- ✅ [Intuitions on L1 and L2 Regularisation](https://towardsdatascience.com/intuitions-on-l1-and-l2-regularisation-235f2db4c261)
- [L0 Norm, L1 Norm, L2 Norm & L-Infinity Norm](https://montjoile.medium.com/l0-norm-l1-norm-l2-norm-l-infinity-norm-7a7d18a4f40c#:~:text=L1%20Norm%20is%20the%20sum,the%20vector%20are%20weighted%20equally.)
- []()
- []()

### Advanced predictions
- [Ensemble methods: bagging, boosting and stacking](https://towardsdatascience.com/ensemble-methods-bagging-boosting-and-stacking-c9214a10a205)
- [sklearn: Ensemble methods](https://scikit-learn.org/stable/modules/ensemble.html)
- [A Deep Dive into Stacking Ensemble Machine Learning — Part I](https://towardsdatascience.com/a-deep-dive-into-stacking-ensemble-machine-learning-part-i-10476b2ade3)
- [Cтекинг (Stacking) и блендинг (Blending)](https://dyakonov.org/2017/03/10/c%D1%82%D0%B5%D0%BA%D0%B8%D0%BD%D0%B3-stacking-%D0%B8-%D0%B1%D0%BB%D0%B5%D0%BD%D0%B4%D0%B8%D0%BD%D0%B3-blending/)
- []()
- []()


### Clustering
- [Overview of Clustering Algorithms](https://towardsdatascience.com/overview-of-clustering-algorithms-27e979e3724d)
- []()
- []()