## Prerequisites

### Import libraries

In [5]:
# Import libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from sklearn.model_selection import train_test_split
# from google.colab import output

from pandas import DatetimeIndex as dt
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
# from google.colab import files
import IPython
from IPython.display import HTML, display, clear_output 
# from google.colab import drive
import sys

# hyper-parameters optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# metrics
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as TP_rate                          
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score as recall
from sklearn.metrics import average_precision_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer,fbeta_score
from sklearn.model_selection import StratifiedKFold


# classifiers
from sklearn.ensemble import RandomForestClassifier
# BaggingClassifier, AdaBoostRegressor, RandomForestRegressor, GradientBoostingClassifier, StackingClassifier, VotingClassifier #
# from sklearn.tree import DecisionTreeClassifier     #
from sklearn.svm import SVC                                    # both linear and radial classification
from sklearn.neighbors import KNeighborsClassifier             # k=3
# from sklearn.naive_bayes import GaussianNB
# from sklearn.neural_network import MLPClassifier
# from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import catboost
from catboost import CatBoostClassifier
# from imblearn.over_sampling import SMOTE
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# from mlxtend.classifier import StackingClassifier

# statistics
# from scipy.stats import shapiro
# from scipy.stats import chi2_contingency
# from scipy.stats import mannwhitneyu

# imputations
# explicitly require this experimental feature
# from sklearn.experimental import enable_iterative_imputer  # noqa
# # now you can import normally from sklearn.impute
# from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.base import clone

# feature selection
# from sklearn.feature_selection import chi2, mutual_info_classif, f_classif, SelectKBest, RFE, RFECV, SequentialFeatureSelector
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# from scipy.stats import kendalltau, spearmanr
# from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold
# from mrmr import mrmr_classif
# from yellowbrick.features import RFECV as RFECV_visual
# import shap

# from tqdm.auto import tqdm

# # to conver string to dict
import ast

# Interpretability
#  # !pip install interpret
# from interpret.blackbox import LimeTabular
# from interpret import set_visualize_provider
# from interpret.provider import InlineProvider
# set_visualize_provider(InlineProvider())
# from interpret import show

# import lime
# import lime.lime_tabular
# from __future__ import print_function
import pickle

# ignore warnings when graphs are plotted
import warnings
warnings.filterwarnings('ignore')

## Preselected features and preprocessed data

In [6]:
clinical_features = list(map(tuple, pd.read_excel('../Raw data/Clinical features.xlsx', index_col=0, header=0).values))

biomarkers_a = list(map(tuple, pd.read_excel('../Raw data/biomarkers_a.xlsx', index_col=0, header=0).values.tolist()))
biomarkers_b = list(map(tuple, pd.read_excel('../Raw data/biomarkers_b.xlsx', index_col=0, header=0).values.tolist()))
biomarkers_c = list(map(tuple, pd.read_excel('../Raw data/biomarkers_c.xlsx', index_col=0, header=0).values.tolist()))
targets = list(map(tuple, pd.read_excel('../Raw data/targets_features.xlsx', index_col=0, header=0).values.tolist()))

continuous = list(map(tuple, pd.read_excel('../Raw data/continuous_features.xlsx', index_col=0, header=0).values.tolist()))
# all biomarkers were continuous except 'БСЖК' in data_b

categorical = list(map(tuple, pd.read_excel('../Raw data/categorical_features.xlsx', index_col=0, header=0).values.tolist()))
# include 'БСЖК' biomarkers from data_b

In [16]:
# importing preprocessed data
test_a = pd.read_excel('../Preprocessed data/Combined target/Imputed data/test_a.xlsx', index_col=0, header=[0])
train_a = pd.read_excel('../Preprocessed data/Combined target/Imputed data/train_a.xlsx', index_col=0, header=[0])
test_b = pd.read_excel('../Preprocessed data/Combined target/Imputed data/test_b.xlsx', index_col=0, header=[0])
train_b = pd.read_excel('../Preprocessed data/Combined target/Imputed data/train_b.xlsx', index_col=0, header=[0])
test_c = pd.read_excel('../Preprocessed data/Combined target/Imputed data/test_c.xlsx', index_col=0, header=[0])
train_c = pd.read_excel('../Preprocessed data/Combined target/Imputed data/train_c.xlsx', index_col=0, header=[0])
test_d = pd.read_excel('../Preprocessed data/Combined target/Imputed data/test_d.xlsx', index_col=0, header=[0])
train_d = pd.read_excel('../Preprocessed data/Combined target/Imputed data/train_d.xlsx', index_col=0, header=[0])
test_abcd = pd.read_excel('../Preprocessed data/Combined target/Imputed data/test_abcd.xlsx', index_col=0, header=[0])
train_abcd = pd.read_excel('../Preprocessed data/Combined target/Imputed data/train_abcd.xlsx', index_col=0, header=[0])

### Define $MCC$ metric

# Statistics

In [30]:
data = pd.concat([train_abcd[list(map(str, continuous))], test_abcd[list(map(str, continuous))]])

In [71]:
def visual_correlation_matrix(column_names, data, title='Pearson correlation matrix'):
    fig = go.Figure(data=[go.Heatmap(
                                    z=data, 
                                    x=column_names, 
                                    y=column_names, 
                                    texttemplate="%{text}",
                                    text=list(map(lambda x: list(map(lambda x: x.round(3) if x> 0.5 else '', x)),data.values))
                                    )])
    fig.update_layout(
                      title={'text': title, 'y':0.99, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
                      autosize=False,
                      width=1100, 
                      height=500,
                      margin=dict(l=15, r=50, b=15, t=40),
                      xaxis=dict(visible=True),
                      yaxis=dict(visible=True),
                      hoverlabel=dict(
                                      bgcolor="white",
                                      font_size=12,
                                      font_family="Rockwell",

                                      )
                      )
    fig.update_xaxes()
    fig.update_yaxes()
    fig.show()

In [58]:
correlation_matrix = data.corr( method='pearson').abs()
correlation_matrix  = pd.DataFrame(correlation_matrix)
# correlation_matrix.style.background_gradient(cmap='coolwarm').set_precision(3)

In [72]:
visual_correlation_matrix(column_names=list(map(lambda x: x[1], list(map(eval, list(data.columns))))), 
                          data=correlation_matrix,
                          title='Pearson correlation matrix for clinical features')

# References