### Import libraries

In [46]:
# %%capture
# !pip install catboost
# !pip install mrmr_selection
# !pip install imblearn
# !pip install mlxtend

In [120]:
# Import libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from sklearn.model_selection import train_test_split
# from google.colab import output

from pandas import DatetimeIndex as dt
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
# from google.colab import files
import IPython
from IPython.display import HTML, display
# from google.colab import drive
import sys

# hyper-parameters optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# metrics
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as TP_rate                          
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score as recall
from sklearn.metrics import average_precision_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer,fbeta_score

# classifiers
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostRegressor, RandomForestRegressor, GradientBoostingClassifier, StackingClassifier, VotingClassifier #
from sklearn.tree import DecisionTreeClassifier     #
from sklearn.svm import SVC                                    # both linear and radial classification
from sklearn.neighbors import KNeighborsClassifier             # k=3
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import catboost
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from mlxtend.classifier import StackingClassifier

# statistics
from scipy.stats import shapiro
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu

# imputations
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

# feature selection
from sklearn.feature_selection import chi2, mutual_info_classif, f_classif, SelectKBest, RFE, RFECV, SequentialFeatureSelector
from scipy.stats import kendalltau, spearmanr
from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold
from mrmr import mrmr_classif

# to conver string to dict
import ast

# Interpretability
 # !pip install interpret
from interpret.blackbox import LimeTabular
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
from interpret import show

import lime
import lime.lime_tabular
from __future__ import print_function

# ignore warnings when graphs are plotted
import warnings
warnings.filterwarnings('ignore')

In [48]:
# %%capture
# !pip install ipython-autotime

# %load_ext autotime

### Data tranformation

Letter with data: [here](https://mail.yandex.ru/?win=176&clid=1985545-207&uid=112725799#message/179862510118127643)
- [Difference between Standard scaler and MinMaxScaler](https://stackoverflow.com/questions/51237635/difference-between-standard-scaler-and-minmaxscaler)
- [sklearn: Compare the effect of different scalers on data with outliers](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html)
- [How to Use StandardScaler and MinMaxScaler Transforms in Python](https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/)

Robust scaler is less sensitive to outliers -> might perform better  
Although, for non-normal distribution of variables Normalisation methods (MinMaxScaler) are recommended

#### Dataset A

In [96]:
# download Dataset A from Github repo and read as excel file


link_a = 'https://github.com/KonstantinBurkin/personalized-medicine/blob/master/Data/cardio_a_updated.xlsx?raw=true'
data_a = pd.read_excel(link_a ,header=[0,1], index_col=0)
print('data_a raw shape: ', data_a.shape)

data_a raw shape:  (263, 252)


In [97]:
# Correcting data_a


# replace NAs with -1
data_a = data_a.fillna(-1)
data_a = data_a.replace(' ',-1)

# Modify 'Пол' feature: covert all string to lower format and convert to 0 and 1
data_a['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'] = data_a['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'].str.lower().replace(['м','ж'],[0,1])

# correct date typos
data_a['АНТРОПОФИЗИОМЕТРИЯ', 'Дата госпитализации'][154] = '2013-03-22 00:00:00'
data_a['АНТРОПОФИЗИОМЕТРИЯ', 'Дата госпитализации'][129] = '2013-01-17 00:00:00'
data_a['АНТРОПОФИЗИОМЕТРИЯ', 'Дата госпитализации'] = pd.to_datetime(data_a['АНТРОПОФИЗИОМЕТРИЯ', 'Дата госпитализации'], format='%Y-%m-%d %H:%M:%S')

# Replace typos or non-numeric data with '-1'
data_a[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия ХБП по KDOQI /ERA')] = \
data_a[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия ХБП по KDOQI /ERA')].replace(
    ["С2", "С1", "с2", "с1", "С3а", "с3а", "C2", "с3б", "C1", "С3б", "С4", "С3", "3А", "с4", "с5", "С3b", "ОПН!"], \
    ["c2", "c1", "c2", "c1", "c3a", "c3a", "c2", "c3b", "c1", "c3b", "c4", "c3", "c3a", "c4", "c5", "c3b", "опн!"] )
data_a[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия ХБП по KDOQI /ERA')] = \
data_a[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия ХБП по KDOQI /ERA')].replace(
    ['c2', 'c1', 'c3a', 'c3b', 'c4', 'c3', 0, 2, 'c5', 'опн!'], [0,  1,  2,  3,  4,  5,  6,  7,  8,  9] ).astype(int)
# ---------------
data_a[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия по KDOQI/ERA')] = \
data_a[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия по KDOQI/ERA')].replace(
    [-1, 'С2', 'с2', 'С1', 0, 'c2', 'с1', 'С3а', 'C2', 'C1', 'С4', 'с3б', 'С3b', 'С3А', 'с4', 'C3б', 'C3а', 'С3б'],
    [-1, 'c2', 'c2', 'c1', 0, 'c2', 'c1', 'c3a', 'c2', 'c1', 'c4', 'c3b', 'c3b', 'c3a', 'c4', 'c3b', 'c3a', 'c3b'] )
data_a[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия по KDOQI/ERA')] = \
data_a[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия по KDOQI/ERA')].replace(
    [-1, 'c2', 'c1', 'c3a', 'c3b', 0, 'c4'], [-1, 1, 2, 3, 4, 0, 5] ).astype(int)
# ---------------
data_a[('Хроническая сердечная недостаточность', 'НК')] = \
data_a[('Хроническая сердечная недостаточность', 'НК')].replace([-1, 1, '2а', 0, '2А', '2Б', 2, '2б'], [-1, 1, 3, 0, 3, 4, 2, 4]).astype(int)
# ---------------
data_a.rename({'Повторная реваскуляризация (ЧКВ/АКШ)': 'Повторная реваскуляризация'}, axis=1, inplace=True)
data_a[('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация')].replace(to_replace=['ЧКВ',
                           'АКШ',
                           '1899-12-29 00:00:00', 
                           'ЧКВ ',
                           'АКШ ', 
                           '2018-07-30 00:00:00', 
                           '2019-04-15 00:00:00', 
                           '2020-08-30 00:00:00'], 
               value=1, 
               inplace=True)
data_a[('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация')].replace(to_replace=['0'], value=0, inplace=True)

# Features with nulls: manually chose non-categorical columns with '0' and replace with '-1' 
# nulls = [col for col in data_a.columns[:90] if (len((data_a[col].unique())) > 12) and (data_a[col] == 0).sum() > 0] ; data_a[nulls]
data_a[('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'ЛПНП, ммоль/л')] = data_a[('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'ЛПНП, ммоль/л')].replace([0],[-1]) 
data_a[('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Билирубин, мкмоль/л')] = data_a[('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Билирубин, мкмоль/л')].replace([0],[-1]) 

In [99]:
# Find columns


hyperlipidemia = pd.DataFrame(columns=pd.MultiIndex.from_product([['ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ'], ['Хсобщ, ммоль/л']]))
hyperlipidemia[('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ'), ('Хсобщ, ммоль/л')] = data_a[('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Хсобщ, ммоль/л')]


# drop features that were obtained at first discharge and biomarkers 
# columns must have no more than threshold=20% of NAs
threshold = 0.2
columns_with_useful_data = list(data_a.columns[:2]) + list(data_a.columns[3:77]) + list(data_a.columns[148:237])
cols_with_NAs = [col for col in columns_with_useful_data if (data_a[col] == -1).sum() > threshold*data_a.shape[0]]
data_a.drop(columns=cols_with_NAs, inplace=True)       


# find all the necessary columns
# define continuous and categorical groups
clinical_and_biomarkers = list(data_a.columns[:2]) + list(data_a.columns[3:61]) + list(data_a.columns[132:218])
continuous_cols = [col for col in clinical_and_biomarkers if (len((data_a[col].unique())) >= 7)]
categorical = [col for col in clinical_and_biomarkers if (len((data_a[col].unique())) < 7)]
clinical_and_biomarkers_and_hyperlipidemia = clinical_and_biomarkers + [('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Хсобщ, ммоль/л')]

In [100]:
print('data_a raw shape: ', data_a.shape)

data_a raw shape:  (263, 233)


In [84]:
# # Make imputation

# # replace -1 values with NA
# data_a[clinical_and_biomarkers] = data_a[clinical_and_biomarkers].replace(-1, np.nan)

# # Impute NAs with IterativeImputer (estimator - RandomForestRegressor)
# impute_estimator = RandomForestRegressor(n_estimators=50,
#                                          max_depth=5,
#                                          n_jobs=-1,
#                                          random_state=0)
# imputer = IterativeImputer(random_state=0,
#                            estimator=impute_estimator, 
#                            max_iter=25)
# data_a[clinical_and_biomarkers] = imputer.fit_transform(data_a[clinical_and_biomarkers])

# # Round values for categorical data - so that there will be no new categories
# data_a[categorical] = data_a[categorical].values.round()

# save this column for Lancet dataset
hyperlipidemia = hyperlipidemia.replace(-1, np.nan)

In [101]:
# For subsets


biomarkers_a_columns = list(data_a.columns[132:218])
clinical_and_biomarkers_a_columns = list(data_a.columns[:2]) + list(data_a.columns[3:61]) + list(data_a.columns[132:218])
clinical_a_columns = list(data_a.columns[:2]) + list(data_a.columns[3:61])

In [102]:
data_a[biomarkers_a_columns].head(2)

Unnamed: 0_level_0,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А
№ п/п,"TnI-1, нг/мл","TnI-2, нг/мл","TnI-3, нг/мл","TnI-4, нг/мл","CKMB-1, нг/мл","CKMB-2, нг/мл","CKMB-3, нг/мл","CKMB-4, нг/мл","MG-1, нг/мл","MG-2, нг/мл",...,"АЧТВ-3, с","АЧТВ-4, с","АТ3-1, %","АТ3-2, %","АТ3-3, %","АТ3-4, %","FW-1,%","FW-2,%","FW-3,%","FW-4,%"
1,7.56,13.0,2.31,-1.0,21.5,3.14,6.71,-1.0,460.6,164.2,...,31.2,-1.0,100.72,86.59,116.31,-1.0,120.0,120.0,293,-1.0
2,5.17,10.3,0.304,0.66,28.2,0.5,3.85,7.5,421.3,56.21,...,34.0,36.1,100.55,101.63,127.76,99.58,120.0,115.99,90,94.0


In [87]:
data_a[clinical_and_biomarkers_a_columns].head(2)

Unnamed: 0_level_0,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,ХАРАКТЕРИСТИКА ОИМ,ХАРАКТЕРИСТИКА ОИМ,...,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А,БИОМАРКЕРЫ БЛОК А
№ п/п,Пол,Возраст,Рост,Вес,ИМТ,S тела,систол. АД,ЧСС,Давность болевого синдрома,Cегмент ST,...,"АЧТВ-3, с","АЧТВ-4, с","АТ3-1, %","АТ3-2, %","АТ3-3, %","АТ3-4, %","FW-1,%","FW-2,%","FW-3,%","FW-4,%"
1,0,75,1.64,80,29.7442,1.88,190,90,2,1,...,31.2,-1.0,100.72,86.59,116.31,-1.0,120.0,120.0,293,-1.0
2,0,49,1.76,130,41.967975,2.41,140,100,2,1,...,34.0,36.1,100.55,101.63,127.76,99.58,120.0,115.99,90,94.0


In [88]:
data_a[clinical_a_columns].head(2)

Unnamed: 0_level_0,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,АНТРОПОФИЗИОМЕТРИЯ,ХАРАКТЕРИСТИКА ОИМ,ХАРАКТЕРИСТИКА ОИМ,...,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ,ИСХОДНАЯ ЭХОКГ
№ п/п,Пол,Возраст,Рост,Вес,ИМТ,S тела,систол. АД,ЧСС,Давность болевого синдрома,Cегмент ST,...,ФВ ЛЖ,ТМЖП ЛЖ,ТЗС ЛЖ,ММ ЛЖ,иММ ЛЖ,ЛП,РМК,Аневризма ЛЖ,Тромбоз ЛЖ,ИНЛС ЛЖ
1,0,75,1.64,80,29.7442,1.88,190,90,2,1,...,55.147059,1.3,1.2,271.5,144.4,4.2,1,0,0,0.0
2,0,49,1.76,130,41.967975,2.41,140,100,2,1,...,54.304636,1.4,1.13,301.6,125.1,4.1,1,0,0,1.125


#### Dataset B

In [None]:
# read cardio_b_updated.xlsx
data_b = pd.read_excel('./HSE project/Data/cardio_b_updated.xlsx',header=[0,1])
# drop index column
data_b.drop(columns=('Unnamed: 0_level_0','№ п/п'), inplace=True)
# replace NAs with -1
data_b = data_b.fillna(-1)
data_b = data_b.replace(' ',-1)
# Modify 'Пол' feature: covert all string to lower format and convert to 0 and 1
data_b['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'] = data_b['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'].str.lower().replace(['м','ж'],[0,1])


# Replace typos or non-numeric data with '-1'
data_b[('ИСХОДНАЯ ЭХОКГ', 'РМК')] = data_b[('ИСХОДНАЯ ЭХОКГ', 'РМК')].replace('1-2', -1).astype(int)
# ---------------
data_b[('КОРОНАРОАНГИОГРАФИЯ И РЕВАСКУЛЯРИЗАЦИЯ МИОКАРДА', 'TIMI в ИЗА')] = \
data_b[('КОРОНАРОАНГИОГРАФИЯ И РЕВАСКУЛЯРИЗАЦИЯ МИОКАРДА', 'TIMI в ИЗА')].replace(['0-I', '0-1', '1-2'], [4, 5, 6]).astype(int)
# ---------------
data_b[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия ХБП по KDOQI /ERA')] = \
data_b[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия ХБП по KDOQI /ERA')].replace(
    ["С2", "С1", "с2", "с1", "С3а", "с3а", "C2", "с3б", "C1", "С3б", "С4", "С3", "3А", "с4", "с5", "С3b", "ОПН!"], \
    ["c2", "c1", "c2", "c1", "c3a", "c3a", "c2", "c3b", "c1", "c3b", "c4", "c3", "3a", "c4", "c5", "c3b", "опн!"] )
data_b[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия ХБП по KDOQI /ERA')] = \
data_b[('СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ', 'Стадия ХБП по KDOQI /ERA')].replace(
    ['c2', 'c1', 'c3a', 'c3b', 'c4', 'c3', 0, 2, '3a', 'c5', 'опн!'], [0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10] ).astype(int)
# ---------------
data_b[('ЭХОКГ (ИСХОД)', 'ТМЖП ЛЖ')] = \
data_b[('ЭХОКГ (ИСХОД)', 'ТМЖП ЛЖ')].replace(['1,1-1,0-0,8', '1,0 - 1,5', '1,3; 0,8'], -1).astype(float)
# ---------------
data_b[('ПОВТОРНАЯ ЭХОКГ', 'ЛП')] = \
data_b[('ПОВТОРНАЯ ЭХОКГ', 'ЛП')].replace(['С2'], -1).astype(float)
# ---------------
data_b[('ПОВТОРНАЯ ЭХОКГ', 'ТМЖП ЛЖ')] = \
data_b[('ПОВТОРНАЯ ЭХОКГ', 'ТМЖП ЛЖ')].replace(['1,1-1.0-0,7'], -1).astype(float)
# ---------------
data_b.replace(to_replace=['ЧКВ',
                           'АКШ',
                           '1899-12-29 00:00:00', 
                           'ЧКВ ',
                           'АКШ ', 
                           pd.to_datetime('2018-07-30 00:00:00', format='%Y-%m-%d %H:%M:%S'),
                           pd.to_datetime('2019-04-15 00:00:00', format='%Y-%m-%d %H:%M:%S'),
                           pd.to_datetime('2020-08-30 00:00:00', format='%Y-%m-%d %H:%M:%S'),
                           ], 
               value=1, 
               inplace=True)
data_b[('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация')].replace(to_replace=['0'], value=0, inplace=True)

# Features with nulls: manually chose non-categorical columns with '0' and replace with '-1' 
# nulls = [col for col in data_b.columns[:90] if (len((data_b[col].unique())) > 12) and (data_b[col] == 0).sum() > 0] ; data_b[nulls]
data_b[('ИСХОДНАЯ ЭХОКГ', 'ИНЛС ЛЖ')] = data_b[('ИСХОДНАЯ ЭХОКГ', 'ИНЛС ЛЖ')].replace([0],[-1]) 


# drop features that were obtained at first discharge and biomarkers that have more than 20% of NAs
cols_with_NAs = [col for col in (list(data_b.columns[:2]) + list(data_b.columns[3:84])) \
                        if (data_b[col] == -1).sum() > 0.2*data_b.shape[0]] # there are more than 20% of NAs in some biomarkers
data_b.drop(columns=cols_with_NAs, inplace=True)

      


clinical_and_biomarkers = list(data_b.columns[:2]) + list(data_b.columns[3:72]) + list(data_b.columns[214:])
continuous_cols = [col for col in clinical_and_biomarkers if (len((data_b[col].unique())) >= 7)]
categorical = [col for col in clinical_and_biomarkers if (len((data_b[col].unique())) < 7)]

data_b[clinical_and_biomarkers] = data_b[clinical_and_biomarkers].replace(-1, np.nan)

# Impute NAs with IterativeImputer (estimator - RandomForestRegressor)
impute_estimator = RandomForestRegressor( n_estimators=50,
                                          max_depth=5,
                                          n_jobs=-1,
                                          random_state=0)
imputer=  IterativeImputer(
                           random_state=0,
                           estimator=impute_estimator, 
                           max_iter=25)
data_b[clinical_and_biomarkers] = imputer.fit_transform(data_b[clinical_and_biomarkers])

# Round values for categorical data - so that there will be no new categories
data_b[categorical] = data_b[categorical].values.round()

In [None]:
# For subsets
biomarkers_b_columns = list(data_b.columns[214:])
clinical_and_biomarkers_b_columns = list(data_b.columns[:2]) + list(data_b.columns[3:72]) + list(data_b.columns[214:])
clinical_b_columns = list(data_b.columns[:2]) + list(data_b.columns[3:72])

#### Dataset C

In [None]:
# read cardio_b_updated.xlsx
data_c = pd.read_excel('./HSE project/Data/Блок С обезлич.xlsx',header=[0,1])
# drop index column
data_c.drop(columns=('Unnamed: 0_level_0','№ п/п'), inplace=True)
# replace NAs with -1
data_c = data_c.fillna(-1)
data_c = data_c.replace(' ',-1)
# data_c.columns[(data_c.dtypes == object).values]
# Modify 'Пол' feature: covert all string to lower format and convert to 0 and 1
data_c['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'] = data_c['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'].replace([1, 2],[0,1])
# Reencrypt psychological scales
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Ситуативная тревожность Спилберга (баллы)'].replace(range(31), 0, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Ситуативная тревожность Спилберга (баллы)'].replace(range(31, 46), 1, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Ситуативная тревожность Спилберга (баллы)'].replace(range(46,100), 2, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Личная тревожность Спилберга (баллы)'].replace(range(31), 0, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Личная тревожность Спилберга (баллы)'].replace(range(31, 46), 1, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Личная тревожность Спилберга (баллы)'].replace(range(46,100), 2, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Опросник депрессии Бека (баллы)'].replace(range(10), 0, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Опросник депрессии Бека (баллы)'].replace(range(10, 20), 1, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Опросник депрессии Бека (баллы)'].replace(range(20,100), 2, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Шкала AUDIT (баллы)'].replace(range(16), 0, inplace=True)
data_c['ПСИХОСОЦИАЛЬНЫЕ ФАКТОРЫ', 'Шкала AUDIT (баллы)'].replace(range(16, 100), 1, inplace=True)

# date_c = pd.to_datetime(data_c['АНТРОПОФИЗИОМЕТРИЯ', 'Дата госпитализации'], format='%Y-%m-%d %H:%M:%S')
date_c = pd.to_datetime(data_c['АНТРОПОФИЗИОМЕТРИЯ', 'Дата госпитализации'], format='%Y-%m-%d %H:%M:%S')
data_c.drop(columns=[('АНТРОПОФИЗИОМЕТРИЯ', 'Дата рождения'), ('АНТРОПОФИЗИОМЕТРИЯ', 'Дата госпитализации')], inplace=True)
data_c.drop(columns=['ПЛАНОВАЯ ТЕРАПИЯ ПРИ ВЫПИСКЕ', 'ГОСПИТАЛЬНАЯ ТЕРАПИЯ'], inplace=True)
# cols with features from first discharge and biomarkers
cols_of_interest = list(data_c.columns[:153]) + list(data_c.columns[419:424])

# drop features that were obtained at first discharge and biomarkers that have more than 20% of NAs
cols_with_NAs = [col for col in cols_of_interest[:-5] if (data_c[col] == -1).sum() > 0.2*data_c.shape[0]]
cols_of_interest = [col for col in cols_of_interest if col not in cols_with_NAs]
continuous_cols = [col for col in cols_of_interest if (len((data_c[col].unique())) >= 7)]
categorical = [col for col in cols_of_interest if (len((data_c[col].unique())) < 7)]

data_c[cols_of_interest] = data_c[cols_of_interest].replace(-1, np.nan)


# Impute NAs with IterativeImputer (estimator - RandomForestRegressor)
impute_estimator = RandomForestRegressor( n_estimators=50,
                                          max_depth=5,
                                          n_jobs=-1,
                                          random_state=0)
imputer=  IterativeImputer(
                           random_state=0,
                           estimator=impute_estimator, 
                           max_iter=25)
data_c[cols_of_interest] = imputer.fit_transform(data_c[cols_of_interest])

# Round values for categorical data - so that there will be no new categories
data_c[categorical] = data_c[categorical].values.round()

In [None]:
# For subsets
biomarkers_c_columns = cols_of_interest[-5:]
clinical_c_columns = cols_of_interest[:-5]
clinical_and_biomarkers_c_columns = clinical_c_columns + biomarkers_c_columns

#### Сombined Dataset ABC

In [None]:
# merge clinicals of all three datasets
data_abc = pd.concat([data_a, data_b, data_c], axis=0)
data_abc.dropna(axis=1, inplace=True)

cols_of_interest = list(data_abc.columns)[:56]

In [None]:
# For subsets
clinical_abc_columns = cols_of_interest

#### Lancet Dataset ABC

In [None]:
data_a_and_hyperlipidemia = data_a.copy()
data_a_and_hyperlipidemia[('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ'), ('Хсобщ, ммоль/л')] = hyperlipidemia

# Impute NAs with IterativeImputer (estimator - RandomForestRegressor)
impute_estimator = RandomForestRegressor( n_estimators=50,
                                          max_depth=5,
                                          n_jobs=-1,
                                          random_state=0)
imputer=  IterativeImputer(
                           random_state=0,
                           estimator=impute_estimator, 
                           max_iter=25)


data_a_and_hyperlipidemia[clinical_and_biomarkers_a_columns + [('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Хсобщ, ммоль/л')]] = \
                                    imputer.fit_transform(data_a_and_hyperlipidemia[clinical_and_biomarkers_a_columns + [('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Хсобщ, ммоль/л')]])

data_abc_lancet = pd.concat([data_a_and_hyperlipidemia, data_b, data_c], axis=0)
data_abc_lancet.dropna(axis=1, inplace=True)

cols_of_interest = list(data_abc_lancet.columns)[:56]
cols_of_interest.append(('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Хсобщ, ммоль/л'))

In [None]:
# For subsets
clinical_abc_columns_lancet = cols_of_interest

### Cardiovascular death

#### Dataset A

In [121]:
random_state = 20
target = ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть')
path = "./HSE project/Preprocessed Data/cardiovascular death/"

In [196]:
# add target columnt and drop patients with -1 in the outcome
features = clinical_and_biomarkers_a_columns + [target]
dataframe = data_a[features].copy()
dataframe[target] = \
dataframe[target].replace(-1, np.nan)
dataframe.dropna(axis=0, how='any', inplace=True)
dataframe.replace(-1, np.nan, inplace=True)

In [197]:
# divide dataset into train and test
X_train, X_test, y_train, y_test = \
train_test_split(dataframe[clinical_and_biomarkers_a_columns], 
                 dataframe[target], 
                 test_size=0.25, 
                 random_state=random_state, 
                 shuffle=True)

In [198]:
# Make imputation

# Impute NAs with IterativeImputer (estimator - RandomForestRegressor)
impute_estimator = RandomForestRegressor(n_estimators=50,
                                         max_depth=5,
                                         n_jobs=-1,
                                         random_state=random_state)
imputer = IterativeImputer(random_state=random_state,
                           estimator=impute_estimator, 
                           max_iter=25)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Round values for categorical data - so that there will be no new categories
categorical = [dataframe.columns.get_loc(col) for col in dataframe.columns[:-1] if len(dataframe[col].unique()) <= 7]
X_train[:, categorical] = X_train[:, categorical].round()
X_test[:, categorical] = X_test[:, categorical].round()

In [199]:
# fit SMOTE on train part
smote = SMOTENC(categorical_features = [dataframe.columns.get_loc(col) for col in dataframe.columns[:-1] if len(dataframe[col].unique()) <= 7],
                sampling_strategy='minority',
                n_jobs=-1,
                random_state=random_state)
X_sm, y_sm = smote.fit_resample(X_train, y_train)

In [202]:
train_imputed = pd.DataFrame(data=X_sm, 
                             columns=clinical_and_biomarkers_a_columns)
train_imputed[target] = y_sm

train_imputed.to_excel(f'{path}train_a.xlsx')


#### Dataset B

#### Dataset C

### Scaling

In [None]:
# Define scaler 
scaler = RobustScaler() 

# Scaling of dataset A
continuous_cols = [col for col in clinical_and_biomarkers_a_columns if (len((data_a[col].unique())) >= 7)]
data_a[continuous_cols] = scaler.fit_transform(data_a[continuous_cols])

# Scaling of dataset B
continuous_cols = [col for col in clinical_and_biomarkers_b_columns if (len((data_b[col].unique())) >= 7)]
data_b[continuous_cols] = scaler.fit_transform(data_b[continuous_cols])

# Scaling of dataset C
continuous_cols = [col for col in clinical_and_biomarkers_c_columns if (len((data_c[col].unique())) >= 7)]
data_c[continuous_cols] = scaler.fit_transform(data_c[continuous_cols])

# Scaling of dataset ABC
continuous_cols = [col for col in clinical_abc_columns if (len((data_abc[col].unique())) >= 7)]
data_abc[continuous_cols] = scaler.fit_transform(data_abc[continuous_cols])

# Scaling of dataset Lancet
continuous_cols = [col for col in clinical_abc_columns_lancet if (len((data_abc_lancet[col].unique())) >= 7)]
data_abc_lancet[continuous_cols] = scaler.fit_transform(data_abc_lancet[continuous_cols])