## Import libraries

In [1]:
# Import custom classes
from Preprocessing import Preprocessing

# Import libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# ignore warnings when graphs are plotted
import warnings
warnings.filterwarnings('ignore')

## Preselected features

In [2]:
clinical_features = list(map(tuple, pd.read_excel('../Raw data/Clinical features.xlsx', index_col=0, header=0).values))

biomarkers_a = list(map(tuple, pd.read_excel('../Raw data/biomarkers_a.xlsx', index_col=0, header=0).values.tolist()))
biomarkers_b = list(map(tuple, pd.read_excel('../Raw data/biomarkers_b.xlsx', index_col=0, header=0).values.tolist()))
biomarkers_c = list(map(tuple, pd.read_excel('../Raw data/biomarkers_c.xlsx', index_col=0, header=0).values.tolist()))
targets = list(map(tuple, pd.read_excel('../Raw data/targets_features.xlsx', index_col=0, header=0).values.tolist()))

continuous = list(map(tuple, pd.read_excel('../Raw data/continuous_features.xlsx', index_col=0, header=0).values.tolist()))
# all biomarkers were continuous except 'БСЖК' in data_b

categorical = list(map(tuple, pd.read_excel('../Raw data/categorical_features.xlsx', index_col=0, header=0).values.tolist()))
# include 'БСЖК' biomarkers from data_b

## Data editing

In [3]:
# links to datasets
link_a = '../Raw data/cardio_a.xlsx'
link_b = '../Raw data/cardio_b.xlsx'
link_c = '../Raw data/cardio_c.xlsx'
link_d = '../Raw data/cardio_d.xlsx'

#### Dataset A

In [4]:
# download Dataset A from Github repo and read as excel file

data_a = pd.read_excel(link_a, header=[0, 1], index_col=0)
data_a = data_a[clinical_features + biomarkers_a + targets[:5]]
data_a[targets[5:]] = -1
print('data_a raw shape: ', data_a.shape)

data_a raw shape:  (263, 110)


In [5]:
# Correcting data_a


# replace NAs with -1
data_a = data_a.fillna(-1)
data_a = data_a.replace(' ', -1)

# Modify 'Пол' feature: covert all string to lower format and convert to 0 and 1
data_a['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'] = data_a['АНТРОПОФИЗИОМЕТРИЯ',
                                             'Пол'].str.lower().replace(['м', 'ж'], [0, 1])

data_a[('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация')].replace(to_replace=['ЧКВ',
                                                                                         'АКШ',
                                                                                         '1899-12-29 00:00:00',
                                                                                         'ЧКВ ',
                                                                                         'АКШ ',
                                                                                         '2018-07-30 00:00:00',
                                                                                         '2019-04-15 00:00:00',
                                                                                         '2020-08-30 00:00:00'],
                                                                             value=1,
                                                                             inplace=True)
data_a[('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация')
       ].replace(to_replace=['0'], value=0, inplace=True)

In [6]:
# Find columns

# drop features that were obtained at first discharge and biomarkers
# columns must have no more than threshold=20% of NAs

threshold = 0.2
cols_with_NAs = [col for col in clinical_features +
                 biomarkers_a if (data_a[col] == -1).sum() > threshold*data_a.shape[0]]
# data_a.drop(columns=cols_with_NAs, inplace=True)
cols_with_NAs

[('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'Хсобщ, ммоль/л'),
 ('ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ', 'ТГц, ммоль/л')]

In [7]:
(data_a[cols_with_NAs] == -1).sum() / 263 * 100

                         № п/п         
ЛАБОРАТОРНЫЕ ПОКАЗАТЕЛИ  Хсобщ, ммоль/л    25.095057
                         ТГц, ммоль/л      25.095057
dtype: float64

#### Dataset B

In [8]:
# download Dataset A from Github repo and read as excel file

data_b = pd.read_excel(link_b, header=[0, 1], index_col=0)
data_b = data_b[clinical_features + biomarkers_b + targets]
print('data_b raw shape: ', data_b.shape)

data_b raw shape:  (109, 50)


In [9]:
# Correcting data_b


# replace NAs with -1
data_b = data_b.fillna(-1)
data_b = data_b.replace(' ', -1)
# Modify 'Пол' feature: covert all string to lower format and convert to 0 and 1
data_b['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'] = data_b['АНТРОПОФИЗИОМЕТРИЯ',
                                             'Пол'].str.lower().replace(['м', 'ж'], [0, 1])

# ---------------
data_b.replace(to_replace=['ЧКВ',
                           'АКШ',
                           '1899-12-29 00:00:00',
                           'ЧКВ ',
                           'АКШ ',
                           pd.to_datetime('2018-07-30 00:00:00',
                                          format='%Y-%m-%d %H:%M:%S'),
                           pd.to_datetime('2019-04-15 00:00:00',
                                          format='%Y-%m-%d %H:%M:%S'),
                           pd.to_datetime('2020-08-30 00:00:00',
                                          format='%Y-%m-%d %H:%M:%S'),
                           ],
               value=1,
               inplace=True)

In [10]:
# # Find columns


# # drop features that were obtained at first discharge and biomarkers that have more than 20% of NAs
cols_with_NAs = [col for col in clinical_features + biomarkers_b
                 if (data_b[col] == -1).sum() > 0.2*data_b.shape[0]]  # there are more than 20% of NAs in some biomarkers

In [11]:
(data_b[cols_with_NAs] ==-1).sum() / 109 * 100

ПАСПОРТНЫЕ ДАННЫЕ ПАЦИЕНТА  № п/п           
БИОМАРКЕРЫ БЛОК В           БСЖК-2              21.100917
                            hsТnT-2, пг/мл.1    22.935780
                            MG-2, нг/мл         23.853211
dtype: float64

#### Dataset C

In [12]:
# download Dataset C from Github repo and read as excel file

data_c = pd.read_excel(link_c, header=[0, 1], index_col=0)
data_c = data_c[clinical_features + biomarkers_c + targets]
print('data_c raw shape: ', data_c.shape)

data_c raw shape:  (129, 49)


In [13]:
# Correcting data_c


# replace NAs with -1
data_c = data_c.fillna(-1)
data_c = data_c.replace(' ', -1)
# data_c.columns[(data_c.dtypes == object).values]
# Modify 'Пол' feature: covert all string to lower format and convert to 0 and 1
data_c['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'] = data_c['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'].replace([1, 2],
                                                                                  [0, 1])

In [14]:
# # Find columns


# # cols with features from first discharge and biomarkers
# cols_of_interest = list(data_c.columns[:153]) + list(data_c.columns[419:424])

# # drop features that were obtained at first discharge and biomarkers that have more than 20% of NAs
cols_with_NAs = [col for col in clinical_features + biomarkers_c if (data_c[col] == -1).sum() > 0.2*data_c.shape[0]]
# # cols_of_interest = [col for col in cols_of_interest if col not in cols_with_NAs]
# # continuous_cols = [col for col in cols_of_interest if (len((data_c[col].unique())) > 9)]
# # categorical = [col for col in cols_of_interest if (len((data_c[col].unique())) <= 9)]

In [15]:
(data_c[cols_with_NAs] == -1).sum() / 129 * 100

            № п/п             
БИОМАРКЕРЫ  Галектин-3 (нг/мл)    26.356589
            MMP-9 (нг/мл)         26.356589
            ST2 (нг/мл)           26.356589
            PCSK9 (нг/мл)         26.356589
            Копептин (нг/мл)      26.356589
dtype: float64

#### Dataset D

In [16]:
# download Dataset A from Github repo and read as excel file

data_d = pd.read_excel(link_d ,header=[0,1], index_col=0)
data_d = data_d[clinical_features + targets]
print('data_d raw shape: ', data_d.shape)

data_d raw shape:  (113, 44)


In [17]:
# Correcting data_d


# replace NAs with -1
data_d = data_d.fillna(-1)
data_d = data_d.replace(' ', -1)
# data_d.columns[(data_d.dtypes == object).values]
# Modify 'Пол' feature: covert all string to lower format and convert to 0 and 1
data_d['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'] = data_d['АНТРОПОФИЗИОМЕТРИЯ', 'Пол'].replace([1, 2],
                                                                                  [0, 1])

In [18]:
cols_with_NAs = [col for col in clinical_features if (data_d[col] == -1).sum() > 0.2*data_d.shape[0]]
cols_with_NAs

[]

#### Сombined Dataset ABCD

In [19]:
# merge clinicals of all three datasets
data_abcd = pd.concat([data_a[clinical_features + targets], data_b[clinical_features + targets],
                      data_c[clinical_features + targets], data_d[clinical_features + targets]], axis=0)
data_abcd.dropna(axis=1, inplace=True)

# cols_of_interest = list(data_abcd.columns)[:56]
print(data_abcd.shape)

(614, 44)


In [20]:
cols_with_NAs = [col for col in data_abcd.columns if (
    data_abcd[col] == -1).sum() > 0.2*data_abcd.shape[0]]
cols_with_NAs

[('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Сердечно-сосудистая смерть'),
 ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Реинфаркт'),
 ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'Повторная реваскуляризация'),
 ('КОНЕЧНЫЕ ИСХОДЫ НАБЛЮДЕНИЯ', 'ОНМК'),
 ('КОНТРОЛЬНЫЕ КЛИНИЧЕСКИЕ ИСХОДЫ', 'Сердечно-сосудистая смерть'),
 ('КОНТРОЛЬНЫЕ КЛИНИЧЕСКИЕ ИСХОДЫ', 'Реинфаркт'),
 ('КОНТРОЛЬНЫЕ КЛИНИЧЕСКИЕ ИСХОДЫ', 'Повторная реваскуляризация'),
 ('КОНТРОЛЬНЫЕ КЛИНИЧЕСКИЕ ИСХОДЫ', 'ОНМК')]

In [21]:
(data_abcd == -1).sum() / 614 *100

                                       № п/п                     
АНТРОПОФИЗИОМЕТРИЯ                     Пол                            0.000000
                                       Возраст                        0.000000
                                       ИМТ                            0.000000
                                       систол. АД                     0.000000
                                       ЧСС                            0.000000
ХАРАКТЕРИСТИКА ОИМ                     Давность болевого синдрома     0.651466
                                       Cегмент ST                     0.651466
                                       Передняя стенка ЛЖ             6.351792
                                       Боковая стенка ЛЖ              6.351792
                                       Класс Killip                   0.977199
                                       Риск GRACE, баллы              0.814332
СОПУТСТВУЮЩИЕ ЗАБОЛЕВАНИЯ И СОСТОЯНИЯ  пост-ИМ                   

## Preprocessing

In [22]:
preprocessing = Preprocessing()

In [23]:
results_path = '../Preprocessed data/Combined target/Non-imputed data/'

#### Dataset A

In [24]:
_, _ = preprocessing.process(

    data=data_a,
    target=('target', 'combined'),
    path=results_path,
    save_before_split=False,
    dataset_features=clinical_features + biomarkers_a,
    test_size=0.25,
    download=False,
    name='a',
    continuous_cols=continuous + biomarkers_a,
)

Train shape:	 (142, 102)
Train target:
 1.0    72
0.0    70
Name: (target, combined), dtype: int64


Test shape:	 (48, 102)
Test target:
 1.0    25
0.0    23
Name: (target, combined), dtype: int64

In [25]:
preprocessing.get_combined_target_column(data_a).value_counts()

1.0    97
0.0    93
dtype: int64

#### Dataset B

In [26]:
_, _ = preprocessing.process(

    data=data_b,
    target=('target', 'combined'),
    path=results_path,
    save_before_split=False,
    dataset_features=clinical_features + biomarkers_b,
    test_size=0.25,
    download=False,
    name='b',
    continuous_cols=continuous + biomarkers_b[2:],
)

Train shape:	 (67, 42)
Train target:
 0.0    51
1.0    16
Name: (target, combined), dtype: int64


Test shape:	 (23, 42)
Test target:
 0.0    17
1.0     6
Name: (target, combined), dtype: int64

In [27]:
preprocessing.get_combined_target_column(data_b).value_counts()

0.0    68
1.0    22
dtype: int64

#### Dataset C

In [28]:
_, _ = preprocessing.process(

    data=data_c,
    target=('target', 'combined'),
    path=results_path,
    save_before_split=False,
    dataset_features=clinical_features + biomarkers_c,
    test_size=0.25,
    download=False,
    name='c',
    continuous_cols=continuous + biomarkers_c,
)

Train shape:	 (96, 41)
Train target:
 0.0    83
1.0    13
Name: (target, combined), dtype: int64


Test shape:	 (32, 41)
Test target:
 0.0    27
1.0     5
Name: (target, combined), dtype: int64

In [29]:
preprocessing.get_combined_target_column(data_c).value_counts()

0.0    110
1.0     18
dtype: int64

#### Dataset D

In [30]:
_, _ = preprocessing.process(

    data=data_d,
    target=('target', 'combined'),
    path=results_path,
    save_before_split=False,
    dataset_features=clinical_features,
    test_size=0.25,
    download=False,
    name='d',
    continuous_cols=continuous,
)

Train shape:	 (57, 36)
Train target:
 0.0    33
1.0    24
Name: (target, combined), dtype: int64


Test shape:	 (19, 36)
Test target:
 0.0    11
1.0     8
Name: (target, combined), dtype: int64

In [31]:
preprocessing.get_combined_target_column(data_d).value_counts()

0.0    44
1.0    32
dtype: int64

#### Сombined Dataset ABCD

In [32]:
_, _ = preprocessing.process(

    data=data_abcd,
    target=('target', 'combined'),
    path=results_path,
    save_before_split=False,
    dataset_features=clinical_features,
    test_size=0.25,
    download=False,
    name='abcd',
    continuous_cols=continuous,
)

Train shape:	 (363, 36)
Train target:
 0.0    236
1.0    127
Name: (target, combined), dtype: int64


Test shape:	 (121, 36)
Test target:
 0.0    79
1.0    42
Name: (target, combined), dtype: int64

In [33]:
preprocessing.get_combined_target_column(data_abcd).value_counts()

0.0    315
1.0    169
dtype: int64

## References and info