## Import libraries

In [1]:
# Import custom classes
from Imputing import Imputing

# Import libraries

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import sklearn
import plotly


# metrics
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as TP_rate                          
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score as recall
from sklearn.metrics import average_precision_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer,fbeta_score

# classifiers
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostRegressor, RandomForestRegressor, GradientBoostingClassifier, StackingClassifier, VotingClassifier #
from sklearn.tree import DecisionTreeClassifier     #
from sklearn.svm import SVC                                    # both linear and radial classification
from sklearn.neighbors import KNeighborsClassifier             # k=3
from sklearn.linear_model import LogisticRegression
# import catboost
# from catboost import CatBoostClassifier


# imputations
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.base import clone

import ast

# ignore warnings when graphs are plotted
import warnings
warnings.filterwarnings('ignore')

## Preselected features and preprocessed data

In [2]:
clinical_features = list(map(tuple, pd.read_excel('../Raw data/Clinical features.xlsx', index_col=0, header=0).values))

biomarkers_a = list(map(tuple, pd.read_excel('../Raw data/biomarkers_a.xlsx', index_col=0, header=0).values.tolist()))
biomarkers_b = list(map(tuple, pd.read_excel('../Raw data/biomarkers_b.xlsx', index_col=0, header=0).values.tolist()))
biomarkers_c = list(map(tuple, pd.read_excel('../Raw data/biomarkers_c.xlsx', index_col=0, header=0).values.tolist()))
targets = list(map(tuple, pd.read_excel('../Raw data/targets_features.xlsx', index_col=0, header=0).values.tolist()))

continuous = list(map(tuple, pd.read_excel('../Raw data/continuous_features.xlsx', index_col=0, header=0).values.tolist()))
# all biomarkers were continuous except 'БСЖК' in data_b

categorical = list(map(tuple, pd.read_excel('../Raw data/categorical_features.xlsx', index_col=0, header=0).values.tolist()))
# include 'БСЖК' biomarkers from data_b

In [3]:
# importing preprocessed data
test_a = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/test_a.xlsx', index_col=0, header=[0,1])
train_a = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/train_a.xlsx', index_col=0, header=[0,1])
test_b = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/test_b.xlsx', index_col=0, header=[0,1])
train_b = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/train_b.xlsx', index_col=0, header=[0,1])
test_c = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/test_c.xlsx', index_col=0, header=[0,1])
train_c = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/train_c.xlsx', index_col=0, header=[0,1])
test_d = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/test_d.xlsx', index_col=0, header=[0,1])
train_d = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/train_d.xlsx', index_col=0, header=[0,1])
test_abcd = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/test_abcd.xlsx', index_col=0, header=[0,1])
train_abcd = pd.read_excel('../Preprocessed data/Combined target/Non-imputed data/train_abcd.xlsx', index_col=0, header=[0,1])

## Imputing

In [4]:
imputation = Imputing()

In [5]:
# Impute NAs with IterativeImputer (estimator - Random Forest Regressor)

noncat_imputer = IterativeImputer(random_state=20, max_iter=25)

impute_estimator = LogisticRegression(random_state=20)
cat_imputer = IterativeImputer(
    random_state=20, estimator=impute_estimator, max_iter=25, skip_complete=True)

In [6]:
results_path = '../Preprocessed data/Combined target/Imputed data/'

#### Dataset A

In [7]:
_, _ = imputation.process(

    data={'train': train_a, 'test': test_a},
    noncat_imputer=noncat_imputer, cat_imputer=cat_imputer,
    target=('target', 'combined'),
    path=results_path,
    dataset_features=clinical_features + biomarkers_a,
    download=False,
    name='a',
    categorical_cols=categorical,

)

Train shape:	 (142, 102)
Train target:
 1    72
0    70
Name: (target, combined), dtype: int64


Test shape:	 (48, 102)
Test target:
 1    25
0    23
Name: (target, combined), dtype: int64

#### Dataset B

In [8]:
_, _ = imputation.process(

    data={'train': train_b, 'test': test_b},
    noncat_imputer=noncat_imputer, cat_imputer=cat_imputer,
    target=('target', 'combined'),
    path=results_path,
    dataset_features=clinical_features + biomarkers_b,
    download=False,
    name='b',
    categorical_cols=categorical + biomarkers_b[:2],

)

Train shape:	 (67, 42)
Train target:
 0    51
1    16
Name: (target, combined), dtype: int64


Test shape:	 (23, 42)
Test target:
 0    17
1     6
Name: (target, combined), dtype: int64

#### Dataset C

In [9]:
_, _ = imputation.process(

    data={'train': train_c, 'test': test_c},
    noncat_imputer=noncat_imputer, cat_imputer=cat_imputer,
    target=('target', 'combined'),
    path=results_path,
    dataset_features=clinical_features + biomarkers_c,
    download=False,
    name='c',
    categorical_cols=categorical,

)

Train shape:	 (96, 41)
Train target:
 0    83
1    13
Name: (target, combined), dtype: int64


Test shape:	 (32, 41)
Test target:
 0    27
1     5
Name: (target, combined), dtype: int64

#### Dataset D

In [10]:
_, _ = imputation.process(

    data={'train': train_d, 'test': test_d},
    noncat_imputer=noncat_imputer, cat_imputer=cat_imputer,
    target=('target', 'combined'),
    path=results_path,
    dataset_features=clinical_features,
    download=False,
    name='d',
    categorical_cols=categorical,

)

Train shape:	 (57, 36)
Train target:
 0    33
1    24
Name: (target, combined), dtype: int64


Test shape:	 (19, 36)
Test target:
 0    11
1     8
Name: (target, combined), dtype: int64

#### Сombined Dataset ABCD

In [11]:
_, _ = imputation.process(

    data={'train': train_abcd, 'test': test_abcd},
    noncat_imputer=noncat_imputer, cat_imputer=cat_imputer,
    target=('target', 'combined'),
    path=results_path,
    dataset_features=clinical_features,
    download=False,
    name='abcd',
    categorical_cols=categorical,

)

Train shape:	 (363, 36)
Train target:
 0    236
1    127
Name: (target, combined), dtype: int64


Test shape:	 (121, 36)
Test target:
 0    79
1    42
Name: (target, combined), dtype: int64

## References and info