In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn.model_selection import train_test_split, cross_validate
from sklearn import metrics
from sklearn.model_selection import StratifiedShuffleSplit, KFold
from sklearn.model_selection import GridSearchCV
from scikitplot.metrics import plot_roc_curve

from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
import lightgbm
import catboost
import sklearn.svm

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
data_all_old = pd.read_excel('Data/data_2_samples_NEW.xlsx', index_col=0)
# Drop sample 1
data_all_old = data_all_old[data_all_old['sample']==0]
# Reneme coluns the same as in new data
data_all_old.drop(['sample', 'AGT_AGTR2'], axis=1, inplace=True)
data_all_old.rename(columns={'stage_agr': 'stage'}, inplace=True)
# Change order of columns (other order causes error in xgboost)
data_all_old = data_all_old[['gender', 'age', 'bmi', 'ao','activity', 
                             'smoking', 'male_heredity', 'stage']]
data_all_old.head()

Unnamed: 0,gender,age,bmi,ao,activity,smoking,male_heredity,stage
0,0,0,0,0,0,0,1,1
1,1,1,0,0,1,1,0,1
2,1,1,0,1,1,1,0,1
3,1,0,1,1,1,0,0,1
4,0,1,1,1,0,0,0,1


In [3]:
data_old = pd.read_csv('Data\data_decode_old.csv', index_col=0).drop('alcohol', axis=1)
data_old.head()

Unnamed: 0_level_0,gender,age,bmi,ao,activity,smoking,male_heredity,stage
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,0,1,1,1,0,1
2,1,0,1,1,1,0,1,1
3,0,1,0,1,1,1,0,1
4,0,1,0,0,1,0,1,0
5,1,1,1,1,0,1,1,1


In [4]:
data_new = pd.read_csv('Data\data_decode_new.csv', index_col=0).drop('alcohol', axis=1)
data_new.head()

Unnamed: 0_level_0,gender,age,bmi,ao,activity,smoking,male_heredity,stage
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,0,0,0,1,0,1
2,1,0,0,1,1,0,1,1
3,0,1,1,1,0,0,0,1
4,0,1,0,1,0,0,1,0
5,1,1,0,1,0,1,1,1


## Train models
- Train - old data
- Validate - old data
- Test - new data

In [5]:
# (X_train, X_valid,
# y_train, y_valid) = train_test_split(data_old.drop(['stage'], axis=1),
#                                     data_old['stage'], test_size=0.3,
#                                     random_state=42, stratify=data_old['stage'])

In [6]:
(X_train, X_valid,
 y_train, y_valid) = train_test_split(data_all_old.drop(['stage'], axis=1),
                                     data_all_old['stage'], test_size=0.3,
                                     random_state=42, stratify=data_all_old['stage'])

In [30]:
def plot_for_treshhold(y, y_prob, plot=True):
    x1 = np.linspace(0.1, 0.9, 20)
    y1 = []
    y2 = []
    y3 = []
    for th in x1:
        y_pred = list(map(lambda x: 1 if x > th else 0, y_prob))
        y1.append(metrics.f1_score(y, y_pred))
        y2.append(metrics.recall_score(y, y_pred))
        y3.append(metrics.precision_score(y, y_pred))
    
    max_in = x1[np.argmax(y1)]
    #print(f'Treseshold: {max_in:.5}')
    if plot is False:
        return max_in
    
    plt.figure(figsize=(12, 10))
    plt.plot(x1, y1, label='f1')
    plt.plot(x1, y2, label='req')
    plt.plot(x1, y3, label='pres')
    plt.legend()
    plt.show()
    
    return max_in

In [31]:
def get_metrics(y_test, y_prob, y_pred):
    roc_test = metrics.roc_auc_score(y_test, y_prob)
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    pres = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    return roc_test,acc, f1, pres, rec

In [32]:
def use_classificator(cls, X_train, X_test, y_train, y_test,
                      threeshold=0.5, choose_th=False, 
                      print_info=True, plot_th=False):
    cls.fit(X_train, y_train)
    if choose_th or plot_th:
        y_prob_train = cls.predict_proba(X_train)[:, 1]
        tmp = plot_for_treshhold(y_train, y_prob_train, plot_th)
        if choose_th:
            threeshold = tmp
        
    y_prob = cls.predict_proba(X_test)[:, 1]        
    y_pred = list(map(lambda x: 1 if x > threeshold else 0, y_prob))
    
    roc_train = metrics.roc_auc_score(y_train, cls.predict_proba(X_train)[:, 1])
    roc_test, acc, f1, pres, rec = get_metrics(y_test, y_prob, y_pred)
    
    info = (f'ROC-AUC score TRAIN: {roc_train:.2}' + '\n' + 
            f'ROC-AUC score VALID: {roc_test:.2}' + '\n' + 
            f'Accuracy score: {acc:.2}' + '\n' + 
            f'F1 score: {f1:.2}' + '\n' + 
            f'Precision score: {pres:.2}' + '\n' + 
            f'Recall score: {rec:.2}')
    
    if print_info:
        print(info)
    
    return threeshold, cls, info, y_pred

In [33]:
def test_model(cls, X_test, y_test, threeshold):
    y_prob = cls.predict_proba(X_test)[:, 1]
    y_pred = list(map(lambda x: 1 if x > threeshold else 0, y_prob))
    roc_test, acc, f1, pres, rec = get_metrics(y_test, y_prob, y_pred)
    
    info = (f'ROC-AUC score TEST: {roc_test:.2}' + '\n' + 
            f'Accuracy score: {acc:.2}' + '\n' + 
            f'F1 score: {f1:.2}' + '\n' + 
            f'Precision score: {pres:.2}' + '\n' + 
            f'Recall score: {rec:.2}')
    return info, y_pred

# List of models that I will use:
- Logistic Regression
- Decision Tree
- Random Forest
- Support Vector Classification
- CatBoost
- XGBoost
- LightCBM

In [34]:
list_of_cls = [LogisticRegressionCV(class_weight={0: 1, 1: 3}, solver='liblinear', random_state=42),
               DecisionTreeClassifier(max_depth=5, class_weight={0:1, 1:3}, random_state=42),
               RandomForestClassifier(random_state=42, max_depth=2, class_weight={0:1, 1:3}),
               sklearn.svm.SVC(kernel='poly', degree=5, probability=True, random_state=42),
               xgboost.XGBClassifier(n_estimators=150, max_depth=2, random_state=42),
               lightgbm.LGBMClassifier(random_state=42, max_depth=5, learning_rate=0.03),
               catboost.CatBoostClassifier(silent=True, depth=2)    
]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'XGBoost', 'LightGBM', 'CatBoost']

In [35]:
table = []
for name, cls in zip(names, list_of_cls):
    th, cls_fit, res_old, valid_pred = use_classificator(cls, X, X_valid,
                              y, y_valid, choose_th=True, print_info=False)
    res_new, test_pred = test_model(cls_fit, data_new.drop('stage', axis=1), data_new['stage'], th)
    
    crosstab_valid = str(pd.crosstab(y_valid, pd.Series(valid_pred)))
    crosstab_test = str(pd.crosstab(data_new['stage'], pd.Series(test_pred)))
    
    table += [[name + f'\nThreeshold - {th:.4}', res_old, res_new, crosstab_valid, crosstab_test]]
print(tabulate(table, headers=['', 'Old Data', 'New Data', 'Crosstab Validate', 'Crosstab Test'], tablefmt='fancy_grid'))

╒═════════════════════╤═══════════════════════════╤══════════════════════════╤═════════════════════╤═════════════════╕
│                     │ Old Data                  │ New Data                 │ Crosstab Validate   │ Crosstab Test   │
╞═════════════════════╪═══════════════════════════╪══════════════════════════╪═════════════════════╪═════════════════╡
│ Logistic Regression │ ROC-AUC score TRAIN: 0.78 │ ROC-AUC score TEST: 0.76 │ col_0   0   1       │ col_0   0    1  │
│ Threeshold - 0.6895 │ ROC-AUC score VALID: 0.74 │ Accuracy score: 0.71     │ stage               │ stage           │
│                     │ Accuracy score: 0.74      │ F1 score: 0.81           │ 0       2   8       │ 0      16   79  │
│                     │ F1 score: 0.82            │ Precision score: 0.72    │ 1      10  25       │ 1      26  163  │
│                     │ Precision score: 0.75     │ Recall score: 0.92       │                     │                 │
│                     │ Recall score: 0.92      