In [196]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

from sklearn.model_selection import train_test_split, cross_validate
from sklearn import metrics
from sklearn.model_selection import StratifiedShuffleSplit, KFold
from sklearn.model_selection import GridSearchCV
from scikitplot.metrics import plot_roc_curve

from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
import lightgbm
import catboost
import sklearn.svm

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
data = pd.read_excel('data_2_samples_NEW.xlsx', index_col=0)

In [3]:
data.head()

Unnamed: 0,gender,age,bmi,ao,activity,male_heredity,smoking,AGT_AGTR2,stage_agr,sample
0,0,0,0,0,0,1,0,0,1,0
1,1,1,0,0,1,0,1,0,1,0
2,1,1,0,1,1,0,1,0,1,0
3,1,0,1,1,1,0,0,1,1,0
4,0,1,1,1,0,0,0,0,1,0


In [202]:
data.describe()

Unnamed: 0,gender,age,bmi,ao,activity,male_heredity,smoking,AGT_AGTR2,stage_agr,sample
count,788.0,788.0,788.0,788.0,788.0,788.0,788.0,788.0,788.0,788.0
mean,0.502538,0.637056,0.341371,0.479695,0.548223,0.126904,0.376904,0.227157,0.676396,0.252538
std,0.500311,0.481154,0.474471,0.499905,0.50053,0.333076,0.484918,0.419261,0.468148,0.434744
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0


In [4]:
(X_train, X_test,
 y_train, y_test) = train_test_split(data.drop(['stage_agr', 'sample'], axis=1),
                                     data['stage_agr'], test_size=0.33,
                                     random_state=42, stratify=data['stage_agr'])

In [198]:
def plot_for_treshhold(y, y_prob, plot=True):
    x1 = np.linspace(0.1, 0.9, 100)
    y1 = []
    y2 = []
    y3 = []
    for th in x1:
        y_pred = list(map(lambda x: 1 if x > th else 0, y_prob))
        y1.append(metrics.f1_score(y, y_pred))
        y2.append(metrics.recall_score(y, y_pred))
        y3.append(metrics.precision_score(y, y_pred))
    
    max_in = x1[np.argmax(y1)]
    #print(f'Treseshold: {max_in:.5}')
    if plot is False:
        return max_in
    
    plt.figure(figsize=(12, 10))
    plt.plot(x1, y1, label='f1')
    plt.plot(x1, y2, label='req')
    plt.plot(x1, y3, label='pres')
    plt.legend()
    plt.show()

In [193]:
def use_classificator(cls, X_train, X_test, y_train, y_test,
                      threeshold=0.5, choose_th=False, print_info=True):
    cls.fit(X_train, y_train)
    if choose_th:
        y_prob_train = cls.predict_proba(X_train)[:, 1]
        threeshold = plot_for_treshhold(y_train, y_prob_train, False)
        
    y_prob = cls.predict_proba(X_test)[:, 1]        
    y_pred = list(map(lambda x: 1 if x > threeshold else 0, y_prob))
    
    roc_train = metrics.roc_auc_score(y_train, cls.predict_proba(X_train)[:, 1])
    roc_test = metrics.roc_auc_score(y_test, y_prob)
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    pres = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    
    info = (f'ROC-AUC score TRAIN: {roc_train:.2}' + '\n' + 
            f'ROC-AUC score TEST: {roc_test:.2}' + '\n' + 
            f'Accuracy score: {acc:.2}' + '\n' + 
            f'F1 score: {f1:.2}' + '\n' + 
            f'Precision score: {pres:.2}' + '\n' + 
            f'Recall score: {rec:.2}')
    
    if print_info:
        print(info)
    
    return cls, info

# List of models that I will use:
- Logistic Regression
- Decision Tree
- Random Forest
- Support Vector Classification
- CatBoost
- XGBoost
- LightCBM

In [180]:
list_of_cls = [LogisticRegressionCV(class_weight={0: 1, 1: 3}, solver='liblinear', random_state=42),
               DecisionTreeClassifier(max_depth=5, class_weight={0:1, 1:3}, random_state=42),
               RandomForestClassifier(random_state=42, max_depth=2, class_weight={0:1, 1:3}),
               sklearn.svm.SVC(kernel='poly', degree=5, probability=True, random_state=42),
               xgboost.XGBClassifier(n_estimators=150, max_depth=2, random_state=42),
               lightgbm.LGBMClassifier(random_state=42, max_depth=5, learning_rate=0.03),
               catboost.CatBoostClassifier(silent=True, depth=2)    
]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'XGBoost', 'LightGBM', 'CatBoost']

In [194]:
use_classificator(list_of_cls[3], X_train.drop('AGT_AGTR2', axis=1),
                  X_test.drop('AGT_AGTR2', axis=1), y_train,
                  y_test, choose_th=True)

Treseshold: 0.4798
ROC-AUC score TRAIN: 0.8
ROC-AUC score TEST: 0.8
Accuracy score: 0.76
F1 score: 0.84
Precision score: 0.76
Recall score: 0.95


(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=5, gamma='auto_deprecated',
     kernel='poly', max_iter=-1, probability=True, random_state=42,
     shrinking=True, tol=0.001, verbose=False),
 'ROC-AUC score TRAIN: 0.8\nROC-AUC score TEST: 0.8\nAccuracy score: 0.76\nF1 score: 0.84\nPrecision score: 0.76\nRecall score: 0.95')

In [200]:
table = []
for name, cls in zip(names, list_of_cls):
    _, res_no_gen = use_classificator(cls, X_train.drop('AGT_AGTR2', axis=1),
                                  X_test.drop('AGT_AGTR2', axis=1), y_train,
                                  y_test, choose_th=True, print_info=False)
    _, res_with_gen = use_classificator(cls, X_train, X_test,
                              y_train, y_test, choose_th=True, print_info=False)
    table += [[name, res_no_gen, res_with_gen]]
print(tabulate(table, headers=['', 'No Genetic', 'With Genetic'], tablefmt='fancy_grid'))

╒═════════════════════╤═══════════════════════════╤═══════════════════════════╕
│                     │ No Genetic                │ With Genetic              │
╞═════════════════════╪═══════════════════════════╪═══════════════════════════╡
│ Logistic Regression │ ROC-AUC score TRAIN: 0.79 │ ROC-AUC score TRAIN: 0.79 │
│                     │ ROC-AUC score TEST: 0.78  │ ROC-AUC score TEST: 0.78  │
│                     │ Accuracy score: 0.74      │ Accuracy score: 0.73      │
│                     │ F1 score: 0.84            │ F1 score: 0.83            │
│                     │ Precision score: 0.73     │ Precision score: 0.72     │
│                     │ Recall score: 0.98        │ Recall score: 0.98        │
├─────────────────────┼───────────────────────────┼───────────────────────────┤
│ Decision Tree       │ ROC-AUC score TRAIN: 0.79 │ ROC-AUC score TRAIN: 0.79 │
│                     │ ROC-AUC score TEST: 0.77  │ ROC-AUC score TEST: 0.76  │
│                     │ Accuracy score: 