In [33]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import metrics
from sklearn.model_selection import StratifiedShuffleSplit, KFold
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
data = pd.read_excel('data_2_samples_NEW.xlsx', index_col=0)

In [3]:
data.head()

Unnamed: 0,gender,age,bmi,ao,activity,male_heredity,smoking,AGT_AGTR2,stage_agr,sample
0,0,0,0,0,0,1,0,0,1,0
1,1,1,0,0,1,0,1,0,1,0
2,1,1,0,1,1,0,1,0,1,0
3,1,0,1,1,1,0,0,1,1,0
4,0,1,1,1,0,0,0,0,1,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 788 entries, 0 to 787
Data columns (total 10 columns):
gender           788 non-null int64
age              788 non-null int64
bmi              788 non-null int64
ao               788 non-null int64
activity         788 non-null int64
male_heredity    788 non-null int64
smoking          788 non-null int64
AGT_AGTR2        788 non-null int64
stage_agr        788 non-null int64
sample           788 non-null int64
dtypes: int64(10)
memory usage: 67.7 KB


In [107]:
X = data.drop('stage_agr', axis=1)
y = data['stage_agr']

In [108]:
(X_train, X_test,
 y_train, y_test) = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
X_sample_0 = data.query('sample == 0').drop(['stage_agr', 'sample'], axis=1)
y_sample_0 = data.query('sample == 0')['stage_agr']

X_sample_1 = data.query('sample == 1').drop(['stage_agr', 'sample'], axis=1)
y_sample_1 = data.query('sample == 1')['stage_agr']

In [96]:
(X_sample_0_train, X_sample_0_test,
 y_sample_0_train, y_sample_0_test) = train_test_split(X_sample_0, y_sample_0, 
                                                       test_size=0.33, stratify=y_sample_0,
                                                       random_state=42)

In [97]:
def print_metrics(y_true, y_pred):
    print(f'F1: {metrics.f1_score(y_true, y_pred):.6}')
    print(f'Accuracy: {metrics.accuracy_score(y_true, y_pred):.6}')
    print(f'ROC_AUC: {metrics.roc_auc_score(y_true, y_pred):.6}')
    print(f'Precision: {metrics.precision_score(y_true, y_pred):.6}')
    print(f'Recall: {metrics.recall_score(y_true, y_pred):.6}')

In [98]:
def use_model(model, X_train, y_train, X_test, y_test, names):
    model.fit(X_train, y_train)
    predictions = []
    i = 0
    for i in range(len(y_test)):
        print(names[i], ':')
        pred_i = model.predict(X_test[i])
        predictions.append(pred_i)
        print_metrics(y_test[i], pred_i)
        print()
    return predictions

# Logistic regression

In [103]:
X_sample_0

Unnamed: 0,gender,age,bmi,ao,activity,male_heredity,smoking,AGT_AGTR2
0,0,0,0,0,0,1,0,0
1,1,1,0,0,1,0,1,0
2,1,1,0,1,1,0,1,0
3,1,0,1,1,1,0,0,1
4,0,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...
584,1,1,1,1,0,1,0,0
585,0,1,0,1,0,0,0,0
586,1,1,1,1,0,0,1,0
587,0,1,1,1,1,0,0,0


In [99]:
lg_reg = LogisticRegression()

In [100]:
lg_reg.fit(X_sample_0_train, y_sample_0_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [101]:
score = metrics.roc_auc_score(y_sample_0_train, lg_reg.predict_proba(X_sample_0_train)[:, 1])
print(f'ROC_train: {score:.2}')

ROC_train: 0.8


In [102]:
score = metrics.roc_auc_score(y_sample_0_test, lg_reg.predict_proba(X_sample_0_test)[:, 1])
print(f'ROC_train: {score:.2}')

ROC_train: 0.73


In [109]:
lg_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [114]:
score = metrics.roc_auc_score(y_test, lg_reg.predict_log_proba(X_test)[:, 1])
print(f'ROC_train: {score:.2}')

ROC_train: 0.73


# Logistic regression using split

In [192]:
(X_sample_0_train, X_sample_0_test,
 y_sample_0_train, y_sample_0_test) = train_test_split(X_sample_0, y_sample_0, test_size=0.3, random_state=42)

In [193]:
lg_reg_split = LogisticRegression(penalty='l1')

In [259]:
pred_lg_reg_split = use_model(lg_reg_split, X_sample_0_train, 
                              y_sample_0_train, [X_sample_0_test, X_sample_1], 
                              [y_sample_0_test, y_sample_1], ['Test', 'Sample 1'])

Test :
F1: 0.819923
Accuracy: 0.734463
ROC_AUC: 0.652663
Precision: 0.804511
Recall: 0.835938

Sample 1 :
F1: 0.832787
Accuracy: 0.743719
ROC_AUC: 0.631389
Precision: 0.77439
Recall: 0.900709



In [195]:
pd.crosstab(y_sample_1, pred_lg_reg_split[1])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21,37
1,14,127


# Logistic regression split by age

In [199]:
X_sample_0_bef_45 = X_sample_0.query('age == 0').drop('age', axis=1)
X_sample_0_aft_45 = X_sample_0.query('age == 1').drop('age', axis=1)
y_sample_0_bef_45 = y_sample_0[X_sample_0['age'] == 0]
y_sample_0_aft_45 = y_sample_0[X_sample_0['age'] == 1]

X_sample_1_bef_45 = X_sample_1.query('age == 0').drop('age', axis=1)
X_sample_1_aft_45 = X_sample_1.query('age == 1').drop('age', axis=1)
y_sample_1_bef_45 = y_sample_1[X_sample_1['age'] == 0]
y_sample_1_aft_45 = y_sample_1[X_sample_1['age'] == 1]

In [200]:
y_sample_0_aft_45.value_counts()

1    263
0     88
Name: stage_agr, dtype: int64

In [201]:
lg_reg_age_bef_45 = LogisticRegression(penalty='l1')
lg_reg_age_aft_45 = LogisticRegression(penalty='l1')

In [205]:
pred_lg_reg_bef_45 = use_model(lg_reg_age_bef_45, X_sample_0_bef_45, 
                               y_sample_0_bef_45, [X_sample_1_bef_45], 
                               [y_sample_1_bef_45], ['Sample 1 before 45'])
pred_lg_reg_aft_45 = use_model(lg_reg_age_aft_45, X_sample_0_aft_45, 
                               y_sample_0_aft_45, [X_sample_1_aft_45], 
                               [y_sample_1_aft_45], ['Sample 1 after 45'])

Sample 1 before 45 :
F1: 0.622222
Accuracy: 0.645833
ROC_AUC: 0.645105
Precision: 0.608696
Recall: 0.636364

Sample 1 after 45 :
F1: 0.859316
Accuracy: 0.754967
ROC_AUC: 0.490415
Precision: 0.784722
Recall: 0.94958



In [206]:
pd.crosstab(y_sample_1_bef_45, pred_lg_reg_bef_45[0])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,17,9
1,8,14


In [207]:
pd.crosstab(y_sample_1_aft_45, pred_lg_reg_aft_45[0])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,31
1,6,113


In [208]:
lg_reg_age_bef_45.coef_

array([[0.76631575, 0.84778304, 0.91811107, 0.26428646, 0.89791636,
        0.22931327, 0.01463963]])

In [209]:
lg_reg_age_aft_45.coef_

array([[7.61556515e-04, 1.11172547e+00, 7.71911722e-01, 8.62590637e-01,
        9.01308633e-01, 0.00000000e+00, 5.27604447e-01]])

In [210]:
X_sample_0_aft_45.shape[0], X_sample_0_bef_45.shape[0]

(351, 238)

In [211]:
pd.crosstab(y_sample_0_bef_45, X_sample_0_bef_45.gender)

gender,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,56,53
1,34,95


In [212]:
pd.crosstab(y_sample_0_aft_45, X_sample_0_aft_45.gender)

gender,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,46,42
1,148,115


# Logistic regression CV

In [213]:
def predict_lg_reg_cv(X_test, lg_regs, thd=0.5):
    prediction = np.zeros((1, X_test.shape[0]))
    for idx, lg_reg in enumerate(lg_regs):
        prediction += lg_reg.predict_proba(X_test)[:, 1]
    prediction = prediction[0] / len(lg_regs)
    return list(map(lambda x: 1 if x > thd else 0, prediction.T))

In [225]:
cv_sss = StratifiedShuffleSplit(n_splits=6)

In [226]:
lg_reg_cv = []
for train_indices, test_indices in cv_sss.split(X_sample_0, y_sample_0):
    current_lg_reg = LogisticRegression(penalty='l1')
    current_lg_reg.fit(X_sample_0.loc[train_indices], y_sample_0.loc[train_indices])
    # print_metrics(y_sample_0.loc[test_indices], current_lg_reg.predict(X_sample_0.loc[test_indices]))
    # print()
    lg_reg_cv.append(current_lg_reg)

In [227]:
res_sample_1_cv = np.array(predict_lg_reg_cv(X_sample_1, lg_reg_cv))

In [228]:
print_metrics(y_sample_0, np.array(predict_lg_reg_cv(X_sample_0, lg_reg_cv)))

F1: 0.811456
Accuracy: 0.731749
ROC_AUC: 0.664638
Precision: 0.762332
Recall: 0.867347


In [229]:
print_metrics(y_sample_1, res_sample_1_cv)

F1: 0.828479
Accuracy: 0.733668
ROC_AUC: 0.609073
Precision: 0.761905
Recall: 0.907801


In [230]:
pd.crosstab(y_sample_1, res_sample_1_cv)

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,18,40
1,13,128


In [231]:
cv_kf = KFold(n_splits=6)

In [232]:
lg_reg_cv = []
for train_indices, test_indices in cv_kf.split(X_sample_0, y_sample_0):
    current_lg_reg = LogisticRegression(penalty='l1')
    current_lg_reg.fit(X_sample_0.loc[train_indices], y_sample_0.loc[train_indices])
    # print_metrics(y_sample_0.loc[test_indices], current_lg_reg.predict(X_sample_0.loc[test_indices]))
    # print()
    lg_reg_cv.append(current_lg_reg)

In [233]:
res_sample_1_cv = np.array(predict_lg_reg_cv(X_sample_1, lg_reg_cv))

In [234]:
print_metrics(y_sample_1, res_sample_1_cv)

F1: 0.829582
Accuracy: 0.733668
ROC_AUC: 0.603999
Precision: 0.758824
Recall: 0.914894


In [235]:
pd.crosstab(y_sample_1, res_sample_1_cv)

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,17,41
1,12,129


# Kmeans 2 groups

In [244]:
kmeans_2 = KMeans(n_clusters=2)

In [245]:
kmeans_2.fit(X_sample_0_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [246]:
X_sample_0_clust_2_train = X_sample_0_train.copy()
X_sample_0_clust_2_train['clust'] = kmeans_2.predict(X_sample_0_train)

X_sample_0_clust_2_test = X_sample_0_test.copy()
X_sample_0_clust_2_test['clust'] = kmeans_2.predict(X_sample_0_test)

X_sample_1_clust_2 = X_sample_1.copy()
X_sample_1_clust_2['clust'] = kmeans_2.predict(X_sample_1)

In [264]:
lg_reg_clust_2 = LogisticRegression(penalty='l2')

In [265]:
pred_lg_reg_clust_2 = use_model(lg_reg_clust_2, X_sample_0_clust_2_train, 
                                y_sample_0_train, [X_sample_0_clust_2_test, X_sample_1_clust_2], 
                                [y_sample_0_test, y_sample_1], ['Test', 'Sample 1'])

Test :
F1: 0.816794
Accuracy: 0.728814
ROC_AUC: 0.642459
Precision: 0.798507
Recall: 0.835938

Sample 1 :
F1: 0.827362
Accuracy: 0.733668
ROC_AUC: 0.614148
Precision: 0.76506
Recall: 0.900709



In [266]:
pd.crosstab(y_sample_1, pred_lg_reg_clust_2[1])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19,39
1,14,127


In [267]:
lg_reg_clust_2.coef_

array([[0.36859164, 0.83554241, 1.0026496 , 0.81551239, 0.55935675,
        1.18308007, 0.24788388, 0.35586855, 0.20377327]])

In [268]:
lg_reg_clust_2 = LogisticRegression(penalty='l1')

In [269]:
pred_lg_reg_clust_2 = use_model(lg_reg_clust_2, X_sample_0_clust_2_train, 
                                y_sample_0_train, [X_sample_0_clust_2_test, X_sample_1_clust_2], 
                                [y_sample_0_test, y_sample_1], ['Test', 'Sample 1'])

Test :
F1: 0.819923
Accuracy: 0.734463
ROC_AUC: 0.652663
Precision: 0.804511
Recall: 0.835938

Sample 1 :
F1: 0.832787
Accuracy: 0.743719
ROC_AUC: 0.631389
Precision: 0.77439
Recall: 0.900709



Simple log reg:

    Test :
    F1: 0.819923
    Accuracy: 0.734463
    ROC_AUC: 0.652663
    Precision: 0.804511
    Recall: 0.835938

    Sample 1 :
    F1: 0.832787
    Accuracy: 0.743719
    ROC_AUC: 0.631389
    Precision: 0.77439
    Recall: 0.900709

In [270]:
pd.crosstab(y_sample_1, pred_lg_reg_clust_2[1])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21,37
1,14,127


In [263]:
lg_reg_clust_2.coef_

array([[0.34217263, 0.83437891, 1.039574  , 0.97504309, 0.54123989,
        1.20922853, 0.21585332, 0.30612167, 0.        ]])

# Kmeans 3 groups

In [271]:
kmeans_3 = KMeans(n_clusters=3)

In [272]:
kmeans_3.fit(X_sample_0_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [276]:
X_sample_0_clust_3_train = X_sample_0_train.copy()
X_sample_0_clust_3_train['clust'] = kmeans_3.predict(X_sample_0_train)

X_sample_0_clust_3_test = X_sample_0_test.copy()
X_sample_0_clust_3_test['clust'] = kmeans_3.predict(X_sample_0_test)

X_sample_1_clust_3 = X_sample_1.copy()
X_sample_1_clust_3['clust'] = kmeans_3.predict(X_sample_1)

In [277]:
lg_reg_clust_3 = LogisticRegression(penalty='l1')

In [278]:
pred_lg_reg_clust_3 = use_model(lg_reg_clust_3, X_sample_0_clust_3_train, 
                                y_sample_0_train, [X_sample_0_clust_3_test, X_sample_1_clust_3], 
                                [y_sample_0_test, y_sample_1], ['Test', 'Sample 1'])

Test :
F1: 0.820312
Accuracy: 0.740113
ROC_AUC: 0.675462
Precision: 0.820312
Recall: 0.820312

Sample 1 :
F1: 0.835088
Accuracy: 0.763819
ROC_AUC: 0.706469
Precision: 0.826389
Recall: 0.843972



In [279]:
pd.crosstab(y_sample_1, pred_lg_reg_clust_3[1])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,33,25
1,22,119


In [280]:
lg_reg_clust_3.coef_

array([[0.29248212, 0.66207062, 0.95822034, 0.31362171, 0.47005695,
        1.26668214, 0.00855735, 0.30210231, 0.58697628]])

# Kmeans 4 groups

In [281]:
kmeans_4 = KMeans(n_clusters=4)

In [282]:
kmeans_4.fit(X_sample_0_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [286]:
X_sample_0_clust_4_train = X_sample_0_train.copy()
X_sample_0_clust_4_train['clust'] = kmeans_4.predict(X_sample_0_train)

X_sample_0_clust_4_test = X_sample_0_test.copy()
X_sample_0_clust_4_test['clust'] = kmeans_4.predict(X_sample_0_test)

X_sample_1_clust_4 = X_sample_1.copy()
X_sample_1_clust_4['clust'] = kmeans_4.predict(X_sample_1)

In [287]:
lg_reg_clust_4 = LogisticRegression(penalty='l1')

In [288]:
pred_lg_reg_clust_4 = use_model(lg_reg_clust_4, X_sample_0_clust_4_train, 
                                y_sample_0_train, [X_sample_0_clust_4_test, X_sample_1_clust_4], 
                                [y_sample_0_test, y_sample_1], ['Test', 'Sample 1'])

Test :
F1: 0.816794
Accuracy: 0.728814
ROC_AUC: 0.642459
Precision: 0.798507
Recall: 0.835938

Sample 1 :
F1: 0.827362
Accuracy: 0.733668
ROC_AUC: 0.614148
Precision: 0.76506
Recall: 0.900709



In [289]:
pd.crosstab(y_sample_1, pred_lg_reg_clust_4[1])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19,39
1,14,127


In [290]:
lg_reg_clust_4.coef_

array([[ 0.30960363,  1.26403439,  0.87126777,  0.82698077,  0.55119423,
         1.23692987,  0.13161055,  0.30607753, -0.2647843 ]])

# Decision tree

In [140]:
dt = DecisionTreeClassifier(max_depth=5)

In [291]:
pred_dt = use_model(dt, X_sample_0_train, y_sample_0_train,
                   [X_sample_0_test, X_sample_1], 
                   [y_sample_0_test, y_sample_1], ['Test', 'Sample 1'])

Test :
F1: 0.832685
Accuracy: 0.757062
ROC_AUC: 0.693479
Precision: 0.829457
Recall: 0.835938

Sample 1 :
F1: 0.83612
Accuracy: 0.753769
ROC_AUC: 0.65878
Precision: 0.791139
Recall: 0.886525



In [293]:
pd.crosstab(y_sample_1, pred_dt[1])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25,33
1,16,125


# Random Forest

In [341]:
rf = RandomForestClassifier(n_estimators=4)

In [342]:
pred_rf = use_model(rf, X_sample_0_train, y_sample_0_train,
                   [X_sample_0_test, X_sample_1], 
                   [y_sample_0_test, y_sample_1], ['Test', 'Sample 1'])

Test :
F1: 0.759184
Accuracy: 0.666667
ROC_AUC: 0.618383
Precision: 0.794872
Recall: 0.726562

Sample 1 :
F1: 0.83274
Accuracy: 0.763819
ROC_AUC: 0.716618
Precision: 0.835714
Recall: 0.829787



In [343]:
pd.crosstab(y_sample_1, pred_rf[1])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,35,23
1,24,117


# Extra Trees

In [350]:
et = ExtraTreesClassifier(n_estimators=7)

In [351]:
pred_et = use_model(et, X_sample_0_train, y_sample_0_train,
                   [X_sample_0_test, X_sample_1], 
                   [y_sample_0_test, y_sample_1], ['Test', 'Sample 1'])

Test :
F1: 0.734177
Accuracy: 0.644068
ROC_AUC: 0.615354
Precision: 0.798165
Recall: 0.679688

Sample 1 :
F1: 0.808824
Accuracy: 0.738693
ROC_AUC: 0.709036
Precision: 0.839695
Recall: 0.780142



In [352]:
pd.crosstab(y_sample_1, pred_et[1])

col_0,0,1
stage_agr,Unnamed: 1_level_1,Unnamed: 2_level_1
0,37,21
1,31,110
