In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 0〜5列目は
id feat_13 feat_14 feat_15 feat_16 feat_18 

In [2]:
data = np.loadtxt('../emb/n2v_X_y_concentration.txt')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,124,125,126,127,128,129,130,131,132,133
0,1.0,0.0,0.0,0.0,0.0,0.0,-0.287548,-0.001427,-0.12009,-0.377493,...,0.080572,-0.088503,0.218645,0.172041,-0.322487,-0.420138,0.139752,0.382551,0.28242,-0.358664
1,2.0,0.0,0.0,0.0,0.0,0.0,0.040874,0.08287,0.134158,-0.353407,...,-0.258742,-0.127682,0.067486,0.388841,0.011226,-1.004405,-0.082768,0.401341,0.262858,-0.345512
2,3.0,0.0,1.0,0.0,0.0,0.0,-0.093957,0.159373,-0.01174,-0.490959,...,0.175396,-0.007743,0.130623,0.168703,-0.111257,-0.672268,0.048327,0.349718,0.120555,-0.387086
3,4.0,0.0,0.0,0.0,0.0,0.0,-0.184001,0.415718,0.209469,-0.261934,...,-0.00377,-0.452354,0.290476,0.146022,-0.368995,-0.654642,-0.465005,0.406181,0.359105,-0.20104
4,5.0,0.0,0.0,0.0,0.0,0.0,-0.288642,0.055733,-0.247007,-0.464414,...,0.21568,-0.109553,0.078936,0.13644,-0.227735,-0.460104,-0.042816,0.286819,0.26658,-0.418679


In [3]:
X = data[:, 6:]
y_lst = [np.ravel(data[:, index:index+1].astype(np.int64)) for index in range(1, 6)]

In [4]:
feat_lst = ["feat_13", "feat_14", "feat_15", "feat_16", "feat_18"]
#データの内訳表示用
y_disp_lst = []
for index, element in enumerate(feat_lst):
    y_disp = np.ravel(data[:, index+1:index+2].astype(np.int64))
    y_disp_lst.append(y_disp)
    print("{}".format(element))
    print("label 0 {}".format(X[y_disp_lst[index] == 0].shape[0]))
    print("label 1 {}\n".format(X[y_disp_lst[index] == 1].shape[0]))

feat_13
label 0 1082
label 1 30

feat_14
label 0 983
label 1 129

feat_15
label 0 1102
label 1 10

feat_16
label 0 1094
label 1 18

feat_18
label 0 1107
label 1 5



### feat13

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lst[0], test_size=0.2, random_state=0)
ros = RandomOverSampler(random_state=0)
X_train_ros, y_train_ros = ros.fit_sample(X_train, y_train)
X_test_ros, y_test_ros = ros.fit_sample(X_test, y_test)

In [6]:
param_xgb = [
    {"learning_rate": [0.05, 0.10, 0.15, 0.20], "max_depth":[i for i in range(3, 11)],
    "n_estimators":[100, 250, 500, 750, 1000]}
]

In [7]:
xgb = XGBClassifier(seed=0)

clf_xgb_ros = GridSearchCV(xgb, param_xgb, cv=10, n_jobs=-1,verbose=10)
clf_xgb_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_xgb = clf_xgb_ros.predict(X_test_ros)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   43.8s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   55.1s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2

In [8]:
print("accuracy xgb over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_xgb)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_xgb)))
print("confusion matrix xgb over sampled\n{}".format(confusion_matrix(y_test_ros, y_pred_ros_xgb)))

accuracy xgb over sampled 0.5000
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       217
           1       0.00      0.00      0.00       217

   micro avg       0.50      0.50      0.50       434
   macro avg       0.25      0.50      0.33       434
weighted avg       0.25      0.50      0.33       434

confusion matrix xgb over sampled
[[217   0]
 [217   0]]


  'precision', 'predicted', average, warn_for)


In [9]:
lr = LogisticRegression(random_state=0)

In [10]:
param_lr = [
    {"penalty": ["l2","l1"], "C":[10**i for i in range(-3, 4)]}
]

In [11]:
clf_lr_ros = GridSearchCV(lr, param_lr, cv=10, n_jobs=-1, verbose=10)
clf_lr_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_lr = clf_lr_ros.predict(X_test_ros)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1510s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1558s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 117 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 125 out of 140 | elapsed:  2.2min remaining:   16.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  3.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  3.3min finished


In [12]:
print("accuracy LR over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_lr)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_lr)))
print("confusion matrix LR over sampled \n{}".format(confusion_matrix(y_test_ros, y_pred_ros_lr)))

accuracy LR over sampled 0.5507
              precision    recall  f1-score   support

           0       0.53      0.96      0.68       217
           1       0.79      0.14      0.24       217

   micro avg       0.55      0.55      0.55       434
   macro avg       0.66      0.55      0.46       434
weighted avg       0.66      0.55      0.46       434

confusion matrix LR over sampled 
[[209   8]
 [187  30]]


### feat14

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lst[1], test_size=0.2, random_state=0)
ros = RandomOverSampler(random_state=0)
X_train_ros, y_train_ros = ros.fit_sample(X_train, y_train)
X_test_ros, y_test_ros = ros.fit_sample(X_test, y_test)

In [14]:
clf_xgb_ros = GridSearchCV(xgb, param_xgb, cv=10, n_jobs=-1,verbose=10)
clf_xgb_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_xgb = clf_xgb_ros.predict(X_test_ros)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   42.9s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:  3.5min
[Paralle

In [15]:
print("accuracy xgb over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_xgb)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_xgb)))
print("confusion matrix xgb over sampled\n{}".format(confusion_matrix(y_test_ros, y_pred_ros_xgb)))

accuracy xgb over sampled 0.5871
              precision    recall  f1-score   support

           0       0.55      0.96      0.70       201
           1       0.83      0.22      0.35       201

   micro avg       0.59      0.59      0.59       402
   macro avg       0.69      0.59      0.52       402
weighted avg       0.69      0.59      0.52       402

confusion matrix xgb over sampled
[[192   9]
 [157  44]]


In [16]:
clf_lr_ros = GridSearchCV(lr, param_lr, cv=10, n_jobs=-1, verbose=10)
clf_lr_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_lr = clf_lr_ros.predict(X_test_ros)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0420s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  80 out of 140 | elapsed:    3.5s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  3.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  3.9min finished


In [17]:
print("accuracy LR over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_lr)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_lr)))
print("confusion matrix LR over sampled \n{}".format(confusion_matrix(y_test_ros, y_pred_ros_lr)))

accuracy LR over sampled 0.6418
              precision    recall  f1-score   support

           0       0.61      0.81      0.69       201
           1       0.71      0.48      0.57       201

   micro avg       0.64      0.64      0.64       402
   macro avg       0.66      0.64      0.63       402
weighted avg       0.66      0.64      0.63       402

confusion matrix LR over sampled 
[[162  39]
 [105  96]]


### feat15

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lst[2], test_size=0.2, random_state=0)
rus = RandomOverSampler(random_state=0)
X_train_ros, y_train_ros = rus.fit_sample(X_train, y_train)
X_test_ros, y_test_ros = rus.fit_sample(X_test, y_test)

In [19]:
clf_xgb_ros = GridSearchCV(xgb, param_xgb, cv=10, n_jobs=-1,verbose=10)
clf_xgb_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_xgb = clf_xgb_ros.predict(X_test_ros)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   59.5s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:  1.4min
[Paralle

In [20]:
print("accuracy xgb over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_xgb)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_xgb)))
print("confusion matrix xgb over sampled\n{}".format(confusion_matrix(y_test_ros, y_pred_ros_xgb)))

accuracy xgb over sampled 0.5000
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       219
           1       0.00      0.00      0.00       219

   micro avg       0.50      0.50      0.50       438
   macro avg       0.25      0.50      0.33       438
weighted avg       0.25      0.50      0.33       438

confusion matrix xgb over sampled
[[219   0]
 [219   0]]


  'precision', 'predicted', average, warn_for)


In [21]:
clf_lr_ros = GridSearchCV(lr, param_lr, cv=10, n_jobs=-1, verbose=10)
clf_lr_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_lr = clf_lr_ros.predict(X_test_ros)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0518s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 110 out of 140 | elapsed:   12.4s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:   16.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:   16.8s finished


In [22]:
print("accuracy LR over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_lr)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_lr)))
print("confusion matrix LR over sampled \n{}".format(confusion_matrix(y_test_ros, y_pred_ros_lr)))

accuracy LR over sampled 0.4932
              precision    recall  f1-score   support

           0       0.50      0.99      0.66       219
           1       0.00      0.00      0.00       219

   micro avg       0.49      0.49      0.49       438
   macro avg       0.25      0.49      0.33       438
weighted avg       0.25      0.49      0.33       438

confusion matrix LR over sampled 
[[216   3]
 [219   0]]


### feat16

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lst[3], test_size=0.2, random_state=0)
ros = RandomOverSampler(random_state=0)
X_train_ros, y_train_ros = ros.fit_sample(X_train, y_train)
X_test_ros, y_test_ros = ros.fit_sample(X_test, y_test)

In [24]:
clf_xgb_ros = GridSearchCV(xgb, param_xgb, cv=10, n_jobs=-1,verbose=10)
clf_xgb_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_xgb = clf_xgb_ros.predict(X_test_ros)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:  1.9min
[Paralle

In [25]:
print("accuracy xgb over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_xgb)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_xgb)))
print("confusion matrix xgb over sampled\n{}".format(confusion_matrix(y_test_ros, y_pred_ros_xgb)))

accuracy xgb over sampled 0.4908
              precision    recall  f1-score   support

           0       0.50      0.98      0.66       217
           1       0.00      0.00      0.00       217

   micro avg       0.49      0.49      0.49       434
   macro avg       0.25      0.49      0.33       434
weighted avg       0.25      0.49      0.33       434

confusion matrix xgb over sampled
[[213   4]
 [217   0]]


In [26]:
clf_lr_ros = GridSearchCV(lr, param_lr, cv=10, n_jobs=-1, verbose=10)
clf_lr_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_lr = clf_lr_ros.predict(X_test_ros)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0677s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.7min finished


In [27]:
print("accuracy LR over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_lr)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_lr)))
print("confusion matrix LR over sampled \n{}".format(confusion_matrix(y_test_ros, y_pred_ros_lr)))

accuracy LR over sampled 0.5253
              precision    recall  f1-score   support

           0       0.51      0.91      0.66       217
           1       0.61      0.14      0.23       217

   micro avg       0.53      0.53      0.53       434
   macro avg       0.56      0.53      0.44       434
weighted avg       0.56      0.53      0.44       434

confusion matrix LR over sampled 
[[198  19]
 [187  30]]


### feat18

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lst[4], test_size=0.2, random_state=0)
ros = RandomOverSampler(random_state=0)
X_train_ros, y_train_ros = ros.fit_sample(X_train, y_train)
X_test_ros, y_test_ros = ros.fit_sample(X_test, y_test)

In [29]:
clf_xgb_ros = GridSearchCV(xgb, param_xgb, cv=10, n_jobs=-1,verbose=10)
clf_xgb_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_xgb = clf_xgb_ros.predict(X_test_ros)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   

In [30]:
print("accuracy xgb over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_xgb)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_xgb)))
print("confusion matrix xgb over sampled\n{}".format(confusion_matrix(y_test_ros, y_pred_ros_xgb)))

accuracy xgb over sampled 0.5000
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       221
           1       0.00      0.00      0.00       221

   micro avg       0.50      0.50      0.50       442
   macro avg       0.25      0.50      0.33       442
weighted avg       0.25      0.50      0.33       442

confusion matrix xgb over sampled
[[221   0]
 [221   0]]


  'precision', 'predicted', average, warn_for)


In [31]:
clf_lr_ros = GridSearchCV(lr, param_lr, cv=10, n_jobs=-1, verbose=10)
clf_lr_ros.fit(X_train_ros, y_train_ros)
y_pred_ros_lr = clf_lr_ros.predict(X_test_ros)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0499s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  80 out of 140 | elapsed:    2.9s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    6.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    6.5s finished


In [32]:
print("accuracy LR over sampled {:.4f}".format(accuracy_score(y_test_ros, y_pred_ros_lr)))
print("{}".format(classification_report(y_test_ros, y_pred_ros_lr)))
print("confusion matrix LR over sampled \n{}".format(confusion_matrix(y_test_ros, y_pred_ros_lr)))

accuracy LR over sampled 0.4955
              precision    recall  f1-score   support

           0       0.50      0.99      0.66       221
           1       0.00      0.00      0.00       221

   micro avg       0.50      0.50      0.50       442
   macro avg       0.25      0.50      0.33       442
weighted avg       0.25      0.50      0.33       442

confusion matrix LR over sampled 
[[219   2]
 [221   0]]
