In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedBaggingClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data = np.loadtxt('../emb/n2v_X_y_edutype.txt')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,122,123,124,125,126,127,128,129,130,131
0,0.0,0.0,0.0,0.0,-0.06335,0.105774,0.151211,-0.390686,0.174236,-0.112463,...,0.012502,-0.273374,0.163978,-0.011858,-0.101687,-0.614817,-0.374594,0.316872,0.056657,-0.212233
1,1.0,0.0,0.0,0.0,-0.287548,-0.001427,-0.12009,-0.377493,0.215586,-0.093043,...,0.080572,-0.088503,0.218645,0.172041,-0.322487,-0.420138,0.139752,0.382551,0.28242,-0.358664
2,2.0,1.0,0.0,1.0,0.040874,0.08287,0.134158,-0.353407,0.069297,-0.076478,...,-0.258742,-0.127682,0.067486,0.388841,0.011226,-1.004405,-0.082768,0.401341,0.262858,-0.345512
3,3.0,1.0,0.0,1.0,-0.093957,0.159373,-0.01174,-0.490959,0.321027,-0.240256,...,0.175396,-0.007743,0.130623,0.168703,-0.111257,-0.672268,0.048327,0.349718,0.120555,-0.387086
4,4.0,1.0,0.0,1.0,-0.184001,0.415718,0.209469,-0.261934,0.027717,-0.241358,...,-0.00377,-0.452354,0.290476,0.146022,-0.368995,-0.654642,-0.465005,0.406181,0.359105,-0.20104


In [3]:
X = data[:, 4:]
y_lst = [np.ravel(data[:, index:index+1].astype(np.int64)) for index in range(1, 4)]

In [4]:
feat_lst = ["formal education", "informal education", "Non-formal education"]
#データの内訳表示用
y_disp_lst = []
for index, element in enumerate(feat_lst):
    y_disp = np.ravel(data[:, index+1:index+2].astype(np.int64))
    y_disp_lst.append(y_disp)
    print("{}".format(element))
    print("label 0 {}".format(X[y_disp_lst[index] == 0].shape[0]))
    print("label 1 {}\n".format(X[y_disp_lst[index] == 1].shape[0]))

formal education
label 0 1236
label 1 2806

informal education
label 0 2831
label 1 1211

Non-formal education
label 0 1499
label 1 2543



## 1. Formal education

### 学習データ，テストデータともにアンダーサンプリング

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lst[0], test_size=0.2, random_state=0)
rus = RandomUnderSampler(random_state=0)
X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
X_test_rus, y_test_rus = rus.fit_sample(X_test, y_test)

In [6]:
param_xgb = [
    {"learning_rate": [0.05, 0.10, 0.15, 0.20], "max_depth":[i for i in range(3, 11)],
    "n_estimators":[100, 250, 500, 750, 1000]}
]

In [7]:
xgb = XGBClassifier(seed=0)

clf_xgb_rus = GridSearchCV(xgb, param_xgb, cv=10, n_jobs=-1,verbose=10)
clf_xgb_rus.fit(X_train_rus, y_train_rus)
y_pred_rus_xgb = clf_xgb_rus.predict(X_test_r

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   40.6s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6

In [8]:
print("accuracy xgb under sampled {:.4f}".format(accuracy_score(y_test_rus, y_pred_rus_xgb)))
print("{}".format(classification_report(y_test_rus, y_pred_rus_xgb)))
print("confusion matrix xgb under sampled\n{}".format(confusion_matrix(y_test_rus, y_pred_rus_xgb)))

accuracy xgb under sampled 0.6280
              precision    recall  f1-score   support

           0       0.63      0.62      0.63       250
           1       0.63      0.63      0.63       250

   micro avg       0.63      0.63      0.63       500
   macro avg       0.63      0.63      0.63       500
weighted avg       0.63      0.63      0.63       500

confusion matrix xgb under sampled
[[156  94]
 [ 92 158]]


In [9]:
param_lr = [
    {"penalty": ["l1","l2"], "C":[10**i for i in range(-3, 4)]}
]

In [10]:
lr = LogisticRegression(random_state=0)

clf_lr_rus = GridSearchCV(lr, param_lr, cv=10, n_jobs=-1, verbose=10)
clf_lr_rus.fit(X_train_rus, y_train_rus)
y_pred_rus_lr = clf_lr_rus.predict(X_test_rus)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0828s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0591s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.8min finished


In [11]:
print("accuracy LR under sampled {:.4f}".format(accuracy_score(y_test_rus, y_pred_rus_lr)))
print("{}".format(classification_report(y_test_rus, y_pred_rus_lr)))
print("confusion matrix LR under sampled \n{}".format(confusion_matrix(y_test_rus, y_pred_rus_lr)))

accuracy LR under sampled 0.6520
              precision    recall  f1-score   support

           0       0.66      0.63      0.64       250
           1       0.65      0.67      0.66       250

   micro avg       0.65      0.65      0.65       500
   macro avg       0.65      0.65      0.65       500
weighted avg       0.65      0.65      0.65       500

confusion matrix LR under sampled 
[[158  92]
 [ 82 168]]


### 学習データをバギング+アンダーサンプリング テストデータをアンダーサンプリング 

In [12]:
param_xgb_bbc = [
    {"base_estimator__learning_rate": [0.05, 0.10, 0.15, 0.20], "base_estimator__max_depth":[i for i in range(3, 11)],
    "base_estimator__n_estimators":[100, 250, 500, 750, 1000]}
]

In [13]:
bbc_xgb = BalancedBaggingClassifier(base_estimator=xgb,n_estimators=10,replacement=False,n_jobs=-1,random_state=0,verbose=10)
clf_xgb_bbc = GridSearchCV(bbc_xgb, param_xgb_bbc, cv=10, n_jobs=-1,verbose=10)
clf_xgb_bbc.fit(X_train, y_train)
y_pred_bbc_xgb = clf_xgb_bbc.predict(X_test_rus)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 33.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 42.2min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 45.9min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 54.7min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed: 64.7min
[Paralle

In [14]:
print("accuracy xgb bagging {:.4f}".format(accuracy_score(y_test_rus, y_pred_bbc_xgb)))
print("{}".format(classification_report(y_test_rus, y_pred_bbc_xgb)))
print("confusion matrix xgb bagging \n{}".format(confusion_matrix(y_test_rus, y_pred_bbc_xgb)))

accuracy xgb bagging 0.6100
              precision    recall  f1-score   support

           0       0.63      0.52      0.57       250
           1       0.59      0.70      0.64       250

   micro avg       0.61      0.61      0.61       500
   macro avg       0.61      0.61      0.61       500
weighted avg       0.61      0.61      0.61       500

confusion matrix xgb bagging 
[[130 120]
 [ 75 175]]


In [15]:
param_lr_bbc = [
    {"base_estimator__penalty": ["l2","l1"], "base_estimator__C":[10**i for i in range(-3, 4)]}
]

In [16]:
bbc_lr = BalancedBaggingClassifier(base_estimator=lr,n_estimators=100,replacement=False,n_jobs=-1,random_state=0,verbose=100)
clf_lr_bbc = GridSearchCV(bbc_lr, param_lr_bbc, cv=10, n_jobs=-1,verbose=10)
clf_lr_bbc.fit(X_train, y_train)
y_pred_bbc_lr = clf_lr_bbc.predict(X_test_rus)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 20.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 94.0min
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed: 215.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed: 215.4min finished


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
Pickling array (shape=(2,), dtype=int64).
Pickling array (shape=(100,), dtype=int64).
Memmapping (shape=(3233, 128), dtype=float64) to new file /dev/shm/joblib_memmapping_folder_24429_8859235995/24429-140205575481272-5afac13c44ba49678189bc3fa30399cf.pkl
Pickling array (shape=(3233,), dtype=int64).
Pickling array (shape=(13,), dtype=int64).
Pickling array (shape=(2,), dtype=int64).
Pickling array (shape=(100,), dtype=int64).
Memmapping (shape=(3233, 128), dtype=float64) to old file /dev/shm/joblib_memmapping_folder_24429_8859235995/24429-140205575481272-5afac13c44ba49678189bc3fa30399cf.pkl
Pickling array (shape=(3233,), dtype=int64).
Pickling array (shape=(13,), dtype=int64).
Pickling array (shape=(2,), dtype=int64).
Pickling array (shape=(100,), dtype=int64).
Memmapping (shape=(3233, 128), dtype=float64) to old file /dev/shm/joblib_memmapping_folder_24429_8859235995/24429-140205575481272-5afac13c44ba49678189bc3f

In [17]:
print("accuracy LR bagging {:.4f}".format(accuracy_score(y_test_rus, y_pred_bbc_lr)))
print("{}".format(classification_report(y_test_rus, y_pred_bbc_lr)))
print("confusion matrix LR bagging\n{}".format(confusion_matrix(y_test_rus, y_pred_bbc_lr)))

accuracy LR bagging 0.6520
              precision    recall  f1-score   support

           0       0.66      0.64      0.65       250
           1       0.65      0.66      0.66       250

   micro avg       0.65      0.65      0.65       500
   macro avg       0.65      0.65      0.65       500
weighted avg       0.65      0.65      0.65       500

confusion matrix LR bagging
[[160  90]
 [ 84 166]]


## 2. informal education

### 学習データ，テストデータともにアンダーサンプリング

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lst[1], test_size=0.2, random_state=0)
rus = RandomUnderSampler(random_state=0)
X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
X_test_rus, y_test_rus = rus.fit_sample(X_test, y_test)

In [19]:
clf_xgb_rus = GridSearchCV(xgb, param_xgb, cv=10, n_jobs=-1,verbose=10)
clf_xgb_rus.fit(X_train_rus, y_train_rus)
y_pred_rus_xgb = clf_xgb_rus.predict(X_test_rus)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5

In [20]:
print("accuracy xgb under sampled {:.4f}".format(accuracy_score(y_test_rus, y_pred_rus_xgb)))
print("{}".format(classification_report(y_test_rus, y_pred_rus_xgb)))
print("confusion matrix xgb under sampled\n{}".format(confusion_matrix(y_test_rus, y_pred_rus_xgb)))

accuracy xgb under sampled 0.6961
              precision    recall  f1-score   support

           0       0.70      0.69      0.69       255
           1       0.69      0.71      0.70       255

   micro avg       0.70      0.70      0.70       510
   macro avg       0.70      0.70      0.70       510
weighted avg       0.70      0.70      0.70       510

confusion matrix xgb under sampled
[[175  80]
 [ 75 180]]


In [21]:
clf_lr_rus = GridSearchCV(lr, param_lr, cv=10, n_jobs=-1, verbose=10)
clf_lr_rus.fit(X_train_rus, y_train_rus)
y_pred_rus_lr = clf_lr_rus.predict(X_test_rus)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0699s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0274s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.7min finished


In [22]:
print("accuracy LR under sampled {:.4f}".format(accuracy_score(y_test_rus, y_pred_rus_lr)))
print("{}".format(classification_report(y_test_rus, y_pred_rus_lr)))
print("confusion matrix LR under sampled \n{}".format(confusion_matrix(y_test_rus, y_pred_rus_lr)))

accuracy LR under sampled 0.6706
              precision    recall  f1-score   support

           0       0.67      0.67      0.67       255
           1       0.67      0.67      0.67       255

   micro avg       0.67      0.67      0.67       510
   macro avg       0.67      0.67      0.67       510
weighted avg       0.67      0.67      0.67       510

confusion matrix LR under sampled 
[[170  85]
 [ 83 172]]


### 学習データをバギング+アンダーサンプリング テストデータをアンダーサンプリング 

In [23]:
bbc_xgb = BalancedBaggingClassifier(base_estimator=xgb,n_estimators=10,replacement=False,n_jobs=-1,random_state=0,verbose=10)
clf_xgb_bbc = GridSearchCV(bbc_xgb, param_xgb_bbc, cv=10, n_jobs=-1,verbose=10)
clf_xgb_bbc.fit(X_train, y_train)
y_pred_bbc_xgb = clf_xgb_bbc.predict(X_test_rus)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 24.0min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 32.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 40.3min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 44.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 52.3min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed: 61.7min
[Paralle

In [24]:
print("accuracy xgb bagging {:.4f}".format(accuracy_score(y_test_rus, y_pred_bbc_xgb)))
print("{}".format(classification_report(y_test_rus, y_pred_bbc_xgb)))
print("confusion matrix xgb bagging \n{}".format(confusion_matrix(y_test_rus, y_pred_bbc_xgb)))

accuracy xgb bagging 0.7137
              precision    recall  f1-score   support

           0       0.69      0.77      0.73       255
           1       0.74      0.65      0.70       255

   micro avg       0.71      0.71      0.71       510
   macro avg       0.72      0.71      0.71       510
weighted avg       0.72      0.71      0.71       510

confusion matrix xgb bagging 
[[197  58]
 [ 88 167]]


In [25]:
bbc_lr = BalancedBaggingClassifier(base_estimator=lr,n_estimators=100,replacement=False,n_jobs=-1,random_state=0,verbose=10)
clf_lr_bbc = GridSearchCV(bbc_lr, param_lr_bbc, cv=10, n_jobs=-1,verbose=10)
clf_lr_bbc.fit(X_train, y_train)
y_pred_bbc_lr = clf_lr_bbc.predict(X_test_rus)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 20.9min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 86.1min
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed: 197.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed: 197.8min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.6s remaining:    1.7s
[Parallel

In [26]:
print("accuracy LR bagging {:.4f}".format(accuracy_score(y_test_rus, y_pred_bbc_lr)))
print("{}".format(classification_report(y_test_rus, y_pred_bbc_lr)))
print("confusion matrix LR bagging\n{}".format(confusion_matrix(y_test_rus, y_pred_bbc_lr)))

accuracy LR bagging 0.5000
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       255
           1       0.00      0.00      0.00       255

   micro avg       0.50      0.50      0.50       510
   macro avg       0.25      0.50      0.33       510
weighted avg       0.25      0.50      0.33       510

confusion matrix LR bagging
[[255   0]
 [255   0]]


  'precision', 'predicted', average, warn_for)


## 3. Non-formal education

### 学習データ，テストデータともにアンダーサンプリング

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y_lst[2], test_size=0.2, random_state=0)
rus = RandomUnderSampler(random_state=0)
X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
X_test_rus, y_test_rus = rus.fit_sample(X_test, y_test)

In [28]:
clf_xgb_rus = GridSearchCV(xgb, param_xgb, cv=10, n_jobs=-1,verbose=10)
clf_xgb_rus.fit(X_train_rus, y_train_rus)
y_pred_rus_xgb = clf_xgb_rus.predict(X_test_rus)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   47.6s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:  8.6min
[Paralle

In [29]:
print("accuracy xgb under sampled {:.4f}".format(accuracy_score(y_test_rus, y_pred_rus_xgb)))
print("{}".format(classification_report(y_test_rus, y_pred_rus_xgb)))
print("confusion matrix xgb under sampled\n{}".format(confusion_matrix(y_test_rus, y_pred_rus_xgb)))

accuracy xgb under sampled 0.6342
              precision    recall  f1-score   support

           0       0.61      0.72      0.66       298
           1       0.66      0.55      0.60       298

   micro avg       0.63      0.63      0.63       596
   macro avg       0.64      0.63      0.63       596
weighted avg       0.64      0.63      0.63       596

confusion matrix xgb under sampled
[[215  83]
 [135 163]]


In [30]:
clf_lr_rus = GridSearchCV(lr, param_lr, cv=10, n_jobs=-1, verbose=10)
clf_lr_rus.fit(X_train_rus, y_train_rus)
y_pred_rus_lr = clf_lr_rus.predict(X_test_rus)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0356s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  80 out of 140 | elapsed:   11.9s remaining:    8.9s
[Parallel(n_jobs=-1)]: Done 110 out of 140 | elapsed:  1.8min remaining:   29.1s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  2.4min finished


In [31]:
print("accuracy LR under sampled {:.4f}".format(accuracy_score(y_test_rus, y_pred_rus_lr)))
print("{}".format(classification_report(y_test_rus, y_pred_rus_lr)))
print("confusion matrix LR under sampled \n{}".format(confusion_matrix(y_test_rus, y_pred_rus_lr)))

accuracy LR under sampled 0.6242
              precision    recall  f1-score   support

           0       0.61      0.69      0.65       298
           1       0.64      0.56      0.60       298

   micro avg       0.62      0.62      0.62       596
   macro avg       0.63      0.62      0.62       596
weighted avg       0.63      0.62      0.62       596

confusion matrix LR under sampled 
[[205  93]
 [131 167]]


### 学習データをバギング+アンダーサンプリング テストデータをアンダーサンプリング 

In [32]:
bbc_xgb = BalancedBaggingClassifier(base_estimator=xgb,n_estimators=10,replacement=False,n_jobs=-1,random_state=0,verbose=10)
clf_xgb_bbc = GridSearchCV(bbc_xgb, param_xgb_bbc, cv=10, n_jobs=-1,verbose=10)
clf_xgb_bbc.fit(X_train, y_train)
y_pred_bbc_xgb = clf_xgb_bbc.predict(X_test_rus)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 33.1min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 40.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 51.1min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 55.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 66

In [33]:
print("accuracy xgb bagging {:.4f}".format(accuracy_score(y_test_rus, y_pred_bbc_xgb)))
print("{}".format(classification_report(y_test_rus, y_pred_bbc_xgb)))
print("confusion matrix xgb bagging \n{}".format(confusion_matrix(y_test_rus, y_pred_bbc_xgb)))

accuracy xgb bagging 0.6242
              precision    recall  f1-score   support

           0       0.63      0.62      0.62       298
           1       0.62      0.63      0.63       298

   micro avg       0.62      0.62      0.62       596
   macro avg       0.62      0.62      0.62       596
weighted avg       0.62      0.62      0.62       596

confusion matrix xgb bagging 
[[184 114]
 [110 188]]


In [34]:
bbc_lr = BalancedBaggingClassifier(base_estimator=lr,n_estimators=100,replacement=False,n_jobs=-1,random_state=0,verbose=10)
clf_lr_bbc = GridSearchCV(bbc_lr, param_lr_bbc, cv=10, n_jobs=-1,verbose=10)
clf_lr_bbc.fit(X_train, y_train)
y_pred_bbc_lr = clf_lr_bbc.predict(X_test_rus)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 14 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 115.9min
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed: 259.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed: 259.5min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    9.5s remaining:   28.6s
[Paralle

In [35]:
print("accuracy LR bagging {:.4f}".format(accuracy_score(y_test_rus, y_pred_bbc_lr)))
print("{}".format(classification_report(y_test_rus, y_pred_bbc_lr)))
print("confusion matrix LR bagging\n{}".format(confusion_matrix(y_test_rus, y_pred_bbc_lr)))

accuracy LR bagging 0.6460
              precision    recall  f1-score   support

           0       0.63      0.69      0.66       298
           1       0.66      0.60      0.63       298

   micro avg       0.65      0.65      0.65       596
   macro avg       0.65      0.65      0.65       596
weighted avg       0.65      0.65      0.65       596

confusion matrix LR bagging
[[205  93]
 [118 180]]
