In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, make_scorer
from sklearn.ensemble import BaggingClassifier
import re
import seaborn as sns
from IPython.display import Image
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_validate

In [2]:
train = pd.read_csv('./data/train.csv', index_col=0)
test = pd.read_csv('./data/test_x.csv', index_col=0)
submission = pd.read_csv('./data/sample_submission.csv', index_col=0)  

### 설문조사 소요시간을 사람별로 정규화

In [3]:
survey_time = []
for col in list(train.columns):
    if re.match('Q[a-z]E', col):
        survey_time.append(col)
train[survey_time] = train[survey_time].apply(lambda x: x / x.max(), axis=1)   

survey_time = []
for col in list(test.columns):
    if re.match('Q[a-z]E', col):
        survey_time.append(col)
test[survey_time] = test[survey_time].apply(lambda x: x / x.max(), axis=1)    

In [4]:
train.head(5)

Unnamed: 0_level_0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,...,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,0.230184,4.0,0.868738,5.0,0.632213,1.0,0.649334,2.0,1.0,...,0,1,0,1,1,0,1,0,1,1
1,5.0,0.149457,5.0,0.303303,3.0,0.782398,5.0,0.68584,1.0,0.997921,...,1,1,0,1,1,0,1,0,1,1
2,4.0,0.481031,1.0,0.438648,1.0,0.302608,4.0,1.0,5.0,0.39508,...,1,1,0,1,1,1,1,0,1,1
3,3.0,0.021116,3.0,0.096824,4.0,0.041562,3.0,0.135956,1.0,0.014957,...,0,0,0,0,1,0,1,0,1,1
4,1.0,0.490736,1.0,0.374272,5.0,0.294336,2.0,0.562202,1.0,0.536792,...,1,1,1,1,1,0,1,1,1,1


In [5]:
test.head(5)

Unnamed: 0_level_0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,...,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,0.08278,2.0,0.330784,3.0,0.519739,1.0,0.546283,2.0,0.286807,...,0,1,0,0,1,0,1,0,1,1
1,3.0,0.113591,2.0,0.431381,3.0,0.342983,3.0,0.181436,4.0,0.254144,...,0,0,0,0,0,0,0,0,0,0
2,3.0,0.150739,2.0,0.755803,4.0,0.144709,2.0,0.185107,2.0,0.399759,...,0,1,0,1,1,0,1,0,1,1
3,1.0,0.231809,1.0,0.363825,5.0,0.497228,2.0,0.780319,5.0,0.877685,...,1,1,1,1,1,1,1,1,1,1
4,2.0,0.256555,1.0,0.639075,5.0,0.434447,2.0,0.856555,2.0,0.475578,...,1,1,0,1,1,0,1,1,1,1


### 마키아벨리즘

In [6]:
Answers = ['QaA', 'QbA', 'QcA', 'QdA', 'QeA',
             'QfA', 'QgA', 'QhA', 'QiA', 'QjA', 
             'QkA', 'QlA', 'QmA', 'QnA', 'QoA', 
             'QpA', 'QqA', 'QrA', 'QsA', 'QtA']

In [7]:
flipping_columns = ["QeA", "QfA", "QkA", "QqA", "QrA", "QaA", "QdA", "QgA", "QiA", "QnA"]
for flip in flipping_columns: 
    train[flip] = 6 - train[flip]
    test[flip] = 6 - test[flip]
train['Mach_score'] = train[Answers].mean(axis = 1)
test['Mach_score'] = test[Answers].mean(axis = 1)

In [8]:
train.head()

Unnamed: 0_level_0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,...,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,Mach_score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,0.230184,4.0,0.868738,5.0,0.632213,5.0,0.649334,4.0,1.0,...,1,0,1,1,0,1,0,1,1,2.95
1,1.0,0.149457,5.0,0.303303,3.0,0.782398,1.0,0.68584,5.0,0.997921,...,1,0,1,1,0,1,0,1,1,2.6
2,2.0,0.481031,1.0,0.438648,1.0,0.302608,2.0,1.0,1.0,0.39508,...,1,0,1,1,1,1,0,1,1,1.9
3,3.0,0.021116,3.0,0.096824,4.0,0.041562,3.0,0.135956,5.0,0.014957,...,0,0,0,1,0,1,0,1,1,3.35
4,5.0,0.490736,1.0,0.374272,5.0,0.294336,4.0,0.562202,5.0,0.536792,...,1,1,1,1,0,1,1,1,1,3.0


In [9]:
test.head()

Unnamed: 0_level_0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,...,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,Mach_score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,0.08278,2.0,0.330784,3.0,0.519739,5.0,0.546283,4.0,0.286807,...,1,0,0,1,0,1,0,1,1,2.45
1,3.0,0.113591,2.0,0.431381,3.0,0.342983,3.0,0.181436,2.0,0.254144,...,0,0,0,0,0,0,0,0,0,2.8
2,3.0,0.150739,2.0,0.755803,4.0,0.144709,4.0,0.185107,4.0,0.399759,...,1,0,1,1,0,1,0,1,1,3.55
3,5.0,0.231809,1.0,0.363825,5.0,0.497228,4.0,0.780319,1.0,0.877685,...,1,1,1,1,1,1,1,1,1,3.35
4,4.0,0.256555,1.0,0.639075,5.0,0.434447,4.0,0.856555,4.0,0.475578,...,1,0,1,1,0,1,1,1,1,2.9


In [10]:
train_x = train.drop('voted', axis = 1)
train_y = train['voted']

In [11]:
X_train, X_vali, Y_train, Y_vali = train_test_split(train_x, train_y, test_size=0.25)

In [12]:
train_x_dummies = pd.get_dummies(X_train)
vali_x_dummies = pd.get_dummies(X_vali)

all_train_x_dummies = pd.get_dummies(train_x)
test_x_dummies = pd.get_dummies(test)

In [13]:
train_x_dummies.shape, vali_x_dummies.shape, all_train_x_dummies.shape, test_x_dummies.shape

((34149, 101), (11383, 101), (45532, 101), (11383, 101))

- train_val

In [14]:
lgbm_model = lgbm.LGBMClassifier(n_estimators=500)
lgbm_model.fit(train_x_dummies, Y_train)

LGBMClassifier(n_estimators=500)

In [15]:
roc_auc_score(Y_vali, list(pd.DataFrame(lgbm_model.predict_proba(vali_x_dummies)).apply(lambda x: x[1], axis=1)))

0.7567656823614162

- train_test

In [None]:
lgbm_model2 = lgbm.LGBMClassifier(n_estimators=500)
bagging_lgbm_model = BaggingClassifier(base_estimator=lgbm_model2, n_estimators=30).fit(all_train_x_dummies, train_y)

In [None]:
submission['voted']= list(pd.DataFrame(bagging_lgbm_model.predict_proba(test_x_dummies)).apply(lambda x: x[1], axis=1))

In [None]:
submission.to_csv('./data/bagging_lgbm.csv')

### lgbm 파라미터 튜닝

In [30]:
def lgbm_cv(learning_rate, num_leaves, max_depth, min_child_weight, colsample_bytree, feature_fraction, bagging_fraction, lambda_l1, lambda_l2):
    model = lgbm.LGBMClassifier(learning_rate=learning_rate,
                                n_estimators = 300,
                                #boosting = 'dart',
                                num_leaves = int(round(num_leaves)),
                                max_depth = int(round(max_depth)),
                                min_child_weight = int(round(min_child_weight)),
                                colsample_bytree = colsample_bytree,
                                feature_fraction = max(min(feature_fraction, 1), 0),
                                bagging_fraction = max(min(bagging_fraction, 1), 0),
                                lambda_l1 = max(lambda_l1, 0),
                                lambda_l2 = max(lambda_l2, 0)
                               )
    scoring = {'roc_auc_score': make_scorer(roc_auc_score, needs_proba=True)}
    result = cross_validate(model, all_train_x_dummies, train_y, cv=5, scoring=scoring)
    auc_score = result["test_roc_auc_score"].mean()
    return auc_score

In [31]:
pbounds = {'learning_rate' : (0.0001, 0.05),
           'num_leaves': (300, 600),
           'max_depth': (2, 25),
           'min_child_weight': (30, 100),
           'colsample_bytree': (0, 0.99),
           'feature_fraction': (0.0001, 0.99),
           'bagging_fraction': (0.0001, 0.99),
           'lambda_l1' : (0, 0.99),
           'lambda_l2' : (0, 0.99),
          }

In [32]:
lgbmBO = BayesianOptimization(f = lgbm_cv, pbounds = pbounds, verbose = 2, random_state = 0 )

In [33]:
lgbmBO.maximize(init_points=5, n_iter = 20, acq='ei', xi=0.01)

|   iter    |  target   | baggin... | colsam... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7677  [0m | [0m 0.5434  [0m | [0m 0.708   [0m | [0m 0.5968  [0m | [0m 0.5394  [0m | [0m 0.4194  [0m | [0m 0.03233 [0m | [0m 12.06   [0m | [0m 92.42   [0m | [0m 589.1   [0m |
| [0m 2       [0m | [0m 0.7618  [0m | [0m 0.3797  [0m | [0m 0.7838  [0m | [0m 0.5237  [0m | [0m 0.5624  [0m | [0m 0.9163  [0m | [0m 0.003645[0m | [0m 4.004   [0m | [0m 31.42   [0m | [0m 549.8   [0m |


| [95m 3       [0m | [95m 0.7683  [0m | [95m 0.7704  [0m | [95m 0.8613  [0m | [95m 0.9688  [0m | [95m 0.7912  [0m | [95m 0.4569  [0m | [95m 0.03905 [0m | [95m 4.72    [0m | [95m 74.79   [0m | [95m 343.0   [0m |
| [0m 4       [0m | [0m 0.7678  [0m | [0m 0.9352  [0m | [0m 0.5166  [0m | [0m 0.4106  [0m | [0m 0.2619  [0m | [0m 0.7665  [0m | [0m 0.02286 [0m | [0m 15.07   [0m | [0m 31.32   [0m | [0m 485.3   [0m |
| [0m 5       [0m | [0m 0.7664  [0m | [0m 0.606   [0m | [0m 0.6108  [0m | [0m 0.9343  [0m | [0m 0.675   [0m | [0m 0.3559  [0m | [0m 0.02191 [0m | [0m 18.05   [0m | [0m 34.22   [0m | [0m 500.0   [0m |


| [0m 6       [0m | [0m 0.7528  [0m | [0m 0.9621  [0m | [0m 0.5592  [0m | [0m 0.02621 [0m | [0m 0.1738  [0m | [0m 0.8315  [0m | [0m 0.03358 [0m | [0m 12.1    [0m | [0m 44.26   [0m | [0m 423.8   [0m |
| [0m 7       [0m | [0m 0.7669  [0m | [0m 0.3638  [0m | [0m 0.2178  [0m | [0m 0.2149  [0m | [0m 0.004065[0m | [0m 0.6756  [0m | [0m 0.02482 [0m | [0m 5.529   [0m | [0m 87.81   [0m | [0m 306.9   [0m |


| [95m 8       [0m | [95m 0.769   [0m | [95m 0.8959  [0m | [95m 0.1059  [0m | [95m 0.7973  [0m | [95m 0.5175  [0m | [95m 0.1686  [0m | [95m 0.02376 [0m | [95m 4.552   [0m | [95m 38.16   [0m | [95m 318.1   [0m |
| [0m 9       [0m | [0m 0.7687  [0m | [0m 0.1773  [0m | [0m 0.1546  [0m | [0m 0.897   [0m | [0m 0.1736  [0m | [0m 0.6533  [0m | [0m 0.01885 [0m | [0m 24.21   [0m | [0m 99.52   [0m | [0m 535.8   [0m |
| [0m 10      [0m | [0m 0.7598  [0m | [0m 0.02805 [0m | [0m 0.486   [0m | [0m 0.2886  [0m | [0m 0.3421  [0m | [0m 0.5953  [0m | [0m 0.008227[0m | [0m 2.824   [0m | [0m 99.71   [0m | [0m 492.2   [0m |


| [0m 11      [0m | [0m 0.7598  [0m | [0m 0.8058  [0m | [0m 0.2891  [0m | [0m 0.267   [0m | [0m 0.8589  [0m | [0m 0.5556  [0m | [0m 0.000992[0m | [0m 24.87   [0m | [0m 99.64   [0m | [0m 534.4   [0m |
| [0m 12      [0m | [0m 0.7668  [0m | [0m 0.8518  [0m | [0m 0.4633  [0m | [0m 0.3379  [0m | [0m 0.6867  [0m | [0m 0.06141 [0m | [0m 0.04023 [0m | [0m 12.76   [0m | [0m 91.87   [0m | [0m 545.5   [0m |
| [0m 13      [0m | [0m 0.7682  [0m | [0m 0.9603  [0m | [0m 0.003919[0m | [0m 0.4968  [0m | [0m 0.1905  [0m | [0m 0.3403  [0m | [0m 0.02221 [0m | [0m 14.29   [0m | [0m 31.24   [0m | [0m 485.8   [0m |


| [0m 14      [0m | [0m 0.7618  [0m | [0m 0.928   [0m | [0m 0.4399  [0m | [0m 0.726   [0m | [0m 0.0656  [0m | [0m 0.9838  [0m | [0m 0.0499  [0m | [0m 15.98   [0m | [0m 30.64   [0m | [0m 486.3   [0m |
| [0m 15      [0m | [0m 0.7683  [0m | [0m 0.187   [0m | [0m 0.9051  [0m | [0m 0.3966  [0m | [0m 0.8458  [0m | [0m 0.4487  [0m | [0m 0.02535 [0m | [0m 12.11   [0m | [0m 91.25   [0m | [0m 544.7   [0m |


| [0m 16      [0m | [0m 0.7688  [0m | [0m 0.7129  [0m | [0m 0.9263  [0m | [0m 0.8627  [0m | [0m 0.2539  [0m | [0m 0.7501  [0m | [0m 0.01292 [0m | [0m 13.36   [0m | [0m 91.28   [0m | [0m 544.6   [0m |
| [0m 17      [0m | [0m 0.7446  [0m | [0m 0.8979  [0m | [0m 0.5156  [0m | [0m 0.04269 [0m | [0m 0.9729  [0m | [0m 0.05342 [0m | [0m 0.005787[0m | [0m 14.5    [0m | [0m 30.95   [0m | [0m 482.9   [0m |
| [0m 18      [0m | [0m 0.7685  [0m | [0m 0.05782 [0m | [0m 0.2703  [0m | [0m 0.943   [0m | [0m 0.1505  [0m | [0m 0.7278  [0m | [0m 0.01376 [0m | [0m 12.08   [0m | [0m 89.59   [0m | [0m 544.1   [0m |


| [0m 19      [0m | [0m 0.7682  [0m | [0m 0.01393 [0m | [0m 0.8269  [0m | [0m 0.3245  [0m | [0m 0.7115  [0m | [0m 0.1635  [0m | [0m 0.03243 [0m | [0m 5.724   [0m | [0m 38.15   [0m | [0m 319.5   [0m |
| [0m 20      [0m | [0m 0.7673  [0m | [0m 0.09324 [0m | [0m 0.1693  [0m | [0m 0.9546  [0m | [0m 0.1888  [0m | [0m 0.371   [0m | [0m 0.0301  [0m | [0m 12.37   [0m | [0m 91.73   [0m | [0m 543.2   [0m |
| [0m 21      [0m | [0m 0.767   [0m | [0m 0.000516[0m | [0m 0.0714  [0m | [0m 0.7266  [0m | [0m 0.2891  [0m | [0m 0.34    [0m | [0m 0.04519 [0m | [0m 13.51   [0m | [0m 90.26   [0m | [0m 543.4   [0m |


| [0m 22      [0m | [0m 0.7681  [0m | [0m 0.2966  [0m | [0m 0.8123  [0m | [0m 0.6863  [0m | [0m 0.3138  [0m | [0m 0.1369  [0m | [0m 0.007872[0m | [0m 23.22   [0m | [0m 99.88   [0m | [0m 537.5   [0m |
| [0m 23      [0m | [0m 0.7672  [0m | [0m 0.6323  [0m | [0m 0.8489  [0m | [0m 0.8719  [0m | [0m 0.9698  [0m | [0m 0.5604  [0m | [0m 0.01066 [0m | [0m 4.694   [0m | [0m 36.91   [0m | [0m 320.3   [0m |


| [0m 24      [0m | [0m 0.7674  [0m | [0m 0.1586  [0m | [0m 0.04466 [0m | [0m 0.3328  [0m | [0m 0.545   [0m | [0m 0.392   [0m | [0m 0.0325  [0m | [0m 24.1    [0m | [0m 97.87   [0m | [0m 536.9   [0m |
| [0m 25      [0m | [0m 0.7677  [0m | [0m 0.5144  [0m | [0m 0.08468 [0m | [0m 0.2883  [0m | [0m 0.6727  [0m | [0m 0.4974  [0m | [0m 0.0404  [0m | [0m 4.769   [0m | [0m 36.42   [0m | [0m 317.2   [0m |


In [34]:
lgbmBO.max

{'target': 0.7689795171044522,
 'params': {'bagging_fraction': 0.8958797357815458,
  'colsample_bytree': 0.10593113961701743,
  'feature_fraction': 0.7973403266487863,
  'lambda_l1': 0.5174759248593073,
  'lambda_l2': 0.16855495537647786,
  'learning_rate': 0.023759205615142862,
  'max_depth': 4.5520687487038725,
  'min_child_weight': 38.15818358905622,
  'num_leaves': 318.0864399129222}}

In [35]:
tuned_lgbm_model = lgbm.LGBMClassifier(learning_rate=lgbmBO.max['params']['learning_rate'],
                               num_leaves = int(round(lgbmBO.max['params']['num_leaves'])),
                               max_depth = int(round(lgbmBO.max['params']['max_depth'])),
                               min_child_weight = int(round(lgbmBO.max['params']['min_child_weight'])),
                               colsample_bytree=lgbmBO.max['params']['colsample_bytree'],
                               feature_fraction = max(min(lgbmBO.max['params']['feature_fraction'], 1), 0),
                               bagging_fraction = max(min(lgbmBO.max['params']['bagging_fraction'], 1), 0),
                               lambda_l1 = lgbmBO.max['params']['lambda_l1'],
                               lambda_l2 = lgbmBO.max['params']['lambda_l2']
                               )

In [None]:
bagging_lgbm_model = BaggingClassifier(base_estimator=tuned_lgbm_model, n_estimators=40).fit(all_train_x_dummies, train_y)



In [None]:
submission['voted']= list(pd.DataFrame(bagging_lgbm_model.predict_proba(test_x_dummies)).apply(lambda x: x[1], axis=1))

In [None]:
submission.to_csv('./data/bagging_tuned_lgbm.csv')