# 【第1回_Beginner限定コンペ】銀行の顧客ターゲティング

顧客の属性情報などから定期預金キャンペーンの反応率を予測しよう。

https://signate.jp/competitions/292

XGBoost編

In [158]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import optuna
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [159]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submit_df = pd.read_csv('submit_sample.csv', header=None)

In [160]:
df_list = [train_df, test_df]

for df in df_list:
    #df['job'] = job_encoder.transform(df['job'])
    df['job'] = df['job'].map({'unknown': 1, 'technician': 2, 'blue-collar': 3, 'services': 4, 'entrepreneur': 5, 'admin.': 6, 'management': 7, 'housemaid': 8, 'self-employed': 9, 'unemployed': 10, 'retired': 11, 'student': 12})

    df['marital'] = df['marital'].map({'married': 2, 'divorced':1, 'single': 0})
    #df.drop(['marital'], axis=1, inplace=True)

    df['education'] = df['education'].map({'tertiary': 3, 'secondary': 2, 'primary': 1, 'unknown': 0})
    #df.drop(['education'], axis=1, inplace=True)

    #df['default'] = df['default'].map({'yes': 1, 'no': 0})
    df.drop(['default'], axis=1, inplace=True)

    df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
    #df.drop(['housing'], axis=1, inplace=True)
    
    df['loan'] = df['loan'].map({'yes': 1, 'no': 0})
    #df.drop(['loan'], axis=1, inplace=True)

    df['contact'] = df['contact'].map({'telephone': 2, 'cellular': 1, 'unknown': 0})
    #df.drop(['contact'], axis=1, inplace=True)

    df['poutcome'] = df['poutcome'].map({'success': 3, 'unknown': 2, 'failure': 1, 'other': 0})
    
    df['month'] = df['month'].map({'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12})
    #df.drop(['day', 'month'], axis=1, inplace=True)
    
    # Feb 30 とかあって、正確には変換できない
    df['days'] = pd.to_datetime('2016/' + df['month'].astype(str).str.pad(2,fillchar='0') + '/' + df['day'].astype(str).str.pad(2,fillchar='0'), format='%Y/%m/%d').dt.strftime('%j').astype('uint16')
    #df['days'] = df['month'] * 31 + df['day']
    
    df['bpp'] = np.log((df['balance'] - df['balance'].min()) / (df['pdays'] + 2) + 1)
    
    df.drop(['id'], axis=1, inplace=True)

In [161]:
y = train_df.pop('y')

In [162]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=44)

In [163]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 100),
    }
    model = xgb.XGBRegressor(**params).fit(X_train, y_train)
    return model.score(X_test, y_test)

In [164]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2020-08-22 09:11:58,666] Trial 0 finished with value: -0.23155550632942212 and parameters: {'n_estimators': 421, 'max_depth': 20, 'min_child_weight': 7, 'scale_pos_weight': 94}. Best is trial 0 with value: -0.23155550632942212.
[I 2020-08-22 09:12:01,804] Trial 1 finished with value: -0.9869926224664562 and parameters: {'n_estimators': 981, 'max_depth': 4, 'min_child_weight': 16, 'scale_pos_weight': 62}. Best is trial 0 with value: -0.23155550632942212.
[I 2020-08-22 09:12:06,841] Trial 2 finished with value: -0.40659833350620267 and parameters: {'n_estimators': 884, 'max_depth': 13, 'min_child_weight': 4, 'scale_pos_weight': 56}. Best is trial 0 with value: -0.23155550632942212.
[I 2020-08-22 09:12:10,564] Trial 3 finished with value: -0.0409296153878016 and parameters: {'n_estimators': 403, 'max_depth': 27, 'min_child_weight': 6, 'scale_pos_weight': 10}. Best is trial 3 with value: -0.0409296153878016.
[I 2020-08-22 09:12:21,205] Trial 4 finished with value: -0.14898580568564523 a

In [165]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 20
Best trial:
  Value: 0.23642243041761268
  Params: 
    n_estimators: 716
    max_depth: 2
    min_child_weight: 20
    scale_pos_weight: 1


In [166]:
optuna.importance.get_param_importances(study)

OrderedDict([('max_depth', 0.46722911979319887),
             ('min_child_weight', 0.31377830808533647),
             ('scale_pos_weight', 0.13669732853563774),
             ('n_estimators', 0.08229524358582702)])

In [167]:
best_params = {}
best_params.update(study.best_trial.params)

best_model = xgb.XGBRegressor(**best_params).fit(X_train, y_train)

In [168]:
preds = np.round(best_model.predict(X_test))
print('Accuracy score = \t {}'.format(accuracy_score(y_test, preds)))
print('Precision score = \t {}'.format(precision_score(y_test, preds)))
print('Recall score =   \t {}'.format(recall_score(y_test, preds)))
print('F1 score =      \t {}'.format(f1_score(y_test, preds)))

Accuracy score = 	 0.9324723247232473
Precision score = 	 0.8301886792452831
Recall score =   	 0.2018348623853211
F1 score =      	 0.32472324723247237


In [169]:
test_pred = best_model.predict(test_df)
submit_df[1] = test_pred
submit_df.to_csv('submit-xg.csv', header=False, index=False)