# 【第1回_Beginner限定コンペ】銀行の顧客ターゲティング

顧客の属性情報などから定期預金キャンペーンの反応率を予測しよう。

https://signate.jp/competitions/292

Random Forest編

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import optuna
from sklearn.ensemble import RandomForestRegressor as RandomForest

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submit_df = pd.read_csv('submit_sample.csv', header=None)

In [3]:
df_list = [train_df, test_df]

for df in df_list:
    #df['job'] = job_encoder.transform(df['job'])
    df['job'] = df['job'].map({'unknown': 1, 'technician': 2, 'blue-collar': 3, 'services': 4, 'entrepreneur': 5, 'admin.': 6, 'management': 7, 'housemaid': 8, 'self-employed': 9, 'unemployed': 10, 'retired': 11, 'student': 12})

    df['marital'] = df['marital'].map({'married': 2, 'divorced':1, 'single': 0})
    #df.drop(['marital'], axis=1, inplace=True)

    df['education'] = df['education'].map({'tertiary': 3, 'secondary': 2, 'primary': 1, 'unknown': 0})
    #df.drop(['education'], axis=1, inplace=True)

    #df['default'] = df['default'].map({'yes': 1, 'no': 0})
    df.drop(['default'], axis=1, inplace=True)

    df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
    #df.drop(['housing'], axis=1, inplace=True)
    
    df['loan'] = df['loan'].map({'yes': 1, 'no': 0})
    #df.drop(['loan'], axis=1, inplace=True)

    df['contact'] = df['contact'].map({'telephone': 2, 'cellular': 1, 'unknown': 0})
    #df.drop(['contact'], axis=1, inplace=True)

    df['poutcome'] = df['poutcome'].map({'success': 3, 'unknown': 2, 'failure': 1, 'other': 0})
    
    df['month'] = df['month'].map({'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12})
    #df.drop(['day', 'month'], axis=1, inplace=True)
    
    # Feb 30 とかあって、正確には変換できない
    df['days'] = pd.to_datetime('2016/' + df['month'].astype(str).str.pad(2,fillchar='0') + '/' + df['day'].astype(str).str.pad(2,fillchar='0'), format='%Y/%m/%d').dt.strftime('%j').astype('uint16')
    #df['days'] = df['month'] * 31 + df['day']
    
    df['bpp'] = np.log((df['balance'] - df['balance'].min()) / (df['pdays'] + 2) + 1)
    
    df.drop(['id'], axis=1, inplace=True)

In [4]:
y = train_df.pop('y')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=44)

In [6]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        #'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        #'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 40),
    }
    model = RandomForest(**params).fit(X_train, y_train)
    return model.score(X_test, y_test)
    #return roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

In [7]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[W 2020-08-25 08:43:48,354] Trial 0 failed because of the following error: AttributeError("'RandomForestRegressor' object has no attribute 'predict_proba'",)
Traceback (most recent call last):
  File "/home/ubuntu/work/tensorflow-gpu/.venv/lib/python3.6/site-packages/optuna/study.py", line 709, in _run_trial
    result = func(trial)
  File "<ipython-input-6-d179794a9064>", line 11, in objective
    return roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
AttributeError: 'RandomForestRegressor' object has no attribute 'predict_proba'


AttributeError: 'RandomForestRegressor' object has no attribute 'predict_proba'

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
optuna.importance.get_param_importances(study)

In [None]:
best_params = {}
best_params.update(study.best_trial.params)

best_model = RandomForest(**best_params).fit(X_train, y_train)

In [None]:
preds = np.round(best_model.predict(X_test))
print('Accuracy score = \t {}'.format(accuracy_score(y_test, preds)))
print('Precision score = \t {}'.format(precision_score(y_test, preds)))
print('Recall score =   \t {}'.format(recall_score(y_test, preds)))
print('F1 score =      \t {}'.format(f1_score(y_test, preds)))

In [None]:
test_pred = best_model.predict(test_df)
submit_df[1] = test_pred
submit_df.to_csv('submit-rf.csv', header=False, index=False)