# 【第1回_Beginner限定コンペ】銀行の顧客ターゲティング

顧客の属性情報などから定期預金キャンペーンの反応率を予測しよう。

https://signate.jp/competitions/292

Random Forest編

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import optuna
from sklearn.ensemble import RandomForestRegressor as RandomForest

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [50]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submit_df = pd.read_csv('submit_sample.csv', header=None)

In [51]:
df_list = [train_df, test_df]

for df in df_list:
    #df['job'] = job_encoder.transform(df['job'])
    df['job'] = df['job'].map({'unknown': 1, 'technician': 2, 'blue-collar': 3, 'services': 4, 'entrepreneur': 5, 'admin.': 6, 'management': 7, 'housemaid': 8, 'self-employed': 9, 'unemployed': 10, 'retired': 11, 'student': 12})

    df['marital'] = df['marital'].map({'married': 2, 'divorced':1, 'single': 0})
    #df.drop(['marital'], axis=1, inplace=True)

    df['education'] = df['education'].map({'tertiary': 3, 'secondary': 2, 'primary': 1, 'unknown': 0})
    #df.drop(['education'], axis=1, inplace=True)

    #df['default'] = df['default'].map({'yes': 1, 'no': 0})
    df.drop(['default'], axis=1, inplace=True)

    df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
    #df.drop(['housing'], axis=1, inplace=True)
    
    df['loan'] = df['loan'].map({'yes': 1, 'no': 0})
    #df.drop(['loan'], axis=1, inplace=True)

    df['contact'] = df['contact'].map({'telephone': 2, 'cellular': 1, 'unknown': 0})
    #df.drop(['contact'], axis=1, inplace=True)

    df['poutcome'] = df['poutcome'].map({'success': 3, 'unknown': 2, 'failure': 1, 'other': 1})
    
    df['month'] = df['month'].map({'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12})
    #df.drop(['day', 'month'], axis=1, inplace=True)
    
    # Feb 30 とかあって、正確には変換できない
    #df['days'] = pd.to_datetime('2016/' + df['month'].astype(str).str.pad(2,fillchar='0') + '/' + df['day'].astype(str).str.pad(2,fillchar='0'), format='%Y/%m/%d').dt.strftime('%j')
    df['days'] = df['month'] * 31 + df['day']
    
    df.drop(['id'], axis=1, inplace=True)

In [52]:
y = train_df.pop('y')

In [53]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=25)

In [54]:
def objective(trial):
    params = {
        'n_estimators': 1000,
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        #'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        #'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 40),
    }
    model = RandomForest(**params).fit(X_train, y_train)
    return model.score(X_test, y_test)

In [55]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2020-08-08 09:18:04,233] Trial 0 finished with value: 0.20941712902593557 and parameters: {'max_depth': 9, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.20941712902593557.
[I 2020-08-08 09:19:05,187] Trial 1 finished with value: 0.18699093170422887 and parameters: {'max_depth': 29, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.20941712902593557.
[I 2020-08-08 09:19:49,858] Trial 2 finished with value: 0.2099819248733814 and parameters: {'max_depth': 12, 'min_samples_leaf': 17}. Best is trial 2 with value: 0.2099819248733814.
[I 2020-08-08 09:20:40,127] Trial 3 finished with value: 0.20952613761019212 and parameters: {'max_depth': 20, 'min_samples_leaf': 19}. Best is trial 2 with value: 0.2099819248733814.
[I 2020-08-08 09:21:27,658] Trial 4 finished with value: 0.2098406951308367 and parameters: {'max_depth': 14, 'min_samples_leaf': 18}. Best is trial 2 with value: 0.2099819248733814.
[I 2020-08-08 09:22:23,776] Trial 5 finished with value: 0.20283871798786035 and pa

In [56]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 10
Best trial:
  Value: 0.2099819248733814
  Params: 
    max_depth: 12
    min_samples_leaf: 17


In [57]:
optuna.importance.get_param_importances(study)

OrderedDict([('min_samples_leaf', 0.7095990556229859),
             ('max_depth', 0.2904009443770142)])

In [58]:
best_params = {
    'n_estimators': 1000,
}
best_params.update(study.best_trial.params)

best_model = RandomForest(**best_params).fit(X_train, y_train)

In [59]:
preds = np.round(best_model.predict(X_test))
print('Accuracy score = \t {}'.format(accuracy_score(y_test, preds)))
print('Precision score = \t {}'.format(precision_score(y_test, preds)))
print('Recall score =   \t {}'.format(recall_score(y_test, preds)))
print('F1 score =      \t {}'.format(f1_score(y_test, preds)))

Accuracy score = 	 0.9269372693726937
Precision score = 	 0.7428571428571429
Recall score =   	 0.174496644295302
F1 score =      	 0.2826086956521739


In [60]:
test_pred = best_model.predict(test_df)
submit_df[1] = test_pred
submit_df.to_csv('submit-rf.csv', header=False, index=False)