# 準備

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

FILE_SAVE = True
DEFAULT_COLOR = '#1f77b4'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from multiprocessing import cpu_count
import math
import pickle

In [None]:
train = pd.read_csv("./data/train_processed_02.csv")

In [None]:
MODEL_NAME_RF    = "./data/rf_model_02.sav"
MODEL_NAME_RF_CV = "./data/rf_model_cv_02.sav"

## 実行条件

In [None]:
# 可視化結果を保存するかどうか
FILE_SAVE_FLAG = False

# デバッグコードを表示するかどうか
DEBUG = False

# データ確認

In [None]:
# 先頭データ
train.head()

In [None]:
# カラム確認
train.columns

In [None]:
# 特徴量の列記録
Feature_columns = ["cylinders", "displacement", "horsepower",
                   "weight", "model year"]

# 学習用前処理

In [None]:
# 不要列削除
train.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
# データの分割
X_train, X_test, y_train, y_test = train_test_split(
    train[Feature_columns], train.mpg, random_state=0)

# 学習

In [None]:
rf_model = RandomForestRegressor(random_state=0, n_estimators=80)
rf_model.fit(X_train, y_train)

## 重要度

In [None]:
# 重要度、累積比をDF化
rf_reg_df = pd.DataFrame(
                    data=rf_model.feature_importances_,
                    index=Feature_columns, 
                    columns=["importance"]
            )
rf_reg_df.sort_values("importance", ascending=False, inplace=True)
rf_reg_df['imp_cumsum'] = rf_reg_df.importance.cumsum()*100
rf_reg_df

In [None]:
# 特徴量の重要度 Feature Importance
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax2 = ax1.twinx()

plt.title("feature importance")
ax1.set_xticklabels(Feature_columns, rotation=45, ha="right")
ax1.set_ylim([0, 1])
ax1.bar(rf_reg_df.index, rf_reg_df.importance)

ax2.set_ylim([0, 100])
ax2.plot(rf_reg_df.index, rf_reg_df.imp_cumsum, c='r')

# ハイパーパラメータ選定のための交差検証

In [None]:
# 検証対象ハイパーパラメータ
K = 5                                       # 分割数、交差検定の回数
grid_param = {
    'n_estimators': [20, 40, 80, 100, 120], # 決定木の数
    'max_depth': [4, 8, 16]                 # 木の深さ
    #    'num_leaves':[31,15,7,3],          #
    #    'learning_rate':[0.1,0.05,0.01]    #
}

In [None]:
# モデル作成
model = RandomForestRegressor(random_state=1)
rf_grid = GridSearchCV(
    estimator=model,                    # モデル
    param_grid=grid_param,              # 最適化したいパラメータセット
    cv=KFold(n_splits=K, shuffle=True), # 交差検定の回数
    scoring='neg_mean_squared_error',   # 評価方法
    n_jobs=cpu_count()                  # number of core
)

# モデル保管

In [None]:
if FILE_SAVE:
    pickle.dump(rf_model, open(MODEL_NAME_RF, 'wb'))
    pickle.dump(rf_grid,  open(MODEL_NAME_RF_CV, 'wb'))
    #pickle.dump(tree_grid_best, open(MODEL_NAME_RF_BEST, 'wb'))

# (参考)ベストモデル

## 学習

In [None]:
# 学習
rf_grid.fit(X_train, y_train)

# best estimator
tree_grid_best = rf_grid.best_estimator_

## 評価値

In [None]:
print("Best Model Parameter: ", rf_grid.best_params_)
print("Best Model Score    : ", math.sqrt(abs(rf_grid.best_score_)))

## 重要度

In [None]:
# 重要度、累積比をDF化
rf_reg_cv_df = pd.DataFrame(data=tree_grid_best.feature_importances_,
                         index=Feature_columns, columns=["importance"])
rf_reg_cv_df.sort_values("importance", ascending=False, inplace=True)
rf_reg_cv_df['imp_cumsum'] = rf_reg_cv_df.importance.cumsum()*100
rf_reg_cv_df

In [None]:
# 特徴量の重要度 Feature Importance
fig = plt.figure(figsize=(15, 5))

ax1 = fig.add_subplot(121)
ax2 = ax1.twinx()
plt.title("[best estimator]feature importance")
ax1.set_xticklabels(Feature_columns, rotation=45, ha="right")
ax1.set_ylim([0, 1])
ax1.bar(rf_reg_cv_df.index, rf_reg_cv_df.importance)
ax2.set_ylim([0, 100])
ax2.plot(rf_reg_cv_df.index, rf_reg_cv_df.imp_cumsum, c='r')


ax1 = fig.add_subplot(122)
ax2 = ax1.twinx()
plt.title("[first estimator]feature importance")
ax1.set_xticklabels(Feature_columns, rotation=45, ha="right")
ax1.set_ylim([0, 1])
ax1.bar(rf_reg_df.index, rf_reg_df.importance)
ax2.set_ylim([0, 100])
ax2.plot(rf_reg_df.index, rf_reg_df.imp_cumsum, c='r')