In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.datasets import load_boston

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb
from catboost import Pool
from catboost import CatBoost

## データセット

In [2]:
boston = load_boston()

df = pd.concat([pd.DataFrame(boston.data, columns=boston.feature_names),
           pd.DataFrame(boston.target, columns=["target"])], 
          axis=1)

## XGBoost

In [3]:
def xgb_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv, loop_counts):
    # データを格納する
    # 学習用
    xgb_train = xgb.DMatrix(X_train_cv, label=y_train_cv)
    # 検証用
    xgb_eval = xgb.DMatrix(X_eval_cv, label=y_eval_cv)
    # テスト用
    #xgb_test = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'reg:squarederror',  # 回帰
        'eval_metric': 'rmse'           # 学習用の指標 (RMSE)
    }

    # 学習
    evals = [(xgb_train, 'train'), (xgb_eval, 'eval')] # 学習に用いる検証用データ
    evaluation_results = {}                            # 学習の経過を保存する箱
    bst = xgb.train(xgb_params,                        # 上記で設定したパラメータ
                    xgb_train,                         # 使用するデータセット
                    num_boost_round=200,               # 学習の回数
                    early_stopping_rounds=10,          # アーリーストッピング
                    evals=evals,                       # 学習経過で表示する名称
                    evals_result=evaluation_results,   # 上記で設定した検証用データ
                    verbose_eval=0                     # 学習の経過の表示(非表示)
                    )
    
    # 検証用データで予測
    y_pred = bst.predict(xgb_eval, ntree_limit=bst.best_ntree_limit)

    print('Trial: ' + str(loop_counts))
    
    # RMSEの評価
    rmse = mean_squared_error(y_eval_cv, y_pred, squared=True)
    print('XGBoost Validation:', rmse)
    
    return(bst, rmse, y_pred)

## LightGBM

In [4]:
def lgbm_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):
    # データを格納する
    # 学習用
    lgb_train = lgb.Dataset(X_train_cv, y_train_cv,
                            free_raw_data=False)
    # 検証用
    lgb_eval = lgb.Dataset(X_eval_cv, y_eval_cv, reference=lgb_train,
                           free_raw_data=False)
    
    # パラメータを設定
    params = {'task': 'train',                # レーニング ⇔　予測predict
              'boosting_type': 'gbdt',        # 勾配ブースティング
              'objective': 'regression',      # 目的関数：多値分類、マルチクラス分類
              'metric': 'rmse',      # 検証用データセットで、分類モデルの性能を測る指標
              'learning_rate': 0.1,           # 学習率（初期値0.1）
              'num_leaves': 23,               # 決定木の複雑度を調整（初期値31）
              'min_data_in_leaf': 1,          # データの最小数（初期値20）
             }

    # 学習
    evaluation_results = {}                                # 学習の経過を保存する箱
    model = lgb.train(params,                              # 上記で設定したパラメータ
                      lgb_train,                           # 使用するデータセット
                      num_boost_round=200,                 # 学習の回数
                      valid_names=['train', 'valid'],      # 学習経過で表示する名称
                      valid_sets=[lgb_train, lgb_eval],    # モデルの検証に使用するデータセット
                      evals_result=evaluation_results,     # 学習の経過を保存
                      early_stopping_rounds=10,            # アーリーストッピングの回数
                      verbose_eval=0                      # 学習の経過を表示する刻み（非表示）
    )

    # 検証用データで予測
    y_pred = model.predict(X_eval_cv, num_iteration=model.best_iteration)
    
    # RMSEの評価
    rmse = mean_squared_error(y_eval_cv, y_pred, squared=True)
    print('LightGBM Validation:', rmse)
    
    return(model, rmse, y_pred)

## CatBoost

In [5]:
def catboost_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):
    # データを格納する
    # 学習用
    CatBoost_train = Pool(X_train_cv, label=y_train_cv)
    # 検証用
    CatBoost_eval = Pool(X_eval_cv, label=y_eval_cv)

    # パラメータを設定
    params = {        
        'loss_function': 'RMSE',          # 多値分類問題
        'num_boost_round': 1000,          # 学習の回数
        'early_stopping_rounds': 10       # アーリーストッピングの回数
    }

    # 学習
    catb = CatBoost(params)
    catb.fit(CatBoost_train, eval_set=[CatBoost_eval], verbose=False)

    # 検証用データで予測
    y_pred = catb.predict(X_eval_cv)
    
    # RMSEの評価
    rmse = mean_squared_error(y_eval_cv, y_pred, squared=True)
    print('CatBoost Validation:', rmse)
    
    return(catb, rmse, y_pred)

## 学習

In [6]:
# 各5つのモデルを保存するリストの初期化
xgb_models = []
lgbm_models = []
catb_models = []

# 各5つのモデルの正答率を保存するリストの初期化
xgb_validations = []
lgbm_validations = []
catb_validations = []

# 学習のカウンター
loop_counts = 1

# 各クラスの確率（3モデル*5seed*３クラス）
first_reg = pd.DataFrame(np.zeros((len(df), 3*5*3)))


for seed_no in range(5): 
        
    # 学習データの数だけの数列（0行から最終行まで連番）
    row_no_list = list(range(len(df)))

    # KFoldクラスをインスタンス化（これを使って5分割する）
    K_fold =  KFold(n_splits=5, shuffle=True, random_state=42)
    
    # KFoldクラスで分割した回数だけ実行（ここでは5回）
    for train_cv_no, eval_cv_no in K_fold.split(row_no_list, df["target"]):
        # ilocで取り出す行を指定
        X_train_cv = df.drop(["target"], axis=1).iloc[train_cv_no, :]
        y_train_cv = df["target"].iloc[train_cv_no]
        X_eval_cv  = df.drop(["target"], axis=1).iloc[eval_cv_no, :]
        y_eval_cv  = df["target"].iloc[eval_cv_no]
        
        # XGBoostの訓練を実行
        bst, bst_validation, xgb_reg = xgb_train_cv(X_train_cv, y_train_cv,
                                                   X_eval_cv, y_eval_cv, 
                                                   loop_counts)
        # LIghtGBMの訓練を実行
        model, model_validation, lgbm_reg = lgbm_train_cv(X_train_cv, y_train_cv, 
                                                         X_eval_cv, y_eval_cv)
        # CatBoostの訓練を実行
        catb, catb_validation, catb_reg = catboost_train_cv(X_train_cv, y_train_cv,
                                                           X_eval_cv, y_eval_cv)
        # 実行回数のカウント
        loop_counts += 1
        
        # 学習が終わったモデルをリストに入れておく
        xgb_models.append(bst) 
        lgbm_models.append(model) 
        catb_models.append(catb) 
        
        # 学習が終わったモデルの正答率をリストに入れておく
        xgb_validations.append(bst_validation) 
        lgbm_validations.append(model_validation) 
        catb_validations.append(catb_validation) 
        
        # 検証データの各クラスの回帰
        for i in range(3):
            first_reg.iloc[eval_cv_no, (seed_no * 3) + i] = xgb_reg[i]
            first_reg.iloc[eval_cv_no, (seed_no * 3) + 15 + i] = lgbm_reg[i]
            first_reg.iloc[eval_cv_no, (seed_no * 3) + 30 + i] = catb_reg[i]
            
first_reg.head()



Trial: 1
XGBoost Validation: 6.55995747666388
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1030
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 13
[LightGBM] [Info] Start training from score 22.796535
LightGBM Validation: 4.330466272095863
CatBoost Validation: 8.644868063307952
Trial: 2
XGBoost Validation: 10.083251246830114
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1042
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.712099




LightGBM Validation: 8.455796668796213
CatBoost Validation: 8.13000346864853
Trial: 3
XGBoost Validation: 12.81116460604389
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.322963
LightGBM Validation: 15.293116975972934




CatBoost Validation: 11.749212823553451
Trial: 4
XGBoost Validation: 8.723749894351535
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.327654




LightGBM Validation: 9.374767798015942
CatBoost Validation: 8.069802434283632
Trial: 5
XGBoost Validation: 6.08487379914099
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.505432




LightGBM Validation: 12.533191330170698
CatBoost Validation: 7.766294065731304
Trial: 6
XGBoost Validation: 6.55995747666388
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1030
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 13
[LightGBM] [Info] Start training from score 22.796535




LightGBM Validation: 4.330466272095863
CatBoost Validation: 8.644868063307952
Trial: 7
XGBoost Validation: 10.083251246830114
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1042
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.712099




LightGBM Validation: 8.455796668796213
CatBoost Validation: 8.13000346864853
Trial: 8
XGBoost Validation: 12.81116460604389
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.322963
LightGBM Validation: 15.293116975972934




CatBoost Validation: 11.749212823553451
Trial: 9
XGBoost Validation: 8.723749894351535
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.327654




LightGBM Validation: 9.374767798015942
CatBoost Validation: 8.069802434283632
Trial: 10
XGBoost Validation: 6.08487379914099
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.505432




LightGBM Validation: 12.533191330170698
CatBoost Validation: 7.766294065731304
Trial: 11
XGBoost Validation: 6.55995747666388
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1030
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 13
[LightGBM] [Info] Start training from score 22.796535




LightGBM Validation: 4.330466272095863
CatBoost Validation: 8.644868063307952
Trial: 12
XGBoost Validation: 10.083251246830114
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1042
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.712099
LightGBM Validation: 8.455796668796213




CatBoost Validation: 8.13000346864853
Trial: 13
XGBoost Validation: 12.81116460604389
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.322963
LightGBM Validation: 15.293116975972934




CatBoost Validation: 11.749212823553451
Trial: 14
XGBoost Validation: 8.723749894351535
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.327654




LightGBM Validation: 9.374767798015942
CatBoost Validation: 8.069802434283632
Trial: 15
XGBoost Validation: 6.08487379914099
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.505432




LightGBM Validation: 12.533191330170698
CatBoost Validation: 7.766294065731304
Trial: 16
XGBoost Validation: 6.55995747666388
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1030
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 13
[LightGBM] [Info] Start training from score 22.796535




LightGBM Validation: 4.330466272095863
CatBoost Validation: 8.644868063307952
Trial: 17
XGBoost Validation: 10.083251246830114
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1042
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.712099




LightGBM Validation: 8.455796668796213
CatBoost Validation: 8.13000346864853
Trial: 18
XGBoost Validation: 12.81116460604389
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.322963
LightGBM Validation: 15.293116975972934




CatBoost Validation: 11.749212823553451
Trial: 19
XGBoost Validation: 8.723749894351535
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.327654




LightGBM Validation: 9.374767798015942
CatBoost Validation: 8.069802434283632
Trial: 20
XGBoost Validation: 6.08487379914099
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.505432




LightGBM Validation: 12.533191330170698
CatBoost Validation: 7.766294065731304
Trial: 21
XGBoost Validation: 6.55995747666388
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1030
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 13
[LightGBM] [Info] Start training from score 22.796535
LightGBM Validation: 4.330466272095863




CatBoost Validation: 8.644868063307952
Trial: 22
XGBoost Validation: 10.083251246830114
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1042
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.712099
LightGBM Validation: 8.455796668796213




CatBoost Validation: 8.13000346864853
Trial: 23
XGBoost Validation: 12.81116460604389
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.322963
LightGBM Validation: 15.293116975972934




CatBoost Validation: 11.749212823553451
Trial: 24
XGBoost Validation: 8.723749894351535
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.327654




LightGBM Validation: 9.374767798015942
CatBoost Validation: 8.069802434283632
Trial: 25
XGBoost Validation: 6.08487379914099
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 405, number of used features: 13
[LightGBM] [Info] Start training from score 22.505432




LightGBM Validation: 12.533191330170698
CatBoost Validation: 7.766294065731304


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,27.145155,32.89201,19.417727,27.145155,32.89201,19.417727,27.145155,32.89201,19.417727,27.145155,...,18.767848,26.740495,35.423938,18.767848,26.740495,35.423938,18.767848,26.740495,35.423938,18.767848
1,22.010473,19.930988,14.301942,22.010473,19.930988,14.301942,22.010473,19.930988,14.301942,22.010473,...,13.497022,21.587954,20.17389,13.497022,21.587954,20.17389,13.497022,21.587954,20.17389,13.497022
2,27.145155,32.89201,19.417727,27.145155,32.89201,19.417727,27.145155,32.89201,19.417727,27.145155,...,18.767848,26.740495,35.423938,18.767848,26.740495,35.423938,18.767848,26.740495,35.423938,18.767848
3,34.725506,23.145926,15.534187,34.725506,23.145926,15.534187,34.725506,23.145926,15.534187,34.725506,...,17.575805,36.858545,27.144804,17.575805,36.858545,27.144804,17.575805,36.858545,27.144804,17.575805
4,33.646828,16.715572,19.566851,33.646828,16.715572,19.566851,33.646828,16.715572,19.566851,33.646828,...,20.861329,33.146283,17.354749,20.861329,33.146283,17.354749,20.861329,33.146283,17.354749,20.861329


## XGBoost で Stacking

In [7]:
loop_counts = 0

# 学習データとテストデータに分ける
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["target"], axis=1), 
    df["target"], 
    random_state=42,
    test_size=0.2
)

# 予測結果の格納用のnumpy行列を作成
test_preds = np.zeros((len(y_test), 5))

# 学習データの数だけの数列（0行から最終行まで連番）
row_no_list = list(range(len(y_train)))

# KFoldクラスをインスタンス化（これを使って5分割する）
K_fold = KFold(n_splits=5, shuffle=True,  random_state=42)

# KFoldクラスで分割した回数だけ実行（ここでは5回）
for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y_train):
    # ilocで取り出す行を指定
    X_train_cv = X_train.iloc[train_cv_no, :]
    y_train_cv = pd.Series(y_train).iloc[train_cv_no]
    X_eval_cv = X_train.iloc[eval_cv_no, :]
    y_eval_cv = pd.Series(y_train).iloc[eval_cv_no]

    # データを格納する
    # 学習用
    xgb_train = xgb.DMatrix(X_train_cv, label=y_train_cv)
    # 検証用
    xgb_eval = xgb.DMatrix(X_eval_cv, label=y_eval_cv)
    # テスト用
    xgb_test = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'reg:squarederror',  # 回帰
        'learning_rate': 0.1,           # 学習率
        'eval_metric': 'rmse'           # 学習用の指標 (RMSE)
    }
    
    # 学習
    evals = [(xgb_train, 'train'), (xgb_eval, 'eval')] # 学習に用いる検証用データ
    evaluation_results = {}                            # 学習の経過を保存する箱
    bst = xgb.train(xgb_params,                        # 上記で設定したパラメータ
                    xgb_train,                         # 使用するデータセット
                    num_boost_round=200,               # 学習の回数
                    early_stopping_rounds=10,          # アーリーストッピング
                    evals=evals,                       # 学習経過で表示する名称
                    evals_result=evaluation_results,   # 上記で設定した検証用データ
                    verbose_eval=0                     # 学習の経過の表示(非表示)
                    )

    y_pred = bst.predict(xgb_test, ntree_limit=bst.best_ntree_limit)
    
    # testの予測を保存
    test_preds[:, loop_counts] = y_pred
 
    print('Trial: ' + str(loop_counts))
    loop_counts += 1    
    rmse = mean_squared_error(y_test, y_pred, squared=True)
    
    print('RMSE:', rmse)

Trial: 0




RMSE: 10.12527605234543
Trial: 1
RMSE: 6.620539824211737




Trial: 2
RMSE: 10.36307283523485
Trial: 3
RMSE: 9.720079862500858
Trial: 4
RMSE: 5.15608715971082




## 予測

In [8]:
y_pred_mean = test_preds.mean(axis=1)

mean_squared_error(y_test, y_pred_mean, squared=True)

7.002718516727869

In [9]:
y_pred = test_preds[:, 0] * 0.1 + test_preds[:, 1] * 0.2 + test_preds[:, 2] * 0.1 + test_preds[:, 3] * 0.1 + test_preds[:, 4] * 0.6

mean_squared_error(y_test, y_pred, squared=True)

9.036437175317653