# Sprint アンサンブル学習 

## 【問題1】ブレンディングのスクラッチ実装

ブレンディング をスクラッチ実装し、単一モデルより精度があがる例を 最低3つ 示してください。精度があがるとは、検証用データに対する平均二乗誤差（MSE）が小さくなることを指します。

In [175]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")

In [176]:
df_original = pd.read_csv("dataset/train.csv")
df = df_original[["GrLivArea", "YearBuilt", "SalePrice"]]

# 説明変数と目的変数に分割し
X = df[["GrLivArea", "YearBuilt"]].values
y = df[["SalePrice"]].values

# トレーニングデータとテストデータに分割
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, random_state=42)

# 標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# 目的変数を対数変換
y_train = np.log(y_train)
y_test = np.log(y_test)

# 1次元に変更
y_train=np.reshape(y_train,(-1))
y_test=np.reshape(y_test,(-1))

In [6]:
# 単一モデルの作成
lnr = LinearRegression()
svr = SVR(gamma='auto')
dtr = DecisionTreeRegressor(random_state=0)
rf = RandomForestRegressor(random_state=0)
lgb = LGBMRegressor(random_state=0)

# 単一モデルごとに学習
lnr.fit(X_train,y_train)
svr.fit(X_train,y_train)
dtr.fit(X_train,y_train)
rf.fit(X_train,y_train)
lgb.fit(X_train,y_train)

# 単一モデルの評価
print('{:.5f} [LinearRegression]'.format(mean_squared_error(y_test,lnr.predict(X_test))))
print('{:.5f} [SVR]'.format(mean_squared_error(y_test,svr.predict(X_test))))
print('{:.5f} [DecisionTreeRegressor]'.format(mean_squared_error(y_test,dtr.predict(X_test))))
print('{:.5f} [RandomForestRegressor]'.format(mean_squared_error(y_test,rf.predict(X_test))))
print('{:.5f} [LGBMRegressor]'.format(mean_squared_error(y_test,lgb.predict(X_test))))

0.05186 [LinearRegression]
0.04744 [SVR]
0.06857 [DecisionTreeRegressor]
0.05114 [RandomForestRegressor]
0.04718 [LGBMRegressor]


単一モデルでは SVR と LGBMRegressor の数値が比較的良好でした。

In [7]:
# 2つのモデルをブレンドする
def blending(X_train, X_test, y_train, model1, model2):
    # モデル1の学習
    model1.fit(X_train, y_train)
    # モデル1で予測
    model1_pred = model1.predict(X_test)
    # モデル2の学習
    model2.fit(X_train, y_train)
    # モデル2で予測
    model2_pred = model2.predict(X_test)
    # モデル1と2の平均を出力
    blend_pred = (model1_pred + model2_pred) / 2
    # 評価
    mse_result = mean_squared_error(y_test,blend_pred)
    # r2_score_result = r2_score(y_test,blend_pred)
    
    return mse_result

In [8]:
#blending(X_train, X_test, y_train, lnr, lgb)
print('{:.5f} [lnr x svr]'.format(blending(X_train, X_test, y_train, lnr, svr)))
print('{:.5f} [lnr x dtr]'.format(blending(X_train, X_test, y_train, lnr, dtr)))
print('{:.5f} [lnr x rf]'.format(blending(X_train, X_test, y_train, lnr, rf)))
print('{:.5f} [lnr x lgb]'.format(blending(X_train, X_test, y_train, lnr, lgb)))
print('{:.5f} [svr x dtr]'.format(blending(X_train, X_test, y_train, svr, dtr)))
print('{:.5f} [svr x rf]'.format(blending(X_train, X_test, y_train, svr, rf)))
print('{:.5f} [svr x lgb]'.format(blending(X_train, X_test, y_train, svr, lgb)))
print('{:.5f} [dtr x rf]'.format(blending(X_train, X_test, y_train, dtr, rf)))
print('{:.5f} [dtr x lgb]'.format(blending(X_train, X_test, y_train, dtr, lgb)))
print('{:.5f} [rf x lgb]'.format(blending(X_train, X_test, y_train, rf, lgb)))

0.04855 [lnr x svr]
0.04936 [lnr x dtr]
0.04641 [lnr x rf]
0.04656 [lnr x lgb]
0.04767 [svr x dtr]
0.04469 [svr x rf]
0.04513 [svr x lgb]
0.05363 [dtr x rf]
0.04931 [dtr x lgb]
0.04683 [rf x lgb]


2つのモデルをブレンドした結果、[SVR x RandomForest]と[SVR x LightGBM]の組み合わせが良好な結果となりました。
なお、単一モデルよりも精度が良かった組み合わせは下記となります。
- [LinearRegression x RandomForestRegressor]
- [LinearRegression x LGBMRegressor]
- [SVR x RandomForestRegressor]
- [SVR x LGBMRegressor]
- [RandomForestRegressor x LGBMRegressor]

## 【問題2】バギングのスクラッチ実装

バギング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

In [9]:
y_pred = np.empty((X_test.shape[0],1))

In [10]:
bagging_tree = np.zeros(len(y_test)).reshape(-1, 1)
bagging_tree.shape

(292, 1)

In [11]:
"""
指定した回数のループを実行:
    指定した行数に分割して抽出
    抽出したデータに対して学習を行う
    予測した結果をリストに蓄積する
""" 
    
def Bagging(X_train, y_train, X_test, model, cnt=5, split=0.2):
    # リストの初期化
    y_pred = np.zeros((X_test.shape[0], 2))
    y_pred[:,0:2] = X_test
    # 指定回数のループ
    for i in range(cnt):
        (X_train_bg, _, y_train_bg, _) = train_test_split(X_train, y_train, test_size=split, shuffle=True)            
        # 学習
        model.fit(X_train_bg, y_train_bg)
        # 予測
        model_pred = model.predict(X_test)
        # 予想結果の蓄積
        y_pred = np.c_[y_pred, model_pred]
            
    #平均の計算
    y_pred_mean = np.mean(y_pred[:,2:cnt+2], axis=1)
    # 平均も結合
    y_pred = np.c_[y_pred, y_pred_mean]
    
    return y_pred[:,-1]

In [12]:
# 単一モデル&バギング実施の評価
print('{:.5f} [LinearRegression]'.format(mean_squared_error(y_test,Bagging(X_train, y_train, X_test, lnr))))
print('{:.5f} [SVR]'.format(mean_squared_error(y_test,Bagging(X_train, y_train, X_test, svr))))
print('{:.5f} [DecisionTreeRegressor]'.format(mean_squared_error(y_test,Bagging(X_train, y_train, X_test, dtr))))
print('{:.5f} [RandomForestRegressor]'.format(mean_squared_error(y_test,Bagging(X_train, y_train, X_test, rf))))
print('{:.5f} [LGBMRegressor]'.format(mean_squared_error(y_test,Bagging(X_train, y_train, X_test, lgb))))

0.05169 [LinearRegression]
0.04748 [SVR]
0.05410 [DecisionTreeRegressor]
0.04582 [RandomForestRegressor]
0.04785 [LGBMRegressor]


SVM以外のモデルに関しては、バギングを行うことでパフォーマンスの向上が見られました。

### 【問題3】スタッキングのスクラッチ実装

スタッキング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

In [171]:
class Stacking():
    def __init__(self, split_n=3, model_n=2):
        self.split_n = split_n
        self.model_n = model_n

    def fit(self, X_train, y_train, X_test, y_test, models):
        # K個に分割するdividerを作る
        divider = np.zeros(self.split_n)
        vol = X_train.shape[0]
        num = self.split_n
        for i in range(self.split_n):
            divider[i] = math.ceil(vol/num)
            num -= 1
            vol = vol-divider[i]
        
        self.divider = divider.astype(int)
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.models = models
        # print(self.divider)
    
    def predict(self, X_test):
        for m in range(self.model_n):
            divide_point = 0
            for n in range(self.split_n):
                idx = np.zeros(X_train.shape[0], dtype=bool)
                idx[divide_point:divide_point+self.divider[n]]= True
                self.X_test_divided = X_train[idx, :]
                self.X_train_divided = X_train[~idx, :]
                self.y_test_divided = y_train[idx]
                self.y_train_divided = y_train[~idx]                    
                
                models[m].fit(self.X_train_divided, self.y_train_divided)
                if n == 0:
                    blend = models[m].predict(self.X_test_divided)
                    pred_data = models[m].predict(X_test)
                else:
                    blend = np.r_[blend, models[m].predict(self.X_test_divided)]
                    pred_data = np.c_[pred_data, models[m].predict(X_test)]
            
                divide_point += self.divider[n]
            if m ==0:
                blend_data =blend.reshape(-1, 1)
                blend_pred_data = np.mean(pred_data, axis=1)
            else:
                blend_data = np.c_[blend_data, blend.reshape(-1, 1) ]
                blend_pred_data = np.c_[blend_pred_data, np.mean(pred_data, axis=1)]
        
        models[0].fit(blend_data, y_train)
        y_pred = models[0].predict(blend_pred_data)                               
        return y_pred

In [177]:
stacking = Stacking()
models = [LinearRegression(), SVR(gamma='auto')]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [lnr x svr]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [LinearRegression(), DecisionTreeRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [lnr x dtr]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [LinearRegression(), RandomForestRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [lnr x rf]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [LinearRegression(), LGBMRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [lnr x lgb]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [SVR(gamma='auto'), DecisionTreeRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [svr x dtr]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [SVR(gamma='auto'), RandomForestRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [svr x rf]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [SVR(gamma='auto'), LGBMRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [svr x lgb]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [DecisionTreeRegressor(random_state=0), RandomForestRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [dtr x rf]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [DecisionTreeRegressor(random_state=0), LGBMRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [dtr x lgb]".format(mean_squared_error(y_test, stacking.predict(X_test))))

models = [RandomForestRegressor(random_state=0), LGBMRegressor(random_state=0)]
stacking.fit(X_train, y_train, X_test, y_test, models)
print("{:.5f} [rf x lgb]".format(mean_squared_error(y_test, stacking.predict(X_test))))

0.04715 [lnr x svr]
0.04884 [lnr x dtr]
0.04530 [lnr x rf]
0.04692 [lnr x lgb]
0.04777 [svr x dtr]
0.04548 [svr x rf]
0.04667 [svr x lgb]
0.10924 [dtr x rf]
0.09536 [dtr x lgb]
0.04867 [rf x lgb]


2つのモデルをスタッキングした結果、下記の組み合わせが単一モデルの数値（ [LGBMRegressor]の0.04718）よりも精度が良好な結果となりました。

- [LinearRegression x SVR]
- [LinearRegression x RandomForestRegressor]
- [LinearRegression x LGBMRegressor]
- [SVR x RandomForestRegressor]
- [SVR x LGBMRegressor]
