<a href="https://colab.research.google.com/github/MajiroZ/for_git_study/blob/master/ensemble_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# train.csvの目的変数としてSalePrice, 説明変数としてGrLivAreaとYearBuiltを抽出

import pandas as pd
from sklearn.model_selection import train_test_split

# データの読み込み
df = pd.read_csv('/train.csv')

print(df)

# 目的変数と説明変数の選択
target_variable = 'SalePrice'
explanatory_variables = ['GrLivArea', 'YearBuilt']

# 欠損値を含む行を削除
df = df.dropna(subset=[target_variable] + explanatory_variables)

# データの分割
X = df[explanatory_variables]
y = df[target_variable]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# 分割後のデータの確認 (任意)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_valid shape:", X_valid.shape)
print("y_valid shape:", y_valid.shape)

        Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL         65.0     8450   Pave   NaN      Reg   
1        2          20       RL         80.0     9600   Pave   NaN      Reg   
2        3          60       RL         68.0    11250   Pave   NaN      IR1   
3        4          70       RL         60.0     9550   Pave   NaN      IR1   
4        5          60       RL         84.0    14260   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1455  1456          60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457          20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458          70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459          20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ... PoolArea PoolQC  Fe

##【問題1】ブレンディングのスクラッチ実装



In [12]:
# 線形回帰、SVM、決定木、ニューラルネットワークそれぞれ単一のモデルでのMSEを求める

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# データの読み込み
df = pd.read_csv('/train.csv')

# 目的変数と説明変数の選択
target_variable = 'SalePrice'
explanatory_variables = ['GrLivArea', 'YearBuilt']

# 欠損値を含む行を削除
df = df.dropna(subset=[target_variable] + explanatory_variables)

# データの分割
X = df[explanatory_variables]
y = df[target_variable]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)


# 線形回帰
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_valid)
mse_lr = mean_squared_error(y_valid, y_pred_lr)
print(f"Linear Regression MSE: {mse_lr}")

# SVM
svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_valid)
mse_svr = mean_squared_error(y_valid, y_pred_svr)
print(f"SVM MSE: {mse_svr}")

# 決定木
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_valid)
mse_dt = mean_squared_error(y_valid, y_pred_dt)
print(f"Decision Tree MSE: {mse_dt}")

# ニューラルネットワーク
mlp = MLPRegressor(random_state=0, max_iter=1000) # max_iterを増やして収束を確認
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_valid)
mse_mlp = mean_squared_error(y_valid, y_pred_mlp)
print(f"MLPRegressor MSE: {mse_mlp}")

Linear Regression MSE: 2942066921.6721087
SVM MSE: 7235023974.812659
Decision Tree MSE: 3009170128.186454
MLPRegressor MSE: 3731243284.413869


In [13]:
# ブレンディングをスクラッチ実装する

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# モデルのインスタンス化
model1 = LinearRegression()
model2 = SVR()
model3 = DecisionTreeRegressor()

# モデルの学習
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

# 予測値の取得
pred1 = model1.predict(X_valid)
pred2 = model2.predict(X_valid)
pred3 = model3.predict(X_valid)

# ブレンディング
# 予測値を重み付け平均で結合する
# 重みの合計が1になるように調整
w1 = 0.3  # LinearRegressionの重み
w2 = 0.3  # SVRの重み
w3 = 0.4  # DecisionTreeRegressorの重み

# 重みを調整し、合計が1になることを確認
# assert np.isclose(w1 + w2 + w3, 1.0), "Weights must sum to 1.0"  # 必要に応じてアサーションを追加

blended_pred = w1 * pred1 + w2 * pred2 + w3 * pred3


# 平均二乗誤差の算出
mse = mean_squared_error(y_valid, blended_pred)
print(f"Blended MSE: {mse}")

Blended MSE: 2736308546.3999968


In [14]:
# prompt: ブレンディングにあたり、ハイパーパラメータを変えたモデルの組み合わせを含める

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# データの読み込み
df = pd.read_csv('/train.csv')

# 目的変数と説明変数の選択
target_variable = 'SalePrice'
explanatory_variables = ['GrLivArea', 'YearBuilt']

# 欠損値を含む行を削除
df = df.dropna(subset=[target_variable] + explanatory_variables)

# データの分割
X = df[explanatory_variables]
y = df[target_variable]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# モデルのリストとハイパーパラメータ
models = [
    (LinearRegression(), {}),  # 線形回帰
    (SVR(), {'kernel': 'rbf', 'C': 100}), # SVM (ハイパーパラメータ変更)
    (DecisionTreeRegressor(), {'max_depth': 5, 'random_state':0}), # 決定木 (ハイパーパラメータ変更)
    (MLPRegressor(random_state=0, max_iter=1000), {}) # ニューラルネットワーク
]

predictions = []
for model, params in models:
    model.set_params(**params)  # ハイパーパラメータを設定
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    predictions.append(y_pred)
    mse = mean_squared_error(y_valid, y_pred)
    print(f"{model.__class__.__name__} with params {params} MSE: {mse}")

# ブレンディング (重み付き平均)
weights = [0.25, 0.25, 0.25, 0.25] # 各モデルの重み（合計は1になるように設定）
blended_pred = np.average(predictions, axis=0, weights=weights)

mse_blended = mean_squared_error(y_valid, blended_pred)
print(f"Blended MSE: {mse_blended}")

LinearRegression with params {} MSE: 2942066921.6721087
SVR with params {'kernel': 'rbf', 'C': 100} MSE: 6089799668.498366
DecisionTreeRegressor with params {'max_depth': 5, 'random_state': 0} MSE: 2169961248.6656322
MLPRegressor with params {} MSE: 3731243284.413869
Blended MSE: 2865055788.513104


In [15]:
# prompt: ブレンディングに際して、入力データの前処理をPCAにしたモデルを含める

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# データの読み込み
df = pd.read_csv('/train.csv')

# 目的変数と説明変数の選択
target_variable = 'SalePrice'
explanatory_variables = ['GrLivArea', 'YearBuilt']

# 欠損値を含む行を削除
df = df.dropna(subset=[target_variable] + explanatory_variables)

# データの分割
X = df[explanatory_variables]
y = df[target_variable]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# 標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# PCA
pca = PCA(n_components=1) # 主成分を1つに削減
X_train_pca = pca.fit_transform(X_train_scaled)
X_valid_pca = pca.transform(X_valid_scaled)

# モデルのリストとハイパーパラメータ
models = [
    (LinearRegression(), {}),
    (SVR(), {'kernel': 'rbf', 'C': 100}),
    (DecisionTreeRegressor(), {'max_depth': 5, 'random_state':0}),
    (MLPRegressor(random_state=0, max_iter=1000), {})
]

predictions = []
for model, params in models:
    model.set_params(**params)
    model.fit(X_train_pca, y_train) # PCA適用後のデータで学習
    y_pred = model.predict(X_valid_pca) # PCA適用後のデータで予測
    predictions.append(y_pred)
    mse = mean_squared_error(y_valid, y_pred)
    print(f"{model.__class__.__name__} with params {params} MSE: {mse}")

# ブレンディング (重み付き平均)
weights = [0.25, 0.25, 0.25, 0.25]
blended_pred = np.average(predictions, axis=0, weights=weights)

mse_blended = mean_squared_error(y_valid, blended_pred)
print(f"Blended MSE: {mse_blended}")

LinearRegression with params {} MSE: 2913453795.88941
SVR with params {'kernel': 'rbf', 'C': 100} MSE: 5662432108.443092
DecisionTreeRegressor with params {'max_depth': 5, 'random_state': 0} MSE: 1857425427.152014
MLPRegressor with params {} MSE: 37444521557.03058
Blended MSE: 5395012967.675472




どのブレンディングでも、単独で最もMSEが短い線形回帰よりも精度がよくなった。

ブレンディングするモデルの数によるものとは限らないが、標準化やPCAなどの前処理を組み込んだモデルが含まれるブレンディングは、特に精度が高いことが分かった。

##【問題2】バギングのスクラッチ実装

In [16]:
# バギングをスクラッチ実装し、精度を上げる

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# データの読み込みと前処理
df = pd.read_csv('/train.csv')
target_variable = 'SalePrice'
explanatory_variables = ['GrLivArea', 'YearBuilt']
df = df.dropna(subset=[target_variable] + explanatory_variables)
X = df[explanatory_variables]
y = df[target_variable]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# バギングの実装
n_estimators = 10  # 決定木の数
predictions = []

for i in range(n_estimators):
    # 「ブートストラップサンプリング」
    bootstrap_indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
    X_bootstrap = X_train.iloc[bootstrap_indices]
    y_bootstrap = y_train.iloc[bootstrap_indices]

    # 決定木の学習
    model = DecisionTreeRegressor(random_state=i) # random_stateをループごとに変更
    model.fit(X_bootstrap, y_bootstrap)

    # 予測値の取得
    y_pred = model.predict(X_valid)
    predictions.append(y_pred)

# バギングによる予測値の平均化
bagging_pred = np.mean(predictions, axis=0)

# 平均二乗誤差の計算
mse_bagging = mean_squared_error(y_valid, bagging_pred)
print(f"Bagging MSE: {mse_bagging}")

Bagging MSE: 1910533246.6218002


##【問題3】スタッキングのスクラッチ実装

In [17]:
# スタッキングをスクラッチ実装する

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# データの読み込み
df = pd.read_csv('/train.csv')

# 目的変数と説明変数の選択
target_variable = 'SalePrice'
explanatory_variables = ['GrLivArea', 'YearBuilt']

# 欠損値を含む行を削除
df = df.dropna(subset=[target_variable] + explanatory_variables)

# データの分割
X = df[explanatory_variables]
y = df[target_variable]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

#レベル１モデル
# モデルのリストとハイパーパラメータ
models = [
    LinearRegression(),
    SVR(kernel='rbf', C=100),
    DecisionTreeRegressor(max_depth=5, random_state=0),
    MLPRegressor(random_state=0, max_iter=1000)
]

# 各モデルの予測値を格納するリスト
level1_predictions = []
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    level1_predictions.append(y_pred)


# レベル1の予測値を結合して新しい特徴量を作成
level1_predictions = np.array(level1_predictions).T
#レベル２モデル
# スタッキングモデルの学習
level2_model = LinearRegression()
level2_model.fit(level1_predictions, y_valid)

# レベル1モデルの検証データに対する予測値を取得
level1_valid_predictions = []
for model in models:
    level1_valid_pred = model.predict(X_valid)
    level1_valid_predictions.append(level1_valid_pred)
level1_valid_predictions = np.array(level1_valid_predictions).T

# スタッキングモデルによる予測
stacked_predictions = level2_model.predict(level1_valid_predictions)

# MSEの算出
mse_stacked = mean_squared_error(y_valid, stacked_predictions)
print(f"Stacked MSE: {mse_stacked}")

Stacked MSE: 2147383210.1924074
