# Ensemble learning
We will implement scratch implementation of three types of ensemble learning. Then check each effect on a smaller dataset.

* Blending
* Bagging
* Stacking

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets

In [2]:
data=pd.read_csv('train.csv').select_dtypes(include='number')
data.isnull().sum()
data = data.fillna(data.mean())
data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [3]:
X = data.drop(['SalePrice'],axis=1).values
y = data['SalePrice'].values
X = np.log1p(X)
y = np.log1p(y)
print('X-{}, y-{}'.format(X.shape,y.shape))

X-(1460, 37), y-(1460,)


# Problem 1] Blending scratch mounting
Show at least three​ ​examples of scratch implementation of blending that are more accurate than a single model. Higher accuracy means less mean squared error (MSE) on the validation data.

# What is blending?
Blending is a method of independently training N diverse models, weighting the estimation results, and then adding them together. The simplest is to take the average. Various models are created by changing the following conditions.

Techniques (eg linear regression, SVM, decision tree, neural network, etc.)
Hyperparameters (eg SVM kernel type, initial weights, etc.)
How to preprocess input data (eg standardization, logarithmic transformation, PCA, etc.)
The important thing is that each model is very different.

Blending in regression problems is so simple that it is not provided in scikit-learn.

《 Supplemental information
》

In the case of a classification problem, a majority vote will be taken. Because it is more complicated than regression problems, scikit-learn provides a Voting Classifier.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
print('X_train shape:{}, y_test shape:{}'.format(X_train.shape,y_test.shape))

X_train shape:(1168, 37), y_test shape:(292,)


In [5]:
models = [LinearRegression(),SVR(),DecisionTreeRegressor()]
predictions = list()
for model in models:
    model.fit(X_train,y_train)
    predictions.append(model.predict(X_test))
    
predictions_ndarray = np.array(predictions)
blend = np.mean(predictions_ndarray,axis=0)

print('MSE')
print(mean_squared_error(y_test,blend))

MSE
0.021989903025349554


In [6]:
svr_model1 = SVR(C=1)
svr_model2 = SVR(C=5)
svr_model3 = SVR(C=10)
svr_model1.fit(X_train,y_train)
svr_model2.fit(X_train,y_train)
svr_model3.fit(X_train,y_train)
svr_pred1 = svr_model1.predict(X_test)
svr_pred2 = svr_model2.predict(X_test)
svr_pred3 = svr_model2.predict(X_test)  
svr_blend = np.mean([svr_pred1,svr_pred2,svr_pred3],axis=0)
print('MSE')
print('Blend:{:.3f}'.format(mean_squared_error(y_test,svr_blend)))

MSE
Blend:0.023


In [7]:
std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train_trans = std_scaler.transform(X_train)
X_test_trans = std_scaler.transform(X_test)

models2 = [LinearRegression(),SVR(),DecisionTreeRegressor()]
predictions2 = list()
for model in models2:
    model.fit(X_train_trans,y_train)
    predictions2.append(model.predict(X_test_trans))
    
predictions_ndarray2 = np.array(predictions)
blend2 = np.mean(predictions_ndarray2,axis=0)
print('MSE')
print((mean_squared_error(y_test,blend2)))

MSE
0.021989903025349554


# 4.Bagging
# [Problem 2] Scratch mounting of bagging
Please show at least one​ ​example where you scratch-implement the bagging and it is more accurate than a single model.

# What is bagging?
Bagging is a way to diversify how to select input data. N types of subsets (bootstrap samples) are created by randomly extracting from the training data after allowing duplication. N models are trained by them and the estimation results are averaged. Unlike blending, each weight does not change.

sklearn.model_selection.train_test_split — scikit-learn 0.21.3 documentation

The part that averages the estimation results is implemented in the same way as blending.



In [8]:
X_train_bag, X_test_bag, y_train_bag, y_test_bag = train_test_split(X,y,test_size=0.2,shuffle=True)
print('X_train shape:{}, y_test shape:{}'.format(X_train_bag.shape,y_test_bag.shape))

X_train shape:(1168, 37), y_test shape:(292,)


In [9]:
models = [LinearRegression(),SVR(),DecisionTreeRegressor()]
class BaggingScratch():
    def __init__(self, models):
        self.models = models
        self.predictions = list()
        
    def fit(self,X,y):
        for model in models:
            model.fit(X,y)
    def predict(self,X):
        predictions = list()
        for model in self.models:
            prediction = model.predict(X)
            predictions.append(prediction)
        self.predictions = np.mean(np.array(predictions),axis=0)
        return self.predictions
    def mse(self, y):
        mse = (mean_squared_error(y,self.predictions))
        return mse
    
bag = BaggingScratch(models)
bag.fit(X_train,y_train)
print("average of bagging pred:{}".format(bag.predict(X_test)))
print("average of bagging mse:{:.3f}".format(bag.mse(y_test)))

average of bagging pred:[12.25781018 11.99545486 11.56490218 11.01704575 11.89735932 12.60472669
 12.59121682 11.86406303 12.27052756 12.36113397 12.0939682  11.08387363
 12.18649386 12.82128078 12.32959948 11.60539099 11.61961935 11.72522247
 12.32655348 11.74982214 11.65550567 11.71891534 12.44081091 12.65000953
 11.4749905  12.20523297 11.72834793 12.16725904 12.86494783 11.86085426
 11.73072584 11.69092576 11.65772745 11.53485066 11.90581371 12.73189308
 11.77319291 11.32333622 12.57546139 11.67098164 11.89106665 11.92868525
 11.55077765 11.74554201 12.13173827 12.09052153 11.75582306 12.00658681
 12.41677083 12.387204   11.61414461 12.74536037 11.50486925 12.3525508
 12.2601907  11.55525153 11.66513099 12.03161852 11.74564378 12.13721745
 12.05648355 12.53183387 11.48497146 11.57012425 12.04109569 11.78683765
 11.74946287 12.36580131 12.07654313 11.93577139 12.01896958 11.49884952
 12.65849332 11.94814788 12.01314801 12.26227902 12.07852157 11.86209502
 12.92208333 12.231429   12.

# Stacking
# [Problem 3] Stacking scratch mounting
Please show at least one​ ​example where stacking is scratch-implemented and more accurate than a single model.

What is stacking?
The stacking procedure is as follows. Stacking is possible if there is at least stage 0 and stage 1, so implement it. First of all, we will start to the extent ​K​ ​0​ ​=​ ​3​,​ ​M​ ​0​ ​=​ ​2K0=3,M0=2.

《When learning》

(stage 00 ）

Training data K​ ​0K0Divide into pieces.
With(K​ ​0​ ​−​ ​1)(K0−1)Training data collectively, the rest 11for estimation, we can make K​ ​0K0 estimation.
We haveK​ ​0K0Prepare individual pieces and learn using different training data.
For each trained model, the remaining 11 unused estimation data is input to obtain an estimate. (This is called blended data.)
We also prepare K​ ​0K0 instances of different models and do the same thing. If there are M​ ​0M0 models, M​ ​0M0 blended data will be obtained.
(stage nn ）

The blended data of stage n​ ​−​ ​1n−1 is considered as the training data with M​ ​n​ ​−​ ​1Mn−1Think of it as training data with dimensional features, K​ ​nKn pieces. The same applies below.
(stage NN) *Last stage

stage N​ ​−​ ​1N−1of M​ ​N​ ​−​ ​1MN−1Blend data M​ ​N​ ​−​ ​1MN−1One type of model is trained as an input of dimensional features. This is the model for the final estimation.
《Estimated time》

(stage 00 ）

Test data K​ ​0​ ​x​ ​M​ ​0K0×M0Fill in the trained models and K​ ​0​ ​x​ ​M​ ​0K0×M0Get an estimate. this K​ ​0K0Calculate the average value on the axis of M​ ​0M0Obtain data with dimensional features. (Called a blend test)
(stage nn ）

The blended data of stage n​ ​−​ ​1n−1The blend test obtained in K​ ​n​ ​×​ ​M​ ​nKn×MnFill in the trained models and K​ ​n​ ​×​ ​M​ ​nKn×MnGet an estimate. this K​ ​nKnCalculate the average value on the axis of M​ ​0M0Obtain data with dimensional features. (Called a blend test)
(stage NN) *Last stage

stage N​ ​−​ ​1N−1Input the blend test obtained in step 2 into the trained model to obtain an estimate.

In [13]:
def get_dataset():
    X, y = datasets.make_classification(n_samples=10000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
    return X, y

X, y = get_dataset()
X_train_full, X_test_1, y_train_full, y_test_1 = train_test_split(X,y,test_size=0.5,random_state=1)
X_train_1, X_val, y_train_1, y_val = train_test_split(X_train_full,y_train_full,test_size=0.2,random_state=1)


In [14]:

def get_models():
    models = list()
    models.append(('lr',LinearRegression()))
    models.append(('knn', KNeighborsClassifier()))
    models.append(('cart', DecisionTreeRegressor()))
    models.append(('bayes', GaussianNB()))
    return models

def fit_ensemble(models, X_train_1, X_val, y_train_1, y_val):
    meta_X = list()
    for name,model in models:
        model.fit(X_train_1, y_train_1)
        y_pred = model.predict(X_val)
        y_pred = y_pred.reshape(len(y_pred), 1)
        meta_X.append(y_pred)
        
    meta_X = np.hstack(meta_X)
    blender = LinearRegression()
    blender.fit(meta_X, y_val)
    return blender
def pred_ensemble(models, blender, X_test_1):
    meta_X = list()
    for name,model in models:
        y_pred = model.predict(X_test_1)
        y_pred = y_pred.reshape(len(y_pred), 1)
        meta_X.append(y_pred)
        
    meta_X = np.hstack(meta_X)
    return blender.predict(meta_X)

models = get_models()
blender = fit_ensemble(models, X_train_1, X_val, y_train_1, y_val)
y_pred = pred_ensemble(models, blender, X_test_1)

In [16]:

print("Accuracy score")
print("xxxxxxxxxxxxxxxxxxxx")
print("MSE",(mean_squared_error(y_test_1,y_pred)))

model = LinearRegression()
model.fit(X_train_1, y_train_1)
y_pred1= model.predict(X_test_1)
print("Logistic regression:",(mean_squared_error(y_test_1,y_pred1)))

Accuracy score
xxxxxxxxxxxxxxxxxxxx
MSE 0.023238055526779017
Logistic regression: 0.10995299517654025
