

---

- **Blending**
- **Bagging**
- **Stacking**

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets

In [2]:
data=pd.read_csv('train.csv').select_dtypes(include='number')

# handling of missing values
data.isnull().sum()
data = data.fillna(data.mean())
data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [3]:
# splitting the data
X = data.drop(['SalePrice'],axis=1).values
y = data['SalePrice'].values

X = np.log1p(X)
y = np.log1p(y)

print('X shape:{}, y shape:{}'.format(X.shape,y.shape))

X shape:(1460, 37), y shape:(1460,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
print('X_train shape:{}, y_test shape:{}'.format(X_train.shape,y_test.shape))

X_train shape:(1168, 37), y_test shape:(292,)


In [5]:
models = [LinearRegression(),SVR(),DecisionTreeRegressor()]
predictions = list()
for model in models:
    model.fit(X_train,y_train)
    predictions.append(model.predict(X_test))
    
predictions_ndarray = np.array(predictions)
blend = np.mean(predictions_ndarray,axis=0)

print('MSE')
print('-------')
print('Blend:{:.3f}'.format(mean_squared_error(y_test,blend)))

MSE
-------
Blend:0.022


In [6]:
svr_model1 = SVR(C=1)
svr_model2 = SVR(C=5)
svr_model3 = SVR(C=10)
svr_model1.fit(X_train,y_train)
svr_model2.fit(X_train,y_train)
svr_model3.fit(X_train,y_train)
svr_pred1 = svr_model1.predict(X_test)
svr_pred2 = svr_model2.predict(X_test)
svr_pred3 = svr_model2.predict(X_test)
    
svr_blend = np.mean([svr_pred1,svr_pred2,svr_pred3],axis=0)

print('MSE')
print('-------')
print('Blend:{:.3f}'.format(mean_squared_error(y_test,svr_blend)))

MSE
-------
Blend:0.023


In [7]:

std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train_trans = std_scaler.transform(X_train)
X_test_trans = std_scaler.transform(X_test)

models2 = [LinearRegression(),SVR(),DecisionTreeRegressor()]
predictions2 = list()
for model in models2:
    model.fit(X_train_trans,y_train)
    predictions2.append(model.predict(X_test_trans))
    
predictions_ndarray2 = np.array(predictions)
blend2 = np.mean(predictions_ndarray2,axis=0)

print('MSE')
print('-------')
print('Blend:{:.3f}'.format(mean_squared_error(y_test,blend2)))

MSE
-------
Blend:0.022


In [8]:
svr_model1 = SVR(C=1)
svr_model2 = SVR(C=5)
svr_model3 = SVR(C=10)
svr_model1.fit(X_train,y_train)
svr_model2.fit(X_train,y_train)
svr_model3.fit(X_train,y_train)
svr_pred1 = svr_model1.predict(X_test)
svr_pred2 = svr_model2.predict(X_test)
svr_pred3 = svr_model2.predict(X_test)
    
svr_blend = np.mean([svr_pred1,svr_pred2,svr_pred3],axis=0)

print('MSE')
print('-------')
print('Blend:{:.3f}'.format(mean_squared_error(y_test,svr_blend)))

MSE
-------
Blend:0.023


In [9]:
# Example 3
std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train_trans = std_scaler.transform(X_train)
X_test_trans = std_scaler.transform(X_test)

models2 = [LinearRegression(),SVR(),DecisionTreeRegressor()]
predictions2 = list()
for model in models2:
    model.fit(X_train_trans,y_train)
    predictions2.append(model.predict(X_test_trans))
    
predictions_ndarray2 = np.array(predictions)
blend2 = np.mean(predictions_ndarray2,axis=0)

print('MSE')
print('-------')
print('Blend:{:.3f}'.format(mean_squared_error(y_test,blend2)))

MSE
-------
Blend:0.022


In [10]:
X_train_bag, X_test_bag, y_train_bag, y_test_bag = train_test_split(X,y,test_size=0.2,shuffle=True)
print('X_train shape:{}, y_test shape:{}'.format(X_train_bag.shape,y_test_bag.shape))

X_train shape:(1168, 37), y_test shape:(292,)


In [11]:
models = [LinearRegression(),SVR(),DecisionTreeRegressor()]
class BaggingScratch():
    def __init__(self, models):
        self.models = models
        self.predictions = list()
        
    def fit(self,X,y):
        for model in models:
            model.fit(X,y)
    def predict(self,X):
        predictions = list()
        for model in self.models:
            prediction = model.predict(X)
            predictions.append(prediction)
        self.predictions = np.mean(np.array(predictions),axis=0)
        return self.predictions
    def mse(self, y):
        mse = (mean_squared_error(y,self.predictions))
        return mse
    

bag = BaggingScratch(models)
bag.fit(X_train,y_train)
print("average of bagging pred:{}".format(bag.predict(X_test)))
print("average of bagging mse:{:.3f}".format(bag.mse(y_test)))

average of bagging pred:[12.24834723 12.00773981 11.56490218 11.01704575 11.94186951 12.60472669
 12.59121682 11.86406303 12.27052756 12.36113397 12.0939682  11.08387363
 12.18649386 12.82128078 12.35823762 11.62444362 11.61961935 11.72522247
 12.32107768 11.77422308 11.66586087 11.80025371 12.45918849 12.65908722
 11.4749905  12.20612542 11.72834793 12.15390619 12.86494783 11.86085426
 11.73072584 11.70222618 11.65493805 11.53485066 11.88092672 12.73284026
 11.76620459 11.32333622 12.57546139 11.64456544 11.97189583 11.92435621
 11.55077765 11.74041374 12.13173827 12.08853148 11.73163762 12.03074557
 12.41677083 12.387204   11.51358932 12.64822509 11.51629882 12.34734819
 12.25089831 11.51290754 11.68953193 12.03161852 11.74564378 12.13535004
 12.05905897 12.53183387 11.48061821 11.57012425 12.03181347 11.79338659
 11.74946287 12.3373183  12.08495867 11.9408605  12.16256299 11.49884952
 12.65849332 11.91379867 12.00256517 12.2646974  12.04960477 11.86209502
 12.89247505 12.231429   12

In [15]:
def get_dataset():
    X, y = datasets.make_classification(n_samples=10000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
    return X, y

X, y = get_dataset()
# Splitting into train and tests(used for base models)
X_train_full, X_test_1, y_train_full, y_test_1 = train_test_split(X,y,test_size=0.5,random_state=1)

# Splitting into train and validations(used for ensemble model)
X_train_1, X_val, y_train_1, y_val = train_test_split(X_train_full,y_train_full,test_size=0.2,random_state=1)


In [16]:
  # a function to return the models in a form of a tuple
def get_models():
    models = list()
    models.append(('lr',LinearRegression()))
    models.append(('knn', KNeighborsClassifier()))
    models.append(('cart', DecisionTreeRegressor()))
    models.append(('bayes', GaussianNB()))
    return models

# a function to fit and blend all of our models
def fit_ensemble(models, X_train_1, X_val, y_train_1, y_val):
    # fit and predict using the validation data
    
    # a list to hold the predicted data from the base model for the blender model
    meta_X = list()
    
    # loop through our models
    for name,model in models:
        model.fit(X_train_1, y_train_1)
        y_pred = model.predict(X_val)
        
        # reshaping the predicted results into a matrix with one column
        y_pred = y_pred.reshape(len(y_pred), 1)
        meta_X.append(y_pred)
        
    meta_X = np.hstack(meta_X)
    
    # defining our blender
    blender = LinearRegression()
    
    # fitting our blender using our meta values and y validation set
    blender.fit(meta_X, y_val)
    return blender

# a function to make predictions with our ensemble
def pred_ensemble(models, blender, X_test_1):
    # a list to hold te predictions for the blender
    meta_X = list()
    
    # loop through our models
    for name,model in models:
        
        # predicting using our base models
        y_pred = model.predict(X_test_1)
        
        # reshaping the predicted results into a matrix with one column
        y_pred = y_pred.reshape(len(y_pred), 1)
        meta_X.append(y_pred)
        
    meta_X = np.hstack(meta_X)
    
    # predicting using our blender
    return blender.predict(meta_X)

models = get_models()
blender = fit_ensemble(models, X_train_1, X_val, y_train_1, y_val)
y_pred = pred_ensemble(models, blender, X_test_1)

In [17]:
# printing mse
print("Values used")
print("Train:{} Val:{} Test:{}".format(X_train_1.shape, X_val.shape, X_test_1.shape))
print("Accuracy score")
print("------------------")
print("Blended ensemble:{:.3f}".format(mean_squared_error(y_test_1,y_pred)))

# on individual model
model = LinearRegression()
model.fit(X_train_1, y_train_1)
y_pred1= model.predict(X_test_1)
print("Logistic regression:{:.3f}".format(mean_squared_error(y_test_1,y_pred1)))

Values used
Train:(4000, 20) Val:(1000, 20) Test:(5000, 20)
Accuracy score
------------------
Blended ensemble:0.023
Logistic regression:0.110
