# Ensembling of Various Models: Voting and Stacking

#### Import Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

#options for display
%matplotlib inline
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 120)

Ensemble Packages

In [11]:
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

##### Train and test data :)

In [12]:
train_x = pd.read_csv('../Data/train_x2.csv')
train_y = pd.read_csv('../Data/train_y2.csv',header=None)
train_y = train_y.values.ravel()
test_x = pd.read_csv('../Data/test_x2.csv')

###### Scoring Metric

In [13]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_x.values)
    rmse= np.sqrt(-cross_val_score(model, train_x.values, train_y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

Libraries for specific Ensembled Models

In [14]:
#Linear Models
from sklearn.linear_model import Lasso, ElasticNet


#Kernel Ridge Regression
from sklearn.kernel_ridge import KernelRidge


#Gradient Boosting Machines
from sklearn.ensemble import GradientBoostingRegressor

#Random Forest
from sklearn.ensemble import RandomForestRegressor

#Support Vector Machines
from sklearn import svm

#Others


### Defining Models

Lasso (Elastic went to Lasso)

In [15]:
lasso = make_pipeline(Lasso(alpha =0.0003, random_state=1))
elastic = ElasticNet(alpha= 0.0049, fit_intercept = True, l1_ratio= 0.61)

In [16]:
rmsle_cv(lasso).mean()

0.12039872717670055

Kernel Ridge Regression

In [17]:
KRR_deg3 = KernelRidge(alpha=10, kernel='polynomial', degree=3, coef0=4)
KRR_deg2 =KernelRidge(alpha=1, kernel='polynomial', degree=2, coef0=2.5)

In [18]:
print("KRR deg 3: ", rmsle_cv(KRR_deg3).mean())
print("KRR deg 2: ", rmsle_cv(KRR_deg2).mean())

KRR deg 3:  0.116524207789
KRR deg 2:  0.116519269316


Gradient Boost

In [19]:
gbr = GradientBoostingRegressor(max_depth = 2, max_features = 12, min_samples_split = 10, subsample = 0.7,
     random_state=42, learning_rate = 0.01, n_estimators = 4000, verbose = 0)

In [20]:
print("GBR: ", rmsle_cv(gbr).mean())

GBR:  0.114690038171


Random Forest

In [21]:
randomforest1 = RandomForestRegressor(n_estimators=800, max_features=13, random_state=43, oob_score=True,max_depth=7)
randomforest2 = RandomForestRegressor(n_estimators=600, max_features=8, random_state=43, oob_score=True,max_depth=3)

In [22]:
print("RF deep: ", rmsle_cv(randomforest1).mean())
print("RF shallow: ", rmsle_cv(randomforest2).mean())

RF deep:  0.149590737861
RF shallow:  0.220055339749


SVM

In [22]:
svm1 = svm.SVR(C=9, epsilon=.009, degree = 1, kernel='poly')
svm2 = svm.SVR(C =11, epsilon = 0.03, gamma =0.0002, kernel = 'rbf')

In [36]:
print("SVM 1: ", rmsle_cv(svm1).mean())
print("SVM 2: ", rmsle_cv(svm2).mean())

SVM 1:  0.120619069159
SVM 2:  0.118384662385


##### Ok, let's average models then try a more complex ensemble

### Average Model

Building a class that takes models, fits them, then averages them

In [25]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

Try out a few linear models. General Linear Lasso, Kernel Ridge Regression, and Gradient Boost Regression

In [37]:
averaged_models = AveragingModels(models = (lasso, elastic, KRR_deg3, KRR_deg2, gbr, svm1, svm2))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.1126 (0.0038)



In [44]:
averaged_models.fit(train_x, train_y)

AveragingModels(models=(Pipeline(steps=[('lasso', Lasso(alpha=0.0003, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False))]), ElasticNet(alpha=0.0049, copy_X=True, fit_intercept=True, l1_ratio=0.61... epsilon=0.03, gamma=0.0002,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)))

In [45]:
predictions = averaged_models.predict(test_x)

In [46]:
prediction = pd.DataFrame({'Id' : (np.arange(len(test_x))+1461),
            'SalePrice': np.exp(predictions)})


In [47]:
prediction.to_csv(path_or_buf="../predictions_avg_data2.csv",index=None)

#### Model:  CV, Public LB Score, Data
average with all models: .126? (LB), new data
average without rf2: 0.114 (CV), 0.1234 (LB), new data
average without both rf's: 0.113(CV), 0.1245 (LB), new data


###### Oh it improves it!!!!!

### Stacking
Well let's move on to the fancy ass ensemble...
Get each model's sale price prediction (on the out of bag fold), and use that as input to another model.

Stacking Class:

In [33]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred.flatten()
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

#### Stack them up, and use a meta model to decide based on the new stack

Using Gboost as meta-model for now in case there are non-linearities

In [34]:
rfmeta = RandomForestRegressor(n_estimators=800)
gbrmeta = GradientBoostingRegressor()

In [35]:
# First, we ran meta = Random Forests:
# stacked_averaged_models = StackingAveragedModels(base_models = (lasso, elastic, KRR_deg3, KRR_deg2, gbr, randomforest1, randomforest2, svm1, svm2),
#                                                  meta_model = rfmeta)
# Second, we ran meta = lasso
stacked_averaged_models = StackingAveragedModels(base_models = (lasso, elastic, KRR_deg3, KRR_deg2, gbr, randomforest1, svm1, svm2),
                                                 meta_model = gbrmeta)

score_stacked_model = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score_stacked_model.mean(), score_stacked_model.std()))


Stacking Averaged models score: 0.1108 (0.0058)


In [38]:
# for meta_model = lasso we got Stacking Average models score: 0.1103 (0.0048)
# for rfmeta = lasso we got Stacking Average models score: 0.1115 (0.0053)

In [39]:
# stack_rf = stacked_averaged_models
stack_mod = stacked_averaged_models

In [40]:
# stack_rf.fit(train_x.values, train_y)
stack_mod.fit(train_x.values, train_y)

StackingAveragedModels(base_models=(Pipeline(steps=[('lasso', Lasso(alpha=0.0003, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False))]), ElasticNet(alpha=0.0049, copy_X=True, fit_intercept=True, l1_ratio... epsilon=0.03, gamma=0.0002,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)),
            meta_model=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
            n_folds=5)

In [41]:
# pred_stack_rf = stack_rf.predict(test_x.values)
pred_stack = stack_mod.predict(test_x.values)

In [42]:
# pred_stack_rf
pred_stack

array([ 11.62302318,  12.00825159,  12.13360146, ...,  12.00825159,
        11.65702765,  12.36062561])

In [49]:
# pred_stack_rf = pd.DataFrame({'Id' : (np.arange(len(test_x))+1461),
#             'SalePrice': np.exp(pred_stack_rf)})
pred_stack = pd.DataFrame({'Id' : (np.arange(len(test_x))+1461),
            'SalePrice': np.exp(pred_stack)})


In [50]:
# pred_stack_rf.to_csv(path_or_buf="../Data/predictions_stackrf_data2.csv",index=None)
pred_stack.to_csv(path_or_buf="../Data/predictions_stacklasso_data2.csv",index=None)

#### Scores
stacked without rf2: CV 0.1108 (0.006sd), LB 0.1355, new data 

In [None]:
print("Done")