# House Price Predictions - Modelling

In this notebook we will explore and make predictions on the housing dataset from Kaggle:\
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/


* Setup
    * Import libraries
    * Read the data
* Linear Models
* Ensemble Methods

# Setup
#### Import libraries

In [35]:
# main
import pandas as pd
import numpy as np
import time

# warnings
import warnings
def ignore_warn(*args, **kwargs): pass
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.warn = ignore_warn

# config
pd.options.display.max_columns = None

# modelling
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso, Lars, LassoLars, OrthogonalMatchingPursuit, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

##### Read the Data

In [24]:
# data frames
df_train = pd.read_csv('files/train_processed.csv')
df_test = pd.read_csv('files/test_processed.csv')
df_all = pd.concat([df_train, df_test])

# keep track
ntrain = df_train.shape[0]
ntest = df_test.shape[0]

# print shape
print('rows/cols:', df_train.shape[0], df_train.shape[1], 'train')
print('rows/cols:', df_test.shape[0], df_test.shape[1], 'test')
print('rows/cols:', df_all.shape[0], df_all.shape[1], 'train+test')

rows/cols: 1458 221 train
rows/cols: 1459 220 test
rows/cols: 2917 221 train+test


In [25]:
# data frames
y_train = df_train['SalePrice'].copy()
x_train = df_train.drop(columns='SalePrice')
x_test = df_test

#### Validation Function

In [26]:
n_folds = 10

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train.values)
    rmse= np.sqrt(-cross_val_score(model, x_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

# Base Models

* https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
* https://dataaspirant.com/ensemble-methods-bagging-vs-boosting-difference/
* https://www.math.unipd.it/~aiolli/corsi/1213/aa/user_guide-0.12-git.pdf

#### Define Pipelines

In [37]:
pipelines = {
    'OLS' : make_pipeline(RobustScaler(), LinearRegression()),
    'Lasso' : make_pipeline(RobustScaler(), Lasso(alpha =0.0004, random_state=1)),
    'Ridge' : make_pipeline(RobustScaler(), Ridge()),
    'ENet' : make_pipeline(RobustScaler(), ElasticNet(alpha=0.0004, l1_ratio=.9, random_state=3)),
    'OMP' : make_pipeline(RobustScaler(), OrthogonalMatchingPursuit()),
    'BRidge' : BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
                fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
                normalize=False, tol=0.001, verbose=False)
}

for p in pipelines:
    start_time = time.time()
    score = rmsle_cv(pipelines[p])
    print(p, "\t score: {:.4f} ({:.4f}), -- {:.2f} seconds".format(score.mean(), score.std(), (time.time() - start_time)))

OLS 	 score: 0.1181 (0.0163), -- 0.83 seconds
Lasso 	 score: 0.1087 (0.0145), -- 0.88 seconds
Ridge 	 score: 0.1130 (0.0156), -- 0.47 seconds
ENet 	 score: 0.1087 (0.0145), -- 0.99 seconds
OMP 	 score: 0.1209 (0.0149), -- 0.49 seconds
BRidge 	 score: 0.1103 (0.0140), -- 0.67 seconds


In [32]:
# SalePrice
df = np.expm1(y_train).to_frame()

# Prediction Models
for p in pipelines:
    mfit = pipelines[p].fit(x_train.values, y_train)
    pred = mfit.predict(x_train.values)
    df[p] = np.expm1(pred)
    
# view a few rows
df.head(10)

Unnamed: 0,SalePrice,OLS,Lasso,Ridge,ENet,OMP,BRidge
0,208500.0,207931.274627,207986.656335,207531.822215,207981.629749,206803.490856,207060.013826
1,181500.0,190253.898623,186242.355286,189615.37376,186110.228709,185178.19974,189273.07041
2,223500.0,216601.707826,218113.197724,216498.297819,217764.105511,218006.511226,217406.431589
3,140000.0,160230.581472,167299.306272,159441.346321,166817.350196,171675.372005,160372.947173
4,250000.0,295189.979321,296177.763687,294668.485665,296023.611429,283148.943478,292535.727878
5,143000.0,143989.742715,159230.198782,148069.649912,159213.384335,166648.841369,157013.273196
6,307000.0,274448.704261,275627.551304,275827.702055,275587.873291,256190.345223,283060.891416
7,200000.0,236704.595485,224369.604928,233243.950952,224363.996905,222382.653537,229647.24736
8,129900.0,123877.894089,126375.354986,123561.642019,125962.499243,140626.13374,125978.352191
9,118000.0,117346.409247,119249.840148,118264.783452,118972.862089,131509.008173,120922.939123


# Gradient Boosting Models

In [33]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1,
                             verbosity=0)

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,
                             verbosity=-1)

In [38]:
pipelines = {
    'GBoost ' : make_pipeline(RobustScaler(), GBoost),
    'XGBoost' : make_pipeline(RobustScaler(), model_xgb),
    'LGBoost' : make_pipeline(RobustScaler(), model_lgb)
}

for p in pipelines:
    start_time = time.time()
    score = rmsle_cv(pipelines[p])
    print(p, "\t score: {:.4f} ({:.4f}), -- {:.2f} seconds".format(score.mean(), score.std(), (time.time() - start_time)))

GBoost  	 score: 0.1141 (0.0153), -- 85.49 seconds
XGBoost 	 score: 0.1151 (0.0156), -- 25.12 seconds
LGBoost 	 score: 0.1146 (0.0164), -- 3.34 seconds


In [39]:
# SalePrice
df = np.expm1(y_train).to_frame()

# Prediction Models
for p in pipelines:
    mfit = pipelines[p].fit(x_train.values, y_train)
    pred = mfit.predict(x_train.values)
    df[p] = np.expm1(pred)
    
# view a few rows
df.head(10)



Unnamed: 0,SalePrice,GBoost,XGBoost,LGBoost
0,208500.0,208496.370201,205389.03125,207674.487338
1,181500.0,178619.375724,180042.203125,172916.755218
2,223500.0,220643.333651,213329.40625,214596.790189
3,140000.0,142099.98294,159250.046875,153407.625013
4,250000.0,260561.882705,285867.9375,293313.801666
5,143000.0,142840.947172,155408.125,147327.313956
6,307000.0,301927.698682,287712.15625,278851.107577
7,200000.0,201170.379811,214505.71875,211790.041042
8,129900.0,129761.01412,130928.640625,133644.397789
9,118000.0,117523.83849,122115.453125,123959.274717


# Model Stacking

#### Stacking Models - Averaged

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [None]:
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

#### Stacking Models - Meta model

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

#### Stacked Regressor

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
stacked_averaged_models.fit(x_train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(x_train.values)
#stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(y_train, stacked_train_pred))

In [None]:
model_xgb.fit(x_train, y_train)
xgb_train_pred = model_xgb.predict(x_train)
#xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))

In [None]:
model_lgb.fit(x_train, y_train)
lgb_train_pred = model_lgb.predict(x_train)
#lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

In [None]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train,stacked_train_pred*0.70 +
               xgb_train_pred*0.15 + lgb_train_pred*0.15 ))

#### Ensemble Prediction

In [None]:
ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15