In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from math import sqrt
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV



In [2]:
unpickled_df = pd.read_pickle("./saleprice_1.pkl")
unpickled_df.head()

Unnamed: 0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,yrsold,salecondition,saleprice,years_before_sale,years_since_remodel,mszoning_FV,mszoning_RH,mszoning_RL,mszoning_RM,street_Pave
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,2008,Normal,208500,5,5,0,0,1,0,1
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,2007,Normal,181500,31,31,0,0,1,0,1
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,2008,Normal,223500,7,6,0,0,1,0,1
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,2006,Abnorml,140000,91,36,0,0,1,0,1
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,2008,Normal,250000,8,8,0,0,1,0,1


In [3]:
unpickled_df = pd.concat([unpickled_df,pd.get_dummies(unpickled_df.mszoning, prefix="mszoning", drop_first=True)], axis=1)
unpickled_df = pd.concat([unpickled_df,pd.get_dummies(unpickled_df.street, prefix="street", drop_first=True)], axis=1)
dummy_column_names = list(pd.get_dummies(unpickled_df.mszoning, prefix="mszoning", drop_first=True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(unpickled_df.street, prefix="street", drop_first=True).columns)


In [4]:
unpickled_df.head()

# Modelling

X = unpickled_df[['mssubclass', 'lotfrontage', 'lotarea', 'overallqual',
       'overallcond', 'masvnrarea', 'bsmtfinsf1', 'bsmtunfsf', 'totalbsmtsf', 'firstflrsf', 'secondflrsf', 'grlivarea', 'bsmtfullbath', 'fullbath',
       'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd',
       'fireplaces', 'garageyrblt', 'garagecars', 'garagearea', 'wooddecksf',
       'openporchsf', 'enclosedporch', 'screenporch',
       'poolarea'] + dummy_column_names]
y = unpickled_df.saleprice
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [5]:
## Hyperparameter optimization using RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


In [6]:
param_grid = {
    "loss": ["ls"],
    "learning_rate":[0.1, 0.5, 0.9],
    #"n_estimators":[100, 150, 200], get 88%
    #"n_estimators":[250, 300, 350], get 88.3%
    "n_estimators":[400, 500, 600],
    #"subsample":[0.1],
    "subsample":[0.5],
    "criterion":["friedman_mse"],
    "min_samples_split":[2, 4, 6],
    #"min_samples_leaf":[1, 2, 3],
    #"min_samples_leaf":[4, 5, 6], get 87.9%
    "min_samples_leaf":[8, 10, 12],
    "tol":[1e-4],
    "warm_start":[True]
    
}


In [7]:
regressor = GradientBoostingRegressor()

In [8]:
random_search=RandomizedSearchCV(regressor,param_grid,n_jobs=-1,cv=10,verbose=3)

In [9]:
random_search.fit(X, y)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   47.2s finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=GradientBoostingRegressor(alpha=0.9,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                           

In [10]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(regressor,X,y,cv=10)

In [11]:
score

array([0.90258385, 0.90479587, 0.92382921, 0.79130876, 0.89556645,
       0.90353721, 0.86580432, 0.88302834, 0.89779468, 0.89208044])

In [12]:
score.mean()

0.8860329119263947