In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.core.common import random_state
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.exceptions import ConvergenceWarning



# Data Loading

In [39]:
data = pd.read_csv('../../Data/data_processed.csv', index_col = 'Id')

data.head()

Unnamed: 0_level_0,OverallQual,GrLivArea,GarageArea,YearBuilt,TotalBsmtSF,FullBath,YearRemodAdd,2ndFlrSF,Fireplaces,LotArea,...,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,1710,548,2003,856,2,2003,854,0,8450,...,0,0,0,0,0,0,1,0,0,208500
2,6,1262,460,1976,1262,2,1976,0,1,9600,...,0,0,0,1,0,0,0,0,0,181500
3,7,1786,608,2001,920,2,2002,866,1,11250,...,0,0,0,0,0,0,1,0,0,223500
4,7,1717,642,1915,756,1,1970,756,1,9550,...,0,0,0,0,0,0,0,1,0,140000
5,8,2198,836,2000,1145,2,2000,1053,1,14260,...,0,0,0,0,0,0,1,0,0,250000


In [40]:
X = data.copy()  

y = X['SalePrice']

X = X.drop(['SalePrice'], axis = 1)

In [41]:
# Dictionary for storing models results
models_results = {}

# List for storing all models
all_models = []

# Classical Linear Regression

In [42]:
linear_regression = LinearRegression()

linear_regression_scores = cross_val_score(linear_regression,
                         X,
                         y,
                         cv = 5,
                         scoring = 'neg_mean_absolute_error')

print(linear_regression_scores.mean())

-14231.924375192946


In [43]:
models_results['linear_regression'] = linear_regression_scores.mean()
all_models.append(linear_regression)

# Ridge Regression

In [44]:
ridge_sample = Ridge()
ridge_hyper_params = {'alpha': range(1, 100, 5), 'random_state': [0]}
ridge_regression = GridSearchCV(ridge_sample, ridge_hyper_params, scoring = 'neg_mean_absolute_error', cv = 5)
ridge_regression.fit(X, y)

print('Best value of λ: ', ridge_regression.best_params_)
print('Best score: ', ridge_regression.best_score_)

Best value of λ:  {'alpha': 6, 'random_state': 0}
Best score:  -14125.235509806993


Okay, we've roughly figured out in which range the best alpha value lies. Let's try to get a more accurate value.

In [45]:
ridge_hyper_params = {'alpha': np.linspace(1, 10, 40), 'random_state': [0]}
ridge_regression = GridSearchCV(ridge_sample, ridge_hyper_params, scoring = 'neg_mean_absolute_error', cv = 5)
ridge_regression.fit(X, y)

print('Best value of λ: ', ridge_regression.best_params_)
print('Best score: ', ridge_regression.best_score_)

Best value of λ:  {'alpha': 3.307692307692308, 'random_state': 0}
Best score:  -14111.128986548667


Okay, now we'll save the ridge regression model with best value of alpha.

In [46]:
models_results['ridge_regression'] = ridge_regression.best_score_

ridge_regression = Ridge(ridge_regression.best_params_)

In [47]:
all_models.append(ridge_regression)

# LASSO Regression

In [48]:
# I don't want to overload the output of the LASSO regression and Elastic Net
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [49]:
lasso_sample = Lasso()
lasso_hyper_params = {'alpha': range (1, 1000, 10), 'random_state': [0]}
lasso_regression = GridSearchCV(lasso_sample, lasso_hyper_params, scoring = 'neg_mean_absolute_error', cv = 5)
lasso_regression.fit(X, y)

print('best alpha: ', lasso_regression.best_params_)
print('score: ', lasso_regression.best_score_)

best alpha:  {'alpha': 51, 'random_state': 0}
score:  -14011.22968635087


Like before, let's try to get more accurate value of alpha.

In [50]:
lasso_hyper_params = {'alpha': range (45, 60), 'random_state': [0]}
lasso_regression = GridSearchCV(lasso_sample, lasso_hyper_params, scoring = 'neg_mean_absolute_error', cv = 5)
lasso_regression.fit(X, y)

print('best alpha: ', lasso_regression.best_params_)
print('score: ', lasso_regression.best_score_)

best alpha:  {'alpha': 53, 'random_state': 0}
score:  -14011.104506482832


Save the model with best parameters.

In [51]:
models_results['lasso_regression'] = lasso_regression.best_score_

lasso_regression = Lasso(lasso_regression.best_params_)

In [52]:
all_models.append(lasso_regression)

# Elastic Net

Do the same steps as with the Ridge and Lasso regression. 

In [53]:
elastic_net_sample = ElasticNet()
elnet_hyper_params = {'alpha': range(1, 100, 5), 'l1_ratio': np.linspace(0, 1, 20), 'random_state': [0]}
elastic_net = GridSearchCV(elastic_net_sample, elnet_hyper_params, scoring = 'neg_mean_absolute_error', cv = 5)
elastic_net.fit(X, y)

print('best alpha and l1_ratio: ', elastic_net.best_params_)
print('score: ', elastic_net.best_score_)

best alpha and l1_ratio:  {'alpha': 51, 'l1_ratio': 1.0, 'random_state': 0}
score:  -14011.22968635087


Lasso regression seems to be the best fit.   
Get more accurate parameters.

In [54]:
elnet_hyper_params = {'alpha': range(45, 60), 'l1_ratio': np.linspace(0.99, 1, 5), 'random_state': [0]}
elastic_net = GridSearchCV(elastic_net_sample, elnet_hyper_params, scoring = 'neg_mean_absolute_error', cv = 5)
elastic_net.fit(X, y)

print('best alpha and l1_ratio: ', elastic_net.best_params_)
print('score: ', elastic_net.best_score_)

best alpha and l1_ratio:  {'alpha': 53, 'l1_ratio': 1.0, 'random_state': 0}
score:  -14011.104506482832


In [19]:
models_results['elastic_net'] = elastic_net.best_score_

elastic_net = ElasticNet(elastic_net.best_params_)

In [20]:
all_models.append(elastic_net)

In [21]:
models_results

{'linear_regression': -14231.924375192946,
 'ridge_regression': -14111.128986548667,
 'lasso_regression': -14011.104506482832,
 'elastic_net': -14011.22968635087}

In [22]:
all_models

[LinearRegression(),
 Ridge(alpha={'alpha': 3.307692307692308, 'random_state': 0}),
 Lasso(alpha={'alpha': 53, 'random_state': 0}),
 ElasticNet(alpha={'alpha': 51, 'l1_ratio': 1.0, 'random_state': 0})]

# DecisionTree

In [26]:
decision_tree_sample = DecisionTreeRegressor()
decision_tree_hyper_params = {'max_depth': range(5, 10),
                              'min_samples_split': range(5, 16, 5),
                              'min_samples_leaf': range(5, 10),
                              'max_features': ['sqrt'],
                              'random_state': [0],
                              'min_impurity_decrease': np.linspace(0, 3, 5),
                              'ccp_alpha': np.linspace(0, 1, 5)
                             }
decision_tree_regressor = GridSearchCV(decision_tree_sample, decision_tree_hyper_params, 
                                       scoring = 'neg_mean_absolute_error', cv = 5)
decision_tree_regressor.fit(X, y)

print('Best DT params: ', decision_tree_regressor.best_params_)
print('Best score: ', decision_tree_regressor.best_score_)

Best DT params:  {'ccp_alpha': 0.0, 'max_depth': 8, 'max_features': 'sqrt', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 5, 'random_state': 0}
Best score:  -19493.911244088253


The model does not seem to suffer from overfitting.   
Good, let's continue searching for the best parameters.

In [28]:
decision_tree_hyper_params = {'max_depth': [8],
                              'min_samples_split': range(3, 9),
                              'min_samples_leaf': range(3, 9),
                              'max_features': ['sqrt', 5, 10, 15, 20, 30, 50],
                              'random_state': [0]
                             }
decision_tree_regressor = GridSearchCV(decision_tree_sample, decision_tree_hyper_params, 
                                       scoring = 'neg_mean_absolute_error', cv = 5)
decision_tree_regressor.fit(X, y)

print('Best DT params: ', decision_tree_regressor.best_params_)
print('Best score: ', decision_tree_regressor.best_score_)

Best DT params:  {'max_depth': 8, 'max_features': 50, 'min_samples_leaf': 6, 'min_samples_split': 3, 'random_state': 0}
Best score:  -16462.29238259072


A better value of `max_features` gave a gain in accuracy. Let's continue experimenting.

In [30]:
decision_tree_hyper_params = {'max_depth': range(5, 15),
                              'min_samples_split': range(2, 5),
                              'min_samples_leaf': range(3, 9),
                              'max_features': range(40, 60, 2),
                              'random_state': [0]
                             }
decision_tree_regressor = GridSearchCV(decision_tree_sample, decision_tree_hyper_params, 
                                       scoring = 'neg_mean_absolute_error', cv = 5)
decision_tree_regressor.fit(X, y)

print('Best DT params: ', decision_tree_regressor.best_params_)
print('Best score: ', decision_tree_regressor.best_score_)

Best DT params:  {'max_depth': 9, 'max_features': 44, 'min_samples_leaf': 5, 'min_samples_split': 2, 'random_state': 0}
Best score:  -16050.372379923268


In [31]:
decision_tree_hyper_params = {'max_depth': range(8, 11),
                              'min_samples_split': range(2, 4),
                              'min_samples_leaf': range(4, 7),
                              'max_features': range(42, 46),
                              'random_state': [0],
                              'min_impurity_decrease': np.linspace(0, 3, 5),
                              'ccp_alpha': np.linspace(0, 1, 5)
                             }
decision_tree_regressor = GridSearchCV(decision_tree_sample, decision_tree_hyper_params, 
                                       scoring = 'neg_mean_absolute_error', cv = 5)
decision_tree_regressor.fit(X, y)

print('Best DT params: ', decision_tree_regressor.best_params_)
print('Best score: ', decision_tree_regressor.best_score_)

Best DT params:  {'ccp_alpha': 0.0, 'max_depth': 9, 'max_features': 44, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 2, 'random_state': 0}
Best score:  -16050.372379923268


We have found the optimal parameters. Now we can write the model.

In [37]:
best_params = decision_tree_regressor.best_params_

decision_tree_regressor = DecisionTreeRegressor(**best_params)