----------------------------------------------------------------------------------------------------------------------

# Building Models with Interactions - PolynomialFeatures
## (All Attributes, All Numerical Attributes, Random Forest Best 5 Attributes)

----------------------------------------------------------------------------------------------------------------------

In [59]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import make_scorer, r2_score
from sklearn.svm import LinearSVR, SVR
from xgboost import XGBRegressor
from sklearn import tree
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion

In [60]:
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [61]:
def rmse_custom(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))
rmse_score = make_scorer(score_func=rmse_custom)

In [62]:
def rmsle_custom(y_true_log, y_pred_log):
    y_true = np.exp(y_true_log)
    y_pred = np.exp(y_pred_log)
    return np.sqrt(np.mean(np.power(np.log(y_true + 1) - np.log(y_pred + 1), 2)))
rmsle_score = make_scorer(score_func=rmsle_custom)

In [63]:
def score_to_stats(scores):
    
    return {score:round(values.mean(),4) for score, values in scores.items()}

----------------------------------------------------------------------------------------------------------------------
## Polynomial Features (All Attributes)
----------------------------------------------------------------------------------------------------------------------

### Data Importing

In [64]:
df_train = pd.read_csv('train_master.csv')
df_test = pd.read_csv('test_master.csv')

### Output variable logarithmic transformation

In [65]:
df_train['SalePrice_log'] = np.log1p(df_train['SalePrice'])

In [66]:
df_train = df_train.drop(['Id','SalePrice'], axis = 1)

In [67]:
df_test = df_test.drop('Id', axis = 1)

### Pre-processing and data cleaning

In [68]:
outlier1 = df_train[df_train['GrLivArea'] > 4500].index
outlier2 = df_train[df_train["1stFlrSF"] > 4000].index
outlier3 = df_train[df_train["TotalBsmtSF"] > 4000].index

In [69]:
# 523, 1298
df_train = df_train.drop(outlier1)

In [70]:
missing_values_attribute = ['PoolQC','MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
correlated_attributes = ['GarageArea', '1stFlrSF']

In [71]:
df_train = df_train.drop(missing_values_attribute, axis = 1)
df_test = df_test.drop(missing_values_attribute, axis = 1)

In [72]:
df_train = df_train.drop(correlated_attributes, axis = 1)
df_test = df_test.drop(correlated_attributes, axis = 1)

### Attributes mapping and dummy coding

In [73]:
df_train['train'] = 1
df_test['train'] = 0

In [74]:
df_combined = pd.concat([df_train, df_test])

In [75]:
df_combined = df_combined.reset_index(drop = True)

In [76]:
cat_mapping = {'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
scale_attributes = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']
for i in df_combined[scale_attributes]:
    df_combined[i] = df_combined[i].map(cat_mapping)

In [77]:
df_train = df_combined[df_combined['train'] == 1]
df_train = df_train.drop(['train'], axis = 1)

### Polynomial Features implementation

In [78]:
df_train.shape

(1458, 73)

In [79]:
df_train = pd.get_dummies(df_train)

In [80]:
imp = SimpleImputer()

In [81]:
df_train[df_train.columns] = imp.fit_transform(df_train)

In [82]:
df_train.shape

(1458, 240)

In [83]:
X = df_train.drop('SalePrice_log', axis = 1)
y = df_train['SalePrice_log']

In [84]:
poly = PolynomialFeatures(degree = 2, interaction_only= True)
X = poly.fit_transform(X)

In [85]:
X = pd.DataFrame(X)

In [86]:
X.shape

(1458, 28681)

In [87]:
scoring = {'rmsle': rmsle_score,
           'r2': 'r2'}

### Models

#### Linear Regression

In [88]:
lr = LinearRegression(normalize = True)
cv_scores = cross_validate(lr, X, y, scoring = scoring, cv = 10, return_train_score= True)
print(score_to_stats(cv_scores))

{'fit_time': 10.0596, 'score_time': 0.0408, 'test_rmsle': 0.3754, 'train_rmsle': 0.0, 'test_r2': -2.6039, 'train_r2': 1.0}


#### Ridge

In [89]:
rid = Ridge(alpha = 10, normalize = True)
cv_scores = cross_validate(rid, X, y, scoring = scoring, cv = 10, return_train_score= True)
print(score_to_stats(cv_scores))

{'fit_time': 2.3019, 'score_time': 0.0318, 'test_rmsle': 0.1177, 'train_rmsle': 0.0741, 'test_r2': 0.9122, 'train_r2': 0.9656}


#### Lasso

In [90]:
lasso = Lasso(alpha = 0.01)
cv_scores = cross_validate(lasso, X, y, scoring = scoring, cv = 10, return_train_score= True)
print(score_to_stats(cv_scores))

{'fit_time': 24.5526, 'score_time': 0.0328, 'test_rmsle': 0.2758, 'train_rmsle': 0.0324, 'test_r2': 0.4803, 'train_r2': 0.9934}


#### Elastic Net

In [91]:
enet = ElasticNet(alpha = 0.01, l1_ratio = 0, normalize = True)
cv_scores = cross_validate(rid, X, y, scoring = scoring, cv = 10, return_train_score= True)
print(score_to_stats(cv_scores))

{'fit_time': 2.3512, 'score_time': 0.0327, 'test_rmsle': 0.1177, 'train_rmsle': 0.0741, 'test_r2': 0.9122, 'train_r2': 0.9656}


#### Xgboost

In [92]:
xgboost = XGBRegressor(eta = 0.01, subsample = 0.5)
cv_scores = cross_validate(xgboost, X, y, scoring = scoring, cv = 10, return_train_score = True)
print(score_to_stats(cv_scores))

{'fit_time': 156.3264, 'score_time': 1.1721, 'test_rmsle': 0.1225, 'train_rmsle': 0.0771, 'test_r2': 0.9046, 'train_r2': 0.9628}


#### Random Forest

In [93]:
rf = RandomForestRegressor(max_depth = 20, min_samples_leaf = 1, n_estimators = 150)
cv_scores = cross_validate(rf, X, y, scoring = scoring, cv = 10, return_train_score = True)
print(score_to_stats(cv_scores))

{'fit_time': 427.3975, 'score_time': 0.0846, 'test_rmsle': 0.1257, 'train_rmsle': 0.0473, 'test_r2': 0.8998, 'train_r2': 0.986}


----------------------------------------------------------------------------------------------------------------------
## Polynomial Features (All Numerical Attributes)
----------------------------------------------------------------------------------------------------------------------

### Data Importing

In [94]:
df_train = pd.read_csv('train_master.csv')
df_test = pd.read_csv('test_master.csv')

### Output variable logarithmic transformation

In [95]:
df_train['SalePrice_log'] = np.log1p(df_train['SalePrice'])

In [96]:
df_train = df_train.drop(['Id','SalePrice'], axis = 1)

In [97]:
df_test = df_test.drop('Id', axis = 1)

### Pre-processing and data cleaning

In [98]:
outlier1 = df_train[df_train['GrLivArea'] > 4500].index
outlier2 = df_train[df_train["1stFlrSF"] > 4000].index
outlier3 = df_train[df_train["TotalBsmtSF"] > 4000].index

In [99]:
# 523, 1298
df_train = df_train.drop(outlier1)

In [100]:
missing_values_attribute = ['PoolQC','MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
correlated_attributes = ['GarageArea', '1stFlrSF']

In [101]:
df_train = df_train.drop(missing_values_attribute, axis = 1)
df_test = df_test.drop(missing_values_attribute, axis = 1)

In [102]:
df_train = df_train.drop(correlated_attributes, axis = 1)
df_test = df_test.drop(correlated_attributes, axis = 1)

### Attributes mapping (where possible)

In [103]:
df_train['train'] = 1
df_test['train'] = 0

In [104]:
df_combined = pd.concat([df_train, df_test])

In [105]:
df_combined = df_combined.reset_index(drop = True)

In [106]:
cat_mapping = {'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
scale_attributes = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']
for i in df_combined[scale_attributes]:
    df_combined[i] = df_combined[i].map(cat_mapping)

In [107]:
df_train = df_combined[df_combined['train'] == 1]
df_train = df_train.drop(['train'], axis = 1)

In [108]:
df_train.head()

Unnamed: 0,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,Condition1,Condition2,Electrical,EnclosedPorch,ExterCond,ExterQual,Exterior1st,Exterior2nd,Fireplaces,Foundation,FullBath,Functional,GarageCars,GarageCond,GarageFinish,GarageQual,GarageType,GarageYrBlt,GrLivArea,HalfBath,Heating,HeatingQC,HouseStyle,KitchenAbvGr,KitchenQual,LandContour,LandSlope,LotArea,LotConfig,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MSZoning,MasVnrArea,MasVnrType,MiscVal,MoSold,Neighborhood,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,RoofMatl,RoofStyle,SaleCondition,SalePrice_log,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,854,0,3,1Fam,3.0,No,706.0,0.0,GLQ,Unf,1.0,0.0,4.0,150.0,Y,Norm,Norm,SBrkr,0,3,4,VinylSd,VinylSd,0,PConc,2,Typ,2.0,3.0,RFn,3.0,Attchd,2003.0,1710,1,GasA,5,2Story,1,4.0,Lvl,Gtl,8450,Inside,65.0,Reg,0,60,RL,196.0,BrkFace,0,2,CollgCr,61,5,7,Y,0,CompShg,Gable,Normal,12.247699,WD,0,Pave,8,856.0,AllPub,0,2003,2003,2008
1,0,0,3,1Fam,3.0,Gd,978.0,0.0,ALQ,Unf,0.0,1.0,4.0,284.0,Y,Feedr,Norm,SBrkr,0,3,3,MetalSd,MetalSd,1,CBlock,2,Typ,2.0,3.0,RFn,3.0,Attchd,1976.0,1262,0,GasA,5,1Story,1,3.0,Lvl,Gtl,9600,FR2,80.0,Reg,0,20,RL,0.0,,0,5,Veenker,0,8,6,Y,0,CompShg,Gable,Normal,12.109016,WD,0,Pave,6,1262.0,AllPub,298,1976,1976,2007
2,866,0,3,1Fam,3.0,Mn,486.0,0.0,GLQ,Unf,1.0,0.0,4.0,434.0,Y,Norm,Norm,SBrkr,0,3,4,VinylSd,VinylSd,1,PConc,2,Typ,2.0,3.0,RFn,3.0,Attchd,2001.0,1786,1,GasA,5,2Story,1,4.0,Lvl,Gtl,11250,Inside,68.0,IR1,0,60,RL,162.0,BrkFace,0,9,CollgCr,42,5,7,Y,0,CompShg,Gable,Normal,12.317171,WD,0,Pave,6,920.0,AllPub,0,2001,2002,2008
3,756,0,3,1Fam,4.0,No,216.0,0.0,ALQ,Unf,1.0,0.0,3.0,540.0,Y,Norm,Norm,SBrkr,272,3,3,Wd Sdng,Wd Shng,1,BrkTil,1,Typ,3.0,3.0,Unf,3.0,Detchd,1998.0,1717,0,GasA,4,2Story,1,4.0,Lvl,Gtl,9550,Corner,60.0,IR1,0,70,RL,0.0,,0,2,Crawfor,35,5,7,Y,0,CompShg,Gable,Abnorml,11.849405,WD,0,Pave,7,756.0,AllPub,0,1915,1970,2006
4,1053,0,4,1Fam,3.0,Av,655.0,0.0,GLQ,Unf,1.0,0.0,4.0,490.0,Y,Norm,Norm,SBrkr,0,3,4,VinylSd,VinylSd,1,PConc,2,Typ,3.0,3.0,RFn,3.0,Attchd,2000.0,2198,1,GasA,5,2Story,1,4.0,Lvl,Gtl,14260,FR2,84.0,IR1,0,60,RL,350.0,BrkFace,0,12,NoRidge,84,5,8,Y,0,CompShg,Gable,Normal,12.42922,WD,0,Pave,9,1145.0,AllPub,192,2000,2000,2008


In [109]:
df_train.shape

(1458, 73)

In [110]:
df_train['SalePrice_log'].head()

0    12.247699
1    12.109016
2    12.317171
3    11.849405
4    12.429220
Name: SalePrice_log, dtype: float64

### Output variable exclusion

In [111]:
df_SalePrice_log = df_train['SalePrice_log']

In [112]:
df_train = df_train.drop('SalePrice_log', axis = 1)

### Polynomial Features implementation

In [113]:
df_num = df_train.select_dtypes(exclude = "object")
df_cat = df_train.select_dtypes(include = "object")

In [114]:
df_num_columns = df_num.columns
df_cat_columns = pd.get_dummies(df_cat).columns

In [115]:
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_cols = X[self.columns]
        return X_cols

In [116]:
pipe_poly = Pipeline([
    ('features', FeatureUnion([('num', 
                                Pipeline([('extract', 
                                           ColumnExtractor(columns = df_num_columns)),
                                          ('imp',
                                           SimpleImputer(strategy = 'mean')),
                                          ('poly', 
                                           PolynomialFeatures(interaction_only = True, include_bias = False))  ])),
                               ('cat_var',
                                Pipeline([('extract2',
                                           ColumnExtractor(columns = df_cat_columns)),
                                           ('imp2',
                                           SimpleImputer(strategy = 'most_frequent'))]))]))])

In [117]:
pipe_poly.set_params(features__num__poly__degree=2)
df_train = pipe_poly.fit_transform(pd.get_dummies(df_train))

In [118]:
imena_num = pipe_poly.named_steps['features'].transformer_list[0][1].named_steps['poly'].get_feature_names(df_num_columns)

In [119]:
lista1 = list(imena_num)

In [120]:
lista2 = list(df_cat_columns)

In [121]:
nazivi_kolona = lista1 + lista2

In [122]:
df_train = pd.DataFrame(df_train, columns = nazivi_kolona)

In [123]:
df_train = pd.concat([df_train, df_SalePrice_log], 1)

In [124]:
df_train.shape

(1458, 1101)

In [125]:
X = df_train.drop('SalePrice_log', axis = 1)
y = df_train['SalePrice_log']

In [126]:
scoring = {'rmsle': rmsle_score,
           'r2': 'r2'}

### Models

#### Linear Regression

In [127]:
param_grid = {'f_regression__k':[50,100,200,500,700,800,900,1100]}

In [128]:
pipe_lr = Pipeline([('f_regression', SelectKBest(f_regression)), ('lr', LinearRegression(normalize = True))])
grid = GridSearchCV(pipe_lr, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'f_regression__k': 200}


##### Best params

In [129]:
pipe_lr = Pipeline([('f_regression', SelectKBest(k=200)), ('lr', LinearRegression(normalize = True))])
cv_scores = cross_validate(pipe_lr, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.2056, 'score_time': 0.0062, 'test_rmsle': 0.1658, 'test_r2': 0.8132}


#### Ridge

In [130]:
param_grid = {'alpha':[0.01,0.1,1,5,10,20,100]}

In [131]:
rid = Ridge(normalize = True)
grid = GridSearchCV(rid, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'alpha': 1}


##### Best params

In [132]:
rid = Ridge(alpha = 1, normalize = True)
cv_scores = cross_validate(rid, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.1203, 'score_time': 0.0045, 'test_rmsle': 0.1157, 'test_r2': 0.9153}


#### Lasso

In [133]:
param_grid = {'alpha':[0.01,0.1,1,5,10,20,100]}

In [134]:
lasso = Lasso()
grid = GridSearchCV(lasso, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'alpha': 1}


##### Best params

In [135]:
lasso = Lasso(alpha = 1)
cv_scores = cross_validate(lasso, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 1.1368, 'score_time': 0.0038, 'test_rmsle': 0.1409, 'test_r2': 0.8742}


#### Elastic Net

In [136]:
param_grid = {'alpha':[0.01,0.1,1,5,10,20,100],
              'l1_ratio':[0,0.01,0.1,0.5,0.8,1]}

In [137]:
enet = ElasticNet()
grid = GridSearchCV(enet, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'alpha': 20, 'l1_ratio': 0.1}


##### Best params

In [138]:
enet = ElasticNet(alpha = 20, l1_ratio = 0.1)
cv_scores = cross_validate(enet, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 1.1962, 'score_time': 0.0037, 'test_rmsle': 0.1376, 'test_r2': 0.8802}


#### XGBOOST

In [139]:
param_grid = {'eta':[0.01,0.05,0.1,0.2],
              'subsample':[0.5,0.75,1]}

In [140]:
xgboost = XGBRegressor()
grid = GridSearchCV(xgboost, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)





{'eta': 0.01, 'subsample': 1}


##### Best params

In [141]:
xgboost = XGBRegressor(eta = 0.01, subsample = 1)
cv_scores = cross_validate(xgboost, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 7.4506, 'score_time': 0.0458, 'test_rmsle': 0.1196, 'test_r2': 0.909}


#### Random Forest Regressor

In [142]:
param_grid = {'n_estimators':[50,100,150,250],
              'max_depth':[5,10,20,50],
              'min_samples_leaf':[1,3,5,7]}

In [143]:
rf = RandomForestRegressor()
grid = GridSearchCV(rf, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'max_depth': 50, 'min_samples_leaf': 1, 'n_estimators': 250}


##### Best Params

In [144]:
rf = RandomForestRegressor(max_depth=20, min_samples_leaf = 1, n_estimators = 250)
cv_scores = cross_validate(rf, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 94.5926, 'score_time': 0.0494, 'test_rmsle': 0.1272, 'test_r2': 0.8971}


----------------------------------------------------------------------------------------------------------------------
## Polynomial Features (Rendom Forest Best 5 Attributes)
----------------------------------------------------------------------------------------------------------------------

### Data Importing

In [147]:
df_train = pd.read_csv('train_master.csv')
df_test = pd.read_csv('test_master.csv')

### Output variable logarithmic transformation

In [148]:
df_train['SalePrice_log'] = np.log1p(df_train['SalePrice'])

In [149]:
df_train = df_train.drop(['Id','SalePrice'], axis = 1)

In [150]:
df_test = df_test.drop('Id', axis = 1)

### Pre-processing and data cleaning

In [151]:
outlier1 = df_train[df_train['GrLivArea'] > 4500].index
outlier2 = df_train[df_train["1stFlrSF"] > 4000].index
outlier3 = df_train[df_train["TotalBsmtSF"] > 4000].index

In [152]:
# 523, 1298
df_train = df_train.drop(outlier1)

In [153]:
missing_values_attribute = ['PoolQC','MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
correlated_attributes = ['GarageArea', '1stFlrSF']

In [154]:
df_train = df_train.drop(missing_values_attribute, axis = 1)
df_test = df_test.drop(missing_values_attribute, axis = 1)

In [155]:
df_train = df_train.drop(correlated_attributes, axis = 1)
df_test = df_test.drop(correlated_attributes, axis = 1)

### Attributes mapping (where possible)

In [156]:
df_train['train'] = 1
df_test['train'] = 0

In [157]:
df_combined = pd.concat([df_train, df_test])

In [158]:
df_combined = df_combined.reset_index(drop = True)

In [159]:
cat_mapping = {'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
scale_attributes = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']
for i in df_combined[scale_attributes]:
    df_combined[i] = df_combined[i].map(cat_mapping)

In [160]:
df_train = df_combined[df_combined['train'] == 1]
df_train = df_train.drop(['train'], axis = 1)

### Output variable exclusion

In [161]:
df_train = df_train.reset_index(drop = True)

In [162]:
df_SalePrice_log = df_train['SalePrice_log']

In [163]:
df_train = df_train.drop('SalePrice_log', axis = 1)

### Polynomial Features implementation

In [164]:
df_num = df_train.select_dtypes(exclude = "object")
df_cat = df_train.select_dtypes(include = "object")
df_num_important = df_num[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']]
df_num_other = df_num.drop(df_num_important, axis = 1)

In [165]:
df_num_columns = df_num.columns
df_cat_columns = pd.get_dummies(df_cat).columns
df_num_important_columns = df_num_important.columns
df_num_other_columns = df_num_other.columns

In [166]:
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_cols = X[self.columns]
        return X_cols

In [167]:
pipe_poly = Pipeline([
    ('features', FeatureUnion([('num_important', 
                                Pipeline([('extract', 
                                           ColumnExtractor(columns = df_num_important_columns)),
                                          ('imp',
                                           SimpleImputer(strategy = 'mean')),
                                          ('poly', 
                                           PolynomialFeatures(interaction_only = True, include_bias = False))])),
                               ('num_other',
                                Pipeline([('extract1',
                                           ColumnExtractor(columns = df_num_other_columns)),
                                          ('imp',
                                           SimpleImputer(strategy = 'mean'))])),                  
                               ('cat_var',
                                Pipeline([('extract2',
                                           ColumnExtractor(columns = df_cat_columns)),
                                           ('imp2',
                                           SimpleImputer(strategy = 'most_frequent'))]))]))])

In [168]:
pipe_poly.set_params(features__num_important__poly__degree=2)
df_train = pipe_poly.fit_transform(pd.get_dummies(df_train))

In [169]:
names_num_important = pipe_poly.named_steps['features'].transformer_list[0][1].named_steps['poly'].get_feature_names(df_num_important_columns)

In [170]:
lista1 = list(names_num_important)

In [171]:
lista2 = list(df_num_other_columns)

In [172]:
lista3 = list(df_cat_columns)

In [173]:
all_columns = lista1 + lista2 + lista3

In [174]:
df_train = pd.DataFrame(df_train, columns = all_columns)

In [175]:
df_train = pd.concat([df_train, df_SalePrice_log], 1)

In [176]:
df_train.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,YearBuilt,OverallQual GrLivArea,OverallQual GarageCars,OverallQual TotalBsmtSF,OverallQual YearBuilt,GrLivArea GarageCars,GrLivArea TotalBsmtSF,GrLivArea YearBuilt,GarageCars TotalBsmtSF,GarageCars YearBuilt,TotalBsmtSF YearBuilt,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,EnclosedPorch,ExterCond,ExterQual,Fireplaces,FullBath,GarageCond,GarageQual,GarageYrBlt,HalfBath,HeatingQC,KitchenAbvGr,KitchenQual,LotArea,LotFrontage,LowQualFinSF,MSSubClass,MasVnrArea,MiscVal,MoSold,OpenPorchSF,OverallCond,PoolArea,ScreenPorch,TotRmsAbvGrd,WoodDeckSF,YearRemodAdd,YrSold,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,CentralAir_N,CentralAir_Y,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Functional_Maj1,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageFinish_Fin,GarageFinish_RFn,GarageFinish_Unf,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,PavedDrive_N,PavedDrive_P,PavedDrive_Y,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave,Utilities_AllPub,Utilities_NoSeWa,SalePrice_log
0,7.0,1710.0,2.0,856.0,2003.0,11970.0,14.0,5992.0,14021.0,3420.0,1463760.0,3425130.0,1712.0,4006.0,1714568.0,854.0,0.0,3.0,3.0,706.0,0.0,1.0,0.0,4.0,150.0,0.0,3.0,4.0,0.0,2.0,3.0,3.0,2003.0,1.0,5.0,1.0,4.0,8450.0,65.0,0.0,60.0,196.0,0.0,2.0,61.0,5.0,0.0,0.0,8.0,0.0,2003.0,2008.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,12.247699
1,6.0,1262.0,2.0,1262.0,1976.0,7572.0,12.0,7572.0,11856.0,2524.0,1592644.0,2493712.0,2524.0,3952.0,2493712.0,0.0,0.0,3.0,3.0,978.0,0.0,0.0,1.0,4.0,284.0,0.0,3.0,3.0,1.0,2.0,3.0,3.0,1976.0,0.0,5.0,1.0,3.0,9600.0,80.0,0.0,20.0,0.0,0.0,5.0,0.0,8.0,0.0,0.0,6.0,298.0,1976.0,2007.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,12.109016
2,7.0,1786.0,2.0,920.0,2001.0,12502.0,14.0,6440.0,14007.0,3572.0,1643120.0,3573786.0,1840.0,4002.0,1840920.0,866.0,0.0,3.0,3.0,486.0,0.0,1.0,0.0,4.0,434.0,0.0,3.0,4.0,1.0,2.0,3.0,3.0,2001.0,1.0,5.0,1.0,4.0,11250.0,68.0,0.0,60.0,162.0,0.0,9.0,42.0,5.0,0.0,0.0,6.0,0.0,2002.0,2008.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,12.317171
3,7.0,1717.0,3.0,756.0,1915.0,12019.0,21.0,5292.0,13405.0,5151.0,1298052.0,3288055.0,2268.0,5745.0,1447740.0,756.0,0.0,3.0,4.0,216.0,0.0,1.0,0.0,3.0,540.0,272.0,3.0,3.0,1.0,1.0,3.0,3.0,1998.0,0.0,4.0,1.0,4.0,9550.0,60.0,0.0,70.0,0.0,0.0,2.0,35.0,5.0,0.0,0.0,7.0,0.0,1970.0,2006.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,11.849405
4,8.0,2198.0,3.0,1145.0,2000.0,17584.0,24.0,9160.0,16000.0,6594.0,2516710.0,4396000.0,3435.0,6000.0,2290000.0,1053.0,0.0,4.0,3.0,655.0,0.0,1.0,0.0,4.0,490.0,0.0,3.0,4.0,1.0,2.0,3.0,3.0,2000.0,1.0,5.0,1.0,4.0,14260.0,84.0,0.0,60.0,350.0,0.0,12.0,84.0,5.0,0.0,0.0,9.0,192.0,2000.0,2008.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,12.42922


In [177]:
X = df_train.drop('SalePrice_log', axis = 1)
y = df_train['SalePrice_log']

In [178]:
scoring = {'rmse': rmse_score,
           'r2': 'r2'}

### Models

#### Linear Regression

In [179]:
param_grid = {'f_regression__k':[20,50,80,100,120,150,200,239,249]}

In [180]:
pipe_lr = Pipeline([('f_regression', SelectKBest(f_regression)), ('lr', LinearRegression())])
grid = GridSearchCV(pipe_lr, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'f_regression__k': 200}


##### Best params

In [181]:
pipe_lr = Pipeline([('f_regression', SelectKBest(k=200)), ('lr', LinearRegression())])
cv_scores = cross_validate(pipe_lr, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.1287, 'score_time': 0.0045, 'test_rmse': 0.1164, 'test_r2': 0.9141}


#### Ridge

In [182]:
param_grid = {'alpha':[0.01,0,0.1,1,5,10,20,100]}

In [183]:
rid = Ridge()
grid = GridSearchCV(rid, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'alpha': 10}


##### Best params

In [184]:
rid = Ridge(alpha = 10)
cv_scores = cross_validate(rid, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.0113, 'score_time': 0.0025, 'test_rmse': 0.1116, 'test_r2': 0.921}


#### Lasso

In [185]:
param_grid = {'alpha':[0.01,0,0.1,1,5,10,20,100]}

In [186]:
lasso = Lasso()
grid = GridSearchCV(lasso, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'alpha': 0}


##### Best params

In [187]:
lasso = Lasso(alpha = 0)
cv_scores = cross_validate(lasso, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.3793, 'score_time': 0.0048, 'test_rmse': 0.119, 'test_r2': 0.9102}


#### Elastic Net

In [188]:
param_grid = {'alpha':[0.01,0.1,1,5,10,20,100],
              'l1_ratio':[0,0.01,0.1,0.5,0.8,1]}

In [189]:
enet = ElasticNet()
grid = GridSearchCV(enet, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'alpha': 0.01, 'l1_ratio': 0.01}


##### Best params

In [190]:
enet = ElasticNet(alpha = 0.01, l1_ratio = 0.01)
cv_scores = cross_validate(enet, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.5412, 'score_time': 0.0164, 'test_rmse': 0.1114, 'test_r2': 0.9211}


#### XGBOOST

In [191]:
param_grid = {'eta':[0.01,0.05,0.1,0.2],
              'subsample':[0.5,0.75,1]}

In [192]:
xgboost = XGBRegressor()
grid = GridSearchCV(xgboost, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)





{'eta': 0.01, 'subsample': 1}


##### Best params

In [193]:
xgboost = XGBRegressor(eta = 0.01, subsample = 1)
cv_scores = cross_validate(xgboost, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 1.8128, 'score_time': 0.0128, 'test_rmse': 0.1207, 'test_r2': 0.9074}


#### Random Forest Regressor

In [194]:
param_grid = {'n_estimators':[50,100,150,250],
              'max_depth':[5,10,20,50],
              'min_samples_leaf':[1,3,5,7]}

In [195]:
rf = RandomForestRegressor()
grid = GridSearchCV(rf, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 100}


##### Best params

In [196]:
rf = RandomForestRegressor(max_depth=20, min_samples_leaf = 1, n_estimators = 150)
cv_scores = cross_validate(rf, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 6.0373, 'score_time': 0.033, 'test_rmse': 0.1319, 'test_r2': 0.8895}
