----------------------------------------------------------------------------------------------------------------------

# Building Basic Models, Parameters Adjustment and Results Evaluation

----------------------------------------------------------------------------------------------------------------------

In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import make_scorer, r2_score
from sklearn.svm import LinearSVR, SVR
from xgboost import XGBRegressor
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings("ignore")

In [5]:
def rmse_custom(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))
rmse_score = make_scorer(score_func=rmse_custom)

In [6]:
def rmsle_custom(y_true_log, y_pred_log):
    y_true = np.exp(y_true_log)
    y_pred = np.exp(y_pred_log)
    return np.sqrt(np.mean(np.power(np.log(y_true + 1) - np.log(y_pred + 1), 2)))
rmsle_score = make_scorer(score_func=rmsle_custom)

In [7]:
def score_to_stats(scores):
    
    return {score:round(values.mean(),4) for score, values in scores.items()}

### Data Importing

In [8]:
df_train = pd.read_csv('train_master.csv')
df_test = pd.read_csv('test_master.csv')

### Output variable logarthmic transformation

In [9]:
df_train['SalePrice_log'] = np.log1p(df_train['SalePrice'])

In [10]:
df_train = df_train.drop(['Id','SalePrice'], axis = 1)

In [11]:
df_test = df_test.drop('Id', axis = 1)

### Pre-processing and data cleaning

In [12]:
outlier1 = df_train[df_train['GrLivArea'] > 4500].index
outlier2 = df_train[df_train["1stFlrSF"] > 4000].index
outlier3 = df_train[df_train["TotalBsmtSF"] > 4000].index

In [13]:
# 523, 1298
df_train = df_train.drop(outlier1)

In [14]:
missing_values_attribute = ['PoolQC','MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
correlated_attributes = ['GarageArea', '1stFlrSF']

In [15]:
df_train = df_train.drop(missing_values_attribute, axis = 1)
df_test = df_test.drop(missing_values_attribute, axis = 1)

In [16]:
df_train = df_train.drop(correlated_attributes, axis = 1)
df_test = df_test.drop(correlated_attributes, axis = 1)

### Attributes mapping and dummy coding

In [17]:
df_train['train'] = 1
df_test['train'] = 0

In [18]:
df_combined = pd.concat([df_train, df_test])

In [19]:
df_combined = df_combined.reset_index(drop = True)

In [20]:
cat_mapping = {'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
scale_attributes = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']
for i in df_combined[scale_attributes]:
    df_combined[i] = df_combined[i].map(cat_mapping)

In [21]:
df_cat = df_combined.select_dtypes(include = ['object'])

In [22]:
df_cat_columns = df_cat.columns

In [23]:
df_cat_dummies = pd.get_dummies(df_cat)

In [24]:
df_combined = df_combined.drop(df_cat_columns, axis = 1)

In [25]:
df_combined = pd.concat([df_combined, df_cat_dummies], axis = 1)

In [26]:
df_train = df_combined[df_combined['train'] == 1]
df_train = df_train.drop(['train'], axis = 1)

In [28]:
X = df_train.drop('SalePrice_log', axis = 1)
y = df_train['SalePrice_log']

In [29]:
scoring = {'rmsle': rmsle_score,
           'r2': 'r2'}

### Models

In [30]:
pipe_lr = Pipeline([('imp', SimpleImputer()), ('lr', LinearRegression())])
cv_scores = cross_validate(pipe_lr, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.0535, 'score_time': 0.0046, 'test_rmsle': 0.1189, 'train_rmsle': 0.0923, 'test_r2': 0.9103, 'train_r2': 0.9466}


In [31]:
pipe_ridge = Pipeline([('imp', SimpleImputer()), ('rid', Ridge())])
cv_scores = cross_validate(pipe_ridge, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.0335, 'score_time': 0.0041, 'test_rmsle': 0.1151, 'train_rmsle': 0.0933, 'test_r2': 0.916, 'train_r2': 0.9455}


In [32]:
pipe_lasso = Pipeline([('imp', SimpleImputer()), ('lasso', Lasso())])
cv_scores = cross_validate(pipe_lasso, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.0376, 'score_time': 0.0072, 'test_rmsle': 0.1762, 'train_rmsle': 0.1731, 'test_r2': 0.8026, 'train_r2': 0.8123}


In [33]:
pipe_enet = Pipeline([('imp', SimpleImputer()), ('enet', ElasticNet())])
cv_scores = cross_validate(pipe_enet, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.0484, 'score_time': 0.006, 'test_rmsle': 0.1677, 'train_rmsle': 0.1645, 'test_r2': 0.821, 'train_r2': 0.8305}


In [34]:
pipe_xgboost = Pipeline([('imp', SimpleImputer()), ('xgboost', XGBRegressor())])
cv_scores = cross_validate(pipe_xgboost, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 1.841, 'score_time': 0.008, 'test_rmsle': 0.1214, 'train_rmsle': 0.0855, 'test_r2': 0.9062, 'train_r2': 0.9541}


In [35]:
pipe_rf = Pipeline([('imp', SimpleImputer()), ('rf', RandomForestRegressor())])
cv_scores = cross_validate(pipe_rf, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.4154, 'score_time': 0.0066, 'test_rmsle': 0.1418, 'train_rmsle': 0.0624, 'test_r2': 0.8725, 'train_r2': 0.9756}


### Models with adjusted parameters

#### Pipe Linear Regression

In [36]:
param_grid = {'f_regression__k':[20,50,80,100,120,150,200,239]}

In [37]:
pipe_lr = Pipeline([('imp', SimpleImputer()), ('f_regression', SelectKBest(f_regression)), ('lr', LinearRegression())])
grid = GridSearchCV(pipe_lr, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'f_regression__k': 120}


###### Best params

In [171]:
pipe_lr = Pipeline([('imp', SimpleImputer()), ('f_regression', SelectKBest(k=120)), ('lr', LinearRegression())])
cv_scores = cross_validate(pipe_lr, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.1341, 'score_time': 0.0036, 'test_rmsle': 0.1169, 'train_rmsle': 0.1041, 'test_r2': 0.9134, 'train_r2': 0.9321}


#### Pipe Ridge

In [40]:
param_grid = {'f_regression__k':[20,50,80,100,120,150,200,239],
              'rid__alpha':[0.01,0.1,1,5,10,20,100]}

In [41]:
pipe_ridge = Pipeline([('imp', SimpleImputer()), ('f_regression', SelectKBest(f_regression)), ('rid', Ridge())])
grid = GridSearchCV(pipe_ridge, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'f_regression__k': 239, 'rid__alpha': 10}


###### Best params

In [172]:
pipe_ridge = Pipeline([('imp', SimpleImputer()), ('rid', Ridge(alpha = 10))])
cv_scores = cross_validate(pipe_ridge, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.0295, 'score_time': 0.0031, 'test_rmsle': 0.1119, 'train_rmsle': 0.0977, 'test_r2': 0.9205, 'train_r2': 0.9402}


#### Pipe Lasso

In [42]:
param_grid = {'f_regression__k':[20,50,80,100,120,150,200,239],
              'lasso__alpha':[0.01,0.1,1,5,10,20,100]}

In [43]:
pipe_lasso = Pipeline([('imp', SimpleImputer()), ('f_regression', SelectKBest(f_regression)), ('lasso', Lasso())])
grid = GridSearchCV(pipe_lasso, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'f_regression__k': 239, 'lasso__alpha': 0.01}


##### Best params

In [173]:
pipe_lasso = Pipeline([('imp', SimpleImputer()), ('lasso', Lasso(alpha = 0.01))])
cv_scores = cross_validate(pipe_lasso, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.2837, 'score_time': 0.005, 'test_rmsle': 0.1313, 'train_rmsle': 0.1278, 'test_r2': 0.8905, 'train_r2': 0.8977}


#### Pipe Elastic Net

In [44]:
param_grid = {'f_regression__k':[20,50,80,100,120,150,200,239],
              'enet__alpha':[0.01,0.1,1,5,10,20,100],
              'enet__l1_ratio':[0,0.01,0.1,0.5,0.8,1]}

In [45]:
pipe_enet = Pipeline([('imp', SimpleImputer()), ('f_regression', SelectKBest(f_regression)), ('enet', ElasticNet())])
grid = GridSearchCV(pipe_enet, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'enet__alpha': 0.01, 'enet__l1_ratio': 0.01, 'f_regression__k': 239}


##### Best params

In [46]:
pipe_enet = Pipeline([('imp', SimpleImputer()), ('enet', ElasticNet(alpha = 0.01, l1_ratio = 0.01))])
cv_scores = cross_validate(pipe_enet, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 0.4218, 'score_time': 0.0047, 'test_rmsle': 0.1117, 'train_rmsle': 0.0998, 'test_r2': 0.9208, 'train_r2': 0.9377}


#### Pipe Xgboost

In [48]:
param_grid = {'f_regression__k':[20,50,80,100,120,150,200,239],
              'xgboost__eta':[0.01,0.05,0.1,0.2],
              'xgboost__subsample':[0.5,0.75,1]}

In [49]:
pipe_xgboost = Pipeline([('imp', SimpleImputer()), ('f_regression', SelectKBest(f_regression)), ('xgboost', XGBRegressor())])
grid = GridSearchCV(pipe_xgboost, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'f_regression__k': 239, 'xgboost__eta': 0.01, 'xgboost__subsample': 0.5}


##### Best Params

In [175]:
pipe_xgboost = Pipeline([('imp', SimpleImputer()), ('xgboost', XGBRegressor(eta = 0.01, subsample = 0.5))])
cv_scores = cross_validate(pipe_xgboost, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 1.6475, 'score_time': 0.0107, 'test_rmsle': 0.1181, 'train_rmsle': 0.0885, 'test_r2': 0.9114, 'train_r2': 0.9509}


#### Pipe Random Forest

In [52]:
param_grid = {'f_regression__k':[20,50,80,100,120,150,200,239],
              'rf__n_estimators':[50,100,150,250],
              'rf__max_depth':[5,10,20,50],
              'rf__min_samples_leaf':[1,3,5,7]}

In [53]:
pipe_rf = Pipeline([('imp', SimpleImputer()), ('f_regression', SelectKBest(f_regression)), ('rf', RandomForestRegressor())])
grid = GridSearchCV(pipe_rf, param_grid = param_grid, cv = 10)
grid.fit(X,y)
print(grid.best_params_)

{'f_regression__k': 239, 'rf__max_depth': 20, 'rf__min_samples_leaf': 1, 'rf__n_estimators': 150}


##### Best Params

In [176]:
pipe_rf = Pipeline([('imp', SimpleImputer()), ('rf', RandomForestRegressor(max_depth = 20, min_samples_leaf = 1, n_estimators = 150))])
cv_scores = cross_validate(pipe_rf, X, y, scoring = scoring, cv = 10)
print(score_to_stats(cv_scores))

{'fit_time': 4.976, 'score_time': 0.033, 'test_rmsle': 0.1342, 'train_rmsle': 0.0507, 'test_r2': 0.8856, 'train_r2': 0.9839}
