In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm,lognorm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df_train_original = pd.read_csv('train.csv')
# df_test_original = pd.read_csv('test.csv')
potential_features = ['OverallQual','GrLivArea','GarageArea','YearBuilt','TotalBsmtSF','Neighborhood']
df_train  = df_train_original[['Id'] + potential_features + ['SalePrice']]
# df_test  = df_test_original[['Id'] + potential_features + ['SalePrice']]
df_train

Unnamed: 0,Id,OverallQual,GrLivArea,GarageArea,YearBuilt,TotalBsmtSF,Neighborhood,SalePrice
0,1,7,1710,548,2003,856,CollgCr,208500
1,2,6,1262,460,1976,1262,Veenker,181500
2,3,7,1786,608,2001,920,CollgCr,223500
3,4,7,1717,642,1915,756,Crawfor,140000
4,5,8,2198,836,2000,1145,NoRidge,250000
...,...,...,...,...,...,...,...,...
1455,1456,6,1647,460,1999,953,Gilbert,175000
1456,1457,6,2073,500,1978,1542,NWAmes,210000
1457,1458,7,2340,252,1941,1152,Crawfor,266500
1458,1459,5,1078,240,1950,1078,NAmes,142125


# Data Cleaning

## Outliers

In [3]:
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)

## Transformation

In [4]:
col = 'Neighborhood'
def categorical_data_transform(df,col):
    df_ = df[[col]].copy().groupby(col).mean().reset_index().reset_index().rename(columns={'index':col+'_Order'})
    df = pd.merge(df,df_,left_on=col,right_on=col,how='inner').drop(columns=[col])
    return df
df_train = categorical_data_transform(df_train,col)
# df_test = categorical_data_transform(df_test,col)
df_train

Unnamed: 0,Id,OverallQual,GrLivArea,GarageArea,YearBuilt,TotalBsmtSF,SalePrice,Neighborhood_Order
0,1,7,1710,548,2003,856,208500,5
1,3,7,1786,608,2001,920,223500,5
2,14,7,1494,840,2006,1494,279500,5
3,23,8,1795,534,2002,1777,230000,5
4,33,8,1234,484,2007,1234,179900,5
...,...,...,...,...,...,...,...,...
1453,1361,5,2601,621,1921,612,189000,18
1454,1377,6,790,160,1930,768,91000,18
1455,1400,6,1608,216,1925,976,137450,18
1456,600,6,1556,452,1980,716,151000,1


In [5]:
def features_handling(df):
    df['GrLivArea_Log'] = np.log(df['GrLivArea'])

    df['HasGarage'] = pd.Series(len(df['GarageArea']), index=df.index)
    df['HasGarage'] = 0 
    df.loc[df['GarageArea']>0,'HasGarage'] = 1
    df.loc[df['HasGarage']==1,'GarageArea_Log'] = np.log(df['GarageArea'])
    df.loc[df['HasGarage']==0,'GarageArea_Log'] = 0

    df['HasBsmt'] = pd.Series(len(df['TotalBsmtSF']), index=df.index)
    df['HasBsmt'] = 0 
    df.loc[df['TotalBsmtSF']>0,'HasBsmt'] = 1
    df.loc[df['HasBsmt']==1,'TotalBsmtSF_Log'] = np.log(df['TotalBsmtSF'])
    df.loc[df['HasBsmt']==0,'TotalBsmtSF_Log'] = 0
    return df

In [6]:
df_train['SalePrice_Log'] = np.log(df_train['SalePrice'])
# df_test['SalePrice_Log'] = np.log(df_test['SalePrice'])
df_train = features_handling(df_train)
# df_test = features_handling(df_test)
df_train

Unnamed: 0,Id,OverallQual,GrLivArea,GarageArea,YearBuilt,TotalBsmtSF,SalePrice,Neighborhood_Order,SalePrice_Log,GrLivArea_Log,HasGarage,GarageArea_Log,HasBsmt,TotalBsmtSF_Log
0,1,7,1710,548,2003,856,208500,5,12.247694,7.444249,1,6.306275,1,6.752270
1,3,7,1786,608,2001,920,223500,5,12.317167,7.487734,1,6.410175,1,6.824374
2,14,7,1494,840,2006,1494,279500,5,12.540758,7.309212,1,6.733402,1,7.309212
3,23,8,1795,534,2002,1777,230000,5,12.345835,7.492760,1,6.280396,1,7.482682
4,33,8,1234,484,2007,1234,179900,5,12.100156,7.118016,1,6.182085,1,7.118016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,1361,5,2601,621,1921,612,189000,18,12.149502,7.863651,1,6.431331,1,6.416732
1454,1377,6,790,160,1930,768,91000,18,11.418615,6.672033,1,5.075174,1,6.643790
1455,1400,6,1608,216,1925,976,137450,18,11.831015,7.382746,1,5.375278,1,6.883463
1456,600,6,1556,452,1980,716,151000,1,11.925035,7.349874,1,6.113682,1,6.573680


# Model Training

In [7]:
features = ['OverallQual','GrLivArea_Log','GarageArea_Log','YearBuilt','TotalBsmtSF_Log','Neighborhood_Order']
target = 'SalePrice_Log'
X = df_train[features].copy()
y = df_train[target].copy()

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [9]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

## Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression(normalize=True)
linear_regressor.fit(X_train,y_train)

train_pred = linear_regressor.predict(X_train)
test_pred = linear_regressor.predict(X_test)

result_lin_reg_train = ["Linear Regression Train", *evaluate(y_train, train_pred)]
result_lin_reg_test = ["Linear Regression Test", *evaluate(y_test, test_pred)]
results_df = pd.DataFrame(data=[result_lin_reg_train,result_lin_reg_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,Linear Regression Train,0.122996,0.028135,0.167735,0.822212
1,Linear Regression Test,0.129638,0.028841,0.169826,0.822871


In [11]:
coeff_df = pd.DataFrame(linear_regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
OverallQual,0.105302
GrLivArea_Log,0.498527
GarageArea_Log,0.0267
YearBuilt,0.002865
TotalBsmtSF_Log,0.032498
Neighborhood_Order,0.001315


## Lasso Regression

In [12]:
from sklearn.linear_model import Lasso

lasso_regressor = Lasso(alpha=0.1, precompute=True, positive=True, selection='random',random_state=0)
lasso_regressor.fit(X_train, y_train)
train_pred = lasso_regressor.predict(X_train)
test_pred = lasso_regressor.predict(X_test)

result_lasso_reg_train = ["Lasso Regression Train", *evaluate(y_train, train_pred)]
result_lasso_reg_test = ["Lasso Regression Test", *evaluate(y_test, test_pred)]
results_df = pd.DataFrame(data=[result_lasso_reg_train,result_lasso_reg_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,Lasso Regression Train,0.172885,0.055715,0.236041,0.647933
1,Lasso Regression Test,0.1761,0.056548,0.237798,0.652709


## Decision Tree Regressor

In [13]:
from sklearn.tree import DecisionTreeRegressor
dt_Regressor = DecisionTreeRegressor()
dt_Regressor.fit(X_train, y_train)
train_pred = dt_Regressor.predict(X_train)
test_pred = dt_Regressor.predict(X_test)

result_dt_train = ["Decision Tree Regression Train", *evaluate(y_train, train_pred)]
result_dt_test = ["Decision Tree Regression Test", *evaluate(y_test, test_pred)]
results_df = pd.DataFrame(data=[result_dt_train,result_dt_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,Decision Tree Regression Train,0.000775,3.5e-05,0.005908,0.999779
1,Decision Tree Regression Test,0.151927,0.044392,0.210693,0.727367


Overfitting!

## Random Forest Regressor

In [14]:
from sklearn.ensemble import RandomForestRegressor
rf_Regressor = RandomForestRegressor()
rf_Regressor.fit(X_train, y_train)
train_pred = rf_Regressor.predict(X_train)
test_pred = rf_Regressor.predict(X_test)

result_rf_train = ["Random Forest Regression Train", *evaluate(y_train, train_pred)]
result_rf_test = ["Random Forest Regression Test", *evaluate(y_test, test_pred)]
results_df = pd.DataFrame(data=[result_rf_train,result_rf_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,Random Forest Regression Train,0.04221,0.00373,0.061071,0.976432
1,Random Forest Regression Test,0.112652,0.024683,0.157109,0.848407


## KNN Regressor

In [15]:
from sklearn.neighbors import KNeighborsRegressor
knn_regressor=KNeighborsRegressor()
knn_regressor.fit(X_train,y_train)

train_pred = knn_regressor.predict(X_train)
test_pred = knn_regressor.predict(X_test)

result_knn_train = ["KNN Regression Train", *evaluate(y_train, train_pred)]
result_knn_test = ["KNN Regression Test", *evaluate(y_test, test_pred)]
results_df = pd.DataFrame(data=[result_knn_train,result_knn_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,KNN Regression Train,0.129632,0.033511,0.183061,0.788241
1,KNN Regression Test,0.161574,0.052813,0.229811,0.675645


## XGBoost Regressor

In [16]:
from xgboost import XGBRegressor
xgb_regressor = XGBRegressor()
xgb_regressor.fit(X_train,y_train)

train_pred = xgb_regressor.predict(X_train)
test_pred = xgb_regressor.predict(X_test)

result_xgb_train = ["XGBoost Regression Train", *evaluate(y_train, train_pred)]
result_xgb_test = ["XGBoost Regression Test", *evaluate(y_test, test_pred)]
results_df = pd.DataFrame(data=[result_xgb_train,result_xgb_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,XGBoost Regression Train,0.017632,0.00063,0.025101,0.996019
1,XGBoost Regression Test,0.112859,0.022999,0.151655,0.858749


## Light GBM Regressor

In [17]:
from lightgbm import LGBMRegressor
lgbm_regressor = LGBMRegressor(objective='regression')
lgbm_regressor.fit(X_train,y_train)

train_pred = lgbm_regressor.predict(X_train)
test_pred = lgbm_regressor.predict(X_test)

result_lgbm_train = ["Light GBM Regression Train", *evaluate(y_train, train_pred)]
result_lgbm_test = ["Light GBM Regression Test", *evaluate(y_test, test_pred)]
results_df = pd.DataFrame(data=[result_lgbm_train,result_lgbm_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,Light GBM Regression Train,0.054341,0.006254,0.079084,0.960479
1,Light GBM Regression Test,0.116407,0.025188,0.158707,0.845307


## Stacked Regressor

The stacked regressor is a bagging method of Ensemble Learning. I combine some well-performed models together to generate a better reuslt. As we can see above, the linear regression, random forest, XGB, LightGBM achieve very good results in testing set without overfitting problems. The predicted result is the average of results predicted by the four already trained models. 

In [18]:
regressor_ls = [linear_regressor,rf_Regressor,xgb_regressor,lgbm_regressor]

train_pred = np.mean([regressor.predict(X_train) for regressor in regressor_ls],axis=0)
test_pred = np.mean([regressor.predict(X_test) for regressor in regressor_ls],axis=0)
result_stacked_train = ["Stacked Regression Train", *evaluate(y_train, train_pred)]
result_stacked_test = ["Stacked Regression Test", *evaluate(y_test, test_pred)]
results_df = pd.DataFrame(data=[result_stacked_train,result_stacked_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,Stacked Regression Train,0.053677,0.00559,0.074768,0.964675
1,Stacked Regression Test,0.108577,0.021811,0.147685,0.866048


# Metrics Comparision


In [19]:
results_train_df = pd.DataFrame(data=[result_lin_reg_train,result_lasso_reg_train,result_dt_train,result_rf_train,result_knn_train,result_xgb_train,result_lgbm_train,result_stacked_train], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_train_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,Linear Regression Train,0.122996,0.028135,0.167735,0.822212
1,Lasso Regression Train,0.172885,0.055715,0.236041,0.647933
2,Decision Tree Regression Train,0.000775,3.5e-05,0.005908,0.999779
3,Random Forest Regression Train,0.04221,0.00373,0.061071,0.976432
4,KNN Regression Train,0.129632,0.033511,0.183061,0.788241
5,XGBoost Regression Train,0.017632,0.00063,0.025101,0.996019
6,Light GBM Regression Train,0.054341,0.006254,0.079084,0.960479
7,Stacked Regression Train,0.053677,0.00559,0.074768,0.964675


In [20]:
import plotly.express as px
fig = px.bar(results_train_df, x='Model', y='MAE',text_auto=True)
fig.show()

In [21]:
import plotly.express as px
fig = px.bar(results_train_df, x='Model', y='MSE',text_auto=True)
fig.show()

In [22]:
import plotly.express as px
fig = px.bar(results_train_df, x='Model', y='RMSE',text_auto=True)
fig.show()

In [23]:
import plotly.express as px
fig = px.bar(results_train_df, x='Model', y='R2 Square',text_auto=True)
fig.show()

In [24]:
results_test_df = pd.DataFrame(data=[result_lin_reg_test,result_lasso_reg_test,result_dt_test,result_rf_test,result_knn_test,result_xgb_test,result_lgbm_test,result_stacked_test], columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_test_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square
0,Linear Regression Test,0.129638,0.028841,0.169826,0.822871
1,Lasso Regression Test,0.1761,0.056548,0.237798,0.652709
2,Decision Tree Regression Test,0.151927,0.044392,0.210693,0.727367
3,Random Forest Regression Test,0.112652,0.024683,0.157109,0.848407
4,KNN Regression Test,0.161574,0.052813,0.229811,0.675645
5,XGBoost Regression Test,0.112859,0.022999,0.151655,0.858749
6,Light GBM Regression Test,0.116407,0.025188,0.158707,0.845307
7,Stacked Regression Test,0.108577,0.021811,0.147685,0.866048


The MAE/MSE/RMSE are low for linear regression, XGBoost, Light GBM, stacked model. So we may focus on these models in production.

In [25]:
import plotly.express as px
fig = px.bar(results_test_df, x='Model', y='MAE',text_auto=True)
fig.show()

In [26]:
import plotly.express as px
fig = px.bar(results_test_df, x='Model', y='MSE',text_auto=True)
fig.show()

In [27]:
import plotly.express as px
fig = px.bar(results_test_df, x='Model', y='RMSE',text_auto=True)
fig.show()

In [28]:
import plotly.express as px
fig = px.bar(results_test_df, x='Model', y='R2 Square',text_auto=True)
fig.show()

The R2 of stacked model is the highest in the testing set. It means the stacked model explains the largest variance of the whole dataset and has the most accurate predictions