# <font color="orange"> House Prices - Advanced Regression Techniques </font>

#### Predict sales prices and practice feature engineering, RFs, and gradient boosting

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

In [3]:
DataFolder = "/Users/manideepbangaru/Documents/EDAnMLApply/Datasets/house-prices-advanced-regression-techniques/"
OutputFolder = "/Users/manideepbangaru/Documents/EDAnMLApply/Output"

In [4]:
hdf = pd.read_csv(DataFolder+"train.csv")
hdf.reset_index(drop=True, inplace=True)

In [5]:
hdf

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [6]:
hdf_y = hdf["SalePrice"]

In [7]:
hdf.drop(columns=["SalePrice","Id"],inplace=True)

In [8]:
hdf["MSSubClass"] = hdf["MSSubClass"].astype("object")
hdf.dtypes

MSSubClass        object
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object

In [9]:
hdf[hdf.columns[hdf.isnull().any()]].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [10]:
hdf[hdf.columns[hdf.dtypes != "Object"]]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


### Missing value treatment

In [11]:
# columns treatment
for col in hdf.columns:
    nullVals = round(hdf[col].isnull().sum()/len(hdf)*100,2)
    if nullVals > 50:
        hdf.drop(columns=col, axis=1, inplace=True)

In [12]:
# row treatment
for i in range(len(hdf)):
    nullVals = hdf.loc[i].isnull().sum()/len(hdf.columns)*100
    if nullVals > 50:
        hdf.drop(i, axis=0, inplace=True)

In [13]:
hdf.shape

(1460, 75)

### Separating Numerical and Categorical Data

In [14]:
hdf_num = hdf[hdf.columns[hdf.dtypes != "object"]]
hdf_cat = hdf[hdf.columns[hdf.dtypes == "object"]]

In [15]:
hdf_num[hdf_num.columns[hdf_num.isnull().any()]].isnull().sum()

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

In [16]:
hdf_cat[hdf_cat.columns[hdf_cat.isnull().any()]].isnull().sum()

MasVnrType        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
FireplaceQu     690
GarageType       81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64

### Imputation

In [17]:
from sklearn.impute import KNNImputer

In [53]:
imputer = KNNImputer(n_neighbors=3)
imputer_train = imputer.fit(hdf_num)
hdf_num_imputed = imputer_train.transform(hdf_num)

In [54]:
hdf_num_imputed = pd.DataFrame(hdf_num_imputed,columns=hdf_num.columns)

In [55]:
from sklearn.impute import SimpleImputer

In [56]:
simpleImpute = SimpleImputer(strategy="most_frequent")

In [76]:
imputer_cat = simpleImpute.fit(hdf_cat)
hdf_cat_imputed = pd.DataFrame(simpleImpute.fit_transform(hdf_cat),columns=hdf_cat.columns)

In [58]:
print(hdf_num_imputed.isnull().sum().sum())
print(hdf_cat_imputed.isnull().sum().sum())

0
0


### Outlier Treatment

In [59]:
from scipy.stats import zscore

In [60]:
import numpy as np
zscores = zscore(hdf_num_imputed)

In [61]:
filtered_entries = (np.abs(zscores) < 3).all(axis = 1)

In [62]:
hdf_num_imputed_outlier = hdf_num_imputed[filtered_entries]
hdf_num_imputed_outlier.reset_index(drop=True,inplace=True)

In [63]:
hdf_cat_imputed_outlier = hdf_cat_imputed[filtered_entries]
hdf_cat_imputed_outlier.reset_index(drop=True,inplace=True)

In [64]:
hdf_y = hdf_y[filtered_entries]
hdf_y.reset_index(drop=True,inplace=True)

### Creating dummy variables

In [65]:
hdf_cat_imputed_outlier_dummy = pd.get_dummies(hdf_cat_imputed_outlier)

  uniques = Index(uniques)


### Combining dataset

In [66]:
complete_hdf = pd.concat([hdf_num_imputed_outlier,hdf_cat_imputed_outlier_dummy],axis=1)

### Feature selection

In [67]:
corr_scores = hdf_num_imputed_outlier.corr()

In [68]:
corr_scores_df = pd.DataFrame(corr_scores.unstack().sort_values(ascending=False))

In [69]:
corr_scores_df

Unnamed: 0,Unnamed: 1,0
LotFrontage,LotFrontage,1.0
FullBath,FullBath,1.0
OverallCond,OverallCond,1.0
YearBuilt,YearBuilt,1.0
YearRemodAdd,YearRemodAdd,1.0
...,...,...
MoSold,KitchenAbvGr,
MoSold,PoolArea,
YrSold,BsmtHalfBath,
YrSold,KitchenAbvGr,


### Train Test split

In [70]:
from sklearn.model_selection import train_test_split

In [71]:
X_train,X_test,y_train,y_test = train_test_split(complete_hdf.values,hdf_y,random_state=123)

ValueError: Found input variables with inconsistent numbers of samples: [1029, 721]

### Model Building

In [None]:
from statsmodels.api import OLS

In [None]:
linear_model = OLS(y_train,X_train).fit()

In [None]:
print_model = linear_model.summary()

In [None]:
print(print_model)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.952
Model:                            OLS   Adj. R-squared:                  0.935
Method:                 Least Squares   F-statistic:                     55.57
Date:                Fri, 27 May 2022   Prob (F-statistic):          1.90e-282
Time:                        23:41:35   Log-Likelihood:                -8474.1
No. Observations:                 771   AIC:                         1.736e+04
Df Residuals:                     566   BIC:                         1.831e+04
Df Model:                         204                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1          -109.4960     64.577     -1.696      0.0

### generating predictions for kaggle test dataset

In [None]:
test_df = pd.read_csv("/Users/manideepbangaru/Documents/EDAnMLApply/Datasets/house-prices-advanced-regression-techniques/test.csv")

In [None]:
test_df.drop(columns="Id",inplace=True)

In [None]:
test_df["MSSubClass"] = test_df["MSSubClass"].astype("object")

In [75]:
test_df_num = test_df[hdf_num.columns]
print(test_df_num.isnull().sum().sum())
test_df_num_imputed = imputer_train.transform(test_df_num)
test_df_num_imputed = pd.DataFrame(test_df_num_imputed,columns = test_df_num.columns)
print(test_df_num_imputed.isnull().sum().sum())

330
0


In [77]:
test_df_cat = test_df[hdf_cat.columns]
test_df_cat_imputed = imputer_cat.transform(test_df_cat)
test_df_cat_imputed = pd.DataFrame(test_df_cat_imputed,columns=test_df_cat.columns)

In [79]:
test_df_cat_imputed_dummy = pd.get_dummies(test_df_cat_imputed)
test_df_cat_imputed_dummy.shape

  uniques = Index(uniques)


(1459, 239)

In [82]:
Complete_test = pd.concat([test_df_num_imputed,test_df_cat_imputed_dummy],axis=1)

In [94]:
add_columns = [ele for ele in Complete_test.columns if ele in complete_hdf.columns]

In [95]:
Complete_test = Complete_test[add_columns]

In [99]:
preds = linear_model.predict(Complete_test)

In [100]:
preds.to_csv("/Users/manideepbangaru/Documents/EDAnMLApply/Output/preds_v1.csv",index=False)