# <font color="orange"> House Prices - Advanced Regression Techniques </font>

#### Predict sales prices and practice feature engineering, RFs, and gradient boosting

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px

In [4]:
DataFolder = "/Users/manideepbangaru/Documents/EDAnMLApply/Datasets/house-prices-advanced-regression-techniques/"
OutputFolder = "/Users/manideepbangaru/Documents/EDAnMLApply/Output"

In [5]:
hdf = pd.read_csv(DataFolder+"train.csv")
hdf.reset_index(drop=True, inplace=True)

In [6]:
hdf

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [7]:
hdf_y = hdf["SalePrice"]

In [8]:
hdf.drop(columns=["SalePrice","Id"],inplace=True)

In [9]:
hdf["MSSubClass"] = hdf["MSSubClass"].astype("object")
hdf.dtypes

MSSubClass        object
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object

In [10]:
hdf[hdf.columns[hdf.isnull().any()]].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [11]:
hdf[hdf.columns[hdf.dtypes != "Object"]]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


### Missing value treatment

In [12]:
# columns treatment
for col in hdf.columns:
    nullVals = round(hdf[col].isnull().sum()/len(hdf)*100,2)
    if nullVals > 50:
        hdf.drop(columns=col, axis=1, inplace=True)

In [13]:
# row treatment
for i in range(len(hdf)):
    nullVals = hdf.loc[i].isnull().sum()/len(hdf.columns)*100
    if nullVals > 50:
        hdf.drop(i, axis=0, inplace=True)

In [14]:
hdf.shape

(1460, 75)

### Separating Numerical and Categorical Data

In [15]:
hdf_num = hdf[hdf.columns[hdf.dtypes != "object"]]
hdf_cat = hdf[hdf.columns[hdf.dtypes == "object"]]

In [16]:
hdf_num[hdf_num.columns[hdf_num.isnull().any()]].isnull().sum()

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

In [17]:
hdf_cat[hdf_cat.columns[hdf_cat.isnull().any()]].isnull().sum()

MasVnrType        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
FireplaceQu     690
GarageType       81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64

### Imputation

In [18]:
from sklearn.impute import KNNImputer

In [19]:
imputer = KNNImputer(n_neighbors=3)
hdf_num_imputed = imputer.fit_transform(hdf_num)

In [20]:
hdf_num_imputed = pd.DataFrame(hdf_num_imputed,columns=hdf_num.columns)

In [21]:
from sklearn.impute import SimpleImputer

In [22]:
simpleImpute = SimpleImputer(strategy="most_frequent")

In [23]:
hdf_cat_imputed = pd.DataFrame(simpleImpute.fit_transform(hdf_cat),columns=hdf_cat.columns)

In [24]:
print(hdf_num_imputed.isnull().sum().sum())
print(hdf_cat_imputed.isnull().sum().sum())

0
0


### Outlier Treatment

In [25]:
from scipy.stats import zscore

In [29]:
import numpy as np
zscores = zscore(hdf_num_imputed)

In [30]:
filtered_entries = (np.abs(zscores) < 3).all(axis = 1)

In [34]:
hdf_num_imputed_outlier = hdf_num_imputed[filtered_entries]
hdf_num_imputed_outlier.reset_index(drop=True,inplace=True)

In [35]:
hdf_cat_imputed_outlier = hdf_cat_imputed[filtered_entries]
hdf_cat_imputed_outlier.reset_index(drop=True,inplace=True)

In [36]:
hdf_y = hdf_y[filtered_entries]
hdf_y.reset_index(drop=True,inplace=True)

### Creating dummy variables

In [43]:
hdf_cat_imputed_outlier_dummy = pd.get_dummies(hdf_cat_imputed_outlier)

  uniques = Index(uniques)


### Combining dataset

In [40]:
complete_hdf = pd.concat([hdf_num_imputed_outlier,hdf_cat_imputed_outlier_dummy],axis=0)

### Feature selection

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
forest = RandomForestClassifier(n_jobs=1, class_weight="balanced", max_depth=5)
forest.fit(complete_hdf.values,hdf_y.values)

ValueError: could not convert string to float: 'RL'