# <font color="orange"> House Prices - Advanced Regression Techniques </font>

#### Predict sales prices and practice feature engineering, RFs, and gradient boosting

In [117]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly import graph_objects as go
from plotly.subplots import make_subplots

In [118]:
DataFolder = "/Users/manideepbangaru/Documents/EDAnMLApply/Datasets/house-prices-advanced-regression-techniques/"
OutputFolder = "/Users/manideepbangaru/Documents/EDAnMLApply/Output"

In [121]:
hdf = pd.read_csv(DataFolder+"train.csv",index_col=0)
hdf.reset_index(drop=True, inplace=True)
hdf_y = hdf["SalePrice"]
hdf.drop(columns=["SalePrice"],inplace=True)

In [122]:
hdf.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


### Dealing with Missing Values

In [123]:
for cols in hdf.columns:
    nullsum = round((hdf[cols].isnull().sum()/len(hdf))*100,2)
    if nullsum >= 50 :
        hdf.drop(cols,axis=1,inplace=True)
        print("dropped column %s which has %s percentage of null values"%(cols,nullsum))

dropped column Alley which has 93.77 percentage of null values
dropped column PoolQC which has 99.52 percentage of null values
dropped column Fence which has 80.75 percentage of null values
dropped column MiscFeature which has 96.3 percentage of null values


In [124]:
hdf.reset_index(drop=True,inplace=True)
hdf

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,40,0,0,0,0,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,60,0,0,0,0,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,112,0,0,0,0,4,2010,WD,Normal


In [125]:
for idx in range(len(hdf)):
    nullsum = round((hdf.loc[idx].isnull().sum()/len(hdf.columns))*100,2)
    if nullsum > 50:
        hdf.drop(idx,axis=0,inplace=True)
        print("dropping row %s"%idx)

### Separating Numerical and Categorical data

In [126]:
hdf.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 75, dtype: object

In [127]:
hdf["MSSubClass"] = hdf["MSSubClass"].astype("object")
hdf.dtypes

MSSubClass        object
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 75, dtype: object

In [128]:
Num_Data = hdf[hdf.columns[hdf.dtypes != object]]
Cat_Data = hdf[hdf.columns[hdf.dtypes == object]]

In [129]:
len(Num_Data.columns) + len(Cat_Data.columns) == len(hdf.columns)

True

### Imputation

In [130]:
from sklearn.impute import KNNImputer

In [131]:
Num_Data.loc[:,Num_Data.isnull().any()].isnull().sum()/len(Num_Data)*100

LotFrontage    17.739726
MasVnrArea      0.547945
GarageYrBlt     5.547945
dtype: float64

In [132]:
imputer = KNNImputer(n_neighbors=3)

In [133]:
imputer.fit(Num_Data.values)

In [134]:
Num_Data_Trans_values = imputer.transform(Num_Data.values)

In [135]:
Num_Data_Trans = pd.DataFrame(Num_Data_Trans_values,columns = Num_Data.columns)

To impute categorical variables, we need to first convert them to dummy variables

In [136]:
Cat_Data = pd.get_dummies(Cat_Data)

  uniques = Index(uniques)


### Outlier treatment

In [137]:
from scipy.stats import zscore

In [138]:
zscores = zscore(Num_Data_Trans)

In [139]:
abs_z_scores = np.abs(zscores)
filtered_entries = (abs_z_scores < 3).all(axis=1)

In [140]:
New_Num_Data = Num_Data[filtered_entries]

In [141]:
New_Num_Data.reset_index(drop=True,inplace=True)

### Standardization

In [142]:
from sklearn.preprocessing import StandardScaler

In [143]:
SS = StandardScaler()
Standardized_Num_Data = pd.DataFrame(SS.fit_transform(New_Num_Data),columns=New_Num_Data.columns)

### Preparing the data

In [144]:
df = pd.concat([Standardized_Num_Data,Cat_Data[filtered_entries].reset_index(drop=True,inplace=True)],axis=1)

In [157]:
y_filt = hdf_y[filtered_entries]

In [161]:
y_filt.index = range(len(y_filt))

### Train Test Split

In [163]:
from sklearn.model_selection import train_test_split

In [164]:
X_train,X_test,y_train,y_test = train_test_split(df,y_filt,test_size=0.3)