# Housing Sale Prediction

In [88]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns

## 1. Data import and tidy up

In [131]:
train_df = pd.read_csv('../data/housing/train.csv', keep_default_na=False)
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [132]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [133]:
train_df.dtypes.unique()

array([dtype('int64'), dtype('O')], dtype=object)

In [134]:
train_df.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [135]:
test_df = pd.read_csv('../data/housing/test.csv', keep_default_na=False)
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [136]:
test_df.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 80, dtype: int64

## 1. Data Preprocessing and Analysis

In [137]:
train_df['Utilities'].unique()

array(['AllPub', 'NoSeWa'], dtype=object)

In [163]:
numerical_cols = ['MasVnrArea', 'LotFrontage', 'GarageYrBlt']
train_df[numerical_cols].head()

Unnamed: 0,MasVnrArea,LotFrontage,GarageYrBlt
0,196,65,2003
1,0,80,1976
2,162,68,2001
3,0,60,1998
4,350,84,2000


In [164]:
train_df[numerical_cols].dtypes

MasVnrArea     object
LotFrontage    object
GarageYrBlt    object
dtype: object

In [172]:
mask = (train_df[numerical_cols] == "NA")
train_df[mask]

Unnamed: 0,MasVnrArea,LotFrontage,GarageYrBlt
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
1455,False,False,False
1456,False,False,False
1457,False,False,False
1458,False,False,False


These need to be converted to numerical values, as they are currently shown as objects, and will incorrectly be on hot encoded.

In [167]:
train_df[numerical_cols] = train_df[numerical_cols].astype(int)
train_df[numerical_cols].dtypes

ValueError: invalid literal for int() with base 10: 'NA'

## 2. Data Preprocessing

### 2.1 Ordinal Encoding

Identified the below cols as ordinal values.

In [138]:
ordinal_cols = [
    'LandSlope',
    'OverallQual',
    'OverallCond',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'HeatingQC',
    'KitchenQual',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence'
]

In [139]:
train_df['LandSlope'].unique()

array(['Gtl', 'Mod', 'Sev'], dtype=object)

In [140]:
full_categories = [ np.unique(np.append(train_df[col], test_df[col])) for col in ordinal_cols ]
full_categories

[array(['Gtl', 'Mod', 'Sev'], dtype=object),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'TA'], dtype=object),
 array(['Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'NA', 'No'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'NA', 'Rec', 'Unf'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'NA', 'Rec', 'Unf'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Fin', 'NA', 'RFn', 'Unf'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA'], dtype=object),
 array(['GdPrv', 'GdWo', 'MnPrv', 'MnWw', 'NA'], dtype=object)]

In [141]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories=full_categories)

In [142]:
train_df[ordinal_cols] = ordinal_encoder.fit_transform(train_df[ordinal_cols])

In [143]:
test_df[ordinal_cols] = ordinal_encoder.transform(test_df[ordinal_cols])

### 2.2 Binary Encoding

Only need to binary encode central air.

In [144]:
train_df['CentralAir'] = train_df['CentralAir'].replace({'Yes': 1, "No": 0})
test_df['CentralAir'] = test_df['CentralAir'].replace({'Yes': 1, "No": 0})

### 2.2 One Hot Encoder Encoding

In [145]:
columns_to_ignore = np.append(ordinal_cols, 'CentralAir')

In [146]:
categorical_mask = (train_df.dtypes == object)
categorical_cols = train_df.columns[categorical_mask]
categorical_cols = [col for col in categorical_cols if col not in columns_to_ignore]
print(f"num categorical cols {len(categorical_cols)}")

num categorical cols 29


In [147]:
actual_numh_oh_cols = num_onh_cols.loc[num_onh_cols > 1]
actual_numh_oh_cols

MasVnrArea       328
LotFrontage      111
GarageYrBlt       98
Neighborhood      25
Exterior2nd       16
Exterior1st       15
SaleType           9
Condition1         9
Condition2         8
HouseStyle         8
RoofMatl           8
GarageType         7
Functional         7
Electrical         6
Heating            6
Foundation         6
SaleCondition      6
RoofStyle          6
MasVnrType         5
BldgType           5
LotConfig          5
MiscFeature        5
MSZoning           5
LandContour        4
LotShape           4
Alley              3
PavedDrive         3
CentralAir         2
Utilities          2
Street             2
dtype: int64

In [148]:
actual_numh_oh_cols -= 1

In [153]:
train_df.drop('Id', axis=1, inplace=True)
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,3.0,4.0,,0,2,2008,WD,Normal,208500
1,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,3.0,4.0,,0,5,2007,WD,Normal,181500
2,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,3.0,4.0,,0,9,2008,WD,Normal,223500
3,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,3.0,4.0,,0,2,2006,WD,Abnorml,140000
4,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,3.0,4.0,,0,12,2008,WD,Normal,250000


In [157]:
one_hot_training_values = (train_df[categorical_cols]
                           .apply(lambda x : x.unique())
                           .apply(list))
one_hot_training_values

MSZoning                                 [RL, RM, C (all), FV, RH]
LotFrontage      [65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 9...
Street                                                [Pave, Grvl]
Alley                                             [NA, Grvl, Pave]
LotShape                                      [Reg, IR1, IR2, IR3]
LandContour                                   [Lvl, Bnk, Low, HLS]
Utilities                                         [AllPub, NoSeWa]
LotConfig                      [Inside, FR2, Corner, CulDSac, FR3]
Neighborhood     [CollgCr, Veenker, Crawfor, NoRidge, Mitchel, ...
Condition1       [Norm, Feedr, PosN, Artery, RRAe, RRNn, RRAn, ...
Condition2       [Norm, Artery, RRNn, Feedr, PosN, PosA, RRAn, ...
BldgType                     [1Fam, 2fmCon, Duplex, TwnhsE, Twnhs]
HouseStyle       [2Story, 1Story, 1.5Fin, 1.5Unf, SFoyer, SLvl,...
RoofStyle               [Gable, Hip, Gambrel, Mansard, Flat, Shed]
RoofMatl         [CompShg, WdShngl, Metal, WdShake, Membran, T

Review of the above indicates that not all of these need to be one_hot encoded. Some contain largely numerical values.