# Housing Sale Prediction

In [88]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns

## Data import and tidy up

In [89]:
train_df = pd.read_csv('../data/housing/train.csv', keep_default_na=False)
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [90]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [91]:
train_df.dtypes.unique()

array([dtype('int64'), dtype('O')], dtype=object)

In [92]:
train_df.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [93]:
test_df = pd.read_csv('../data/housing/test.csv', keep_default_na=False)
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [94]:
test_df.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 80, dtype: int64

## Ordinal Encoding

Identified the below cols as ordinal values.

In [95]:
ordinal_cols = [
    'LandSlope',
    'OverallQual',
    'OverallCond',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'HeatingQC',
    'KitchenQual',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence'
]

In [96]:
train_df['LandSlope'].unique()

array(['Gtl', 'Mod', 'Sev'], dtype=object)

In [98]:
full_categories = [ np.unique(np.append(train_df[col], test_df[col])) for col in ordinal_cols ]
full_categories

[array(['Gtl', 'Mod', 'Sev'], dtype=object),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'TA'], dtype=object),
 array(['Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'NA', 'No'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'NA', 'Rec', 'Unf'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'NA', 'Rec', 'Unf'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Fin', 'NA', 'RFn', 'Unf'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA'], dtype=object),
 array(['GdPrv', 'GdWo', 'MnPrv', 'MnWw', 'NA'], dtype=object)]

In [99]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories=full_categories)

In [100]:
train_df[ordinal_cols] = ordinal_encoder.fit_transform(train_df[ordinal_cols])

In [101]:
test_df[ordinal_cols] = ordinal_encoder.transform(test_df[ordinal_cols])

## Data Preprocessing

In [34]:
categorical_mask = (train_df.dtypes == object)
categorical_cols = train_df.columns[categorical_mask]
print(f"num categorical cols {len(categorical_cols)}")

num categorical cols 46


In [36]:
num_onh_cols = (train_df[categorical_cols]
                .apply(lambda x : x.nunique())
                .sort_values(ascending=False))
num_onh_cols

MasVnrArea       328
LotFrontage      111
GarageYrBlt       98
Neighborhood      25
Exterior2nd       16
Exterior1st       15
SaleType           9
Condition1         9
HouseStyle         8
Condition2         8
RoofMatl           8
BsmtFinType1       7
Functional         7
BsmtFinType2       7
GarageType         7
RoofStyle          6
FireplaceQu        6
GarageQual         6
GarageCond         6
Electrical         6
Foundation         6
Heating            6
SaleCondition      6
Fence              5
HeatingQC          5
MiscFeature        5
MSZoning           5
BsmtExposure       5
MasVnrType         5
BsmtCond           5
BldgType           5
LotConfig          5
ExterCond          5
BsmtQual           5
KitchenQual        4
ExterQual          4
GarageFinish       4
LandContour        4
PoolQC             4
LotShape           4
LandSlope          3
PavedDrive         3
Alley              3
CentralAir         2
Utilities          2
Street             2
dtype: int64

In [38]:
actual_numh_oh_cols = num_onh_cols.loc[num_onh_cols > 1]
actual_numh_oh_cols

MasVnrArea       328
LotFrontage      111
GarageYrBlt       98
Neighborhood      25
Exterior2nd       16
Exterior1st       15
SaleType           9
Condition1         9
HouseStyle         8
Condition2         8
RoofMatl           8
BsmtFinType1       7
Functional         7
BsmtFinType2       7
GarageType         7
RoofStyle          6
FireplaceQu        6
GarageQual         6
GarageCond         6
Electrical         6
Foundation         6
Heating            6
SaleCondition      6
Fence              5
HeatingQC          5
MiscFeature        5
MSZoning           5
BsmtExposure       5
MasVnrType         5
BsmtCond           5
BldgType           5
LotConfig          5
ExterCond          5
BsmtQual           5
KitchenQual        4
ExterQual          4
GarageFinish       4
LandContour        4
PoolQC             4
LotShape           4
LandSlope          3
PavedDrive         3
Alley              3
CentralAir         2
Utilities          2
Street             2
dtype: int64

In [39]:
actual_numh_oh_cols -= 1

In [40]:
actual_numh_oh_cols.sum()

759