# Housing Sale Prediction

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
2024-03-17 17:24:18.427667: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-17 17:24:20.871848: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-17 17:24:20.872268: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-17 17:2

## 1. Data import and tidy up

In [3]:
train_df = pd.read_csv('../data/housing/train.csv', keep_default_na=False)
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
train_df.dtypes.unique()

array([dtype('int64'), dtype('O')], dtype=object)

In [6]:
train_df.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [7]:
test_df = pd.read_csv('../data/housing/test.csv', keep_default_na=False)
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [8]:
test_df.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 80, dtype: int64

## 1. Data Preprocessing and Analysis

In [9]:
train_df['Utilities'].unique()

array(['AllPub', 'NoSeWa'], dtype=object)

In [10]:
numerical_cols = ['MasVnrArea', 'LotFrontage', 'GarageYrBlt']
train_df[numerical_cols].head()

Unnamed: 0,MasVnrArea,LotFrontage,GarageYrBlt
0,196,65,2003
1,0,80,1976
2,162,68,2001
3,0,60,1998
4,350,84,2000


In [11]:
train_df[numerical_cols].dtypes

MasVnrArea     object
LotFrontage    object
GarageYrBlt    object
dtype: object

In [12]:
train_df[numerical_cols[1]].unique()

array(['65', '80', '68', '60', '84', '85', '75', 'NA', '51', '50', '70',
       '91', '72', '66', '101', '57', '44', '110', '98', '47', '108',
       '112', '74', '115', '61', '48', '33', '52', '100', '24', '89',
       '63', '76', '81', '95', '69', '21', '32', '78', '121', '122', '40',
       '105', '73', '77', '64', '94', '34', '90', '55', '88', '82', '71',
       '120', '107', '92', '134', '62', '86', '141', '97', '54', '41',
       '79', '174', '99', '67', '83', '43', '103', '93', '30', '129',
       '140', '35', '37', '118', '87', '116', '150', '111', '49', '96',
       '59', '36', '56', '102', '58', '38', '109', '130', '53', '137',
       '45', '106', '104', '42', '39', '144', '114', '128', '149', '313',
       '168', '182', '138', '160', '152', '124', '153', '46'],
      dtype=object)

In [19]:
train_df[numerical_cols] = train_df[numerical_cols].replace("NA", '0')

These need to be converted to numerical values, as they are currently shown as objects, and will incorrectly be on hot encoded.

In [20]:
train_df[numerical_cols] = train_df[numerical_cols].astype(int)
train_df[numerical_cols].dtypes

MasVnrArea     int64
LotFrontage    int64
GarageYrBlt    int64
dtype: object

## 2. Data Preprocessing

### 2.1 Ordinal Encoding

Identified the below cols as ordinal values.

In [21]:
ordinal_cols = [
    'LandSlope',
    'OverallQual',
    'OverallCond',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'HeatingQC',
    'KitchenQual',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence'
]

In [22]:
train_df['LandSlope'].unique()

array(['Gtl', 'Mod', 'Sev'], dtype=object)

In [23]:
full_categories = [ np.unique(np.append(train_df[col], test_df[col])) for col in ordinal_cols ]
full_categories

[array(['Gtl', 'Mod', 'Sev'], dtype=object),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'TA'], dtype=object),
 array(['Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'NA', 'No'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'NA', 'Rec', 'Unf'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'NA', 'Rec', 'Unf'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Fin', 'NA', 'RFn', 'Unf'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'NA'], dtype=object),
 array(['GdPrv', 'GdWo', 'MnPrv', 'MnWw', 'NA'], dtype=object)]

In [24]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories=full_categories)

In [25]:
train_df[ordinal_cols] = ordinal_encoder.fit_transform(train_df[ordinal_cols])

In [26]:
test_df[ordinal_cols] = ordinal_encoder.transform(test_df[ordinal_cols])

### 2.2 Binary Encoding

Only need to binary encode central air.

In [27]:
train_df['CentralAir'] = train_df['CentralAir'].replace({'Yes': 1, "No": 0})
test_df['CentralAir'] = test_df['CentralAir'].replace({'Yes': 1, "No": 0})

### 2.2 One Hot Encoder Encoding

In [28]:
columns_to_ignore = np.append(ordinal_cols, 'CentralAir')

In [29]:
categorical_mask = (train_df.dtypes == object)
categorical_cols = train_df.columns[categorical_mask]
categorical_cols = [col for col in categorical_cols if col not in columns_to_ignore]
print(f"num categorical cols {len(categorical_cols)}")

num categorical cols 26


In [31]:
train_df.drop('Id', axis=1, inplace=True)
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,3.0,4.0,,0,2,2008,WD,Normal,208500
1,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,3.0,4.0,,0,5,2007,WD,Normal,181500
2,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,3.0,4.0,,0,9,2008,WD,Normal,223500
3,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,3.0,4.0,,0,2,2006,WD,Abnorml,140000
4,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,3.0,4.0,,0,12,2008,WD,Normal,250000


In [32]:
one_hot_training_values = (train_df[categorical_cols]
                           .apply(lambda x : x.unique())
                           .apply(list))
one_hot_training_values

MSZoning                                 [RL, RM, C (all), FV, RH]
Street                                                [Pave, Grvl]
Alley                                             [NA, Grvl, Pave]
LotShape                                      [Reg, IR1, IR2, IR3]
LandContour                                   [Lvl, Bnk, Low, HLS]
Utilities                                         [AllPub, NoSeWa]
LotConfig                      [Inside, FR2, Corner, CulDSac, FR3]
Neighborhood     [CollgCr, Veenker, Crawfor, NoRidge, Mitchel, ...
Condition1       [Norm, Feedr, PosN, Artery, RRAe, RRNn, RRAn, ...
Condition2       [Norm, Artery, RRNn, Feedr, PosN, PosA, RRAn, ...
BldgType                     [1Fam, 2fmCon, Duplex, TwnhsE, Twnhs]
HouseStyle       [2Story, 1Story, 1.5Fin, 1.5Unf, SFoyer, SLvl,...
RoofStyle               [Gable, Hip, Gambrel, Mansard, Flat, Shed]
RoofMatl         [CompShg, WdShngl, Metal, WdShake, Membran, Ta...
Exterior1st      [VinylSd, MetalSd, Wd Sdng, HdBoard, BrkFace,

Review of the above indicates that not all of these need to be one_hot encoded. Some contain largely numerical values.