## Data compilation 
### (after merging with time-series data)

In [77]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('./data/dataset.csv')

## Imputation of missingness

In [79]:
#missing data
total = dataset.isnull().sum().sort_values(ascending=False)
percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

missing_data[missing_data['Total'].values> 0].head()

Unnamed: 0,Total,Percent
PoolQC,2909,0.996574
MiscFeature,2814,0.964029
Alley,2721,0.932169
Fence,2348,0.804385
FireplaceQu,1420,0.486468


In [9]:
#dataset.isnull().sum().max() #checking if there is missing data

2909

In [10]:
dataset.head() # a look at the current dataset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Month,DJREI,Fannie,LaborForce,UnemplRate
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,3,2010,WD,Normal,99,201003,193.37,4.762,53639.0,4.6
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,1,2010,WD,Normal,99,201001,168.83,4.993,52631.0,4.8


In [71]:
#a look at rows with missing data
null_data = dataset[dataset.isnull().any(axis=1)]
null_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Month,DJREI,Fannie,LaborForce,UnemplRate
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,3,2010,WD,Normal,99,201003,193.37,4.762,53639.0,4.6
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,1,2010,WD,Normal,99,201001,168.83,4.993,52631.0,4.8


### Handling Missing Data

In [81]:
# Handle missing values for features where median/mean or most common value doesn't make sense

# Alley : data description says NA means "no alley access"
dataset.loc[:, "Alley"] = dataset.loc[:, "Alley"].fillna("None")
# BedroomAbvGr : NA most likely means 0
dataset.loc[:, "BedroomAbvGr"] = dataset.loc[:, "BedroomAbvGr"].fillna(0)
# BsmtQual etc : data description says NA for basement features is "no basement"
dataset.loc[:, "BsmtQual"] = dataset.loc[:, "BsmtQual"].fillna("No")
dataset.loc[:, "BsmtCond"] = dataset.loc[:, "BsmtCond"].fillna("No")
dataset.loc[:, "BsmtExposure"] = dataset.loc[:, "BsmtExposure"].fillna("No")
dataset.loc[:, "BsmtFinType1"] = dataset.loc[:, "BsmtFinType1"].fillna("No")
dataset.loc[:, "BsmtFinType2"] = dataset.loc[:, "BsmtFinType2"].fillna("No")
dataset.loc[:, "BsmtFullBath"] = dataset.loc[:, "BsmtFullBath"].fillna(0)
dataset.loc[:, "BsmtHalfBath"] = dataset.loc[:, "BsmtHalfBath"].fillna(0)
dataset.loc[:, "BsmtUnfSF"] = dataset.loc[:, "BsmtUnfSF"].fillna(0)
# CentralAir : NA most likely means No
dataset.loc[:, "CentralAir"] = dataset.loc[:, "CentralAir"].fillna("N")
# Condition : NA most likely means Normal
dataset.loc[:, "Condition1"] = dataset.loc[:, "Condition1"].fillna("Norm")
dataset.loc[:, "Condition2"] = dataset.loc[:, "Condition2"].fillna("Norm")
# EnclosedPorch : NA most likely means no enclosed porch
dataset.loc[:, "EnclosedPorch"] = dataset.loc[:, "EnclosedPorch"].fillna(0)
# External stuff : NA most likely means average
dataset.loc[:, "ExterCond"] = dataset.loc[:, "ExterCond"].fillna("TA")
dataset.loc[:, "ExterQual"] = dataset.loc[:, "ExterQual"].fillna("TA")
# Fence : data description says NA means "no fence"
dataset.loc[:, "Fence"] = dataset.loc[:, "Fence"].fillna("No")
# FireplaceQu : data description says NA means "no fireplace"
dataset.loc[:, "FireplaceQu"] = dataset.loc[:, "FireplaceQu"].fillna("No")
dataset.loc[:, "Fireplaces"] = dataset.loc[:, "Fireplaces"].fillna(0)
# Functional : data description says NA means typical
dataset.loc[:, "Functional"] = dataset.loc[:, "Functional"].fillna("Typ")
# GarageType etc : data description says NA for garage features is "no garage"
dataset.loc[:, "GarageType"] = dataset.loc[:, "GarageType"].fillna("No")
dataset.loc[:, "GarageFinish"] = dataset.loc[:, "GarageFinish"].fillna("No")
dataset.loc[:, "GarageQual"] = dataset.loc[:, "GarageQual"].fillna("No")
dataset.loc[:, "GarageCond"] = dataset.loc[:, "GarageCond"].fillna("No")
dataset.loc[:, "GarageArea"] = dataset.loc[:, "GarageArea"].fillna(0)
dataset.loc[:, "GarageCars"] = dataset.loc[:, "GarageCars"].fillna(0)
dataset.loc[:, "GarageYrBlt"] = dataset.loc[:, "GarageYrBlt"].fillna(0)
# HalfBath : NA most likely means no half baths above grade
dataset.loc[:, "HalfBath"] = dataset.loc[:, "HalfBath"].fillna(0)
# HeatingQC : NA most likely means typical
dataset.loc[:, "HeatingQC"] = dataset.loc[:, "HeatingQC"].fillna("TA")
# KitchenAbvGr : NA most likely means 0
dataset.loc[:, "KitchenAbvGr"] = dataset.loc[:, "KitchenAbvGr"].fillna(0)
# KitchenQual : NA most likely means typical
dataset.loc[:, "KitchenQual"] = dataset.loc[:, "KitchenQual"].fillna("TA")
# LotFrontage : NA most likely means no lot frontage
dataset.loc[:, "LotFrontage"] = dataset.loc[:, "LotFrontage"].fillna(0)
# LotShape : NA most likely means regular
dataset.loc[:, "LotShape"] = dataset.loc[:, "LotShape"].fillna("Reg")
# MasVnrType : NA most likely means no veneer
dataset.loc[:, "MasVnrType"] = dataset.loc[:, "MasVnrType"].fillna("None")
dataset.loc[:, "MasVnrArea"] = dataset.loc[:, "MasVnrArea"].fillna(0)
# MiscFeature : data description says NA means "no misc feature"
dataset.loc[:, "MiscFeature"] = dataset.loc[:, "MiscFeature"].fillna("No")
dataset.loc[:, "MiscVal"] = dataset.loc[:, "MiscVal"].fillna(0)
# OpenPorchSF : NA most likely means no open porch
dataset.loc[:, "OpenPorchSF"] = dataset.loc[:, "OpenPorchSF"].fillna(0)
# PavedDrive : NA most likely means not paved
dataset.loc[:, "PavedDrive"] = dataset.loc[:, "PavedDrive"].fillna("N")
# PoolQC : data description says NA means "no pool"
dataset.loc[:, "PoolQC"] = dataset.loc[:, "PoolQC"].fillna("No")
dataset.loc[:, "PoolArea"] = dataset.loc[:, "PoolArea"].fillna(0)
# SaleCondition : NA most likely means normal sale
dataset.loc[:, "SaleCondition"] = dataset.loc[:, "SaleCondition"].fillna("Normal")
# ScreenPorch : NA most likely means no screen porch
dataset.loc[:, "ScreenPorch"] = dataset.loc[:, "ScreenPorch"].fillna(0)
# TotRmsAbvGrd : NA most likely means 0
dataset.loc[:, "TotRmsAbvGrd"] = dataset.loc[:, "TotRmsAbvGrd"].fillna(0)
# Utilities : NA most likely means all public utilities
dataset.loc[:, "Utilities"] = dataset.loc[:, "Utilities"].fillna("AllPub")
# WoodDeckSF : NA most likely means no wood deck
dataset.loc[:, "WoodDeckSF"] = dataset.loc[:, "WoodDeckSF"].fillna(0)

### A second look

In [82]:
#let's take a look at the data now
total = dataset.isnull().sum().sort_values(ascending=False)
percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

missing_data[missing_data['Total'].values> 0]

#MSZoning = Residential_Low_Density
#Electrical = SBrkr
#Exterior1st = BrkFace, VinylSd
#TotalBsmtSF 
#Exterior2nd = Plywood, BrkFace
#BsmntFinSF2
#SaleType
#BsmtFinSF1

Unnamed: 0,Total,Percent
MSZoning,4,0.00137
Electrical,1,0.000343
Exterior1st,1,0.000343
TotalBsmtSF,1,0.000343
Exterior2nd,1,0.000343
BsmtFinSF2,1,0.000343
SaleType,1,0.000343
BsmtFinSF1,1,0.000343


In [84]:
null_data = dataset[dataset.isnull().any(axis=1)]
null_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Month,DJREI,Fannie,LaborForce,UnemplRate
455,1916,30,,109.0,21780,Grvl,,Reg,Lvl,AllPub,...,3,2009,ConLD,Normal,99,200903,99.77,4.918,49552.0,4.8
660,2121,20,RM,99.0,5940,Pave,,IR1,Lvl,AllPub,...,4,2008,ConLD,Abnorml,99,200804,268.49,5.663,49506.0,2.8
691,2152,30,RL,85.0,19550,Pave,,Reg,Lvl,AllPub,...,1,2008,WD,Normal,99,200801,255.58,5.932,47667.0,3.1
756,2217,20,,80.0,14584,Pave,,Reg,Low,AllPub,...,2,2008,WD,Abnorml,99,200802,244.57,5.534,49426.0,3.0
790,2251,70,,0.0,56600,Pave,,IR1,Low,AllPub,...,1,2008,WD,Normal,99,200801,255.58,5.932,47667.0,3.1
1029,2490,20,RL,85.0,13770,Pave,,Reg,Lvl,AllPub,...,10,2007,,Normal,99,200710,300.46,6.325,49242.0,2.5
1444,2905,20,,125.0,31250,Pave,,Reg,Lvl,AllPub,...,5,2006,WD,Normal,99,200605,270.3,6.446,47642.0,2.5
2838,1380,80,RL,73.0,9735,Pave,,Reg,Lvl,AllPub,...,5,2008,WD,Normal,167500,200805,269.75,5.775,48846.0,3.0


In [None]:
#dataset.loc[:, "MSZoning"] = dataset.loc[:, "MSZoning"].fillna(None)