## Data compilation

In [1]:
import pandas as pd
import numpy as np

## Combining dataset

We combine together training and test dataset, so that we can use the same variables and imputation methods across both dataset. We assign home price (SalePrice) values for test dataset at $99. This way we can separate datasets on next steps.

In [2]:
train = pd.read_csv("./data/train.csv")
train.shape

(1460, 81)

In [3]:
test = pd.read_csv("./data/test.csv")
test.shape

(1459, 80)

In [4]:
# Add SalePrice = 99 to test set
test['SalePrice'] = 99
test.shape

(1459, 81)

In [5]:
# Get a combined dataset
dataset = test.append(train)
dataset.shape

(2919, 81)

We also create a variable Month that is equal to year*100+month. It will help us to join time series later

In [6]:
dataset['Month'] = dataset['YrSold'] * 100 + dataset['MoSold']
dataset.Month.head()

0    201006
1    201006
2    201003
3    201006
4    201001
Name: Month, dtype: int64

In [15]:
# Save the dataset
dataset.to_csv("./data/dataset.csv", index=False)

## Merging with time-series datasets

We are adding the following variables that may explain dynamics of housing prices:

* Dow Jones Real Estate index (lagged by one month)

* Fannie Mae mortgage rates (lagged)

* Labor force and Unemployment rate in Ames (lagged)

In [7]:
dataset = pd.read_csv('./data/dataset.csv')

In [10]:
# A helper function that creates Month variable
def createMonth(date):
    '''
    A helper function that takes a date and returns an integer YYYYMM
    '''
    year = int(date.strftime('%Y'))
    month = int(date.strftime('%m'))
    
    return (year*100 +month)

# Dow Jones Real estate index
djre = pd.read_csv("./data/djre.csv", parse_dates=True)
djre.head()

Unnamed: 0,Date,DJREI
0,12/30/1999,124.96
1,12/31/1999,125.77
2,1/3/2000,124.53
3,1/4/2000,122.94
4,1/5/2000,123.77


In [11]:
djre['Date'] = pd.to_datetime(djre['Date']) #Convert to datetime
djre['Month'] = djre['Date'].apply(lambda x: createMonth(x))
djre = djre.groupby('Month').last().reset_index() # Keep end of month observation
djre.set_index('Date', inplace=True) # Set date index to use shift method later
djre.head()

Unnamed: 0_level_0,Month,DJREI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1999-12-31,199912,125.77
2000-01-31,200001,125.92
2000-02-29,200002,123.54
2000-03-31,200003,128.75
2000-04-28,200004,133.23


In [12]:
djre = djre.shift(1) #Create a lagged variable
djre.head()

Unnamed: 0_level_0,Month,DJREI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1999-12-31,,
2000-01-31,199912.0,125.77
2000-02-29,200001.0,125.92
2000-03-31,200002.0,123.54
2000-04-28,200003.0,128.75


In [13]:
# fannie mae mortgage rates 
fannie = pd.read_csv('./data/FannieMaeRates.csv', parse_dates=True)
fannie.head()

Unnamed: 0,Date,Net Yield
0,12/1/2017,3.511
1,11/1/2017,3.516
2,10/2/2017,3.484
3,9/1/2017,3.36
4,8/1/2017,3.539


In [14]:
fannie['Date'] = pd.to_datetime(fannie['Date']) #Convert to datetime
fannie['Month'] = fannie['Date'].apply(lambda x: createMonth(x))
fannie = fannie.groupby('Month').last().reset_index() # Keep end of month observation
fannie.set_index('Date', inplace=True) # Set date index to use shift method later
fannie.head()

Unnamed: 0_level_0,Month,Net Yield
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1985-01-02,198501,12.75
1985-02-01,198502,12.55
1985-03-04,198503,13.15
1985-04-01,198504,12.85
1985-05-01,198505,12.55


In [15]:
fannie = fannie.shift(1) #Create a lagged variable
fannie.columns = ['Month', 'Fannie']
fannie.head()

Unnamed: 0_level_0,Month,Fannie
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1985-01-02,,
1985-02-01,198501.0,12.75
1985-03-04,198502.0,12.55
1985-04-01,198503.0,13.15
1985-05-01,198504.0,12.85


In [16]:
unempl = pd.read_csv("./data/unempl.csv", parse_dates=True)

unempl.head()

Unnamed: 0,Year,Month,labor force,employment,unemployment,unemployment rate
0,2005,1,46453,44859,1594,3.4
1,2005,2,47256,45729,1527,3.2
2,2005,3,46954,45448,1506,3.2
3,2005,4,47269,45921,1348,2.9
4,2005,5,46591,45173,1418,3.0


In [17]:
unempl['Month'] = unempl['Year']*100 + unempl['Month']
unempl['Date'] = pd.to_datetime(unempl['Month'], format='%Y%m') # Still need date to create lagged
unempl.drop(['Year', 'employment', 'unemployment'], axis=1, inplace=True)
unempl.set_index('Date', inplace=True) # Set date index to use shift method later
unempl.head()

Unnamed: 0_level_0,Month,labor force,unemployment rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-01-01,200501,46453,3.4
2005-02-01,200502,47256,3.2
2005-03-01,200503,46954,3.2
2005-04-01,200504,47269,2.9
2005-05-01,200505,46591,3.0


In [18]:
unempl = unempl.shift(1) #Create a lagged variable
unempl.columns = ['Month', 'LaborForce', 'UnemplRate']
unempl.head()

Unnamed: 0_level_0,Month,LaborForce,UnemplRate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-01-01,,,
2005-02-01,200501.0,46453.0,3.4
2005-03-01,200502.0,47256.0,3.2
2005-04-01,200503.0,46954.0,3.2
2005-05-01,200504.0,47269.0,2.9


In [19]:
# A helper function that creates Month variable
def createMonth(date):
    '''
    A helper function that takes a date and returns an integer YYYYMM
    '''
    year = int(date.strftime('%Y'))
    month = int(date.strftime('%m'))
    
    return (year*100 +month)

# Dow Jones Real estate index
djre = pd.read_csv("./data/djre.csv", parse_dates=True)

djre['Date'] = pd.to_datetime(djre['Date']) #Convert to datetime

djre['Month'] = djre['Date'].apply(lambda x: createMonth(x))
djre = djre.groupby('Month').last().reset_index() # Keep end of month observation
djre.set_index('Date', inplace=True) # Set date index to use shift method later

djre = djre.shift(1) #Create a lagged variable

# fannie mae mortgage rates 
fannie = pd.read_csv('./data/FannieMaeRates.csv', parse_dates=True)

fannie['Date'] = pd.to_datetime(fannie['Date']) #Convert to datetime

fannie['Month'] = fannie['Date'].apply(lambda x: createMonth(x))
fannie = fannie.groupby('Month').last().reset_index() # Keep end of month observation
fannie.set_index('Date', inplace=True) # Set date index to use shift method later

fannie = fannie.shift(1) #Create a lagged variable
fannie.columns = ['Month', 'Fannie']


# Ames unemployment rate
unempl = pd.read_csv("./data/unempl.csv", parse_dates=True)

unempl['Month'] = unempl['Year']*100 + unempl['Month']
unempl['Date'] = pd.to_datetime(unempl['Month'], format='%Y%m') # Still need date to create lagged
unempl.drop(['Year', 'employment', 'unemployment'], axis=1, inplace=True)

unempl.set_index('Date', inplace=True) # Set date index to use shift method later

unempl = unempl.shift(1) #Create a lagged variable
unempl.columns = ['Month', 'LaborForce', 'UnemplRate']


# Combine with the dataset
dataset = pd.merge(dataset, djre, on='Month', how='left')
dataset = pd.merge(dataset, fannie, on='Month', how='left')
dataset = pd.merge(dataset, unempl, on='Month', how='left')

In [20]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Month,DJREI,Fannie,LaborForce,UnemplRate
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,3,2010,WD,Normal,99,201003,193.37,4.762,53639.0,4.6
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,6,2010,WD,Normal,99,201006,183.0,4.561,51950.0,4.7
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,1,2010,WD,Normal,99,201001,168.83,4.993,52631.0,4.8


In [21]:
dataset.to_csv("./data/dataset.csv", index=False)

## Imputation of missingness

In [56]:
missing_data = dataset.isna().sum(axis=0)
missing_data[missing_data >0 ]

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64

In [57]:
missing_data/dataset.shape[0]  # As percentage of data

Id               0.000000
MSSubClass       0.000000
MSZoning         0.001370
LotFrontage      0.166495
LotArea          0.000000
Street           0.000000
Alley            0.932169
LotShape         0.000000
LandContour      0.000000
Utilities        0.000685
LotConfig        0.000000
LandSlope        0.000000
Neighborhood     0.000000
Condition1       0.000000
Condition2       0.000000
BldgType         0.000000
HouseStyle       0.000000
OverallQual      0.000000
OverallCond      0.000000
YearBuilt        0.000000
YearRemodAdd     0.000000
RoofStyle        0.000000
RoofMatl         0.000000
Exterior1st      0.000343
Exterior2nd      0.000343
MasVnrType       0.008222
MasVnrArea       0.007879
ExterQual        0.000000
ExterCond        0.000000
Foundation       0.000000
                   ...   
Fireplaces       0.000000
FireplaceQu      0.486468
GarageType       0.053786
GarageYrBlt      0.054471
GarageFinish     0.054471
GarageCars       0.000343
GarageArea       0.000343
GarageQual  