# Kaggle House Prices - Fit Models

In [9]:
# Import libraries
import pandas as pd
import numpy as np
import statistics
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.3, style="ticks")
fs = (14, 9) # make the figure wider than the default

## Load and Groom the Data
* Load the data
* Fill 'NA's
* Correct data types

In [10]:
# Load the training data
houses = pd.read_csv("data/cleaned_data/cleaned_train.csv")

# Load the test data
houses_test = pd.read_csv("data/cleaned_data/cleaned_test.csv")

houses.shape

(1460, 82)

In [12]:
# Override spurious NAs
# PoolQu
houses.PoolQC.fillna("NA", inplace=True)
#MiscFeature
houses.MiscFeature.fillna("NA", inplace=True)
#Alley
houses.Alley.fillna("NA", inplace=True)
# Fence
houses.Fence.fillna("NA", inplace=True)
# FireplaceQu
houses.FireplaceQu.fillna("NA", inplace=True)
# Garages
houses.GarageType.fillna("NA",inplace=True)
houses.GarageFinish.fillna("NA",inplace=True)
houses.GarageQual.fillna("NA",inplace=True)
houses.GarageCond.fillna("NA",inplace=True)
# Basements
houses.BsmtExposure.fillna("NA",inplace=True)
houses.BsmtFinType1.fillna("NA",inplace=True)
houses.BsmtFinType2.fillna("NA",inplace=True)
houses.BsmtQual.fillna("NA",inplace=True)
houses.BsmtCond.fillna("NA",inplace=True)
# Masonry
houses.MasVnrType.fillna("NA",inplace=True)
houses.MasVnrArea.fillna(0,inplace=True)
# What % data (if any) is missing for each column?
nulls = houses.isnull().sum()
nulls = pd.DataFrame(nulls)
nulls.rename(columns={0:"cnt_missing"},inplace=True)
nulls = nulls[nulls.cnt_missing>0]
nulls['pct_missing'] = round(nulls.cnt_missing/houses.shape[0] * 100,2)
nulls.sort_values(by="pct_missing",ascending=False)

Unnamed: 0,cnt_missing,pct_missing
GarageYrBlt,81,5.55
Electrical,1,0.07


In [13]:
# Look at data types
houses.dtypes

Unnamed: 0         int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
                  ...   
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object
GarageYrBlt      float64
GarageFinish      object


In [14]:
# List columns with type object
objs = houses.select_dtypes(include=['object']).dtypes
objs = [re.sub(" *object","",s) for s in objs.index]

In [15]:
objs

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']