In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

In [2]:
#load in data set
data = pd.read_csv('AmesHousing.csv')


### Missing Values

There are a few features that have missing values. Most of them are features where a missing values represents the house not having that feature, such as a pool or garage. 

Because of this I will fill these missing values with 'None'.

The first set of features to fill are categorical features. 

In [3]:
#These are the categorical features with missing values.
catMissing = ['Pool QC', 'Misc Feature', 'Alley', 'Fence', 'Fireplace Qu', 'Garage Type',
    'Garage Finish', 'Garage Qual', 'Garage Cond', 'Bsmt Qual', 'Bsmt Cond',
    'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Mas Vnr Type']

#Fill missing values with None 
data[catMissing] = data[catMissing].fillna('None')

The next set of features with missing values are numeric features where a missing values again means that the house does not have that feature. This time instead of None it will be filled with 0. 

In [4]:
numMissing = [ 'Garage Area', 'Garage Cars', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',
    'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Mas Vnr Area', 'Lot Frontage']

data[numMissing] = data[numMissing].fillna(0)

Lastly we have a few features with missing valeus that cannot be filled with None or 0. I will use impute techniques to fill these missing values. 

In [5]:
imputeNa = [
    'Functional', 'MS Zoning', 'Electrical', 'Kitchen Qual', 'Exterior 1st',
    'Exterior 2nd', 'Sale Type', 'Utilities'
]

imputer = SimpleImputer(strategy='most_frequent')
data[imputeNa] = pd.DataFrame(imputer.fit_transform(data[imputeNa]), index=data.index)

#also for garage built we will just fill the missing values with the year the house was built

data['Garage Yr Blt'] = data['Garage Yr Blt'].fillna(data['Year Built'])


Next I need to convert the categorical values from strings to numbers so that the XGBoost model can properly train on the data set

In [6]:
encoder = OrdinalEncoder()

for col in data:
    if data[col].dtype == 'object':
        temp = encoder.fit_transform(data[col].values.reshape(-1,1))
        data[col] = temp.flatten()

In [None]:
#Now that the data is preped for training the model I will output it to a csv file
data.to_csv('AmesHousing_clean.csv', index=False)