In [42]:
# set up
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

data = pd.read_csv('data\melb_data.csv')
missing_cols = [col for col in data.columns if data[col].isnull().any()]
X = data.drop(['Price'], axis=1)
X = X.drop(missing_cols, axis=1)
y = data.Price
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8,test_size=0.2)

def score_dataset(train_X, test_X, train_y, test_y):
    model = RandomForestRegressor(random_state=0)
    model.fit(train_X,train_y)
    p_val = model.predict(test_X)
    mae = mean_absolute_error(test_y,p_val)
    return mae

### Background
A categorical variable takes only a limited number of values. Think about enum type in c/Java/python.  
You will get an error if you try to plug these variables into most machine learning models in Python without preprocessing them first.

In [43]:
# find the non-numerical columns

index_series = (X.dtypes == 'object')
object_cols = list(index_series[index_series].index)
print(object_cols)
print(X[object_cols])

['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'Regionname']
              Suburb           Address Type Method   SellerG        Date  \
0         Abbotsford      85 Turner St    h      S    Biggin   3/12/2016   
1         Abbotsford   25 Bloomburg St    h      S    Biggin   4/02/2016   
2         Abbotsford      5 Charles St    h     SP    Biggin   4/03/2017   
3         Abbotsford  40 Federation La    h     PI    Biggin   4/03/2017   
4         Abbotsford       55a Park St    h     VB    Nelson   4/06/2016   
...              ...               ...  ...    ...       ...         ...   
13575  Wheelers Hill      12 Strada Cr    h      S     Barry  26/08/2017   
13576   Williamstown     77 Merrett Dr    h     SP  Williams  26/08/2017   
13577   Williamstown       83 Power St    h      S     Raine  26/08/2017   
13578   Williamstown      96 Verdon St    h     PI   Sweeney  26/08/2017   
13579     Yarraville        6 Agnes St    h     SP   Village  26/08/2017   

              

### Approach 1: Label encoding
As implmentation in c, we can label these categorical variables by numerical type like integer.

<img src="static/label_img.png" style="zoom:40%;"/>

In [44]:
from sklearn.preprocessing import LabelEncoder

label_train_X = train_X.copy()
label_test_X = test_X.copy()
label_encoder = LabelEncoder()
# to prevent data in one column be treated as various type, use "astype"
for col in object_cols:
    label_train_X[col] = label_encoder.fit_transform(train_X[col].astype(str))
    label_test_X[col] = label_encoder.fit_transform(test_X[col].astype(str))
score_dataset(label_train_X, label_test_X, train_y, test_y)

176957.01884388807

### Approach 2: One-Hot encoding
One-hot encoding creates new columns indicating the presence (or absence) of each possible value in the original data.

<img src="static/one_hot_img.png" style="zoom:40%;"/>

In [45]:
from sklearn.preprocessing import OneHotEncoder

'''
avoid errors when the validation data contains classes that aren't represented in the training data and ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix
'''
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.fit_transform(test_X[object_cols]))
OH_cols_train.index = train_X[object_cols].index
OH_cols_test.index = test_X[object_cols].index

# Remove categorical columns (will replace with one-hot encoding)
num_train_X = train_X.drop(object_cols, axis=1)
num_test_X = test_X.drop(object_cols, axis=1)

OH_train_X = pd.concat([num_train_X, OH_cols_train], axis=1)
OH_test_X = pd.concat([num_test_X, OH_cols_test], axis=1)

score_dataset(OH_train_X,OH_test_X,train_y,test_y)

ValueError: Number of features of the model must match the input. Model n_features is 11370 and input n_features is 3225 

### What's the difference?
One big and important difference between label encoding and one-hot encoding is when you use label encoding, you assume the variable order -- in some cases that may lead to error; in the contrast, one-hot encoding does not assume an ordering of the categories. But you may not use one-hot encoding if the categorical variable takes on a large number of values.