In [74]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pandas as pd

In [75]:
data = pd.read_csv('~/JProjects/kaggle cources/data/melb_data.csv')

In [76]:
y = data.Price
X = data.drop('Price', axis=1)

X_train_full, X_val_full, y_train, y_val = train_test_split(X, y, train_size=.8, test_size=.2, random_state=0)

In [77]:
col_w_missing = [col for col in X_train_full.columns
                 if X_train_full[col].isnull().any()]

X_train_full.drop(col_w_missing, axis=1, inplace=True)
X_val_full.drop(col_w_missing, axis=1, inplace=True)

In [78]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns
                        if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']

In [79]:
numerical_cols = [cname for cname in X_train_full.columns
                  if X_train_full[cname].dtype in ['int64', 'float64']]

In [80]:
# Keep selected columns only
cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[cols].copy()
X_val = X_val_full[cols].copy()

In [81]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [82]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = (s[s].index)

In [83]:
print('Cat vars:')
print(object_cols)

Cat vars:
Index(['Type', 'Method', 'Regionname'], dtype='object')


In [84]:
def score_dataset(X_train, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return mean_absolute_error(y_val, preds)

Three approaches for handling this type of data:

## Drop Categorical Variables
The easiest approach to dealing with categorical variables is to simply remove them from the dataset. This approach will only work well if the columns did not contain useful information.

In [85]:
droped_X_train = X_train.select_dtypes(exclude='object')
droped_X_val = X_val.select_dtypes(exclude='object')

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(droped_X_train, droped_X_val, y_train, y_val))

MAE from Approach 1 (Drop categorical variables):
175703.48185157913


## Ordinal Encoding
Ordinal encoding assigns each unique value to a different integer.


In [86]:
%%time

label_X_train = X_train.copy()
label_X_val = X_val.copy()

oe = OrdinalEncoder()

label_X_train[object_cols] = oe.fit_transform(label_X_train[object_cols])
label_X_val[object_cols] = oe.transform(label_X_val[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_val, y_train, y_val))

MAE from Approach 2 (Ordinal Encoding):
165936.40548390493
CPU times: total: 38.3 s
Wall time: 46.2 s


## One-Hot Encoding
One-hot encoding creates new columns indicating the presence (or absence) of each possible value in the original data. To understand this, we'll work through an example.

One-hot encoding generally does not perform well if the categorical variable takes on a large number of values (i.e., you generally won't use it for variables taking more than ***15*** different values).

We use the OneHotEncoder class from scikit-learn to get one-hot encodings. There are a number of parameters that can be used to customize its behavior.
- We set `handle_unknown='ignore'` to avoid errors when the validation data contains classes that aren't represented in the training data,
- and setting `sparse=False` ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix).

In [87]:
%%time

OHE = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_X_train = pd.DataFrame(OHE.fit_transform(X_train[object_cols]))
OH_cols_X_val = pd.DataFrame(OHE.transform(X_val[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_X_train.index = X_train.index
OH_cols_X_val.index = X_val.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_val = X_val.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([OH_cols_X_train, num_X_train], axis=1)
OH_X_val = pd.concat([OH_cols_X_val, num_X_val], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_val.columns = OH_X_val.columns.astype(str)


print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_val, y_train, y_val))

MAE from Approach 3 (One-Hot Encoding):
165699.58889227855
CPU times: total: 39.4 s
Wall time: 39.8 s


In contrast to ordinal encoding, one-hot encoding does not assume an ordering of the categories.

Not all categorical variables have a clear ordering in the values, but we refer to those that do as ***ordinal variables***.

We refer to categorical variables without an intrinsic ranking as ***nominal variables***.