In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('.\\data\\melbourne-housing-snapshot\\melb_data.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
y = data.Price
X = data.drop(['Price'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
cols_with_missing

['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']

In [7]:
X_train.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

In [10]:
low_cardinality_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == "object"]
low_cardinality_cols

['Type', 'Method', 'Regionname']

In [12]:
numerical_columns = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]
numerical_columns

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [13]:
my_cols = low_cardinality_cols + numerical_columns
my_cols

['Type',
 'Method',
 'Regionname',
 'Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [14]:
X_train_selected = X_train[my_cols].copy()
X_test_selected = X_test[my_cols].copy() 

In [30]:
# Categorical variables
s = (X_train_selected.dtypes == 'object')
s

Type              True
Method            True
Regionname        True
Rooms            False
Distance         False
Postcode         False
Bedroom2         False
Bathroom         False
Landsize         False
Lattitude        False
Longtitude       False
Propertycount    False
dtype: bool

In [31]:
s = s[s]
s

Type          True
Method        True
Regionname    True
dtype: bool

In [32]:
object_cols = list(s[s].index)
object_cols

['Type', 'Method', 'Regionname']

### Define Function to measure quality of each approach

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_model(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(random_state=1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

Approach 1: Drop categorical variables

In [35]:
drop_X_train = X_train_selected.select_dtypes(exclude=['object'])
drop_X_valid = X_test_selected.select_dtypes(exclude=['object'])

print("MAE (Drop object):", score_model(drop_X_train, drop_X_valid, y_train, y_test))

MAE (Drop object): 174632.25689207987


Approach 2: Ordinal Encoding

In [37]:
from sklearn.preprocessing import OrdinalEncoder

label_X_train = X_train_selected.copy()
label_X_test = X_test_selected.copy()

ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(label_X_train[object_cols])
label_X_test[object_cols] = ordinal_encoder.transform(label_X_test[object_cols])

label_X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,2.0,1.0,5.0,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,0.0,2.0,6.0,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,0.0,1.0,6.0,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,2.0,3.0,2.0,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,0.0,1.0,6.0,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [38]:
print("MAE (Ordinal encode object):", score_model(label_X_train, label_X_test, y_train, y_test))

MAE (Ordinal encode object): 165256.28786135072


Approach 3: One-Hot Encoding

In [40]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
OH_label_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train_selected[object_cols]))
OH_label_X_test = pd.DataFrame(OH_encoder.transform(X_test_selected[object_cols]))

OH_label_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [41]:
# One hot encoding is a technique that converts categorical variables into binary variables and removes the original categorical variables.
# This process removed index.

OH_label_X_train.index = X_train_selected.index
OH_label_X_test.index = X_test_selected.index

In [42]:
# Remove categorical columns (will replace with one hot encoding)
num_X_train = X_train_selected.drop(object_cols, axis=1)
num_X_test = X_test_selected.drop(object_cols, axis=1)

In [43]:
OH_X_train = pd.concat([num_X_train, OH_label_X_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_label_X_test], axis=1)

OH_X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,0,...,6,7,8,9,10,11,12,13,14,15
12167,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6524,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8413,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2919,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6043,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [44]:
print("MAE (One Hot Encoder):", score_model(OH_X_train, OH_X_test, y_train, y_test))



MAE (One Hot Encoder): 166500.44717161093


