In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('./input/melbourne-housing-snapshot/melb_data.csv')

y = data['Price']
X = data.drop(['Price'], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Drops columns with missing values
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]

X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns
                        if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [54]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [55]:
# !!!
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print('Categorical variables:')
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=1)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    
    return mean_absolute_error(y_valid, preds) 

#### Score Approach 1 (Drop Categorical Variables)

In [57]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print('MAE from Approach 1 (Drop Categorical Variables):')
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop Categorical Variables):
174632.25689207987


#### Score Approach 2 (Ordinal Encoding)

In [58]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print('MAE from Approach 2 (Ordinal Encoding):')
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
165256.28786135072


#### Score Approach 3 (One-Hot Encoding)

In [59]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index


# Remove categorical columns (will replace with one-hot encoding)
print(num_X_train)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
print(num_X_train)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print('MAE from approach 3 (One-Hot Encoding):')
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

       Rooms  Distance  Postcode  Bedroom2  Bathroom  Landsize  Lattitude  \
12167      1       5.0    3182.0       1.0       1.0       0.0  -37.85984   
6524       2       8.0    3016.0       2.0       2.0     193.0  -37.85800   
8413       3      12.6    3020.0       3.0       1.0     555.0  -37.79880   
2919       3      13.0    3046.0       3.0       1.0     265.0  -37.70830   
6043       3      13.3    3020.0       3.0       1.0     673.0  -37.76230   
...      ...       ...       ...       ...       ...       ...        ...   
13123      3       5.2    3056.0       3.0       1.0     212.0  -37.77695   
3264       3      10.5    3081.0       3.0       1.0     748.0  -37.74160   
9845       4       6.7    3058.0       4.0       2.0     441.0  -37.73572   
10799      3      12.0    3073.0       3.0       1.0     606.0  -37.72057   
2732       4       6.4    3011.0       4.0       2.0     319.0  -37.79430   

       Longtitude  Propertycount  
12167   144.98670        13240.0  
6524 



166500.44717161093


