A**s usual importing all the important libraries**

In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [33]:
#Reading data
data = pd.read_csv('/content/melb_data.csv')

In [34]:
#Defining feature vs label data
X = data.drop(['Price'], axis =1)
y = data.Price

In [35]:
#Divide data into training and testing subsets
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, test_size=0.8, train_size = 0.2, random_state = 1)

In [36]:
#Drop the columns with missing value (Simple Approach)
cols_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_missing, axis = 1, inplace = True)
X_test_full.drop(cols_missing, axis = 1, inplace = True)

In [37]:
#Cardinality means the number of unique values in a column
#Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == 'object']
categorical_cols

['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'Regionname']

In [38]:
type(categorical_cols)

list

In [39]:
for c in categorical_cols:
  t = X_train_full[c].value_counts()
  print(c)
  print(t)

Suburb
Reservoir          70
Richmond           62
Brunswick          47
Bentleigh East     47
Hawthorn           46
                   ..
Blackburn North     1
Templestowe         1
Ardeer              1
Chelsea Heights     1
Carrum              1
Name: Suburb, Length: 262, dtype: int64
Address
13 Robinson St          2
1/12 Hatfield Ct        2
19 Charles St           2
16 Coronation St        2
118 Westgarth St        2
                       ..
14 Columbia St          1
1 Monteith St           1
104/13 Wellington St    1
24 Trevannion St        1
11 Condor St            1
Name: Address, Length: 2708, dtype: int64
Type
h    1887
u     607
t     222
Name: Type, dtype: int64
Method
S     1829
PI     327
SP     311
VB     227
SA      22
Name: Method, dtype: int64
SellerG
Nelson           315
Jellis           258
hockingstuart    238
Barry            212
Ray              139
                ... 
Obrien             1
Dixon              1
buyMyplace         1
Ham                1
Del     

In [40]:
low_card_cat_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == 'object' and
                     X_train_full[cname].nunique()<10]

low_card_cat_cols

['Type', 'Method', 'Regionname']

In [42]:
#Select numerical columns
num_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
num_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [43]:
# Keep selected columns
final_cols = num_cols + low_card_cat_cols
final_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount',
 'Type',
 'Method',
 'Regionname']

In [44]:
X_train = X_train_full[final_cols].copy()
X_test  = X_test_full[final_cols].copy()

In [45]:
#Get a list of all categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)
print(type(object_cols))

Categorical variables:
['Type', 'Method', 'Regionname']
<class 'list'>


In [46]:
#Define funciton to measure quality of each approach/ similar to what we did in the intermediate ML topic
def get_mae(X_train, X_test, y_train, y_test):
  model = RandomForestRegressor(n_estimators = 100, random_state = 0)
  model.fit(X_train, y_train)
  pred = model.predict(X_test)
  mae  = mean_absolute_error(y_test, pred)
  return mae

**Approach 1: Dropping categorical values**

In [47]:
drop_X_train = X_train.select_dtypes(exclude = ['object'])
drop_X_test  = X_test.select_dtypes(exclude = ['object'])

print('mean absolute error for dropping categorical values approach is: %f' %get_mae(
    drop_X_train, drop_X_test, y_train, y_test
))

mean absolute error for dropping categorical values approach is: 198216.692813


**Approach 2: Ordinal Encoding**

In [48]:
label_X_train = X_train.copy()
label_X_test  = X_test.copy()

#Apply original encoder to each column with categorical values
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_test[object_cols]  = ordinal_encoder.transform(X_test[object_cols])

#get the error score
print('mean absolute error for Ordinal encoding approach is: %f' %get_mae(
    label_X_train, label_X_test, y_train, y_test
))

mean absolute error for Ordinal encoding approach is: 184204.489430


In [49]:
label_X_train[object_cols]

Unnamed: 0,Type,Method,Regionname
3056,0.0,3.0,5.0
6041,0.0,0.0,6.0
5077,0.0,1.0,2.0
4366,0.0,1.0,2.0
1640,2.0,1.0,5.0
...,...,...,...
905,0.0,1.0,5.0
5192,0.0,1.0,2.0
12172,0.0,1.0,6.0
235,0.0,0.0,5.0


**Approach 3: The best one 'Ordinal Hot Encoding'**

In [50]:
#Setting up handle_unknown = 'ignore' to avoid errors when the test data contatins classes that are not in the training data
#setting sparse = False ensure that the encoded columns are returned as a numpy array (instead of sparse matrix)

OHEncoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
OH_cols_train = pd.DataFrame(OHEncoder.fit_transform(X_train))
OH_cols_test  = pd.DataFrame(OHEncoder.transform(X_test))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index  = X_test.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_test  = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test  = pd.concat([num_X_test, OH_cols_test], axis=1)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(get_mae(OH_X_train, OH_X_test, y_train, y_test))

MAE from Approach 3 (One-Hot Encoding):




182586.72852448456
