In [112]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sigfig import round
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
pd.set_option('display.max_colwidth', None)
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [113]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    return mean_absolute_error(y_valid, predictions)

def currency(val):
    return f"$ {round(str(val), decimals=2, spacing=3, spacer=',')}"

In [114]:
data = pd.read_csv("./melb_data.csv")

In [115]:
# strip rows with anything missing
dropped_rows = pd.DataFrame(row for _, row in data.iterrows() if not row.isnull().any())
row_data_count = len(dropped_rows.columns) * len(dropped_rows)
row_data_count

130116

In [116]:
dropped_cols = data.drop([col for col in data.columns if data[col].isnull().any()], axis=1)
col_data_count = len(dropped_cols.columns) * len(dropped_cols)
col_data_count

230860

In [117]:
#decide which to go with
if row_data_count > col_data_count:
    X = dropped_rows
else:
    X = dropped_cols
y = X.Price
X = X.drop(['Price'], axis=1)

In [118]:
X_training, X_validation, y_training, y_validation = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [119]:
# drop categorical columns
Xt_dropped = X_training.copy()
Xv_dropped = X_validation.copy()
cols_without_numbers = [col for col in X.columns if X[col].dtype == 'object']
print(cols_without_numbers)
for x in [Xt_dropped, Xv_dropped]:
    x.drop(cols_without_numbers, axis=1, inplace=True)

print("MAE from Approach 1 (Drop categorical variables):")
currency(score_dataset(Xt_dropped, Xv_dropped, y_training, y_validation))

['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'Regionname']
MAE from Approach 1 (Drop categorical variables):


'$ 175,703.48'

In [120]:
# Encode selected categories, drop the rest
Xt_encoded = X_training.copy()
Xv_encoded = X_validation.copy()

# print([(col, len(set(X_training[col]))) for col in X.columns if X[col].dtype == 'object'])
for x in [Xt_encoded, Xv_encoded]:
    x.drop([col for col in x.columns if X[col].dtype == 'object' and len(set(X[col])) > 10], axis=1, inplace=True)

cols_to_be_encoded = [col for col in X.columns if X[col].dtype == 'object' and len(set(X[col])) <= 10]
print(cols_to_be_encoded)

ordinal_encoder = OrdinalEncoder()
Xt_encoded[cols_to_be_encoded] = ordinal_encoder.fit_transform(X_training[cols_to_be_encoded])
Xv_encoded[cols_to_be_encoded] = ordinal_encoder.transform(X_validation[cols_to_be_encoded])

print(Xt_encoded, "\n")
print("MAE from Approach 1 (Drop categorical variables):")
print(currency(score_dataset(Xt_encoded, Xv_encoded, y_training, y_validation)))

['Type', 'Method', 'Regionname']
       Rooms  Type  Method  Distance  Postcode  Bedroom2  Bathroom  Landsize  \
12167      1   2.0     1.0       5.0    3182.0       1.0       1.0       0.0   
6524       2   0.0     2.0       8.0    3016.0       2.0       2.0     193.0   
8413       3   0.0     1.0      12.6    3020.0       3.0       1.0     555.0   
2919       3   2.0     3.0      13.0    3046.0       3.0       1.0     265.0   
6043       3   0.0     1.0      13.3    3020.0       3.0       1.0     673.0   
...      ...   ...     ...       ...       ...       ...       ...       ...   
13123      3   0.0     3.0       5.2    3056.0       3.0       1.0     212.0   
3264       3   0.0     1.0      10.5    3081.0       3.0       1.0     748.0   
9845       4   0.0     0.0       6.7    3058.0       4.0       2.0     441.0   
10799      3   0.0     1.0      12.0    3073.0       3.0       1.0     606.0   
2732       4   0.0     3.0       6.4    3011.0       4.0       2.0     319.0   

      

In [127]:
# generate the new columns and values
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
One_hotted_only_training = pd.DataFrame(OH_encoder.fit_transform(X_training[cols_to_be_encoded]))
One_hotted_only_validation = pd.DataFrame(OH_encoder.transform(X_validation[cols_to_be_encoded]))
# attach the correct index to each row
One_hotted_only_training.index = X_training.index
One_hotted_only_validation.index = X_validation.index
# print([col for col in One_hotted_only_training.columns])
One_hotted_only_training.columns = [str(col) for col in One_hotted_only_training.columns]
One_hotted_only_validation.columns = [str(col) for col in One_hotted_only_training.columns]

Xt_1hotted = pd.concat([X_training.drop(cols_without_numbers, axis=1), One_hotted_only_training], axis=1)
Xv_1hotted = pd.concat([X_validation.drop(cols_without_numbers, axis=1), One_hotted_only_validation], axis=1)

print(Xt_1hotted, "\n")
print("MAE from Approach 3 (One-Hot Encoding):") 
print(currency(score_dataset(Xt_1hotted, Xv_1hotted, y_training, y_validation)))

       Rooms  Distance  Postcode  Bedroom2  Bathroom  Landsize  Lattitude  \
12167      1       5.0    3182.0       1.0       1.0       0.0  -37.85984   
6524       2       8.0    3016.0       2.0       2.0     193.0  -37.85800   
8413       3      12.6    3020.0       3.0       1.0     555.0  -37.79880   
2919       3      13.0    3046.0       3.0       1.0     265.0  -37.70830   
6043       3      13.3    3020.0       3.0       1.0     673.0  -37.76230   
...      ...       ...       ...       ...       ...       ...        ...   
13123      3       5.2    3056.0       3.0       1.0     212.0  -37.77695   
3264       3      10.5    3081.0       3.0       1.0     748.0  -37.74160   
9845       4       6.7    3058.0       4.0       2.0     441.0  -37.73572   
10799      3      12.0    3073.0       3.0       1.0     606.0  -37.72057   
2732       4       6.4    3011.0       4.0       2.0     319.0  -37.79430   

       Longtitude  Propertycount    0  ...    6    7    8    9   10   11   