# Melbourne Housing Market
Machine Learning Practice
Algorithms used:
- Gradient Boost Regressor

## Import Dependencies

In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
# from sklearn.externals import joblib
import joblib

import datetime
import time

## Read in data from csv
https://www.kaggle.com/anthonypino/melbourne-housing-market

In [2]:
url = 'https://raw.githubusercontent.com/JacobMannix/melbourne-housing/main/data/Melbourne_housing_FULL.csv'
df = pd.read_csv(url)

In [3]:
# Preview data
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


## Clean Data

### Delete unneeded columns

In [4]:
list = ('Address', 'Method', 'SellerG', 'Date', 'Postcode', 'Lattitude', 'Longtitude', 'Regionname', 'Propertycount')
for col in list:
    del df[col]

### Remove rows with missing values

In [5]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

### Convert non-numerical data using one-hot encoding

In [6]:
features_df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea', 'Type'])

In [7]:
del features_df['Price']

## Prepare data for model

### Create X and y arrays from the dataset

In [8]:
X = features_df.values
y = df['Price'].values

### Split data into test/train set (70/30 split) and shuffle

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Regular Model

In [10]:
# Setting up model
# model = ensemble.GradientBoostingRegressor(
#     n_estimators=250,
#     learning_rate=0.1,
#     max_depth=5,
#     min_samples_split=4,
#     min_samples_leaf=6,
#     max_features=0.6,
#     loss='huber'
# )

In [11]:
# Run model on training data
# model.fit(X_train, y_train)

In [12]:
# Save Model to file
# joblib.dump(model, 'models/trained_model.pkl')

## Grid Search

### Define model

### Setup grid search

In [13]:
# Setting up Grid Search
model = ensemble.GradientBoostingRegressor()

param_grid = {
    'learning_rate': [0.1], # 0.01,0.02,0.6,0.7
    'loss': ['huber'], # 'ls','lad','huber'
    'max_depth': [6], # 5,6,7,9,11
    'max_features': [0.3], # 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
    'min_samples_leaf': [6], # 5,6,7
    'min_samples_split': [4], # 3,4,5
    'n_estimators': [300] # 100, 200, 300,600,1000
}

### Define Grid Search
Run with four CPUs in parallel if applicable.

In [14]:
import multiprocessing
multiprocessing.cpu_count()

4

In [15]:
gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

### Run grid search on training data
+ keep track of time

In [16]:
start_time = datetime.datetime.now()
gs_cv.fit(X_train, y_train)
print(datetime.datetime.now()-start_time)

0:00:53.542258


### Print optimal hyperparameters

In [17]:
print(gs_cv.best_params_)

{'learning_rate': 0.1, 'loss': 'huber', 'max_depth': 6, 'max_features': 0.3, 'min_samples_leaf': 6, 'min_samples_split': 4, 'n_estimators': 300}


### Check model accuracy
(2 decimal places)

In [18]:
# Check model accuracy (2 decimal places)
mse_train = mean_absolute_error(y_train, gs_cv.predict(X_train))
print('Training Set Mean Absolute Error: ${:,.2f}'.format(mse_train))

mse_test = mean_absolute_error(y_test, gs_cv.predict(X_test))
print('Test Set Mean Absolute Error: ${:,.2f}'.format(mse_test))

Training Set Mean Absolute Error: $112,816.69
Test Set Mean Absolute Error: $155,261.10
