# Melbourne Housing Market
Machine Learning Practice
Algorithms used:
- Gradient Boost Regressor

## Import Dependencies

In [3]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
# from sklearn.externals import joblib
import joblib

import datetime
import time

## Read in data from csv
https://www.kaggle.com/anthonypino/melbourne-housing-market

In [4]:
url = 'https://s3.us-west-2.amazonaws.com/secure.notion-static.com/7d63666c-548b-4d0e-a443-e72ccb674aff/Melbourne_housing_FULL.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAT73L2G45O3KS52Y5%2F20201017%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20201017T150523Z&X-Amz-Expires=86400&X-Amz-Signature=0e872ff79ada6bff43104ef5f8158331b38d7fb8d70e9499514676b407d6f1bb&X-Amz-SignedHeaders=host&response-content-disposition=filename%20%3D%22Melbourne_housing_FULL.csv%22'

In [5]:
df = pd.read_csv(url)

In [6]:
# Preview data
# df.head()

## Clean Data

### Delete unneeded columns

In [7]:
list = ('Address', 'Method', 'SellerG', 'Date', 'Postcode', 'Lattitude', 'Longtitude', 'Regionname', 'Propertycount')
for col in list:
    del df[col]

### Remove rows with missing values

In [8]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

### Convert non-numerical data using one-hot encoding

In [9]:
features_df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea', 'Type'])

In [10]:
del features_df['Price']

## Prepare data for model

### Create X and y arrays from the dataset

In [11]:
X = features_df.values
y = df['Price'].values

### Split data into test/train set (70/30 split) and shuffle

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Regular Model

In [None]:
# Setting up model
# model = ensemble.GradientBoostingRegressor(
#     n_estimators=250,
#     learning_rate=0.1,
#     max_depth=5,
#     min_samples_split=4,
#     min_samples_leaf=6,
#     max_features=0.6,
#     loss='huber'
# )

In [None]:
# Run model on training data
# model.fit(X_train, y_train)

In [None]:
# Save Model to file
# joblib.dump(model, 'models/trained_model.pkl')

## Grid Search

### Define model

### Setup grid search

In [None]:
# Setting up Grid Search
model = ensemble.GradientBoostingRegressor()

param_grid = {
    'n_estimators': [300], # ,600,1000
    'learning_rate': [0.1], # 0.01,0.02,0.6,0.7
    'max_depth': [7], ,9,11
    'min_samples_split': [3], # ,4,5
    'min_samples_leaf': [5], # ,6,7
    'max_features': [0.8], # , 0.9
    'loss': ['ls'] # ,'lad','huber'
}

### Define Grid Search
Run with four CPUs in parallel if applicable.

In [1]:
import multiprocessing
multiprocessing.cpu_count()

4

In [None]:
gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

### Run grid search on training data
+ keep track of time

In [4]:
start_time = datetime.datetime.now()

gs_cv.fit(X_train, y_train)

print(datetime.datetime.now()-start_time)

0:00:10.003249


### Print optimal hyperparameters

In [None]:
print(gs_cv.best_params_)

### Check model accuracy
(2 decimal places)

In [None]:
# Check model accuracy (2 decimal places)
mse_train = mean_absolute_error(y_train, gs_cv.predict(X_train))
print('Training Set Mean Absolute Error: ${:,.2f}'.format(mse_train))

mse_test = mean_absolute_error(y_test, gs_cv.predict(X_test))
print('Test Set Mean Absolute Error: ${:,.2f}'.format(mse_test))