The Melbourne_housing_FULL dataset: https://www.kaggle.com/anthonypino/melbourne-housing-market/

# Import libraries and dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
df = pd.read_csv('data/Melbourne_housing_FULL.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


# Data Scrubing

In [3]:
cols = ['Address', 'Method', 'SellerG', 'Date', 'Postcode', 'Longtitude', 'Lattitude', 'Regionname', 'Propertycount']
df = df.drop(columns=cols)
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,2,h,,2.5,2.0,1.0,1.0,126.0,,,Yarra City Council
1,Abbotsford,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
3,Abbotsford,3,u,,2.5,3.0,2.0,1.0,0.0,,,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council


In [4]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
6,Abbotsford,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council
11,Abbotsford,3,h,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,Yarra City Council
14,Abbotsford,2,h,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,Yarra City Council


In [5]:
df = pd.get_dummies(df, columns=['Suburb', 'Type', 'CouncilArea'])
df.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,...,0,0,0,0,0,0,0,0,1,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,...,0,0,0,0,0,0,0,0,1,0
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,...,0,0,0,0,0,0,0,0,1,0
11,3,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,1,...,0,0,0,0,0,0,0,0,1,0
14,2,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,1,...,0,0,0,0,0,0,0,0,1,0


In [6]:
X = df.drop(columns='Price')
y = df['Price']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

# Build Model and Evaluate

In [8]:
model = GradientBoostingRegressor(
    n_estimators=250,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split=4,
    min_samples_leaf=6,
    max_features=0.6,
    loss='huber'
)

In [9]:
model.fit(X_train, y_train)

In [10]:
mae_train = mean_absolute_error(y_train, model.predict(X_train))
print(f'Mean absolute error train: {mae_train}')
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(f'Mean absolute error test: {mae_test}')

Mean absolute error train: 123991.67252408435
Mean absolute error test: 157873.6670865112


# GridSearch

In [14]:
from sklearn.model_selection import GridSearchCV
# Input algorithm
model = GradientBoostingRegressor()

# Set the configurations that you wish to test. To minimize processing time, limit num. of variables or experiment on each hyperparameter separately.
hyperparameters = {
    'n_estimators': [200, 300],
    'max_depth': [4, 6],
    'min_samples_split': [3, 4],
    'min_samples_leaf': [5, 6],
    'learning_rate': [0.01, 0.02],
    'max_features': [0.8, 0.9],
}

# Define grid search.
grid = GridSearchCV(model, hyperparameters, n_jobs=4)

# Run grid search on training data
grid.fit(X_train, y_train)

# Return optimal hyperparameters
grid.best_params_

# Check model accuracy using optimal hyperparameters
mae_train = mean_absolute_error(y_train, grid.predict(X_train))
print ("Training Set Mean Absolute Error: %.2f" % mae_train)
mae_test = mean_absolute_error(y_test, grid.predict(X_test))
print ("Test Set Mean Absolute Error: %.2f" % mae_test)


Training Set Mean Absolute Error: 139858.11
Test Set Mean Absolute Error: 172804.57


In [15]:
grid.best_params_

{'learning_rate': 0.02,
 'max_depth': 6,
 'max_features': 0.8,
 'min_samples_leaf': 5,
 'min_samples_split': 4,
 'n_estimators': 300}