In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('./data/Melbourne_housing_FULL.csv')

In [3]:
df.head(n=5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [4]:
# The misspelings of "longitute" and "latitude" are used, as the two misspellings were not corrected in the source file.

del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [5]:
df.head(3)

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,2,h,,2.5,2.0,1.0,1.0,126.0,,,Yarra City Council
1,Abbotsford,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council


In [6]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

In [7]:
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type'])

In [8]:
features_df.head(n=5)

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Type_h,Type_t,Type_u
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,...,0,0,0,0,0,1,0,1,0,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,...,0,0,0,0,0,1,0,1,0,0
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,...,0,0,0,0,0,1,0,1,0,0
11,3,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,1,...,0,0,0,0,0,1,0,1,0,0
14,2,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,1,...,0,0,0,0,0,1,0,1,0,0


In [9]:
del features_df['Price']

In [10]:
features_df.head(n=5)

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,Suburb_Aberfeldie,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Type_h,Type_t,Type_u
2,2,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,0,...,0,0,0,0,0,1,0,1,0,0
4,3,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,0,...,0,0,0,0,0,1,0,1,0,0
6,4,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,0,...,0,0,0,0,0,1,0,1,0,0
11,3,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,1,0,...,0,0,0,0,0,1,0,1,0,0
14,2,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,1,0,...,0,0,0,0,0,1,0,1,0,0


In [11]:
X = features_df.as_matrix()
y = df['Price'].as_matrix()

In [12]:
len(y)

8895

In [13]:
len(X)

8895

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [15]:
len(X_train)

6226

In [16]:
len(X_test)

2669

In [17]:
len(y_train)

6226

In [18]:
len(y_test)

2669

In [19]:
model = ensemble.GradientBoostingRegressor()

In [20]:
param_grid = {
    'n_estimators': [300, 600, 1000],
    'max_depth': [7, 9],
    'min_samples_split': [3],
    'min_samples_leaf': [5],
    'learning_rate': [0.01],
    'max_features': [0.8],
    'loss': ['huber']
}

In [21]:
gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

In [22]:
gs_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [300, 600, 1000], 'max_depth': [7, 9], 'min_samples_split': [3], 'min_samples_leaf': [5], 'learning_rate': [0.01], 'max_features': [0.8], 'loss': ['huber']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
joblib.dump(gs_cv, './models/gradient_boosting/house_trained_model.pkl')

['./models/gradient_boosting/house_trained_model.pkl']

In [24]:
print(gs_cv.best_params_)

{'learning_rate': 0.01, 'loss': 'huber', 'max_depth': 9, 'max_features': 0.8, 'min_samples_leaf': 5, 'min_samples_split': 3, 'n_estimators': 1000}


In [25]:
mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.2f" % mse)

mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.2f" % mse)

Training Set Mean Absolute Error: 90779.66
Test Set Mean Absolute Error: 158749.31
