# Building a Model in Python

---

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

## Import Dataset

In [2]:
df = pd.read_csv("data/Melbourne_housing_FULL.csv")
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,...,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,...,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,...,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom2       26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  YearBuilt      15551 non-null  float64
 16  CouncilArea    34854 non-null  object 
 17  Lattitude      26881 non-null  float64
 18  Longti

> **Note: Price values in this dataset are expressed in Australian Dollars - \\$1 AUD is approximately \\$0.77 USD (2017).**

## Data Cleaning

In [4]:
# Fix some of the column names
df.columns = ['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Latitude',
       'Longitude', 'Regionname', 'Propertycount']

In [5]:
df.sample(5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,Regionname,Propertycount
20127,Mount Waverley,18 Jeffrey St,3,h,,SN,Biggin,3/09/2017,14.2,3149.0,...,1.0,2.0,790.0,157.0,1970.0,Monash City Council,-37.88883,145.13684,Eastern Metropolitan,13366.0
15531,Caulfield South,2/203 North Rd,3,t,1237000.0,S,Gary,17/06/2017,8.7,3162.0,...,2.0,2.0,0.0,135.0,1990.0,Glen Eira City Council,-37.90034,145.01283,Southern Metropolitan,5051.0
29081,Southbank,1603/8 Dorcas St,2,u,585000.0,S,RT,25/11/2017,0.7,3006.0,...,,,,,,Melbourne City Council,,,Southern Metropolitan,8400.0
17216,Pascoe Vale,3/15 Callander Rd,2,t,,SN,Eview,27/05/2017,8.5,3044.0,...,,,,,,Moreland City Council,,,Northern Metropolitan,7485.0
6767,Middle Park,8/49 Patterson St,2,u,702000.0,SP,Marshall,18/06/2016,4.1,3206.0,...,1.0,1.0,0.0,,1970.0,Port Phillip City Council,-37.8549,144.9662,Southern Metropolitan,2019.0


In [6]:
# Drop unnecessary columns for building the model
df.drop(columns=['Address', 'Method', 'SellerG', 'Date', 'Postcode', 'Latitude',
                 'Longitude', 'Regionname', 'Propertycount'], inplace=True)
df

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,2,h,,2.5,2.0,1.0,1.0,126.0,,,Yarra City Council
1,Abbotsford,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
3,Abbotsford,3,u,,2.5,3.0,2.0,1.0,0.0,,,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,h,1480000.0,6.3,4.0,1.0,3.0,593.0,,,Maribyrnong City Council
34853,Yarraville,2,h,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council
34854,Yarraville,2,t,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council
34855,Yarraville,3,h,1140000.0,6.3,,,,,,,Maribyrnong City Council


### Missing Values

In [7]:
df.isnull().sum()

Suburb              0
Rooms               0
Type                0
Price            7610
Distance            1
Bedroom2         8217
Bathroom         8226
Car              8728
Landsize        11810
BuildingArea    21115
YearBuilt       19306
CouncilArea         3
dtype: int64

In [8]:
df.dropna(axis=0, inplace=True)
df

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
6,Abbotsford,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council
11,Abbotsford,3,h,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,Yarra City Council
14,Abbotsford,2,h,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,Yarra City Council
...,...,...,...,...,...,...,...,...,...,...,...,...
34847,Wollert,3,h,500000.0,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,Whittlesea City Council
34849,Wollert,3,h,570000.0,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,Whittlesea City Council
34853,Yarraville,2,h,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council
34854,Yarraville,2,t,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council


### Categorical features

In [9]:
df = pd.get_dummies(df, columns=['Suburb', 'Type', 'CouncilArea'], dtype=int)
df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,...,0,0,0,0,0,0,0,0,1,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,...,0,0,0,0,0,0,0,0,1,0
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,1,...,0,0,0,0,0,0,0,0,1,0
11,3,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,1,...,0,0,0,0,0,0,0,0,1,0
14,2,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34847,3,500000.0,25.5,3.0,2.0,2.0,383.0,118.0,2016.0,0,...,0,0,0,0,0,0,1,0,0,0
34849,3,570000.0,25.5,3.0,2.0,2.0,404.0,158.0,2012.0,0,...,0,0,0,0,0,0,1,0,0,0
34853,2,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,0,...,0,0,0,0,0,0,0,0,0,0
34854,2,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,0,...,0,0,0,0,0,0,0,0,0,0


## Split the Dataset

In [10]:
X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6226, 359), (2669, 359), (6226,), (2669,))

## Select Algorithm and Configure Hyperparameters

In [12]:
model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split=4,
    min_samples_leaf=5,
    max_features=0.6,
    loss='huber'
)

model.fit(X_train, y_train)

## Evaluate the Results

In [13]:
y_pred_train = model.predict(X_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
print(f"Train Set MAE: {mae_train:.2f}")

y_pred_test = model.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
print(f"Test Set MAE: {mae_test:.2f}")

Train Set MAE: 126689.09
Test Set MAE: 160807.30


# Model Optimization

In [15]:
# Input algorithm
model = GradientBoostingRegressor()

# Set the configurations that you wish to test.
hyperparameters = {
    'n_estimators':[200, 300],
    'max_depth':[4, 6],
    'min_samples_split':[3, 4],
    'min_samples_leaf':[5, 6],
    'learning_rate':[0.01, 0.02],
    'max_features':[0.8, 0.9],
    'loss':['ls', 'lad', 'huber']
}

# Define grid search.
grid = GridSearchCV(model, hyperparameters, n_jobs=4)

# Run grid search on training data
grid.fit(X_train, y_train)

# Return optimal hyperparameters
grid.best_params_

640 fits failed out of a total of 960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\JAVOHIR\miniconda3\envs\mohirdsai\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\JAVOHIR\miniconda3\envs\mohirdsai\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\JAVOHIR\miniconda3\envs\mohirdsai\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\JAVOHIR\miniconda3\envs\mohirdsai\lib\site-packages\sklearn\utils\_param_validation

{'learning_rate': 0.02,
 'loss': 'huber',
 'max_depth': 6,
 'max_features': 0.8,
 'min_samples_leaf': 5,
 'min_samples_split': 3,
 'n_estimators': 300}

In [17]:
# Check model accuracy using optimal hyperparameters
mae_train = mean_absolute_error(y_train, grid.predict(X_train))
print(f"Train MAE: {mae_train:.2f}")

mae_test = mean_absolute_error(y_test, grid.predict(X_test))
print(f"Test MAE: {mae_test:.2f}")

Train MAE: 138226.82
Test MAE: 168133.38
