In [89]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

In [62]:
pd.options.display.max_columns = 500

# First Machine Learning Model - Housing data

In [63]:
melbourne_file_path = r"C:\Users\Benny Boi\Repositories\Medoci\data\melb_data.csv"
df = pd.read_csv(melbourne_file_path)

In [64]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [65]:
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

## Cleaning data

In [66]:
df.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [67]:
#df.loc[df['Car'].isnull()].head(1),
df.loc[df['BuildingArea'].isnull()].head(1),
df.loc[df['YearBuilt'].isnull()].head(1),
df.loc[df['CouncilArea'].isnull()].head(1)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
7584,Brighton East,7 Roberts Ct,3,h,1270000.0,VB,Ray,20/05/2017,10.7,3187.0,3.0,1.0,3.0,724.0,,,,-37.9291,145.0297,Southern Metropolitan,6938.0


In [68]:
df = df.dropna(axis=0)

## Feature Selection

In [69]:
# Prediction target - using dot notation
y = df.Price

In [70]:
FEATURES = ['Rooms','Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [71]:
# Feature selection
X = df[FEATURES]

In [72]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [73]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


## Modeling

In [74]:
model = DecisionTreeRegressor(random_state=42)

In [75]:
model.fit(X,y)

Predicting 5 houses

In [76]:
predicted = model.predict(X.head())

In [77]:
df['Price'].head()

1    1035000.0
2    1465000.0
4    1600000.0
6    1876000.0
7    1636000.0
Name: Price, dtype: float64

## Model Evaluation

Bad because we are using data from the training data -  this will cause the model to overfit

In [78]:
predicted = model.predict(X)

In [79]:
mean_absolute_error(y, predicted)

1115.7467183128902

Better as we split the data before hand which we can use to validate and better evaluate our model

In [81]:
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=42)

In [82]:
model = DecisionTreeRegressor(random_state=42)

In [83]:
model.fit(train_X, train_y)

In [84]:
test_predictions = model.predict(test_X)

In [85]:
mean_absolute_error(test_y, test_predictions)

256197.80116204004

$1115 vs $250,000 big difference

## Controlling the tree depth to help balence the model and avoid overfitting and underfitting

<br>Overfitting: capturing spurious patterns that won't recur in the future, leading to less accurate predictions </br>
Underfitting: failing to capture relevant patterns, again leading to less accurate predictions.

In [86]:
def get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=42)
    model.fit(train_X, train_y)
    pred_test = model.predict(test_X)
    mae = mean_absolute_error(test_y, pred_test)
    return mae

In [88]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  369120
Max leaf nodes: 50  		 Mean Absolute Error:  270711
Max leaf nodes: 500  		 Mean Absolute Error:  243840
Max leaf nodes: 5000  		 Mean Absolute Error:  255896


500 seems to be the most optimal level

## Using Random Forest

In [90]:
forest_model = RandomForestRegressor(random_state=42)

In [91]:
forest_model.fit(train_X, train_y)

In [92]:
forest_preds = forest_model.predict(test_X)

In [93]:
mean_absolute_error(test_y, forest_preds)

188379.909836761