In [1]:
from flask import Flask, jsonify, render_template
import numpy as np
import datetime as dt
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from config import password
import pandas as pd

In [2]:
connection_string = f"postgres:{password}@perth-property-market.cptzycsh4y3w.ap-southeast-2.rds.amazonaws.com:5432/perth-property-market"

# Create the database engine (to the PostgreSQL database)
engine = create_engine(f'postgresql://{connection_string}')
conn = engine.connect()
session = Session(bind=engine)

In [3]:
Base = automap_base()

Base.prepare(engine, reflect=True)

market_data = Base.classes.perth_market

In [4]:
properties = session.query(
    market_data.price, market_data.bedrooms, market_data.bathrooms, market_data.land_area,
    market_data.floor_area, market_data.build_year, market_data.cbd_dist, market_data.year_sold).all()

In [5]:
session.close()


In [6]:
property_list = []
for row in properties:
    property = list(np.ravel(row))
    property_list.append(property)
    

df = pd.DataFrame(property_list, columns=["Price", "Bedrooms", "Bathrooms", "Land_Area", 
"Floor_Area", "Build_Year", "CBD_Dist", "Year_Sold"])

df

Unnamed: 0,Price,Bedrooms,Bathrooms,Land_Area,Floor_Area,Build_Year,CBD_Dist,Year_Sold
0,565000,4,2,600,160,2003,18300,2018
1,365000,3,2,351,139,2013,26900,2019
2,287000,3,1,719,86,1979,22600,2015
3,255000,2,1,651,59,1953,17900,2018
4,325000,4,1,466,131,1998,11200,2016
...,...,...,...,...,...,...,...,...
25678,423000,3,2,248,108,2011,24900,2016
25679,467000,4,2,400,132,2014,17100,2015
25680,955000,3,2,200,127,1997,16000,2019
25681,1040000,4,3,292,245,2013,16100,2016


In [18]:
df["Price"] = (round(df["Price"]/10000)*10000).astype("int")

In [19]:
df["Price"].value_counts()

480000     778
420000     755
440000     706
400000     705
520000     696
          ... 
1990000      1
2330000      1
2070000      1
2430000      1
2210000      1
Name: Price, Length: 230, dtype: int64

# Gradient Boosting Regression

In [20]:
# from sklearn.inspection import permutation_importance
# from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [21]:
X = df.drop(["Price"], axis=1)
y = df["Price"].values.reshape(-1, 1)
print(X.shape, y.shape)

(25683, 7) (25683, 1)


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [23]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)


In [24]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)


In [25]:
reg = GradientBoostingRegressor()
reg.fit(X_train_scaled, y_train_scaled)



  return f(*args, **kwargs)


GradientBoostingRegressor()

In [26]:
print(f"Training Data Score: {reg.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score: {reg.score(X_test_scaled, y_test_scaled)}")

Training Data Score: 0.6863557785576773
Testing Data Score: 0.6846623874392768


## Important Features

In [14]:
feature_names = X.columns
feature_names

Index(['Bedrooms', 'Bathrooms', 'Land_Area', 'Floor_Area', 'Build_Year',
       'CBD_Dist', 'Year_Sold'],
      dtype='object')

In [15]:
sorted(zip(reg.feature_importances_, feature_names), reverse=True)

[(0.5022042099990027, 'Floor_Area'),
 (0.3123867860252648, 'CBD_Dist'),
 (0.07759921163830995, 'Build_Year'),
 (0.0667742449247468, 'Bathrooms'),
 (0.028111445303772364, 'Land_Area'),
 (0.008913153631342987, 'Year_Sold'),
 (0.004010948477560423, 'Bedrooms')]

# Save Model

In [16]:
# save the model to disk
import joblib
filename = '../models/price_model_gradientBoostReg.sav'
joblib.dump(reg, filename)


['../models/price_model_gradientBoostReg.sav']

In [17]:
# load the model from disk
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test_scaled, y_test_scaled)
print(result)

0.6822683224534751
