In [1]:
# from flask import Flask, jsonify, render_template
import numpy as np
import datetime as dt
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from config import password
import pandas as pd
import joblib


In [24]:
connection_string = f"postgres:{password}@perth-property-market.cptzycsh4y3w.ap-southeast-2.rds.amazonaws.com:5432/perth-property-market"

# Create the database engine (to the PostgreSQL database)
engine = create_engine(f'postgresql://{connection_string}')
conn = engine.connect()
session = Session(bind=engine)

In [25]:
Base = automap_base()

Base.prepare(engine, reflect=True)

market_data = Base.classes.perth_market

In [26]:
properties = session.query(
    market_data.price, market_data.bedrooms, market_data.bathrooms, market_data.land_area,
    market_data.floor_area, market_data.build_year, market_data.cbd_dist, market_data.year_sold).all()

In [27]:
session.close()


In [28]:
property_list = []
for row in properties:
    property = list(np.ravel(row))
    property_list.append(property)
    

df = pd.DataFrame(property_list, columns=["Price", "Bedrooms", "Bathrooms", "Land_Area", 
"Floor_Area", "Build_Year", "CBD_Dist", "Year_Sold"])

df

Unnamed: 0,Price,Bedrooms,Bathrooms,Land_Area,Floor_Area,Build_Year,CBD_Dist,Year_Sold
0,565000,4,2,600,160,2003,18300,2018
1,365000,3,2,351,139,2013,26900,2019
2,287000,3,1,719,86,1979,22600,2015
3,255000,2,1,651,59,1953,17900,2018
4,325000,4,1,466,131,1998,11200,2016
...,...,...,...,...,...,...,...,...
25678,423000,3,2,248,108,2011,24900,2016
25679,467000,4,2,400,132,2014,17100,2015
25680,955000,3,2,200,127,1997,16000,2019
25681,1040000,4,3,292,245,2013,16100,2016


In [36]:
df["Price"] = (round(df["Price"]/10000)*10000).astype("int")

In [37]:
df["Price"].value_counts()

480000     778
420000     755
440000     706
400000     705
520000     696
          ... 
1990000      1
2330000      1
2070000      1
2430000      1
2210000      1
Name: Price, Length: 230, dtype: int64

# Random Forest Regression

In [38]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor()
regr

RandomForestRegressor()

In [39]:
X = df.drop(["Price"], axis=1)
y = df["Price"].values.reshape(-1, 1)

print(X.shape, y.shape)

(25683, 7) (25683, 1)


In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [41]:
regr.fit(X_train, y_train)


  """Entry point for launching an IPython kernel.


RandomForestRegressor()

In [42]:
print(f"Training Data Score: {regr.score(X_train, y_train)}")
print(f"Testing Data Score: {regr.score(X_test, y_test)}")

Training Data Score: 0.9557026304425713
Testing Data Score: 0.6950187538796209


In [43]:
feature_names = X.columns
feature_names

Index(['Bedrooms', 'Bathrooms', 'Land_Area', 'Floor_Area', 'Build_Year',
       'CBD_Dist', 'Year_Sold'],
      dtype='object')

In [44]:
sorted(zip(regr.feature_importances_, feature_names), reverse=True)

[(0.43129984763168483, 'Floor_Area'),
 (0.28614365438427986, 'CBD_Dist'),
 (0.10753322248993848, 'Build_Year'),
 (0.0950519833663523, 'Land_Area'),
 (0.03743404996532629, 'Year_Sold'),
 (0.02406746132906541, 'Bathrooms'),
 (0.01846978083335289, 'Bedrooms')]

## Saving Model

In [45]:
# save the model to disk
filename = '../models/price_model_randomForest_roundedprice.sav'
joblib.dump(regr, filename)


['../models/price_model_randomForest_roundedprice.sav']

In [46]:
# load the model from disk
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)
print(result)

0.6950187538796209
