In [1]:
from flask import Flask, jsonify, render_template
import numpy as np
import datetime as dt
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from config import password
import pandas as pd

In [2]:
connection_string = f"postgres:{password}@perth-property-market.cptzycsh4y3w.ap-southeast-2.rds.amazonaws.com:5432/perth-property-market"

# Create the database engine (to the PostgreSQL database)
engine = create_engine(f'postgresql://{connection_string}')
conn = engine.connect()
session = Session(bind=engine)

In [3]:
Base = automap_base()

Base.prepare(engine, reflect=True)

market_data = Base.classes.perth_market

In [4]:
properties = session.query(
    market_data.price, market_data.bedrooms, market_data.bathrooms, market_data.land_area,
    market_data.floor_area, market_data.build_year, market_data.cbd_dist, market_data.year_sold).all()

In [5]:
session.close()


In [6]:
property_list = []
for row in properties:
    property = list(np.ravel(row))
    property_list.append(property)
    

df = pd.DataFrame(property_list, columns=["Price", "Bedrooms", "Bathrooms", "Land_Area", 
"Floor_Area", "Build_Year", "CBD_Dist", "Year_Sold"])

df.head(2)

Unnamed: 0,Price,Bedrooms,Bathrooms,Land_Area,Floor_Area,Build_Year,CBD_Dist,Year_Sold
0,565000,4,2,600,160,2003,18300,2018
1,365000,3,2,351,139,2013,26900,2019


In [20]:
df["Price"] = (round(df["Price"]/10000)*10000).astype("int")

In [21]:
df["Price"].value_counts()

480000     778
420000     755
440000     706
400000     705
520000     696
          ... 
1990000      1
2330000      1
2070000      1
2430000      1
2210000      1
Name: Price, Length: 230, dtype: int64

# Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

LinearRegression()

In [23]:
X = df.drop(["Price"], axis=1)
y = df["Price"].values.reshape(-1, 1)
print(X.shape, y.shape)

(25683, 7) (25683, 1)


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [25]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)


In [26]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)


In [27]:
model.fit(X_train_scaled, y_train_scaled)

LinearRegression()

In [28]:
prediction = model.predict(X_test_scaled)
prediction

array([[-0.25324337],
       [ 0.89312537],
       [ 0.79065645],
       ...,
       [ 0.61728203],
       [-0.45948126],
       [-0.3777352 ]])

### Inversing the transformation

In [29]:
y_scaler.inverse_transform(prediction)

array([[547682.93379678],
       [953264.4782978 ],
       [917011.30669132],
       ...,
       [855671.99725694],
       [474716.63704416],
       [503638.12868125]])

In [30]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test_scaled)}")

Training Data Score: 0.5656473896393996
Testing Data Score: 0.5615083740428672


# Lasso

In [16]:
# LASSO model
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01).fit(X_train_scaled, y_train_scaled)

print(f"Training Data Score: {lasso.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score: {lasso.score(X_test_scaled, y_test_scaled)}")

Training Data Score: 0.5638931434944011
Testing Data Score: 0.5632725046504143


# Ridge

In [34]:
# Ridge model
from sklearn.linear_model import Ridge

### BEGIN SOLUTION
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

print(f"Training Data Score: {ridge.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score: {ridge.score(X_test_scaled, y_test_scaled)}")

Training Data Score: 0.5650046098492716
Testing Data Score: 0.5638247390924325


## Grid Search

In [35]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': [0.01, 0.1, 1]}
grid = GridSearchCV(ridge, param_grid, verbose=3)

In [36]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train_scaled, y_train_scaled)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ........................alpha=0.01;, score=0.563 total time=   0.0s
[CV 2/5] END ........................alpha=0.01;, score=0.562 total time=   0.0s
[CV 3/5] END ........................alpha=0.01;, score=0.579 total time=   0.0s
[CV 4/5] END ........................alpha=0.01;, score=0.553 total time=   0.0s
[CV 5/5] END ........................alpha=0.01;, score=0.563 total time=   0.0s
[CV 1/5] END .........................alpha=0.1;, score=0.563 total time=   0.0s
[CV 2/5] END .........................alpha=0.1;, score=0.562 total time=   0.0s
[CV 3/5] END .........................alpha=0.1;, score=0.579 total time=   0.0s
[CV 4/5] END .........................alpha=0.1;, score=0.553 total time=   0.0s
[CV 5/5] END .........................alpha=0.1;, score=0.563 total time=   0.0s
[CV 1/5] END ...........................alpha=1;, score=0.563 total time=   0.0s
[CV 2/5] END ...........................alpha=1;,

GridSearchCV(estimator=Ridge(alpha=0.01), param_grid={'alpha': [0.01, 0.1, 1]},
             verbose=3)

In [37]:
# List the best parameters for this dataset
print(grid.best_params_)

{'alpha': 1}


In [17]:
print('Train Acc: %.3f' % grid.score(X_train_scaled, y_train_scaled))
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test_scaled))

NameError: name 'grid' is not defined

# Save Model

In [18]:
# save the model to disk

# Linear
# *****************
import joblib
filename_linear = '../models/price_model_linear.sav'
joblib.dump(model, filename_linear)

# # Lasso
# # *****************
# filename_lasso = '../models/price_model_lasso.sav'
# joblib.dump(lasso, filename_lasso)


# # Ridge
# # *****************
# filename = '../models/price_model_ridge.sav'
# joblib.dump(grid, filename)

['../models/price_model_linear.sav']

In [19]:
# Save x & y Scalars
xscaler_path = '../models/price_model_lasso_xscaler.sav'
yscaler_path = '../models/price_model_lasso_yscaler.sav'

joblib.dump(X_scaler, xscaler_path)
joblib.dump(y_scaler, yscaler_path) 

['../models/price_model_lasso_yscaler.sav']

# StatsModel

In [41]:
import statsmodels.api as sm


In [42]:
model_arima = sm.OLS(y_train_scaled, X_train_scaled)
results = model_arima.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.565
Model:                            OLS   Adj. R-squared (uncentered):              0.565
Method:                 Least Squares   F-statistic:                              3573.
Date:                Sun, 21 Nov 2021   Prob (F-statistic):                        0.00
Time:                        00:26:59   Log-Likelihood:                         -19315.
No. Observations:               19262   AIC:                                  3.864e+04
Df Residuals:                   19255   BIC:                                  3.870e+04
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------