In [1]:
import pandas as pd
from sqlalchemy import create_engine
from config import password
import numpy as np
import matplotlib.pyplot as plt

In [18]:
import scipy.stats as stats

# Import Data

In [3]:
csv_path = "csv/perth_market.csv"
perth_df = pd.read_csv(csv_path)
perth_df.head(2)

Unnamed: 0,ADDRESS,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK
0,1 Acorn Place,South Lake,565000,4,2,2.0,600,160,2003.0,18300,Cockburn Central Station,1800,09-2018\r,6164,-32.1159,115.84245,LAKELAND SENIOR HIGH SCHOOL,0.828339,
1,1 Addis Way,Wandi,365000,3,2,2.0,351,139,2013.0,26900,Kwinana Station,4900,02-2019\r,6167,-32.19347,115.859553,ATWELL COLLEGE,5.524324,129.0


In [4]:
# Removing unwanted columns
cleaned_perth_df = perth_df.drop(columns = ["BATHROOMS","GARAGE","FLOOR_AREA","POSTCODE","NEAREST_SCH_RANK"])

cleaned_perth_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33656 entries, 0 to 33655
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ADDRESS           33656 non-null  object 
 1   SUBURB            33656 non-null  object 
 2   PRICE             33656 non-null  int64  
 3   BEDROOMS          33656 non-null  int64  
 4   LAND_AREA         33656 non-null  int64  
 5   BUILD_YEAR        30501 non-null  float64
 6   CBD_DIST          33656 non-null  int64  
 7   NEAREST_STN       33656 non-null  object 
 8   NEAREST_STN_DIST  33656 non-null  int64  
 9   DATE_SOLD         33656 non-null  object 
 10  LATITUDE          33656 non-null  float64
 11  LONGITUDE         33656 non-null  float64
 12  NEAREST_SCH       33656 non-null  object 
 13  NEAREST_SCH_DIST  33656 non-null  float64
dtypes: float64(4), int64(5), object(5)
memory usage: 3.6+ MB


In [5]:
# Convert ADDRESS to FULL ADDRESS which includes the Suburb to get unique adress of sold properties
cleaned_perth_df["ADDRESS"] = cleaned_perth_df[["ADDRESS","SUBURB"]].apply(lambda x: ', '.join(x[x.notnull()]), axis = 1)

cleaned_perth_df["ADDRESS"]

0              1 Acorn Place, South Lake
1                     1 Addis Way, Wandi
2               1 Ainsley Court, Camillo
3              1 Albert Street, Bellevue
4                1 Aman Place, Lockridge
                      ...               
33651    9C Gold Street, South Fremantle
33652        9C Pycombe Way, Westminster
33653        9D Pycombe Way, Westminster
33654        9D Shalford Way, Girrawheen
33655        9E Margaret Street, Midland
Name: ADDRESS, Length: 33656, dtype: object

In [6]:
# Display duplicate addresses
full_address = cleaned_perth_df["ADDRESS"]
cleaned_perth_df[full_address.isin(full_address[full_address.duplicated()])]

Unnamed: 0,ADDRESS,SUBURB,PRICE,BEDROOMS,LAND_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST
4149,"123 Fairway, Crawley",Crawley,818000,4,160,,5500,Daglish Station,3700,02-2018\r,-31.985151,115.815463,SHENTON COLLEGE,3.065177
4150,"123 Fairway, Crawley",Crawley,818000,4,160,,5500,Daglish Station,3700,02-2018\r,-31.985152,115.815314,SHENTON COLLEGE,3.059929
4151,"123 Fairway, Crawley",Crawley,818000,4,160,,5500,Daglish Station,3700,02-2018\r,-31.985133,115.815176,SHENTON COLLEGE,3.053099
20655,"4 Carmel Road, Carmel",Carmel,955000,4,15761,2009.0,21700,Madding,9300,09-2013\r,-32.019395,116.074759,CARMEL ADVENTIST COLLEGE,1.890651
20656,"4 Carmel Road, Carmel",Carmel,955000,4,15761,2009.0,21700,Madding,9300,09-2013\r,-32.019657,116.096652,CARMEL ADVENTIST COLLEGE,0.600853


In [7]:
# Drop duplicate data
cleaned_perth_df.drop_duplicates(subset=["ADDRESS"], keep="last", inplace=True)
cleaned_perth_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33653 entries, 0 to 33655
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ADDRESS           33653 non-null  object 
 1   SUBURB            33653 non-null  object 
 2   PRICE             33653 non-null  int64  
 3   BEDROOMS          33653 non-null  int64  
 4   LAND_AREA         33653 non-null  int64  
 5   BUILD_YEAR        30500 non-null  float64
 6   CBD_DIST          33653 non-null  int64  
 7   NEAREST_STN       33653 non-null  object 
 8   NEAREST_STN_DIST  33653 non-null  int64  
 9   DATE_SOLD         33653 non-null  object 
 10  LATITUDE          33653 non-null  float64
 11  LONGITUDE         33653 non-null  float64
 12  NEAREST_SCH       33653 non-null  object 
 13  NEAREST_SCH_DIST  33653 non-null  float64
dtypes: float64(4), int64(5), object(5)
memory usage: 3.9+ MB


In [8]:
# Remove missing data
cleaned_perth_df.dropna(inplace=True)
cleaned_perth_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30500 entries, 0 to 33654
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ADDRESS           30500 non-null  object 
 1   SUBURB            30500 non-null  object 
 2   PRICE             30500 non-null  int64  
 3   BEDROOMS          30500 non-null  int64  
 4   LAND_AREA         30500 non-null  int64  
 5   BUILD_YEAR        30500 non-null  float64
 6   CBD_DIST          30500 non-null  int64  
 7   NEAREST_STN       30500 non-null  object 
 8   NEAREST_STN_DIST  30500 non-null  int64  
 9   DATE_SOLD         30500 non-null  object 
 10  LATITUDE          30500 non-null  float64
 11  LONGITUDE         30500 non-null  float64
 12  NEAREST_SCH       30500 non-null  object 
 13  NEAREST_SCH_DIST  30500 non-null  float64
dtypes: float64(4), int64(5), object(5)
memory usage: 3.5+ MB


In [9]:
# Removing properties with land areas over 2500 sqm
cleaned_perth_df = cleaned_perth_df.loc[cleaned_perth_df["LAND_AREA"] <= 2500]
cleaned_perth_df.head(2)

Unnamed: 0,ADDRESS,SUBURB,PRICE,BEDROOMS,LAND_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST
0,"1 Acorn Place, South Lake",South Lake,565000,4,600,2003.0,18300,Cockburn Central Station,1800,09-2018\r,-32.1159,115.84245,LAKELAND SENIOR HIGH SCHOOL,0.828339
1,"1 Addis Way, Wandi",Wandi,365000,3,351,2013.0,26900,Kwinana Station,4900,02-2019\r,-32.19347,115.859553,ATWELL COLLEGE,5.524324


In [10]:
# Convert DATE_SOLD column from MM-YYYY to YYYY to only get the year preperty sold
date_splited = cleaned_perth_df["DATE_SOLD"].str.rsplit('-')

year = []
for date in date_splited:
    year.append(date[1])

cleaned_perth_df["YEAR_SOLD"] = year
cleaned_perth_df["YEAR_SOLD"] = cleaned_perth_df["YEAR_SOLD"].astype(int)
cleaned_perth_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27724 entries, 0 to 33654
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ADDRESS           27724 non-null  object 
 1   SUBURB            27724 non-null  object 
 2   PRICE             27724 non-null  int64  
 3   BEDROOMS          27724 non-null  int64  
 4   LAND_AREA         27724 non-null  int64  
 5   BUILD_YEAR        27724 non-null  float64
 6   CBD_DIST          27724 non-null  int64  
 7   NEAREST_STN       27724 non-null  object 
 8   NEAREST_STN_DIST  27724 non-null  int64  
 9   DATE_SOLD         27724 non-null  object 
 10  LATITUDE          27724 non-null  float64
 11  LONGITUDE         27724 non-null  float64
 12  NEAREST_SCH       27724 non-null  object 
 13  NEAREST_SCH_DIST  27724 non-null  float64
 14  YEAR_SOLD         27724 non-null  int32  
dtypes: float64(4), int32(1), int64(5), object(5)
memory usage: 3.3+ MB


In [11]:
# convert BUILD_YEAR column from FLOAT to INT
cleaned_perth_df["BUILD_YEAR"] = cleaned_perth_df["BUILD_YEAR"].astype(int)
cleaned_perth_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27724 entries, 0 to 33654
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ADDRESS           27724 non-null  object 
 1   SUBURB            27724 non-null  object 
 2   PRICE             27724 non-null  int64  
 3   BEDROOMS          27724 non-null  int64  
 4   LAND_AREA         27724 non-null  int64  
 5   BUILD_YEAR        27724 non-null  int32  
 6   CBD_DIST          27724 non-null  int64  
 7   NEAREST_STN       27724 non-null  object 
 8   NEAREST_STN_DIST  27724 non-null  int64  
 9   DATE_SOLD         27724 non-null  object 
 10  LATITUDE          27724 non-null  float64
 11  LONGITUDE         27724 non-null  float64
 12  NEAREST_SCH       27724 non-null  object 
 13  NEAREST_SCH_DIST  27724 non-null  float64
 14  YEAR_SOLD         27724 non-null  int32  
dtypes: float64(3), int32(2), int64(5), object(5)
memory usage: 3.2+ MB


In [12]:
# Removing properties which were sold before 2005
cleaned_perth_df = cleaned_perth_df.loc[cleaned_perth_df["YEAR_SOLD"] >= 2005]
cleaned_perth_df.head(2)

Unnamed: 0,ADDRESS,SUBURB,PRICE,BEDROOMS,LAND_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,YEAR_SOLD
0,"1 Acorn Place, South Lake",South Lake,565000,4,600,2003,18300,Cockburn Central Station,1800,09-2018\r,-32.1159,115.84245,LAKELAND SENIOR HIGH SCHOOL,0.828339,2018
1,"1 Addis Way, Wandi",Wandi,365000,3,351,2013,26900,Kwinana Station,4900,02-2019\r,-32.19347,115.859553,ATWELL COLLEGE,5.524324,2019


In [13]:
# Remove unwanted column
cleaned_perth_df = cleaned_perth_df.drop(columns = ["DATE_SOLD"])
cleaned_perth_df.head(2)

Unnamed: 0,ADDRESS,SUBURB,PRICE,BEDROOMS,LAND_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,YEAR_SOLD
0,"1 Acorn Place, South Lake",South Lake,565000,4,600,2003,18300,Cockburn Central Station,1800,-32.1159,115.84245,LAKELAND SENIOR HIGH SCHOOL,0.828339,2018
1,"1 Addis Way, Wandi",Wandi,365000,3,351,2013,26900,Kwinana Station,4900,-32.19347,115.859553,ATWELL COLLEGE,5.524324,2019


In [15]:
cleaned_perth_df.to_csv("csv/cleaned_perth_market.csv", index=False)

# Correlation Price vs. Distance to School and Station

In [16]:
midland = cleaned_perth_df.loc[cleaned_perth_df["NEAREST_STN"] == "Midland Station"]
midland_price = midland["PRICE"] / midland["LAND_AREA"]
midland_price

3         391.705069
16        527.903469
17       1003.521127
18        423.435419
34        295.857988
            ...     
33572     567.375887
33582     244.000000
33584     568.011958
33635     325.765054
33649    1167.500000
Length: 2564, dtype: float64

In [19]:
# Price vs. Midland Station distance
stats.pearsonr(midland_price,midland["NEAREST_STN_DIST"])

(-0.3081125357719129, 1.6409989455636632e-57)

In [20]:
# Price vs. Station distance (In general)
stats.pearsonr(cleaned_perth_df["PRICE"],cleaned_perth_df["NEAREST_STN_DIST"])

(-0.1537907755976679, 1.2079031231386797e-145)

In [21]:
# Price vs. School distance
stats.pearsonr(cleaned_perth_df["PRICE"],cleaned_perth_df["NEAREST_SCH_DIST"])

(-0.08047208213414893, 7.041722378539672e-41)

In [22]:
# Price vs. School distance (In general)
stats.pearsonr(cleaned_perth_df["PRICE"],cleaned_perth_df["CBD_DIST"])

(-0.4182190881155497, 0.0)

# Connect to SQL

In [23]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

In [24]:
from config import password

In [25]:
# Create the Database Engine - local server, the connection string will be as follows:
connection_string  = f"postgres:{password}@127.0.0.1:5432/ProjectTwo"
    
# Create the database engine (to the PostgreSQL database)
engine = create_engine(f'postgresql://{connection_string}') 
conn = engine.connect()

In [26]:
# Query All Records in the the Database
data = engine.execute("SELECT * FROM distance_analysis")

for record in data:
    print(record)

('1 Acorn Place, South Lake', 18300, 'Cockburn Central Station', 1800, 'LAKELAND SENIOR HIGH SCHOOL', 0.8283385518162787, 600, 565000)
('1 Addis Way, Wandi', 26900, 'Kwinana Station', 4900, 'ATWELL COLLEGE', 5.524324354371822, 351, 365000)
('1 Ainsley Court, Camillo', 22600, 'Challis Station', 1900, 'KELMSCOTT SENIOR HIGH SCHOOL', 1.6491781833669859, 719, 287000)
('1 Albert Street, Bellevue', 17900, 'Midland Station', 3600, 'SWAN VIEW SENIOR HIGH SCHOOL', 1.5714009366124688, 651, 255000)
('1 Aman Place, Lockridge', 11200, 'Bassendean Station', 2000, 'KIARA COLLEGE', 1.5149216294528034, 466, 325000)
('1 Amethyst Crescent, Mount Richon', 27300, 'Armadale Station', 1000, 'ARMADALE SENIOR HIGH SCHOOL', 1.2272191943992041, 759, 409000)
('1 Ardara Lane, Hilbert', 28200, 'Armadale Station', 3700, 'DALE CHRISTIAN SCHOOL', 2.485730857063224, 386, 400000)
('1 Arnside Bend, Waikiki', 41700, 'Warnbro Station', 1100, 'SOUTH COAST BAPTIST COLLEGE', 0.4915731383363828, 468, 370000)
('1 Arrochar Court

In [27]:
# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(engine,reflect=True)

In [28]:
# We can view all of the classes that automap found
Base.classes.keys()

['perth_market',
 'house_age_analysis',
 'distance_analysis',
 'map_analysis',
 'number_of_bedrooms_analysis',
 'suburb_analysis']

In [35]:
# Save references to each table
suburb = Base.classes.suburb_analysis


In [36]:
# Create our session (link) from Python to the DB
session = Session(bind=engine)

In [37]:
session.query(suburb).first().__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x2689e5e3550>,
 'address': '1 Acorn Place, South Lake',
 'land_area': 600,
 'suburb': 'South Lake',
 'price': 565000}

In [38]:
suburb_first_row = session.query(suburb.address, suburb.suburb).all()
suburb_first_row

[('1 Acorn Place, South Lake', 'South Lake'),
 ('1 Addis Way, Wandi', 'Wandi'),
 ('1 Ainsley Court, Camillo', 'Camillo'),
 ('1 Albert Street, Bellevue', 'Bellevue'),
 ('1 Aman Place, Lockridge', 'Lockridge'),
 ('1 Amethyst Crescent, Mount Richon', 'Mount Richon'),
 ('1 Ardara Lane, Hilbert', 'Hilbert'),
 ('1 Arnside Bend, Waikiki', 'Waikiki'),
 ('1 Arrochar Court, Hamersley', 'Hamersley'),
 ('1 Arundel Street, Bayswater', 'Bayswater'),
 ('1 Ashcott Gate, Butler', 'Butler'),
 ('1 Ashcroft Way, Balga', 'Balga'),
 ('1 Ashendon Boulevard, Hammond Park', 'Hammond Park'),
 ('1 Ashtree Boulevard, Wattle Grove', 'Wattle Grove'),
 ('1 August Court, Bull Creek', 'Bull Creek'),
 ('1 Avonlea Place, Bullsbrook', 'Bullsbrook'),
 ('1 Babbler Court, Maida Vale', 'Maida Vale'),
 ('1 Balga Place, Koongamia', 'Koongamia'),
 ('1 Bamlett Street, Mount Nasura', 'Mount Nasura'),
 ('1 Banken Court, Forrestdale', 'Forrestdale'),
 ('1 Barcroft Court, Atwell', 'Atwell'),
 ('1 Bates Loop, Lockridge', 'Lockridge')

In [40]:
suburb_list=[]
for row in suburb_first_row:
    stn_result = list(np.ravel(row))
    stn_dict = {"Address": stn_result[0], "Suburb":stn_result[1]}
    suburb_list.append(stn_dict)
suburb_list

[{'Address': '1 Acorn Place, South Lake', 'Suburb': 'South Lake'},
 {'Address': '1 Addis Way, Wandi', 'Suburb': 'Wandi'},
 {'Address': '1 Ainsley Court, Camillo', 'Suburb': 'Camillo'},
 {'Address': '1 Albert Street, Bellevue', 'Suburb': 'Bellevue'},
 {'Address': '1 Aman Place, Lockridge', 'Suburb': 'Lockridge'},
 {'Address': '1 Amethyst Crescent, Mount Richon', 'Suburb': 'Mount Richon'},
 {'Address': '1 Ardara Lane, Hilbert', 'Suburb': 'Hilbert'},
 {'Address': '1 Arnside Bend, Waikiki', 'Suburb': 'Waikiki'},
 {'Address': '1 Arrochar Court, Hamersley', 'Suburb': 'Hamersley'},
 {'Address': '1 Arundel Street, Bayswater', 'Suburb': 'Bayswater'},
 {'Address': '1 Ashcott Gate, Butler', 'Suburb': 'Butler'},
 {'Address': '1 Ashcroft Way, Balga', 'Suburb': 'Balga'},
 {'Address': '1 Ashendon Boulevard, Hammond Park', 'Suburb': 'Hammond Park'},
 {'Address': '1 Ashtree Boulevard, Wattle Grove', 'Suburb': 'Wattle Grove'},
 {'Address': '1 August Court, Bull Creek', 'Suburb': 'Bull Creek'},
 {'Addres

# Web Scraping

In [48]:
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

In [49]:
#  Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Get LATEST driver version for 94.0.4606
Trying to download new driver from https://chromedriver.storage.googleapis.com/94.0.4606.61/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\Nassim\.wdm\drivers\chromedriver\win32\94.0.4606.61]


In [50]:
url = 'https://www.domain.com.au/sold-listings/perth-wa-6000/?price=1900000-any&excludepricewithheld=1&page=2'
browser.visit(url)

In [51]:
for x in range(1, 2):

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    quotes = soup.find_all('span', class_='property-price')

    for quote in quotes:
        print('page:', x, '-------------')
        print(quote.text)

    # browser.links.find_by_partial_text('Next').click()
    browser.links.find_by_href("/sold-listings/perth-wa-6000/?price=1900000-any&excludepricewithheld=1&page=3")