In [164]:
import requests
import pandas as pd
from tqdm.auto import tqdm
import re
import pickle
from bs4 import BeautifulSoup

In [4]:
df = pd.read_excel("data/SA2_2021_AUST.xlsx")
df.head()

Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,CHANGE_FLAG_2021,CHANGE_LABEL_2021,SA3_CODE_2021,SA3_NAME_2021,SA4_CODE_2021,SA4_NAME_2021,GCCSA_CODE_2021,GCCSA_NAME_2021,STATE_CODE_2021,STATE_NAME_2021,AUS_CODE_2021,AUS_NAME_2021,AREA_ALBERS_SQKM,ASGS_LOCI_URI_2021
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.762,http://linked.data.gov.au/dataset/asgsed3/SA2/...
3,101021010,Queanbeyan - East,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.0032,http://linked.data.gov.au/dataset/asgsed3/SA2/...
4,101021012,Queanbeyan West - Jerrabomberra,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.6748,http://linked.data.gov.au/dataset/asgsed3/SA2/...


- SA2_NAME_2021: suburb name
- SA3_NAME_2021: council name
- SA4_NAME_2021: city name
- GCCSA_NAME_2021: region name
- STATE_NAME_2021: state name

# 1. Clean list of suburbs

Things to consider:

| Consideration                              | Raw example           | Clean example                 |
|--------------------------------------------|-----------------------|-------------------------------|
| Split names with '-' into separate suburbs | Emerald - Cockatoo    | [..., Emerald, Cockatoo, ...] |
| Remove any brackets                        | Blacktown (East)      | [..., Blacktown, ...]         |
| Remove cardinal directions                 | Pakenham - North East | [..., Pakenham, ...]          |
| Apply lowercase for website request        | Oakleigh              | [..., oakleigh, ...]          |
| Replace spaces in names with '+'           | Koo Wee Rup           | [..., koo+wee+rup, ...]       |

In [3]:
df[["SA2_NAME_2021", "SA3_NAME_2021", "SA4_NAME_2021", "GCCSA_NAME_2021", "STATE_NAME_2021"]]

Unnamed: 0,SA2_NAME_2021,SA3_NAME_2021,SA4_NAME_2021,GCCSA_NAME_2021,STATE_NAME_2021
0,Braidwood,Queanbeyan,Capital Region,Rest of NSW,New South Wales
1,Karabar,Queanbeyan,Capital Region,Rest of NSW,New South Wales
2,Queanbeyan,Queanbeyan,Capital Region,Rest of NSW,New South Wales
3,Queanbeyan - East,Queanbeyan,Capital Region,Rest of NSW,New South Wales
4,Queanbeyan West - Jerrabomberra,Queanbeyan,Capital Region,Rest of NSW,New South Wales
...,...,...,...,...,...
2468,Jervis Bay,Jervis Bay,Other Territories,Other Territories,Other Territories
2469,Norfolk Island,Norfolk Island,Other Territories,Other Territories,Other Territories
2470,Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Other Territories
2471,No usual address (OT),No usual address (OT),No usual address (OT),No usual address (OT),Other Territories


In [4]:
cardinal_directions = [
    "Central",
    "North",
    "North East",
    "East",
    "South East",
    "South",
    "South West",
    "West",
    "North West"
]

suburbs = []

# filter suburbs
for x in df["SA2_NAME_2021"]:
    # if parentheses exist - delete parantheses and whatever is in them 
    x_ = re.sub(r"\([^()]*\)", "", x)
    
    # then split by "-"
    subs = x_.split(" - ")
    
    for sub in subs:
        suburbs.append(sub)

print(len(suburbs))
# remove cardinal names from list of suburbs
suburbs = [x for x in suburbs if x not in cardinal_directions]
print(len(suburbs))

3643
3332


In [5]:
# list of suburbs and possible cardinal combinations
suburbs_dir = suburbs + [f"{direction} {sub}" for sub in tqdm(suburbs) for direction in cardinal_directions]

# convert all to lower case and replace spaces with "+"
suburbs_dir = [x.lower().replace(" ", "+") for x in suburbs_dir]

  0%|          | 0/3332 [00:00<?, ?it/s]

# 2. Scrape data

## 2.1 Scrape LLM friendly data

In [7]:
def scrape_jina_ai(suburb):
    response = requests.get(
        "https://r.jina.ai/" + f"https://www.realestateinvestar.com.au/property/{suburb}",
        headers=headers
    )
    return response.text

In [8]:
suburb_scrape = {}

for suburb in tqdm(suburbs_dir):
    suburb_scrape[suburb] = scrape_jina_ai(suburb)        

  0%|          | 0/33320 [00:00<?, ?it/s]

ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))

## 2.2 Scrape html data

In [7]:
def scrape_tables(soup):
    # get all tables from soup
    tables = soup.find_all('table')
    
    # set table names
    table_names = ["median", "rental", "sales"]
    
    # instantiate empty dict
    dfs = {}
    
    # iterate over the three tables and populate
    for i, table in enumerate(tables):
        # Extract headers (th elements)
        headers = [header.get_text(strip=True) for header in table.find_all('td', class_='datatitle')]

        # Extract rows
        rows = []
        for row in table.find_all('tr')[1:]:  # Skip the header row
            cells = row.find_all('td')
            row_data = [cell.get_text(strip=True) for cell in cells]
            rows.append(row_data)

        # Create a DataFrame
        df = pd.DataFrame(rows, columns=['Metric'] + headers)
        dfs[table_names[i]] = df
    return dfs

def scrape_other_data(soup):
    other_data = {}

    paras = soup.find_all('p')

    vac_rates = []
    rental_stocks = []
    pops = []
    rental_pops = []

    for i, p in enumerate(paras):
        # get region
        if "  Suburb" in p.get_text():
            suburb_name = p.get_text().split("\n")[2]
            other_data["suburb"] = {"name": suburb_name}

        # get region
        if "  Region" in p.get_text():
            region_name = p.get_text().split("\n")[2]
            other_data["region"] = {"name": region_name}

        # get state
        if "  State" in p.get_text():
            #print(p.get_text())
            state_name = p.get_text().split("\n")[2]
            other_data["state"] = {"name": state_name}

        # get vac rate
        if "Current vacancy rate" in p.get_text():
            vac_rates.append(paras[i-1].get_text())

        # get rental stock
        if "Rental stock available" in p.get_text():
            rental_stocks.append(paras[i-1].get_text())

        # get pop
        if "Population" in p.get_text():
            pops.append(paras[i-1].get_text())

        # get rental pop
        if "Rental population" in p.get_text():
            rental_pops.append(paras[i-1].get_text())

    if suburb == "act":
        sas = ["state"]
    else:
        sas = ["suburb", "region", "state"]
    for (sa, vr, rs, p, rp) in zip(sas, vac_rates, rental_stocks, pops, rental_pops):
        other_data[sa]["vacancy_rate"] = vr
        other_data[sa]["rental_stock"] = rs
        other_data[sa]["population"] = p
        other_data[sa]["rental_pop"] = rp
        
    return other_data

In [8]:
# scrape other data
suburb_tables = {}
suburb_other_data = {}

for suburb in tqdm(valid_suburbs):
    # get html
    page = requests.get(f'https://www.realestateinvestar.com.au/property/{suburb}')
    
    # save soup
    soup = BeautifulSoup(page.text, 'html.parser')
    
    suburb_tables[suburb] = scrape_tables(soup)
    suburb_other_data[suburb] = pd.DataFrame(scrape_other_data(soup))

  0%|          | 0/2774 [00:00<?, ?it/s]

KeyboardInterrupt: 

# 3. Clean scraped data

In [121]:
from functools import reduce
import numpy as np
from datetime import datetime
import os

In [128]:
# create dir path
now = datetime.today()

current_year = str(now.year)
current_month = str(now.month)

if len(current_month) < 2:
    current_month = "0"+current_month

data_dir = "data_inv"
date_dir = current_year+current_month

data_date_dir = os.path.join(data_dir, date_dir)
if not os.path.exists(data_date_dir):
    os.makedirs(data_date_dir)

In [174]:
with open("test_tables.pickle", "rb") as f:
    tables = pickle.load(f)
    
with open("test_data.pickle", "rb") as f:
    data = pickle.load(f)
    
data_ = []
for x in data.values():
    if len(x.columns) == 3:
        x.loc["state"] = [x.loc["name", "state"]]*3
        x.loc["region"] = [x.loc["name", "region"]]*3
        data_.append(x)

In [180]:
data_suburbs = [x.iloc[:, 0] for x in data_]
data_regions = [x.iloc[:, 1] for x in data_]
data_states = [x.iloc[:, 2] for x in data_]

In [181]:
def combine_clean_data(data):
    df_data = pd.concat(data, axis=1)
    df_data.columns = df_data.iloc[0]
    df_data = df_data.iloc[1:,:]
    df_data = df_data.T
    
    if data[0].name == "region":
        df_data.drop(columns=["region"], inplace=True)
    elif data[0].name == "state":
        df_data.drop(columns=["region", "state"], inplace=True)

    df_data["vacancy_rate"] = df_data["vacancy_rate"].apply(lambda x: x.replace("%", ""))
    df_data["rental_stock"] = df_data["rental_stock"].apply(lambda x: x.replace(",", ""))
    df_data["population"] = df_data["population"].apply(lambda x: x.replace(",", ""))
    df_data["rental_pop"] = df_data["rental_pop"].apply(lambda x: x.replace("%", ""))
    df_data.iloc[:, :4] = df_data.iloc[:, :4].replace("NA", np.nan).apply(pd.to_numeric)
    
    return df_data.drop_duplicates()

In [183]:
df_suburbs = combine_clean_data(data_suburbs)
df_regions = combine_clean_data(data_regions)
df_states = combine_clean_data(data_states)

In [184]:
df_suburbs.to_csv(os.path.join(data_date_dir, "df_data_suburbs.csv"))
df_regions.to_csv(os.path.join(data_date_dir, "df_data_regions.csv"))
df_states.to_csv(os.path.join(data_date_dir, "df_data_states.csv"))

# Create house, townhouse, units df

In [150]:
# get list of suburbs - consistent with our suburbs df
suburbs = [suburb.title().replace("+", " ") for suburb in valid_suburbs]# list(df_suburbs.index)

In [153]:
# get data columns we want to record for each suburb
cols = list(tables["braidwood"]["median"]["Metric"].values) \
+ list(tables["braidwood"]["rental"]["Metric"].values) \
+ list(tables["braidwood"]["sales"]["Metric"].values)

In [154]:
pd.concat([
    tables["braidwood"]["median"], 
    tables["braidwood"]["rental"], 
    tables["braidwood"]["sales"]
])

Unnamed: 0,Metric,House,Townhouses,Units
0,Median listing price,"$789,000",,
1,Median price change - last quarter,-1.26%,,
2,Median price change - 1 year,-2.84%,,
3,Median price change - 2 years,13.19%,,
0,Median weekly rent,$520,,
1,Median yield %,3.42%,,
2,Median rent change - 1 year,6.12%,,
0,Previous month sales,5,,0
1,Stock on market previous month,37,,2
2,Stock variance vs. last year,37.04%,,-33.33%


In [193]:
len(df_suburbs.index)

2761

In [196]:
[x.lower().replace(" ", "+") for x in suburbs]

['braidwood',
 'karabar',
 'queanbeyan',
 'queanbeyan+west',
 'jerrabomberra',
 'googong',
 'bombala',
 'cooma',
 'jindabyne',
 'berridale',
 'batemans+bay',
 'bega',
 'tathra',
 'broulee',
 'tomakin',
 'deua',
 'wadbilliga',
 'eden',
 'merimbula',
 'tura+beach',
 'moruya',
 'tuross+head',
 'narooma',
 'bermagui',
 'goulburn',
 'yass',
 'young',
 'avoca+beach',
 'copacabana',
 'box+head',
 'macmasters+beach',
 'calga',
 'kulnura',
 'erina',
 'gosford',
 'springfield',
 'kariong',
 'kincumber',
 'picketts+valley',
 'narara',
 'niagara+park',
 'lisarow',
 'point+clare',
 'koolewong',
 'saratoga',
 'davistown',
 'terrigal',
 'north+avoca',
 'booker+bay',
 'patonga',
 'wamberal',
 'forresters+beach',
 'woy+woy',
 'blackwall',
 'wyoming',
 'bateau+bay',
 'killarney+vale',
 'blue+haven',
 'san+remo',
 'budgewoi',
 'buff+point',
 'halekulani',
 'chittaway+bay',
 'tumbi+umbi',
 'gorokan',
 'kanwal',
 'charmhaven',
 'jilliby',
 'yarramalong',
 'lake+munmorah',
 'mannering+park',
 'ourimbah',
 '

In [155]:
cols[1] = "Median price change - last quarter (%)"
cols[2] = "Median price change - 1 year (%)"
cols[3] = "Median price change - 2 years (%)"
cols[6] = "Median rent change - 1 year (%)"
cols[-2] = "Stock variance vs. last year (%)"

In [156]:
# init empty df
df_houses = pd.DataFrame(index=suburbs, columns=cols)
df_town_houses = pd.DataFrame(index=suburbs, columns=cols)
df_units = pd.DataFrame(index=suburbs, columns=cols)

for suburb in tqdm(suburbs):
    suburb_ = suburb.lower().replace(" ", "+")
    df_sub_table = pd.concat([
        tables[suburb_]["median"], 
        tables[suburb_]["rental"], 
        tables[suburb_]["sales"]
    ])
    
    df_houses.loc[suburb, :] = df_sub_table["House"].values
    df_town_houses.loc[suburb, :] = df_sub_table["Townhouses"].values
    df_units.loc[suburb, :] = df_sub_table["Units"].values

  0%|          | 0/2774 [00:00<?, ?it/s]

In [182]:
data_date_dir

'data_inv\\202504'

In [157]:
def df_to_numeric(df):
    df = df.dropna(axis=1)
    df = df.map(lambda x: x.replace("$", "").replace(",", "").replace("%", ""))
    return df.map(pd.to_numeric, errors="coerce")

In [158]:
df_houses = df_to_numeric(df_houses)
df_town_houses = df_to_numeric(df_town_houses)
df_units = df_to_numeric(df_units)

In [159]:
df_houses.index.name = "Suburb"
df_town_houses.index.name = "Suburb"
df_units.index.name = "Suburb"

In [160]:
df_houses.to_csv(os.path.join(data_date_dir, "df_tables_houses.csv"))
df_town_houses.to_csv(os.path.join(data_date_dir, "df_tables_town_houses.csv"))
df_units.to_csv(os.path.join(data_date_dir, "df_tables_units.csv"))