In [1]:
import requests
import pandas as pd
from tqdm.auto import tqdm
import re
import pickle
from bs4 import BeautifulSoup

In [2]:
df = pd.read_excel("data/SA2_2021_AUST.xlsx")
df.head()

Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,CHANGE_FLAG_2021,CHANGE_LABEL_2021,SA3_CODE_2021,SA3_NAME_2021,SA4_CODE_2021,SA4_NAME_2021,GCCSA_CODE_2021,GCCSA_NAME_2021,STATE_CODE_2021,STATE_NAME_2021,AUS_CODE_2021,AUS_NAME_2021,AREA_ALBERS_SQKM,ASGS_LOCI_URI_2021
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.762,http://linked.data.gov.au/dataset/asgsed3/SA2/...
3,101021010,Queanbeyan - East,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.0032,http://linked.data.gov.au/dataset/asgsed3/SA2/...
4,101021012,Queanbeyan West - Jerrabomberra,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.6748,http://linked.data.gov.au/dataset/asgsed3/SA2/...


- SA2_NAME_2021: suburb name
- SA3_NAME_2021: council name
- SA4_NAME_2021: city name
- GCCSA_NAME_2021: region name
- STATE_NAME_2021: state name

# 1. Clean list of suburbs

Things to consider:

| Consideration                              | Raw example           | Clean example                 |
|--------------------------------------------|-----------------------|-------------------------------|
| Split names with '-' into separate suburbs | Emerald - Cockatoo    | [..., Emerald, Cockatoo, ...] |
| Remove any brackets                        | Blacktown (East)      | [..., Blacktown, ...]         |
| Remove cardinal directions                 | Pakenham - North East | [..., Pakenham, ...]          |
| Apply lowercase for website request        | Oakleigh              | [..., oakleigh, ...]          |
| Replace spaces in names with '+'           | Koo Wee Rup           | [..., koo+wee+rup, ...]       |

In [3]:
df[["SA2_NAME_2021", "SA3_NAME_2021", "SA4_NAME_2021", "GCCSA_NAME_2021", "STATE_NAME_2021"]]

Unnamed: 0,SA2_NAME_2021,SA3_NAME_2021,SA4_NAME_2021,GCCSA_NAME_2021,STATE_NAME_2021
0,Braidwood,Queanbeyan,Capital Region,Rest of NSW,New South Wales
1,Karabar,Queanbeyan,Capital Region,Rest of NSW,New South Wales
2,Queanbeyan,Queanbeyan,Capital Region,Rest of NSW,New South Wales
3,Queanbeyan - East,Queanbeyan,Capital Region,Rest of NSW,New South Wales
4,Queanbeyan West - Jerrabomberra,Queanbeyan,Capital Region,Rest of NSW,New South Wales
...,...,...,...,...,...
2468,Jervis Bay,Jervis Bay,Other Territories,Other Territories,Other Territories
2469,Norfolk Island,Norfolk Island,Other Territories,Other Territories,Other Territories
2470,Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Other Territories
2471,No usual address (OT),No usual address (OT),No usual address (OT),No usual address (OT),Other Territories


In [4]:
cardinal_directions = [
    "Central",
    "North",
    "North East",
    "East",
    "South East",
    "South",
    "South West",
    "West",
    "North West"
]

suburbs = []

# filter suburbs
for x in df["SA2_NAME_2021"]:
    # if parentheses exist - delete parantheses and whatever is in them 
    x_ = re.sub(r"\([^()]*\)", "", x)
    
    # then split by "-"
    subs = x_.split(" - ")
    
    for sub in subs:
        suburbs.append(sub)

print(len(suburbs))
# remove cardinal names from list of suburbs
suburbs = [x for x in suburbs if x not in cardinal_directions]
print(len(suburbs))

3643
3332


In [5]:
# list of suburbs and possible cardinal combinations
suburbs_dir = suburbs + [f"{direction} {sub}" for sub in tqdm(suburbs) for direction in cardinal_directions]

# convert all to lower case and replace spaces with "+"
suburbs_dir = [x.lower().replace(" ", "+") for x in suburbs_dir]

  0%|          | 0/3332 [00:00<?, ?it/s]

# 2. Scrape data

## 2.1 Scrape LLM friendly data

In [7]:
def scrape_jina_ai(suburb):
    response = requests.get(
        "https://r.jina.ai/" + f"https://www.realestateinvestar.com.au/property/{suburb}",
        headers=headers
    )
    return response.text

In [8]:
suburb_scrape = {}

for suburb in tqdm(suburbs_dir):
    suburb_scrape[suburb] = scrape_jina_ai(suburb)        

  0%|          | 0/33320 [00:00<?, ?it/s]

ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))

In [9]:
# with open("investar_suburbs1.pickle", "wb") as f:
#     pickle.dump(suburb_scrape, f)

## 2.2 Scrape html data

In [135]:
def scrape_tables(soup):
    # get all tables from soup
    tables = soup.find_all('table')
    
    # set table names
    table_names = ["median", "rental", "sales"]
    
    # instantiate empty dict
    dfs = {}
    
    # iterate over the three tables and populate
    for i, table in enumerate(tables):
        # Extract headers (th elements)
        headers = [header.get_text(strip=True) for header in table.find_all('td', class_='datatitle')]

        # Extract rows
        rows = []
        for row in table.find_all('tr')[1:]:  # Skip the header row
            cells = row.find_all('td')
            row_data = [cell.get_text(strip=True) for cell in cells]
            rows.append(row_data)

        # Create a DataFrame
        df = pd.DataFrame(rows, columns=['Metric'] + headers)
        dfs[table_names[i]] = df
    return dfs

def scrape_other_data(soup):
    other_data = {}

    paras = soup.find_all('p')

    vac_rates = []
    rental_stocks = []
    pops = []
    rental_pops = []

    for i, p in enumerate(paras):
        # get region
        if "  Suburb" in p.get_text():
            suburb_name = p.get_text().split("\n")[2]
            other_data["suburb"] = {"name": suburb_name}

        # get region
        if "  Region" in p.get_text():
            region_name = p.get_text().split("\n")[2]
            other_data["region"] = {"name": region_name}

        # get state
        if "  State" in p.get_text():
            #print(p.get_text())
            state_name = p.get_text().split("\n")[2]
            other_data["state"] = {"name": state_name}

        # get vac rate
        if "Current vacancy rate" in p.get_text():
            vac_rates.append(paras[i-1].get_text())

        # get rental stock
        if "Rental stock available" in p.get_text():
            rental_stocks.append(paras[i-1].get_text())

        # get pop
        if "Population" in p.get_text():
            pops.append(paras[i-1].get_text())

        # get rental pop
        if "Rental population" in p.get_text():
            rental_pops.append(paras[i-1].get_text())

    if suburb == "act":
        sas = ["state"]
    else:
        sas = ["suburb", "region", "state"]
    for (sa, vr, rs, p, rp) in zip(sas, vac_rates, rental_stocks, pops, rental_pops):
        other_data[sa]["vacancy_rate"] = vr
        other_data[sa]["rental_stock"] = rs
        other_data[sa]["population"] = p
        other_data[sa]["rental_pop"] = rp
        
    return other_data

In [18]:
suburb_scrape = {}

# scrape tabular data
for suburb in tqdm(suburbs_dir):
    # get html
    page = requests.get(f'https://www.realestateinvestar.com.au/property/{suburb}')
    
    if page.status_code == 200:
        # convert to soup
        soup = BeautifulSoup(page.text, 'html.parser')
        
        suburb_scrape[suburb] = scrape_tables(soup)
        
    else:
        pass
    
with open("investar_tables.pickle", "wb") as f:
    pickle.dump(suburb_scrape, f)

  0%|          | 0/33320 [00:00<?, ?it/s]

In [32]:
valid_suburbs = list(suburb_scrape.keys())

In [78]:
# scrape other data
suburb_soups = {}

for suburb in tqdm(valid_suburbs):
    # get html
    page = requests.get(f'https://www.realestateinvestar.com.au/property/{suburb}')
    
    # save soup
    soup = BeautifulSoup(page.text, 'html.parser')
    suburb_soups[suburb] = soup
    
# with open("investar_soups.pickle", "wb") as f:
#     pickle.dump(suburb_soups, f)

  0%|          | 0/2774 [00:00<?, ?it/s]

RecursionError: maximum recursion depth exceeded while getting the str of an object

In [138]:
suburb_other_data = {}
for suburb, soup in tqdm(suburb_soups.items()):
    suburb_other_data[suburb] = pd.DataFrame(scrape_other_data(soup))

  0%|          | 0/2774 [00:00<?, ?it/s]

In [139]:
with open("other_data.pickle", "wb") as f:
    pickle.dump(suburb_other_data, f)

In [31]:
# # save list of valid suburbs
# with open("data/suburbs.pickle", "wb") as f:
#     pickle.dump(list(suburb_scrape.keys()), f)

# 3. Clean scraped data

In [2]:
from functools import reduce
import numpy as np

In [3]:
with open("tables.pickle", "rb") as f:
    tables = pickle.load(f)
    
with open("other_data.pickle", "rb") as f:
    data = pickle.load(f)

In [6]:
data_suburbs = [value["suburb"] for value in tqdm(list(data.values())) if len(value.columns) == 3]
data_regions = [value["region"] for value in tqdm(list(data.values())) if len(value.columns) == 3]
data_states = [value["state"] for value in tqdm(list(data.values())) if len(value.columns) == 3]

  0%|          | 0/2774 [00:00<?, ?it/s]

  0%|          | 0/2774 [00:00<?, ?it/s]

  0%|          | 0/2774 [00:00<?, ?it/s]

In [8]:
data_ = []
for x in data.values():
    if len(x.columns) == 3:
        x.loc["state"] = [x.loc["name", "state"]]*3
        x.loc["region"] = [x.loc["name", "region"]]*3
    data_.append(x)

In [9]:
data_[-1]

Unnamed: 0,suburb,region,state
name,East Chapman,Greater Geraldton,Western Australia
vacancy_rate,,0.47%,0.59%
rental_stock,0,88,7161
population,41,38633,2474414
rental_pop,100.00%,25.55%,23.98%
state,Western Australia,Western Australia,Western Australia
region,Greater Geraldton,Greater Geraldton,Greater Geraldton


In [10]:
def combine_clean_data(data):
    df_data = pd.concat(data, axis=1)
    df_data.columns = df_data.iloc[0]
    df_data = df_data.iloc[1:,:]
    df_data = df_data.T
    
    if data[0].name == "region":
        df_data.drop(columns=["region"], inplace=True)
    elif data[0].name == "state":
        df_data.drop(columns=["region", "state"], inplace=True)

    df_data["vacancy_rate"] = df_data["vacancy_rate"].apply(lambda x: x.replace("%", ""))
    df_data["rental_stock"] = df_data["rental_stock"].apply(lambda x: x.replace(",", ""))
    df_data["population"] = df_data["population"].apply(lambda x: x.replace(",", ""))
    df_data["rental_pop"] = df_data["rental_pop"].apply(lambda x: x.replace("%", ""))
    df_data.iloc[:, :4] = df_data.iloc[:, :4].replace("NA", np.nan).apply(pd.to_numeric)
    
    return df_data.drop_duplicates()

In [11]:
df_suburbs = combine_clean_data(data_suburbs)
df_regions = combine_clean_data(data_regions)
df_states = combine_clean_data(data_states)

KeyError: "['region'] not found in axis"

In [115]:
# df_suburbs.to_csv("df_data_suburbs.csv")
# df_regions.to_csv("df_data_regions.csv")
# df_states.to_csv("df_data_states.csv")

# Create house, townhouse, units df

In [20]:
tables["braidwood"]

{'median':                                Metric     House Townhouses Units
 0                Median listing price  $789,000         NA    NA
 1  Median price change - last quarter    -1.26%         NA    NA
 2        Median price change - 1 year    -1.26%         NA    NA
 3       Median price change - 2 years    13.52%         NA    NA,
 'rental':                         Metric  House Townhouses Units
 0           Median weekly rent   $520         NA    NA
 1               Median yield %  3.42%         NA    NA
 2  Median rent change - 1 year  4.00%         NA    NA,
 'sales':                            Metric   House    Units
 0            Previous month sales       1        0
 1  Stock on market previous month      34        1
 2    Stock variance vs. last year  30.77%  -50.00%
 3          Average days on market     169       NA}

In [40]:
tables["braidwood"]["median"]

Unnamed: 0,Metric,House,Townhouses,Units
0,Median listing price,"$789,000",,
1,Median price change - last quarter,-1.26%,,
2,Median price change - 1 year,-1.26%,,
3,Median price change - 2 years,13.52%,,


In [22]:
# get list of suburbs - consistent with our suburbs df
suburbs = list(df_suburbs.index)

In [37]:
# get data columns we want to record for each suburb
cols = list(tables["braidwood"]["median"]["Metric"].values) \
+ list(tables["braidwood"]["rental"]["Metric"].values) \
+ list(tables["braidwood"]["sales"]["Metric"].values)

In [45]:
pd.concat([
    tables["braidwood"]["median"], 
    tables["braidwood"]["rental"], 
    tables["braidwood"]["sales"]
])

Unnamed: 0,Metric,House,Townhouses,Units
0,Median listing price,"$789,000",,
1,Median price change - last quarter,-1.26%,,
2,Median price change - 1 year,-1.26%,,
3,Median price change - 2 years,13.52%,,
0,Median weekly rent,$520,,
1,Median yield %,3.42%,,
2,Median rent change - 1 year,4.00%,,
0,Previous month sales,1,,0
1,Stock on market previous month,34,,1
2,Stock variance vs. last year,30.77%,,-50.00%


In [94]:
cols[1] = "Median price change - last quarter (%)"
cols[2] = "Median price change - 1 year (%)"
cols[3] = "Median price change - 2 years (%)"
cols[6] = "Median rent change - 1 year (%)"
cols[-2] = "Stock variance vs. last year (%)"

In [117]:
# init empty df
df_houses = pd.DataFrame(index=suburbs, columns=cols)
df_town_houses = pd.DataFrame(index=suburbs, columns=cols)
df_units = pd.DataFrame(index=suburbs, columns=cols)

for suburb in tqdm(suburbs):
    suburb_ = suburb.lower().replace(" ", "+")
    df_sub_table = pd.concat([
        tables[suburb_]["median"], 
        tables[suburb_]["rental"], 
        tables[suburb_]["sales"]
    ])
    
    df_houses.loc[suburb, :] = df_sub_table["House"].values
    df_town_houses.loc[suburb, :] = df_sub_table["Townhouses"].values
    df_units.loc[suburb, :] = df_sub_table["Units"].values

  0%|          | 0/2721 [00:00<?, ?it/s]

In [118]:
def df_to_numeric(df):
    df = df.dropna(axis=1)
    df = df.map(lambda x: x.replace("$", "").replace(",", "").replace("%", ""))
    return df.map(pd.to_numeric, errors="coerce")

In [119]:
df_houses = df_to_numeric(df_houses)
df_town_houses = df_to_numeric(df_town_houses)
df_units = df_to_numeric(df_units)

In [128]:
df_houses.index.name = "Suburb"
df_town_houses.index.name = "Suburb"
df_units.index.name = "Suburb"

In [129]:
df_houses.to_csv("df_tables_houses.csv")
df_town_houses.to_csv("df_tables_town_houses.csv")
df_units.to_csv("df_tables_units.csv")