In [15]:
import requests
import pandas as pd
from tqdm.auto import tqdm
import re

In [2]:
df = pd.read_excel("data/SA2_2021_AUST.xlsx")

In [3]:
df.head()

Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,CHANGE_FLAG_2021,CHANGE_LABEL_2021,SA3_CODE_2021,SA3_NAME_2021,SA4_CODE_2021,SA4_NAME_2021,GCCSA_CODE_2021,GCCSA_NAME_2021,STATE_CODE_2021,STATE_NAME_2021,AUS_CODE_2021,AUS_NAME_2021,AREA_ALBERS_SQKM,ASGS_LOCI_URI_2021
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.762,http://linked.data.gov.au/dataset/asgsed3/SA2/...
3,101021010,Queanbeyan - East,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.0032,http://linked.data.gov.au/dataset/asgsed3/SA2/...
4,101021012,Queanbeyan West - Jerrabomberra,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.6748,http://linked.data.gov.au/dataset/asgsed3/SA2/...


- SA2_NAME_2021: suburb name
- SA3_NAME_2021: council name
- SA4_NAME_2021: city name
- GCCSA_NAME_2021: region name
- STATE_NAME_2021: state name

# 1. Clean list of suburbs

Things to consider:

| Consideration                              | Raw example           | Clean example                 |
|--------------------------------------------|-----------------------|-------------------------------|
| Split names with '-' into separate suburbs | Emerald - Cockatoo    | [..., Emerald, Cockatoo, ...] |
| Remove any brackets                        | Blacktown (East)      | [..., Blacktown, ...]         |
| Remove cardinal directions                 | Pakenham - North East | [..., Pakenham, ...]          |
| Apply lowercase for website request        | Oakleigh              | [..., oakleigh, ...]          |
| Replace spaces in names with '+'           | Koo Wee Rup           | [..., koo+wee+rup, ...]       |

In [4]:
df[["SA2_NAME_2021", "SA3_NAME_2021", "SA4_NAME_2021", "GCCSA_NAME_2021", "STATE_NAME_2021"]]

Unnamed: 0,SA2_NAME_2021,SA3_NAME_2021,SA4_NAME_2021,GCCSA_NAME_2021,STATE_NAME_2021
0,Braidwood,Queanbeyan,Capital Region,Rest of NSW,New South Wales
1,Karabar,Queanbeyan,Capital Region,Rest of NSW,New South Wales
2,Queanbeyan,Queanbeyan,Capital Region,Rest of NSW,New South Wales
3,Queanbeyan - East,Queanbeyan,Capital Region,Rest of NSW,New South Wales
4,Queanbeyan West - Jerrabomberra,Queanbeyan,Capital Region,Rest of NSW,New South Wales
...,...,...,...,...,...
2468,Jervis Bay,Jervis Bay,Other Territories,Other Territories,Other Territories
2469,Norfolk Island,Norfolk Island,Other Territories,Other Territories,Other Territories
2470,Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Other Territories
2471,No usual address (OT),No usual address (OT),No usual address (OT),No usual address (OT),Other Territories


In [19]:
cardinal_directions = [
    "Central",
    "North",
    "North East",
    "East",
    "South East",
    "South",
    "South West",
    "West",
    "North West"
]

suburbs = []

# filter suburbs
for x in df["SA2_NAME_2021"]:
    # if parentheses exist - delete parantheses and whatever is in them 
    x_ = re.sub(r"\([^()]*\)", "", x)
    
    # then split by "-"
    subs = x_.split(" - ")
    
    for sub in subs:
        suburbs.append(sub)

print(len(suburbs))
# remove cardinal names from list of suburbs
suburbs = [x for x in suburbs if x not in cardinal_directions]
print(len(suburbs))

3643
3332


In [46]:
# list of suburbs and possible cardinal combinations
suburbs_dir = suburbs + [f"{direction} {sub}" for sub in tqdm(suburbs) for direction in cardinal_directions]

# convert all to lower case and replace spaces with "+"
suburbs_dir = [x.lower().replace(" ", "+") for x in suburbs_dir]

  0%|          | 0/3332 [00:00<?, ?it/s]

In [36]:
def scrape_jina_ai(suburb):
    response = requests.get("https://r.jina.ai/" + f"https://www.realestateinvestar.com.au/property/{suburb}")
    return response.text

In [None]:
suburb_scrape = {
    suburb: scrape_jina_ai(suburb) for suburb in suburbs_dir
}