In [34]:
import requests
import pandas as pd
import tqdm
import re

In [2]:
df = pd.read_excel("data/SA2_2021_AUST.xlsx")

In [13]:
df.head()

Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,CHANGE_FLAG_2021,CHANGE_LABEL_2021,SA3_CODE_2021,SA3_NAME_2021,SA4_CODE_2021,SA4_NAME_2021,GCCSA_CODE_2021,GCCSA_NAME_2021,STATE_CODE_2021,STATE_NAME_2021,AUS_CODE_2021,AUS_NAME_2021,AREA_ALBERS_SQKM,ASGS_LOCI_URI_2021
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.762,http://linked.data.gov.au/dataset/asgsed3/SA2/...
3,101021010,Queanbeyan - East,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.0032,http://linked.data.gov.au/dataset/asgsed3/SA2/...
4,101021012,Queanbeyan West - Jerrabomberra,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.6748,http://linked.data.gov.au/dataset/asgsed3/SA2/...


- SA2_NAME_2021: suburb name
- SA3_NAME_2021: council name
- SA4_NAME_2021: city name
- GCCSA_NAME_2021: region name
- STATE_NAME_2021: state name

# 1. Clean list of suburbs

Things to consider:

| Consideration                              | Raw example           | Clean example                 |
|--------------------------------------------|-----------------------|-------------------------------|
| Split names with '-' into separate suburbs | Emerald - Cockatoo    | [..., Emerald, Cockatoo, ...] |
| Remove any brackets                        | Blacktown (East)      | [..., Blacktown, ...]         |
| Remove cardinal directions                 | Pakenham - North East | [..., Pakenham, ...]          |
| Apply lowercase for website request        | Oakleigh              | [..., oakleigh, ...]          |
| Replace spaces in names with '+'           | Koo Wee Rup           | [..., koo+wee+rup, ...]       |

In [14]:
df[["SA2_NAME_2021", "SA3_NAME_2021", "SA4_NAME_2021", "GCCSA_NAME_2021", "STATE_NAME_2021"]]

Unnamed: 0,SA2_NAME_2021,SA3_NAME_2021,SA4_NAME_2021,GCCSA_NAME_2021,STATE_NAME_2021
0,Braidwood,Queanbeyan,Capital Region,Rest of NSW,New South Wales
1,Karabar,Queanbeyan,Capital Region,Rest of NSW,New South Wales
2,Queanbeyan,Queanbeyan,Capital Region,Rest of NSW,New South Wales
3,Queanbeyan - East,Queanbeyan,Capital Region,Rest of NSW,New South Wales
4,Queanbeyan West - Jerrabomberra,Queanbeyan,Capital Region,Rest of NSW,New South Wales
...,...,...,...,...,...
2468,Jervis Bay,Jervis Bay,Other Territories,Other Territories,Other Territories
2469,Norfolk Island,Norfolk Island,Other Territories,Other Territories,Other Territories
2470,Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Migratory - Offshore - Shipping (OT),Other Territories
2471,No usual address (OT),No usual address (OT),No usual address (OT),No usual address (OT),Other Territories


In [63]:
cardinal_directions = [
    "Central",
    "North",
    "North East",
    "East",
    "South East",
    "South",
    "South West",
    "West",
    "North West"
]

suburbs = []

for x in df["SA2_NAME_2021"]:
    # if parentheses exist - delete parantheses and whatever is in them 
    x_ = re.sub(r"\([^()]*\)", "", x)
    
    # then split by "-"
    subs = x_.split(" - ")
    
    for sub in subs:
        suburbs.append(sub)

In [65]:
[x for x in suburbs if "Eden" in x]

['Bega-Eden Hinterland', 'Eden', 'Edensor Park', 'Edens Landing', 'Eden Hill']

In [56]:
list(df["SA2_NAME_2021"].values)

['Braidwood',
 'Karabar',
 'Queanbeyan',
 'Queanbeyan - East',
 'Queanbeyan West - Jerrabomberra',
 'Googong',
 'Queanbeyan Surrounds',
 'Bombala',
 'Cooma',
 'Cooma Surrounds',
 'Jindabyne - Berridale',
 'Batemans Bay',
 'Batemans Bay - South',
 'Bega - Tathra',
 'Bega-Eden Hinterland',
 'Broulee - Tomakin',
 'Deua - Wadbilliga',
 'Eden',
 'Eurobodalla Hinterland',
 'Merimbula - Tura Beach',
 'Moruya - Tuross Head',
 'Narooma - Bermagui',
 'Goulburn',
 'Goulburn Surrounds',
 'Yass',
 'Yass Surrounds',
 'Young',
 'Young Surrounds',
 'Avoca Beach - Copacabana',
 'Box Head - MacMasters Beach',
 'Calga - Kulnura',
 'Erina - Green Point',
 'Gosford - Springfield',
 'Kariong',
 'Kincumber - Picketts Valley',
 'Narara',
 'Niagara Park - Lisarow',
 'Point Clare - Koolewong',
 'Saratoga - Davistown',
 'Terrigal - North Avoca',
 'Umina - Booker Bay - Patonga',
 'Wamberal - Forresters Beach',
 'Woy Woy - Blackwall',
 'Wyoming',
 'Bateau Bay - Killarney Vale',
 'Blue Haven - San Remo',
 'Budgewoi

In [52]:
suburbs

['Braidwood',
 'Karabar',
 'Queanbeyan',
 'Queanbeyan ',
 ' East',
 'Queanbeyan West ',
 ' Jerrabomberra',
 'Googong',
 'Queanbeyan Surrounds',
 'Bombala',
 'Cooma',
 'Cooma Surrounds',
 'Jindabyne ',
 ' Berridale',
 'Batemans Bay',
 'Batemans Bay ',
 ' South',
 'Bega ',
 ' Tathra',
 'Bega',
 'Eden Hinterland',
 'Broulee ',
 ' Tomakin',
 'Deua ',
 ' Wadbilliga',
 'Eden',
 'Eurobodalla Hinterland',
 'Merimbula ',
 ' Tura Beach',
 'Moruya ',
 ' Tuross Head',
 'Narooma ',
 ' Bermagui',
 'Goulburn',
 'Goulburn Surrounds',
 'Yass',
 'Yass Surrounds',
 'Young',
 'Young Surrounds',
 'Avoca Beach ',
 ' Copacabana',
 'Box Head ',
 ' MacMasters Beach',
 'Calga ',
 ' Kulnura',
 'Erina ',
 ' Green Point',
 'Gosford ',
 ' Springfield',
 'Kariong',
 'Kincumber ',
 ' Picketts Valley',
 'Narara',
 'Niagara Park ',
 ' Lisarow',
 'Point Clare ',
 ' Koolewong',
 'Saratoga ',
 ' Davistown',
 'Terrigal ',
 ' North Avoca',
 'Umina ',
 ' Booker Bay ',
 ' Patonga',
 'Wamberal ',
 ' Forresters Beach',
 'Woy Wo

In [9]:
df

Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,CHANGE_FLAG_2021,CHANGE_LABEL_2021,SA3_CODE_2021,SA3_NAME_2021,SA4_CODE_2021,SA4_NAME_2021,GCCSA_CODE_2021,GCCSA_NAME_2021,STATE_CODE_2021,STATE_NAME_2021,AUS_CODE_2021,AUS_NAME_2021,AREA_ALBERS_SQKM,ASGS_LOCI_URI_2021
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.7620,http://linked.data.gov.au/dataset/asgsed3/SA2/...
3,101021010,Queanbeyan - East,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.0032,http://linked.data.gov.au/dataset/asgsed3/SA2/...
4,101021012,Queanbeyan West - Jerrabomberra,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.6748,http://linked.data.gov.au/dataset/asgsed3/SA2/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2468,901031003,Jervis Bay,0,No change,90103,Jervis Bay,901,Other Territories,9OTER,Other Territories,9,Other Territories,AUS,Australia,67.2296,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2469,901041004,Norfolk Island,0,No change,90104,Norfolk Island,901,Other Territories,9OTER,Other Territories,9,Other Territories,AUS,Australia,38.6510,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2470,997979799,Migratory - Offshore - Shipping (OT),0,No change,99797,Migratory - Offshore - Shipping (OT),997,Migratory - Offshore - Shipping (OT),99799,Migratory - Offshore - Shipping (OT),9,Other Territories,AUS,Australia,,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2471,999999499,No usual address (OT),0,No change,99999,No usual address (OT),999,No usual address (OT),99499,No usual address (OT),9,Other Territories,AUS,Australia,,http://linked.data.gov.au/dataset/asgsed3/SA2/...


In [None]:
def scrape_jina_ai(url):
    response = requests