# Data By Region_Crawling

Import Library

In [1]:
import csv
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service


In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

Read the URL of each Region 

In [3]:
# open the ABS(Data By Region Page)

driver.get("https://dbr.abs.gov.au/")

# The base URL for victoria
base_url = "https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=2"
elements = driver.find_elements(By.XPATH, "//li[contains(@class, 'navTreeBranchContent')]//span[@aria-label]")

# The Library use to store the region name and URL
sa2_links_with_names = {}

# Get the Region name and url
for element in elements:
    region_name = element.get_attribute("aria-label")  
    try:
        sa2_link = element.find_element(By.XPATH, ".//a").get_attribute('href')  
        if sa2_link.startswith(base_url):
            sa2_links_with_names[region_name] = sa2_link
    except Exception as e:
        print(f"Can't Find the URL:{region_name}, error: {e}")

# Close the website
driver.quit()

In [4]:
print(sa2_links_with_names)

{'Alfredton': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011001', 'Ballarat': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011002', 'Buninyong': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011005', 'Delacombe': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011006', 'Smythes Creek': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011007', 'Wendouree - Miners Rest': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011008', 'Ballarat East - Warrenheip': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011481', 'Ballarat North - Invermay': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011482', 'Canadian - Mount Clear': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011483', 'Sebastopol - Redan': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201011484', 'Bacchus Marsh Surrounds': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201021009', 'Creswick - Clunes': 'https://dbr.abs.gov.au/region.html?lyr=sa2&rgn=201021010', 'Daylesford': 'https://dbr.ab

Read The data of each region

In [5]:
# The function use to get the data from tabel
def extract_table_data(table, required_descriptions, data):
    rows = table.find('tbody').find_all('tr')
    for row in rows:
        columns = row.find_all('td')
        if len(columns) > 1:
            description = columns[1].text.strip() 
            if description in required_descriptions:
                if description not in data:
                    data[description] = {}
                for col in columns[2:]:
                    year = col.get('id')  
                    value = col.text.strip()  
                    data[description][year] = value
    return data

In [6]:
# The function use to find table
def handle_table(soup, table_id, required_descriptions, data_range):
    table = soup.find('table', id=table_id)
    if table:
        data_range = extract_table_data(table, required_descriptions, data_range)
    return data_range

In [7]:
def scroll_and_click(driver, button):
    actions = ActionChains(driver)
    actions.move_to_element(button).perform()
    button.click()

In [8]:
# The function use to get key_statistics and long_run data
def read_combined_data_by_region(city_name, city_url):
    # Update to use ChromeDriverManager
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(city_url)
    
    try:
        # click all the botton that we need
        button_ids = ['popcollapse', 'econcollapse', 'inccollapse', 'educollapse', 'famcollapse']
        wait = WebDriverWait(driver, 30)
        for button_id in button_ids:
            button = wait.until(EC.element_to_be_clickable((By.ID, button_id)))
            scroll_and_click(driver, button)

        # Get the HTML
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        # The dictionary of long-run data 
        data_range = {}
        # The dictionary of key Statistics
        key_data = {}

        # get key Statistics
        table = soup.find('table', class_='biscuitTable')
        if table:
            for row in table.find_all('tr'):
                columns = row.find_all('td')
                if len(columns) > 0:
                    bisco1 = columns[1].text.strip() if len(columns) > 0 else ''
                    bisco2 = columns[2].text.strip() if len(columns) > 1 else ''
                    bisco3 = columns[3].text.strip() if len(columns) > 2 else ''
                    key_data[bisco1] = f"{bisco2}/{bisco3}"

        # get long-run data
        tables_to_extract = {
            'ERP': ['Estimated resident population (no.)', 'Population density (persons/km2)', 'Working age population (aged 15-64 years) (no.)'],
            'MIGRATION_1': ['Internal arrivals (no.)'],
            'CABEE_1': ['Total number of businesses'],
            'DWELLSTOCK_1': ['Houses - total (no.)', 'Townhouses - total (no.)', 'Apartments - total (no.)', 'Total dwellings (no.)'],
            'INCOME_1': ['Median total income (excl. Government pensions and allowances) ($)'],
            'LEED_1': ['Number of jobs'],
            'HIGH_1': ['Completed year 12 or equivalent (%)'],
            'LF_1': ['Employed (no.)', 'Unemployed (no.)'],
            'TENURE_1': ['Rented (no.)'],
            'HOMELESS_1': ['Count of homeless persons (no.)']
        }

        for table_id, required_descriptions in tables_to_extract.items():
            data_range = handle_table(soup, table_id, required_descriptions, data_range)
        return {city_name: {'key_data': key_data, 'data_range': data_range}}

    finally:
        driver.quit()


In [9]:
def extract_all_region_data(sa2_links_with_names):
    all_region_data = {}
    for region_name, sa2_link in sa2_links_with_names.items():
        print(f"Processing data for region: {region_name}")
        try:
            city_data = read_combined_data_by_region(region_name, sa2_link)
            all_region_data.update(city_data)
        except Exception as e:
            print(f"Failed to process region {region_name}: {e}")
    return all_region_data


In [10]:
all_data = extract_all_region_data(sa2_links_with_names)

Processing data for region: Alfredton


Processing data for region: Ballarat
Processing data for region: Buninyong
Processing data for region: Delacombe
Processing data for region: Smythes Creek
Processing data for region: Wendouree - Miners Rest
Processing data for region: Ballarat East - Warrenheip
Processing data for region: Ballarat North - Invermay
Processing data for region: Canadian - Mount Clear
Processing data for region: Sebastopol - Redan
Processing data for region: Bacchus Marsh Surrounds
Processing data for region: Creswick - Clunes
Processing data for region: Daylesford
Processing data for region: Gordon (Vic.)
Processing data for region: Avoca
Processing data for region: Beaufort
Processing data for region: Golden Plains - North
Processing data for region: Maryborough (Vic.)
Processing data for region: Maryborough Surrounds
Processing data for region: Bendigo
Processing data for region: California Gully - Eaglehawk
Processing data for region: East Bendigo - Kennington
Processing data for region: Flora Hill - S

Alps - East and Lake king are not read successfully. After checking, there is basically no residence, so there is no need to load again

Store the key statistics

In [11]:
# create the dataframe to store the key statistics of each ragion
key_data_df = pd.DataFrame()
for region_name, region_data in all_data.items():
    key_data = region_data.get('key_data', {})
    key_data_series = pd.Series(key_data, name=region_name)
    key_data_df = pd.concat([key_data_df, key_data_series], axis=1)
key_data_df = key_data_df.T

csv_file_path = "../../data/landing/region_data/key_statistics/all_region_key_data_new.csv"
# Ensure the directory exists before saving the file
os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)

key_data_df.to_csv(csv_file_path, index_label="Region")

store the range data of each region

In [12]:
output_dir = "../../data/landing/region_data/long_run"
os.makedirs(output_dir, exist_ok=True)

# Collect all regions based on 'Estimated resident population (no.)'
master_region_key = 'Estimated resident population (no.)'
master_region_list = []
for region_name, region_data in all_data.items():
    data_range = region_data.get('data_range', {})
    if master_region_key in data_range:
        master_region_list.append(region_name)

# Create dataframes for all keys 
key_dataframes = {}
for region_name, region_data in all_data.items():
    data_range = region_data.get('data_range', {})
    
    for key, year_data in data_range.items():
        # Add keys
        if key not in key_dataframes:
            key_dataframes[key] = pd.DataFrame(columns=['Region'] + list(year_data.keys()))
            # if this region have no data of this key then input "None"
            for master_region in master_region_list:
                empty_row = pd.DataFrame([[master_region] + [None] * len(year_data.keys())], columns=key_dataframes[key].columns)
                key_dataframes[key] = pd.concat([key_dataframes[key], empty_row], ignore_index=True)
        
        # Create new row
        new_row = {'Region': region_name}
        new_row.update(year_data)
        for year, value in year_data.items():
            key_dataframes[key].loc[key_dataframes[key]['Region'] == region_name, year] = value

# Save each key as aregion
for key, df in key_dataframes.items():
    df = df.set_index('Region').reindex(master_region_list).reset_index()
    file_name = f"{key.replace(' (no.)','').replace('-','').replace(' ', '_').replace('(', '').replace(')', '').replace('/', '_')}.csv"
    file_path = os.path.join(output_dir, file_name)
    df.to_csv(file_path, index=False)
    print(f"Saved data for '{key}' to '{file_path}'")


Saved data for 'Estimated resident population (no.)' to '../../data/landing/region_data/long_run/Estimated_resident_population.csv'
Saved data for 'Population density (persons/km2)' to '../../data/landing/region_data/long_run/Population_density_persons_km2.csv'
Saved data for 'Working age population (aged 15-64 years) (no.)' to '../../data/landing/region_data/long_run/Working_age_population_aged_1564_years.csv'
Saved data for 'Internal arrivals (no.)' to '../../data/landing/region_data/long_run/Internal_arrivals.csv'
Saved data for 'Total number of businesses' to '../../data/landing/region_data/long_run/Total_number_of_businesses.csv'
Saved data for 'Houses - total (no.)' to '../../data/landing/region_data/long_run/Houses__total.csv'
Saved data for 'Townhouses - total (no.)' to '../../data/landing/region_data/long_run/Townhouses__total.csv'
Saved data for 'Apartments - total (no.)' to '../../data/landing/region_data/long_run/Apartments__total.csv'
Saved data for 'Total dwellings (no.)'