## Create File System

In [9]:
import os

# get the relative output path
RELATIVE_PATH = "../../data/landing/"

# get the directory names
directory_names = ["housing", "population", "economic", "economic_by_region", "school"]

# create each new directory
for directory_name in directory_names:
    new_path = RELATIVE_PATH + directory_name

    # if path doesn't exist, create
    if (not os.path.exists(new_path)):
        os.mkdir(new_path)

## Functions

In [1]:
SLEEP_TIME = 10.2

### Excel

In [2]:
import io
import time
import requests
import pandas as pd

In [3]:
def get_new_columns(old_columns, missing_columns):
    new_columns = []
    for column in old_columns:
        if "Unnamed" in column:
            new_columns.append(missing_columns.pop(0))
        else:
            new_columns.append(column)

    return new_columns

In [4]:
def write_csv(data, sheet_dict, read_excel_dict):
    if (not sheet_dict.get("csv")):
        # read the excel data
        df = pd.read_excel(data,
            sheet_name=sheet_dict["sheet"],
            skiprows=read_excel_dict["skiprows"],
            index_col=read_excel_dict["index col"], 
            header=read_excel_dict["header rows"],
            nrows=read_excel_dict.get("nrows")
        )
    else:
        # read csv data
        csv_data = io.StringIO(data.decode('ascii', errors="replace"))
        df = pd.read_csv(csv_data, index_col=0, header=0, encoding='cp1252')

    # skip the final records if necessary
    if (read_excel_dict.get("skip tail")):
        df = df.iloc[:-read_excel_dict["skip tail"]]

    # if there are missing columns
    if (sheet_dict.get("missing header")):
        df.columns = get_new_columns(df.columns, sheet_dict.get("missing header"))

    # get the full path
    full_path = read_excel_dict["relative path"] + sheet_dict["name"] + ".csv"

    # write to csv
    df.to_csv(full_path)

In [5]:
def write_all_csvs(url_to_sheets, read_excel_dict):
    for url, sheet_dict_list in url_to_sheets.items():
        # get the data
        data = requests.get(url).content

        # for each sheet, write a csv
        for sheet_dict in sheet_dict_list:
            print(f"writing \"{sheet_dict['name']}\"")

            write_csv(data, sheet_dict, read_excel_dict)

            print("success, waiting sever reset")
            time.sleep(SLEEP_TIME)

### ABS

In [6]:
from bs4 import BeautifulSoup
import csv

In [7]:
### DOWNLOADING PAGES FROM HTML

"""Assumes a bs4 object has been passed in"""
def tag_to_csv(table_tag, dst_path):
    with open(dst_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)

        # Iterate through table rows <tr>
        for row in table_tag.find_all('tr'):
            # Get all <td> or <th> in the row
            columns = row.find_all(['td', 'th'])

            # Extract text from each column and write the row to the CSV file
            writer.writerow([col.text.strip() for col in columns])

In [8]:
def write_table(string, url, dst_path):
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html')
    
    caption_tag = soup.find('caption', string=string)

    table_tag = caption_tag.find_parent()

    tag_to_csv(table_tag, dst_path)

    time.sleep(SLEEP_TIME)

## Housing

In [23]:
# setting up the parameters for the reading data
read_excel_dict_houses = {
    "skiprows": [0, 2],
    "index col": 0,
    "header rows": 0,
    "relative path": RELATIVE_PATH + "housing/"
}

# get all the URLS and sheets to get
URL_TO_SHEETS_HOUSES = {
    "https://www.dffh.vic.gov.au/moving-annual-rent-suburb-june-quarter-2024": [
        {"sheet": "1 bedroom flat", "name": "flat_1_bed"},
        {"sheet": "2 bedroom flat", "name": "flat_2_bed"},
        {"sheet": "3 bedroom flat", "name": "flat_3_bed"},
        {"sheet": "2 bedroom house", "name": "house_2_bed"},
        {"sheet": "3 bedroom house", "name": "house_3_bed"},
        {"sheet": "4 bedroom house", "name": "house_4_bed"}
    ]
}

In [24]:
write_all_csvs(URL_TO_SHEETS_HOUSES, read_excel_dict_houses)

writing "flat_1_bed"
success, waiting sever reset
writing "flat_2_bed"
success, waiting sever reset
writing "flat_3_bed"
success, waiting sever reset
writing "house_2_bed"
success, waiting sever reset
writing "house_3_bed"
success, waiting sever reset
writing "house_4_bed"
success, waiting sever reset


## Economic

### Interest Rates

In [21]:
# setting up the parameters for the reading data
read_excel_dict_interest = {
    "skiprows": [x for x in range(11) if x != 1],
    "index col": 0,
    "header rows": 0,
    "relative path": RELATIVE_PATH + "economic/"
}

# get all the URLS and sheets to get
URL_TO_SHEETS_INTEREST = {
    "https://www.rba.gov.au/statistics/tables/xls/f05hist.xlsx?v=2024-09-28-19-56-34": [{
        "sheet": "Data",
        "name": "lending"        
    }]
}

In [14]:
write_all_csvs(URL_TO_SHEETS_INTEREST, read_excel_dict_interest)

writing "lending"
success, waiting sever reset


### GDP

In [15]:
# setting up the parameters for the reading data
read_excel_dict_gdp = {
    "skiprows": list(range(9)),
    "index col": 0,
    "header rows": 0,
    "relative path": RELATIVE_PATH + "economic/"
}

# get all the URLS and sheets to get
URL_TO_SHEETS_GDP = {
    "https://www.abs.gov.au/statistics/economy/national-accounts/australian-national-accounts-national-income-expenditure-and-product/jun-2024/5206001_Key_Aggregates.xlsx": [{
        "sheet": "Data1",
        "name": "gdp"        
    }]
}

In [16]:
write_all_csvs(URL_TO_SHEETS_GDP, read_excel_dict_gdp)

writing "gdp"
success, waiting sever reset


### Inflation

In [17]:
# setting up the parameters for the reading data
read_excel_dict_inflation = {
    "skiprows": list(range(1, 10)),
    "index col": 0,
    "header rows": 0,
    "relative path": RELATIVE_PATH + "economic/"
}

# get all the URLS and sheets to get
URL_TO_SHEETS_INFLATION = {
    "https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/consumer-price-index-australia/jun-quarter-2024/640106.xlsx": [{
        "sheet": "Data1",
        "name": "inflation"        
    }]
}

In [18]:
write_all_csvs(URL_TO_SHEETS_INFLATION, read_excel_dict_inflation)

writing "inflation"
success, waiting sever reset


## Population

### Immigration

In [109]:
# setting up the parameters for the reading data
read_excel_dict_immigration = {
    "skiprows": list(range(12)),
    "index col": 0,
    "header rows": 0,
    "relative path": RELATIVE_PATH + "population/",
    "nrows": 250
}

# get all the URLS and sheets to get
URL_TO_SHEETS_IMMIGRATION = {
    "https://www.abs.gov.au/statistics/people/population/overseas-migration/2022-23-financial-year/34070DO001_202223.xlsx": [{
        "sheet": "Table 1.3",
        "name": "immigration_2022"
    }]
}

In [110]:
write_all_csvs(URL_TO_SHEETS_IMMIGRATION, read_excel_dict_immigration)

### Age Demos

In [12]:
# setting up the parameters for the reading data
read_excel_dict_age = {
    "skiprows": list(range(5)),
    "index col": 0,
    "header rows": [0, 1],
    "relative path": RELATIVE_PATH + "population/",
    "skip tail": 1
}

# get all the URLS and sheets to get
URL_TO_SHEETS_AGE = {
    "https://www.abs.gov.au/statistics/people/population/regional-population-age-and-sex/2023/32350DS0005_2001-23.xlsx": [{
        "sheet": "Table 3",
        "name": "age_demos"
    }]
}

In [13]:
write_all_csvs(URL_TO_SHEETS_AGE, read_excel_dict_age)

### Population Projections

In [113]:
URL_POPULATION = "https://www.abs.gov.au/statistics/people/population/population-projections-australia/latest-release"

REQUESTS_POPULATION = [
    {"string": "Projected population, Victoria", "name": "population_proj"},
    {"string": "Projected natural increase, Victoria", "name": "population_natural_proj"},
    {"string": "Population proportion(a), Victoria", "name": "population_proportions"}
]

In [114]:
for population_dict in REQUESTS_POPULATION:
    # get the output path
    path = RELATIVE_PATH + "population/" + population_dict["name"] + ".csv"

    # write the table
    write_table(population_dict["string"], URL_POPULATION, path)

## Economic by Region

In [18]:
# setting up the parameters for the reading data
read_excel_dict_socioeconomic = {
    "skiprows": list(range(5)),
    "index col": [0, 1],
    "header rows": [0, 1],
    "relative path": RELATIVE_PATH + "economic_by_region/",
    "skip tail": 5
}

# get all the URLS and sheets to get
URL_TO_SHEETS_SOCIOECONOMIC = {
    "https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2011-12-2016-17/6124055002ds0001_2019.xls": [{
        "sheet": "Table 1.4",
        "name": "income_by_geography_2011_2017"
    }], "https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2020-21-financial-year/Table%201%20-%20Total%20income%2C%20earners%20and%20summary%20statistics%20by%20geography%2C%202016-17%20to%202020-21.xlsx": [{
        "sheet": "Table 1.4",
        "name": "income_by_geography_2016_2023"
    }], "https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2020-21-financial-year/Table%202%20-%20Total%20income%20distribution%20by%20geography%2C%202020-21.xlsx": [{
        "sheet": "Table 2.4",
        "name": "income_distribution_by_geography_2021"
    }], 
}

In [19]:
write_all_csvs(URL_TO_SHEETS_SOCIOECONOMIC, read_excel_dict_socioeconomic)

writing "income_by_geography_2011_2017"
success, waiting sever reset
writing "income_by_geography_2016_2023"
success, waiting sever reset
writing "income_distribution_by_geography_2021"
success, waiting sever reset


## Schools

In [117]:
# setting up the parameters for the reading data
read_excel_dict_school = {
    "skiprows": [x for x in range(7) if x != 5],
    "index col": 0,
    "header rows": 0,
    "relative path": RELATIVE_PATH + "school/"
}

MISSING_HEADERS_SCHOOL = ["VCAA code", "School name", "Sector", "Locality", 
                          "Total Completed Year 12", "Survey Participants"]

URL_TO_SHEETS_SCHOOL = {
    "https://www.education.vic.gov.au/Documents/about/research/OnTrack2019/DestinationData2019.xlsx": [{
        "sheet": "SCHOOL PUBLICATION TABLE 2019",
        "name": "after_school_2019",
        "missing header": MISSING_HEADERS_SCHOOL
    }], "https://www.education.vic.gov.au/Documents/about/research/OnTrack2021/DestinationData2021_new.XLSX": [{
        "sheet": "SCHOOL PUBLICATION TABLE 2021",
        "name": "after_school_2021",
        "missing header": MISSING_HEADERS_SCHOOL
    }], "https://www.education.vic.gov.au/Documents/about/research/datavic/dv331_schoollocations2022.csv": [{
        "csv": True,
        "name": "locations_2022"
    }]
}

In [118]:
write_all_csvs(URL_TO_SHEETS_SCHOOL, read_excel_dict_school)