In [10]:
import pandas as pd
import requests

## Create File System

In [3]:
import os

# change directories to default one
# os.chdir("/home/malachy/ADS (2024)/project-2-group-real-estate-industry-project-11/scripts")

# get the relative output path
RELATIVE_PATH = "../data/landing/"

# get the directory names
directory_names = ["housing", "population", "economic", "economic_by_region", "school"]

# create each new directory
for directory_name in directory_names:
    new_path = RELATIVE_PATH + directory_name

    # if path doesn't exist, create
    if (not os.path.exists(new_path)):
        os.mkdir(new_path)

## Housing

In [4]:
SLEEP_TIME = 10.2

In [5]:
def get_new_columns(old_columns, missing_columns):
    new_columns = []
    for column in old_columns:
        if "Unnamed" in column:
            new_columns.append(missing_columns.pop(0))
        else:
            new_columns.append(column)

    return new_columns

In [6]:
import io

def write_csv(data, sheet_dict, read_excel_dict):
    if (not sheet_dict.get("csv")):
        # read the excel data
        df = pd.read_excel(data,
            sheet_name=sheet_dict["sheet"],
            skiprows=read_excel_dict["skiprows"],
            index_col=read_excel_dict["index col"], 
            header=read_excel_dict["header rows"],
            nrows=read_excel_dict.get("nrows")
        )
    else:
        # read csv data
        csv_data = io.StringIO(data.decode('ascii', errors="replace"))
        df = pd.read_csv(csv_data, index_col=0, header=0, encoding='cp1252')

    # skip the final records if necessary
    if (read_excel_dict.get("tail skip")):
        df = df.iloc[:-read_excel_dict["tail skip"]]

    # if there are missing columns
    if (sheet_dict.get("missing header")):
        df.columns = get_new_columns(df.columns, sheet_dict.get("missing header"))

    # get the full path
    full_path = read_excel_dict["relative path"] + sheet_dict["name"] + ".csv"

    # write to csv
    df.to_csv(full_path)

In [7]:
import time

def write_all_csvs(url_to_sheets, read_excel_dict):
    for url, sheet_dict_list in url_to_sheets.items():
        # get the data
        data = requests.get(url).content

        # for each sheet, write a csv
        for sheet_dict in sheet_dict_list:
            write_csv(data, sheet_dict, read_excel_dict)
        
        time.sleep(SLEEP_TIME)

        

In [8]:
# setting up the parameters for the reading data
read_excel_dict_houses = {
    "skiprows": [x for x in range(11) if x != 1],
    "index col": 0,
    "header rows": 0,
    "relative path": "../data/landing/housing/"
}

# get all the URLS and sheets to get
URL_TO_SHEETS_HOUSES = {
    "https://www.dffh.vic.gov.au/moving-annual-rents-suburb-march-quarter-2023-excel": [
        {"sheet": "1 bedroom flat", "name": "flat_1_bed_2023"},
        {"sheet": "2 bedroom flat", "name": "flat_2_bed_2023"},
        {"sheet": "3 bedroom flat", "name": "flat_3_bed_2023"},
        {"sheet": "2 bedroom house", "name": "house_2_bed_2023"},
        {"sheet": "3 bedroom house", "name": "house_3_bed_2023"},
        {"sheet": "4 bedroom house", "name": "house_4_bed_2023"}
    ]
}


In [12]:
write_all_csvs(URL_TO_SHEETS_HOUSES, read_excel_dict_houses)

  df = pd.read_excel(data,
  df = pd.read_excel(data,
  df = pd.read_excel(data,
  df = pd.read_excel(data,
  df = pd.read_excel(data,
  df = pd.read_excel(data,


## Economic

### Interest Rates

In [13]:
# setting up the parameters for the reading data
read_excel_dict_interest = {
    "skiprows": [x for x in range(11) if x != 1],
    "index col": 0,
    "header rows": 0,
    "relative path": "../data/landing/economic/"
}

# get all the URLS and sheets to get
URL_TO_SHEETS_INTEREST = {
    "https://www.rba.gov.au/statistics/tables/xls/f06hist.xlsx": [{
        "sheet": "Data",
        "name": "lending"
    }],
    "https://www.rba.gov.au/statistics/tables/xls/f04hist.xlsx": [{
        "sheet": "Data",
        "name": "deposit"
    }],
    "https://www.rba.gov.au/statistics/tables/xls/f04-1-hist.xlsx": [{
        "sheet": "Data",
        "name": "paid_deposit"
    }]
}

In [14]:
write_all_csvs(URL_TO_SHEETS_INTEREST, read_excel_dict_interest)

  df = pd.read_excel(data,
  df = pd.read_excel(data,
  df = pd.read_excel(data,


### GDP

In [15]:
import csv

"""Assumes a bs4 object has been passed in"""
def tag_to_csv(table_tag, dst_path):
    with open(dst_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)

        # Iterate through table rows <tr>
        for row in table_tag.find_all('tr'):
            # Get all <td> or <th> in the row
            columns = row.find_all(['td', 'th'])

            # Extract text from each column and write the row to the CSV file
            writer.writerow([col.text.strip() for col in columns])

In [16]:
from bs4 import BeautifulSoup
import time

def write_table(string, url, dst_path):
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'lxml')
    
    caption_tag = soup.find('caption', string=string)

    table_tag = caption_tag.find_parent()

    tag_to_csv(table_tag, dst_path)

    time.sleep(SLEEP_TIME)

In [17]:
# no #1: Gross domestic product, chain volume measures, seasonally adjusted
GDP_URL = "https://www.abs.gov.au/statistics/economy/national-accounts/australian-national-accounts-national-income-expenditure-and-product/latest-release"
GDP_STRING = "Gross domestic product, chain volume measures, seasonally adjusted"
DST_PATH = "../data/landing/economic/gdp.csv"

In [18]:
write_table(GDP_STRING, GDP_URL, DST_PATH)

### Inflation

In [19]:
INFLATION_URL = "https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/consumer-price-index-australia/latest-release"
INFLATION_STRING = "All groups CPI and Trimmed mean, Australia, annual movement (%)"
DST_PATH = "../data/landing/economic/inflation.csv"

In [20]:
write_table(INFLATION_STRING, INFLATION_URL, DST_PATH)

## Population

### Immigration

In [21]:
# setting up the parameters for the reading data
read_excel_dict_immigration = {
    "skiprows": list(range(12)),
    "index col": 0,
    "header rows": 0,
    "relative path": "../data/landing/population/",
    "nrows": 250
}

# get all the URLS and sheets to get
URL_TO_SHEETS_IMMIGRATION = {
    "https://www.abs.gov.au/statistics/people/population/overseas-migration/2022-23-financial-year/34070DO001_202223.xlsx": [{
        "sheet": "Table 1.3",
        "name": "immigration_2022"
    }]
}

In [22]:
write_all_csvs(URL_TO_SHEETS_IMMIGRATION, read_excel_dict_immigration)

  df = pd.read_excel(data,


### Age Demos

In [23]:
# setting up the parameters for the reading data
read_excel_dict_age = {
    "skiprows": list(range(5)),
    "index col": 0,
    "header rows": [0, 1],
    "relative path": "../data/landing/population/",
    "skip tail": 1
}

# get all the URLS and sheets to get
URL_TO_SHEETS_AGE = {
    "https://www.abs.gov.au/statistics/people/population/regional-population-age-and-sex/2023/32350DS0005_2001-23.xlsx": [{
        "sheet": "Table 3",
        "name": "age_demos"
    }]
}

In [24]:
write_all_csvs(URL_TO_SHEETS_AGE, read_excel_dict_age)

  df = pd.read_excel(data,


### Population Projections

In [25]:
URL_POPULATION = "https://www.abs.gov.au/statistics/people/population/population-projections-australia/latest-release"

REQUESTS_POPULATION = [
    {"string": "Projected population, Victoria", "name": "population_proj"},
    {"string": "Projected natural increase, Victoria", "name": "population_natural_proj"},
    {"string": "Population proportion(a), Victoria", "name": "population_proportions"}
]

In [26]:
for population_dict in REQUESTS_POPULATION:
    # get the output path
    path = "../data/landing/population/" + population_dict["name"] + ".csv"

    # write the table
    write_table(population_dict["string"], URL_POPULATION, path)

## Economic by Region

In [27]:
# setting up the parameters for the reading data
read_excel_dict_socioeconomic = {
    "skiprows": list(range(5)),
    "index col": [0, 1],
    "header rows": [0, 1],
    "relative path": "../data/landing/economic_by_region/",
    "skip tail": 5
}

# get all the URLS and sheets to get
URL_TO_SHEETS_SOCIOECONOMIC = {
    "https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2020-21-financial-year/Table%201%20-%20Total%20income%2C%20earners%20and%20summary%20statistics%20by%20geography%2C%202016-17%20to%202020-21.xlsx": [{
        "sheet": "Table 1.4",
        "name": "income_by_geography_b2022"
    }], "https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2020-21-financial-year/Table%202%20-%20Total%20income%20distribution%20by%20geography%2C%202020-21.xlsx": [{
        "sheet": "Table 2.4",
        "name": "income_distribution_by_geography_2021"
    }], 
}

In [28]:
write_all_csvs(URL_TO_SHEETS_SOCIOECONOMIC, read_excel_dict_socioeconomic)

  df = pd.read_excel(data,
  df = pd.read_excel(data,


## Schools

In [29]:
# setting up the parameters for the reading data
read_excel_dict_school = {
    "skiprows": [x for x in range(7) if x != 5],
    "index col": 0,
    "header rows": 0,
    "relative path": "../data/landing/school/"
}

MISSING_HEADERS_SCHOOL = ["VCAA code", "School name", "Sector", "Locality", 
                          "Total Completed Year 12", "Survey Participants"]

URL_TO_SHEETS_SCHOOL = {
    "https://www.education.vic.gov.au/Documents/about/research/OnTrack2019/DestinationData2019.xlsx": [{
        "sheet": "SCHOOL PUBLICATION TABLE 2019",
        "name": "after_school_2019",
        "missing header": MISSING_HEADERS_SCHOOL
    }], "https://www.education.vic.gov.au/Documents/about/research/OnTrack2021/DestinationData2021_new.XLSX": [{
        "sheet": "SCHOOL PUBLICATION TABLE 2021",
        "name": "after_school_2021",
        "missing header": MISSING_HEADERS_SCHOOL
    }], "https://www.education.vic.gov.au/Documents/about/research/datavic/dv331_schoollocations2022.csv": [{
        "csv": True,
        "name": "locations_2022"
    }]
}

In [30]:
write_all_csvs(URL_TO_SHEETS_SCHOOL, read_excel_dict_school)

  df = pd.read_excel(data,
  df = pd.read_excel(data,
