Created by Ran Zhang, Aug 30

In [1]:
import os
import requests
import zipfile
import pandas as pd

In [2]:
# Define directories
output_relative_dir = '../../data/landing/ABS_SA2/'
output_absolute_dir = '../../data/raw/ABS_SA2/'

# Create directories if they don't exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")


Directory ../../data/landing/ABS_SA2/ already exists, skipping creation.
Directory ../../data/raw/ABS_SA2/ already exists, skipping creation.


In [3]:
# Define URL and file paths
url = "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip"
download_path = os.path.join(output_relative_dir, "SA2_2021_AUST_SHP_GDA2020.zip")
extract_to_path = output_absolute_dir

# Download and extract the file
if not os.path.exists(download_path):
    print("Downloading SA2 District Boundaries shapefile...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)

    print("Extracting shapefile...")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)

    print("SA2 District Boundaries shapefile downloaded and extracted successfully.")
else:
    print("Zip file already exists, skipping download and extraction.")

Zip file already exists, skipping download and extraction.


In [4]:
# Define directories
output_relative_dir = '../../data/landing/ABS_population/'
output_absolute_dir = '../../data/raw/ABS_population/'

# Create directories if they don't exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")

Directory ../../data/landing/ABS_population/ already exists, skipping creation.
Directory ../../data/raw/ABS_population/ already exists, skipping creation.


In [5]:
# Define URL and download paths
url = "https://www.abs.gov.au/statistics/people/population/regional-population/2022-23/32180DS0003_2001-23.xlsx"
download_path = os.path.join(output_relative_dir, "32180DS0003_2001-23.xlsx")

# Download the Excel file
if not os.path.exists(download_path):
    print("Downloading ABS population dataset...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)
    print("Download completed.")
else:
    print("File already exists, skipping download.")

File already exists, skipping download.


In [6]:
# Load the Excel file and print sheet names
xlsx = pd.ExcelFile(download_path)
print("Available sheet names:", xlsx.sheet_names)

Available sheet names: ['Contents', 'Table 1', 'Table 2', 'Table 3', 'Table 4', 'Table 5']


In [7]:
# Load the Excel file, starting from row 7 
sheet_name = xlsx.sheet_names[1]  # Adjust the index based on the sheet containing the relevant table
df = pd.read_excel(download_path, sheet_name=sheet_name, skiprows=6)

# Select row 7 (which is now index 0) and rows 650-1171 (which are now indices 642 to 1163)
df_filtered = pd.concat([df.iloc[[0]], df.iloc[642:1164]])

df_filtered = df_filtered.reset_index(drop=True)
df_filtered = df_filtered.drop(index=0)

# Check if the data was filtered correctly
print(df_filtered.head())

# Save the filtered DataFrame to a CSV file
csv_output_path = os.path.join(output_absolute_dir, "population_data_table_2.csv")
df_filtered.to_csv(csv_output_path, index=False)
print(f"Data saved to {csv_output_path}")

  GCCSA code    GCCSA name  SA4 code  SA4 name  SA3 code  SA3 name  \
1      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   
2      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   
3      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   
4      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   
5      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   

      SA2 code       SA2 name      no.    no..1  ...   no..13   no..14  \
1  201011001.0      Alfredton   5756.0   6092.0  ...  10338.0  11039.0   
2  201011002.0       Ballarat  11497.0  11708.0  ...  12327.0  12300.0   
3  201011005.0      Buninyong   5320.0   5399.0  ...   7082.0   7191.0   
4  201011006.0      Delacombe   4154.0   4225.0  ...   6583.0   6846.0   
5  201011007.0  Smythes Creek   3317.0   3378.0  ...   3945.0   3966.0   

    no..15   no..16   no..17   no..18   no..19   no..20   no..21   no..22  
1  11852.0  12649.0  13537.0  14434.0  15507.0  16841.0  1

In [8]:
# Define directories
output_relative_dir = '../../data/landing/Population_Forecast/'
output_absolute_dir = '../../data/raw/ABS_population/'

# Create directories if they don't exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")

Directory ../../data/landing/Population_Forecast/ already exists, skipping creation.
Directory ../../data/raw/ABS_population/ already exists, skipping creation.


In [9]:
# Define URL and download paths
url = "https://www.planning.vic.gov.au/__data/assets/excel_doc/0028/691660/VIF2023_SA2_Pop_Hhold_Dwelling_Projections_to_2036_Release_2.xlsx"
download_path = os.path.join(output_relative_dir, "VIF2023_Population_Forecast.xlsx")

# Download the Excel file
if not os.path.exists(download_path):
    print("Downloading population forecast dataset...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)
    print("Download completed.")
else:
    print("File already exists, skipping download.")

File already exists, skipping download.


In [10]:
# Load the Excel file and print sheet names to find the right sheet
xlsx = pd.ExcelFile(download_path)
print("Available sheet names:", xlsx.sheet_names)

Available sheet names: ['Contents', 'Explanatory Notes', 'Total_Population', 'Total_Dwellings', 'Total_Households', 'Dwellings_and_Households', 'Households_by_Type']


In [11]:
# Load the specific sheet
sheet_name = xlsx.sheet_names[2]
skiprows = 9  # Skip the first 9 rows to get to the table

# Read the Excel file, keeping rows only up to row 533
df = pd.read_excel(download_path, sheet_name=sheet_name, skiprows=skiprows)

df = df.iloc[:524, :].drop(index=0)
# df = df.iloc[:524, :]

# Display the first few rows to verify the data
print(df.head())

# Save the DataFrame to CSV (optional step to save it for later use)
csv_output_path = os.path.join(output_absolute_dir, "ABS_population_data.csv")
df.to_csv(csv_output_path, index=False)
print(f"Data saved to {csv_output_path}")

   GCCSA  SA4 Code  SA3 Code    SA2  code Region Type         Region     2021  \
1  2RVIC     201.0   20101.0  201011001.0         SA2      Alfredton  16841.0   
2  2RVIC     201.0   20101.0  201011002.0         SA2       Ballarat  12071.0   
3  2RVIC     201.0   20101.0  201011005.0         SA2      Buninyong   7229.0   
4  2RVIC     201.0   20101.0  201011006.0         SA2      Delacombe  10648.0   
5  2RVIC     201.0   20101.0  201011007.0         SA2  Smythes Creek   4211.0   

           2026          2031          2036  
1  20756.256163  23604.443836  26060.320807  
2  11698.293593  11803.430603  11985.992387  
3   7372.079773   7685.113372   8028.887243  
4  15915.186041  20475.587469  24965.202439  
5   4312.098530   4457.413406   4725.467837  
Data saved to ../../data/raw/ABS_population/ABS_population_data.csv


In [12]:
# Define directories
output_relative_dir = '../../data/landing/Income_Statistics/'
output_absolute_dir = '../../data/raw/Income_Statistics/'

# Create directories if they don't exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")


Directory ../../data/landing/Income_Statistics/ already exists, skipping creation.
Directory ../../data/raw/Income_Statistics/ already exists, skipping creation.


In [13]:
# Define the URL and download path
url = "https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2020-21-financial-year/Table%201%20-%20Total%20income%2C%20earners%20and%20summary%20statistics%20by%20geography%2C%202016-17%20to%202020-21.xlsx"
download_path = os.path.join(output_relative_dir, "Income_Statistics_2016_2020.xlsx")

# Download the Excel file
if not os.path.exists(download_path):
    print("Downloading income statistics Excel file...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)
    print("Download completed.")
else:
    print("File already exists, skipping download.")

File already exists, skipping download.


In [14]:
# Load the Excel file and list sheet names
xlsx = pd.ExcelFile(download_path)
print("Available sheet names:", xlsx.sheet_names)

Available sheet names: ['Contents', 'Table 1.1', 'Table 1.2', 'Table 1.3', 'Table 1.4', 'Table 1.5']


In [15]:
# Read the specific sheet and the relevant subtable
sheet_name = xlsx.sheet_names[4]  # Assuming the 4th subtable is in the 5th sheet
df = pd.read_excel(download_path, sheet_name=sheet_name, skiprows=6)  # Skip first 6 rows

# Extract row 7 and rows 653–1174
df_subtable = pd.concat([df.iloc[[0]], df.iloc[645:1167, :]])

# Remove the row where 'SA2' column is 'Australia'
df_subtable = df_subtable[df_subtable['SA2'] != 'Australia']

# Display the extracted subtable
print(df.head())

# Save the extracted subtable
output_file_path = os.path.join(output_absolute_dir, "Income_Statistics_2020.csv")
df_subtable.to_csv(output_file_path, index=False)

print(f"Subtable saved to {output_file_path}")

               SA2    SA2 NAME     2016-17     2017-18     2018-19  \
0        Australia         NaN  13,675,002  14,069,078  14,425,034   
1  New South Wales         NaN   4,344,142   4,466,939   4,569,649   
2        101021007   Braidwood       2,261       2,311       2,362   
3        101021008     Karabar       4,989       5,057       5,099   
4        101021009  Queanbeyan       6,482       6,594       6,699   

      2019-20     2020-21 2016-17.1 2017-18.1 2018-19.1  ... 2016-17.3  \
0  14,619,600  14,760,008        42        42        42  ...    48,083   
1   4,614,939   4,603,736        42        42        42  ...    48,394   
2       2,427       2,467        50        51        51  ...    40,790   
3       5,131       5,103        42        42        42  ...    57,460   
4       6,773       7,028        39        39        39  ...    55,033   

  2017-18.3 2018-19.3 2019-20.3 2020-21.3 2016-17.4 2017-18.4 2018-19.4  \
0    49,805    51,389    52,338    54,890    63,508    64,2