In [1]:
import os
import requests
import zipfile
import pandas as pd

In [2]:
# Define directories
output_relative_dir = '../../data/landing/ABS_SA2/'
output_absolute_dir = '../../data/raw/ABS_SA2/'

# Create directories if they don't exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")


Directory ../../data/landing/ABS_SA2/ created.
Directory ../../data/raw/ABS_SA2/ created.


In [3]:
# Define URL and file paths
url = "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip"
download_path = os.path.join(output_relative_dir, "SA2_2021_AUST_SHP_GDA2020.zip")
extract_to_path = output_absolute_dir

# Download and extract the file
if not os.path.exists(download_path):
    print("Downloading SA2 District Boundaries shapefile...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)

    print("Extracting shapefile...")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)

    print("SA2 District Boundaries shapefile downloaded and extracted successfully.")
else:
    print("Zip file already exists, skipping download and extraction.")

Downloading SA2 District Boundaries shapefile...
Extracting shapefile...
SA2 District Boundaries shapefile downloaded and extracted successfully.


In [4]:
# Define directories
output_relative_dir = '../../data/landing/ABS_population/'
output_absolute_dir = '../../data/raw/ABS_population/'

# Create directories if they don't exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")

Directory ../../data/landing/ABS_population/ already exists, skipping creation.
Directory ../../data/raw/ABS_population/ already exists, skipping creation.


In [5]:
# Define URL and download paths
url = "https://www.abs.gov.au/statistics/people/population/regional-population/2022-23/32180DS0001_2022-23.xlsx"
download_path = os.path.join(output_relative_dir, "32180DS0001_2022-23.xlsx")

# Download the Excel file
if not os.path.exists(download_path):
    print("Downloading ABS population dataset...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)
    print("Download completed.")
else:
    print("File already exists, skipping download.")

File already exists, skipping download.


In [7]:
# Load the Excel file and print sheet names
xlsx = pd.ExcelFile(download_path)
print("Available sheet names:", xlsx.sheet_names)

Available sheet names: ['Contents', 'Table 1', 'Table 2', 'Table 3', 'Table 4', 'Table 5', 'Table 6', 'Table 7', 'Table 8', 'Table 9', 'Table 10']


In [8]:
sheet_name = xlsx.sheet_names[2]  # Adjust the index based on which sheet contains Table 2

# Try loading the data again, adjust skiprows based on where Table 2 starts
df = pd.read_excel(download_path, sheet_name=sheet_name, skiprows=6)  # Adjust skiprows if necessary

# Check if the data was loaded correctly
print(df.head())

# Save the DataFrame to CSV (optional step to save it for later use)
csv_output_path = os.path.join(output_absolute_dir, "population_data_table_2.csv")
df.to_csv(csv_output_path, index=False)
print(f"Data saved to {csv_output_path}")

  GCCSA code    GCCSA name  SA4 code  SA4 name  SA3 code  SA3 name  \
0      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   
1      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   
2      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   
3      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   
4      2RVIC  Rest of Vic.     201.0  Ballarat   20101.0  Ballarat   

      SA2 code       SA2 name      no.    no..1   no..2    %  no..3  no..4  \
0  201011001.0      Alfredton  18002.0  18997.0   995.0  5.5  140.0  695.0   
1  201011002.0       Ballarat  11938.0  11809.0  -129.0 -1.1  -57.0 -213.0   
2  201011005.0      Buninyong   7247.0   7323.0    76.0  1.0   15.0  -19.0   
3  201011006.0      Delacombe  11798.0  12869.0  1071.0  9.1  133.0  898.0   
4  201011007.0  Smythes Creek   4223.0   4268.0    45.0  1.1   11.0   31.0   

   no..5    km2  persons/km2  
0  160.0   52.7        360.4  
1  141.0   12.4        954.0  
2   80.0   51.6  

In [2]:
# Define directories
output_relative_dir = '../../data/landing/Population_Forecast/'
output_absolute_dir = '../../data/raw/Population_Forecast/'

# Create directories if they don't exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")

Directory ../../data/landing/Population_Forecast/ already exists, skipping creation.
Directory ../../data/raw/Population_Forecast/ already exists, skipping creation.


In [3]:
# Define URL and download paths
url = "https://www.planning.vic.gov.au/__data/assets/excel_doc/0028/691660/VIF2023_SA2_Pop_Hhold_Dwelling_Projections_to_2036_Release_2.xlsx"
download_path = os.path.join(output_relative_dir, "VIF2023_Population_Forecast.xlsx")

# Download the Excel file
if not os.path.exists(download_path):
    print("Downloading population forecast dataset...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)
    print("Download completed.")
else:
    print("File already exists, skipping download.")

File already exists, skipping download.


In [4]:
# Load the Excel file and print sheet names to find the right sheet
xlsx = pd.ExcelFile(download_path)
print("Available sheet names:", xlsx.sheet_names)

Available sheet names: ['Contents', 'Explanatory Notes', 'Total_Population', 'Total_Dwellings', 'Total_Households', 'Dwellings_and_Households', 'Households_by_Type']


In [6]:
# Load the specific sheet
sheet_name = xlsx.sheet_names[2]
skiprows = 9  # Skip the first 9 rows to get to the table

# Read the Excel file, keeping rows only up to row 533
df = pd.read_excel(download_path, sheet_name=sheet_name, skiprows=skiprows)

df = df.iloc[:524, :].drop(index=0)
# df = df.iloc[:524, :]

# Display the first few rows to verify the data
print(df.head())

# Save the DataFrame to CSV (optional step to save it for later use)
csv_output_path = os.path.join(output_absolute_dir, "population_forecast_data.csv")
df.to_csv(csv_output_path, index=False)
print(f"Data saved to {csv_output_path}")

   GCCSA  SA4 Code  SA3 Code    SA2  code Region Type         Region     2021  \
1  2RVIC     201.0   20101.0  201011001.0         SA2      Alfredton  16841.0   
2  2RVIC     201.0   20101.0  201011002.0         SA2       Ballarat  12071.0   
3  2RVIC     201.0   20101.0  201011005.0         SA2      Buninyong   7229.0   
4  2RVIC     201.0   20101.0  201011006.0         SA2      Delacombe  10648.0   
5  2RVIC     201.0   20101.0  201011007.0         SA2  Smythes Creek   4211.0   

           2026          2031          2036  
1  20756.256163  23604.443836  26060.320807  
2  11698.293593  11803.430603  11985.992387  
3   7372.079773   7685.113372   8028.887243  
4  15915.186041  20475.587469  24965.202439  
5   4312.098530   4457.413406   4725.467837  
Data saved to ../../data/raw/Population_Forecast/population_forecast_data.csv
