Downloading the SA2 shapefile

In [2]:
import requests
import zipfile
import io
import os
import pandas as pd

DOWNLOAD_PATH = '../../data/1. landing'
SHAPEFILE_PATH = DOWNLOAD_PATH + '/shapefile'


def download_shapefile(url, file_name, subfolder=None):
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Ensure the SHAPEFILE_PATH directory exists
        os.makedirs(SHAPEFILE_PATH, exist_ok=True)

        # Create a byte stream from the response content
        file_bytes = io.BytesIO(response.content)
        
        # Unzip the downloaded file
        with zipfile.ZipFile(file_bytes, 'r') as zip_ref:
            if subfolder:
                # Extract only files from the specified subfolder and place them in the top level
                for file in zip_ref.namelist():
                    # Check if the file is in the desired subfolder
                    if file.startswith(subfolder):
                        # Create the path for the file in the destination folder without the subfolder prefix
                        relative_path = os.path.relpath(file, subfolder)
                        extract_path = os.path.join(SHAPEFILE_PATH, file_name, relative_path)
                        
                        # Ensure the destination directory exists
                        os.makedirs(os.path.dirname(extract_path), exist_ok=True)
                        
                        # Extract the file
                        with zip_ref.open(file) as source, open(extract_path, 'wb') as target:
                            target.write(source.read())
            else:
                # Extract all files if no subfolder is specified
                zip_ref.extractall(os.path.join(SHAPEFILE_PATH, file_name))
        
        print(f"Downloaded and unzipped {file_name}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")


def download(url, save_path):
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Create the full directory if it doesn't exist
        full_save_path = os.path.join(DOWNLOAD_PATH, save_path)
        os.makedirs(os.path.dirname(full_save_path), exist_ok=True)
        
        # Write the content to a file
        with open(full_save_path, 'wb') as file:
            file.write(response.content)
        print(f"File downloaded and saved to {save_path}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")


def excel_to_csv(excel_path, sheet_name, csv_save_path):
    """
    Converts a specific sheet from an Excel file to a CSV file.

    Args:
    - excel_path (str): Path to the Excel file.
    - sheet_name (str): Name of the sheet to extract.
    - csv_save_path (str): Path to save the CSV file.
    """

    excel_path = os.path.join(DOWNLOAD_PATH, excel_path)
    csv_save_path = os.path.join(DOWNLOAD_PATH, csv_save_path)

    # Check if the Excel file exists
    if not os.path.exists(excel_path):
        print(f"Excel file not found at {excel_path}")
        return

    try:
        # Load the Excel file
        excel_file = pd.ExcelFile(excel_path)
        
        # Check if the specified sheet exists
        if sheet_name in excel_file.sheet_names:
            # Parse the specified sheet
            df = pd.read_excel(excel_file, sheet_name=sheet_name)
            
            # Save the DataFrame to a CSV file
            df.to_csv(csv_save_path, index=False)
            print(f"Extracted '{sheet_name}' and saved as CSV to {csv_save_path}")
        else:
            print(f"Sheet '{sheet_name}' not found in the Excel file.")
    except Exception as e:
        print(f"An error occurred: {e}")


def remove_first_n_lines(file_path, n, output_file_path=None):
    """
    Removes the first n lines from a file.

    Args:
    - file_path (str): Path to the input file.
    - n (int): Number of lines to remove.
    - output_file_path (str, optional): Path to save the modified file. 
                                        If not provided, it overwrites the original file.
    """
    file_path = os.path.join(DOWNLOAD_PATH, file_path)

    # Read the file and skip the first n lines
    with open(file_path, 'r') as file:
        lines = file.readlines()[n:]
    
    # Determine output path (overwrite or new file)
    if output_file_path is None:
        output_file_path = file_path
    
    # Write the remaining lines back to the file
    with open(output_file_path, 'w') as file:
        file.writelines(lines)


Download shapefiles

In [4]:
# Download sa2 regions
sa2_url = "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip"
download_shapefile(sa2_url, 'sa2')

# Download postal regions
postal_url = "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/POA_2021_AUST_GDA2020_SHP.zip"
download_shapefile(postal_url, 'postal')

# Download school zone shapefile
school_zone_url = "https://www.education.vic.gov.au/Documents/about/research/datavic/dv311-schoolzones2022.zip"
download_shapefile(school_zone_url, 'school_zones')

# Download PTV stations
ptv_regional_stations = "https://s3.ap-southeast-2.amazonaws.com/cl-isd-prd-datashare-s3-delivery/Order_23MSYP.zip?orderid=T81MBG"
download_shapefile(ptv_regional_stations, 'regional_station',  
                   subfolder='ll_gda2020/esrishape/whole_of_dataset/victoria/PTV')

# Download PTV-Metro stations
ptv_metro_stations = "https://s3.ap-southeast-2.amazonaws.com/cl-isd-prd-datashare-s3-delivery/Order_2AICD9.zip?orderid=1MTM4X"
download_shapefile(ptv_metro_stations, 'metro_station',
                   subfolder='ll_gda2020/esrishape/whole_of_dataset/victoria/PTV')

Downloaded and unzipped sa2
Downloaded and unzipped postal
Downloaded and unzipped school_zones
Downloaded and unzipped regional_station
Downloaded and unzipped metro_station


Download CSVs

In [10]:
# Download school locations
school_locations = 'https://www.education.vic.gov.au/Documents/about/research/datavic/dv331_schoollocations2022.csv'
download(school_locations, 'school/locations.csv')

# Download school pathways
school_achievment_url = 'https://www.vcaa.vic.edu.au/Documents/statistics/2023/2023SeniorSecondaryCompletionAndAchievementInformation.xlsx'
download(school_achievment_url, 'school/achievement.xlsx')
excel_to_csv('school/achievement.xlsx', 'postcomp_for_publication', 'school/achievement.csv')
os.remove(os.path.join(DOWNLOAD_PATH, 'school/achievement.xlsx'))
remove_first_n_lines('school/achievement.csv', 10)

# Download crime data
crime_excel_file = 'crime.xlsx'

if not os.path.exists(os.path.join(DOWNLOAD_PATH, 'crime.csv')):
    crime_url = 'https://files.crimestatistics.vic.gov.au/2024-09/Data_Tables_LGA_Recorded_Offences_Year_Ending_June_2024.xlsx'
    download(crime_url, crime_excel_file)
    excel_to_csv(crime_excel_file, 'Table 03', 'crime.csv')
    os.remove(os.path.join(DOWNLOAD_PATH, crime_excel_file))
else:
    print("Skipped downloading crime.csv as exists")

# Download land cover
land_cover_url = 'https://cl-isd-prd-datashare-s3-delivery.s3.amazonaws.com/PrePackages/VIC_LANDCOVER_TS.zip'
download_shapefile(land_cover_url, 'land_cover')

File downloaded and saved to school/locations.csv
File downloaded and saved to school/achievement.xlsx
Extracted 'postcomp_for_publication' and saved as CSV to ../../data/1. landing/school/achievement.csv
Skipped downloading crime.csv as exists
Downloaded and unzipped land_cover
