In [1]:
import os
import requests
import pandas as pd
import requests
import zipfile

In [2]:
output_relative_dir = '../data/'
# Ensure the base directory exists
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# List of folder types and corresponding subfolders
folder_structure = {
    'landing': ['criminal_incidents', 'property_data'],
    'raw': ['criminal_incidents', 'property_data'],
    'processed': ['criminal_incidents', 'property_data']
}

# Create folders based on the structure
for folder_type, subfolders in folder_structure.items():
    base_path = os.path.join(output_relative_dir, folder_type)
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    
    for subfolder in subfolders:
        subfolder_path = os.path.join(base_path, subfolder)
        if not os.path.exists(subfolder_path):
            os.makedirs(subfolder_path)

In [3]:
# Define the base directory for storing the Parquet files
base_dir = '../data/landing/criminal_incidents/'

# Ensure the base directory exists
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# File URL for downloading the Excel file
url = 'https://files.crimestatistics.vic.gov.au/2024-06/Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2024.xlsx'

# Download the Excel file content directly into memory
response = requests.get(url)
if response.status_code == 200:
    # Load the Excel content into a pandas ExcelFile object directly from memory
    xls = pd.ExcelFile(response.content)

    # Get all sheet names (table names)
    table_names = xls.sheet_names
    print("Sheet names (tables and other data):", table_names)

    # Initialize a counter for table naming
    table_counter = 1

    # Iterate over each sheet and save only tables as Parquet files
    for sheet in table_names:
        # Assuming table sheets contain the word 'Table'
        if 'Table' in sheet or sheet.lower().startswith('table'):
            # Read the sheet into a DataFrame
            df = pd.read_excel(xls, sheet_name=sheet)

            # Save the table as a Parquet file with the naming convention 'criminal_table_X.parquet'
            parquet_file_name = f'criminal_table_{table_counter:02}.parquet'
            parquet_output_path = os.path.join(base_dir, parquet_file_name)
            df.to_parquet(parquet_output_path, index=False)
            print(f"Table '{sheet}' saved successfully as {parquet_output_path}")

            # Increment table counter for each saved table
            table_counter += 1

else:
    print(f"Failed to download the file. Status code: {response.status_code}")


Sheet names (tables and other data): ['Contents', 'Footnotes', 'Table 01', 'Table 02', 'Table 03', 'Table 04', 'Table 05']
Table 'Table 01' saved successfully as ../data/landing/criminal_incidents/criminal_table_01.parquet
Table 'Table 02' saved successfully as ../data/landing/criminal_incidents/criminal_table_02.parquet
Table 'Table 03' saved successfully as ../data/landing/criminal_incidents/criminal_table_03.parquet
Table 'Table 04' saved successfully as ../data/landing/criminal_incidents/criminal_table_04.parquet
Table 'Table 05' saved successfully as ../data/landing/criminal_incidents/criminal_table_05.parquet


In [4]:
# Define the output directory inside WSL
output_relative_dir = '../data/landing/property_data/'

# Ensure the directory exists
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# File URL for downloading the Excel file
url = 'https://files.crimestatistics.vic.gov.au/2024-06/Data_Tables_Property_Items_Visualisation_Year_Ending_March_2024.xlsx'

# Download the Excel file content directly into memory
response = requests.get(url)
if response.status_code == 200:
    # Load the Excel content into a pandas ExcelFile object directly from memory
    xls = pd.ExcelFile(response.content)

    # Get all sheet names (table names)
    sheet_names = xls.sheet_names
    print("Sheet names (tables and other data):", sheet_names)

    # Initialize a counter for table naming
    table_counter = 1

    # Iterate over each sheet and save them accordingly
    for sheet in sheet_names:
        # Read the sheet into a DataFrame
        df = pd.read_excel(xls, sheet_name=sheet)

        # Only process and save sheets that are tables (skip "Contents" and "Footnotes")
        if 'Table' in sheet or sheet.lower().startswith('table'):
            # Handle inconsistent data types by converting all columns to strings
            df = df.astype(str)

            # Save the table as a Parquet file with the naming convention 'property_table_X.parquet'
            parquet_file_name = f'property_table_{table_counter:02}.parquet'
            parquet_output_path = os.path.join(output_relative_dir, parquet_file_name)
            df.to_parquet(parquet_output_path, index=False)
            print(f"Table '{sheet}' saved successfully as {parquet_output_path}")

            # Increment table counter for each saved table
            table_counter += 1

else:
    print(f"Failed to download the file. Status code: {response.status_code}")

Sheet names (tables and other data): ['Contents', 'Footnotes', 'Table 01', 'Table 02', 'Table 03', 'Table 04', 'Table 05', 'Table 06']


KeyboardInterrupt: 

In [3]:
# Define the URL for the ZIP file
url = 'https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA94.zip'

# Define the output directory and file name
output_dir = '../data/landing/boundaries/'
zip_file_path = os.path.join(output_dir, 'SA2_2021_AUST_SHP_GDA2020.zip')

# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Download the ZIP file and save it locally
response = requests.get(url)
with open(zip_file_path, 'wb') as file:
    file.write(response.content)

# Extract and rename all files in the ZIP archive
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all files to the output directory
    zip_ref.extractall(output_dir)
    # List all files in the ZIP archive
    file_list = zip_ref.namelist()

    # Rename each file in the output directory to start with 'SA2_district_boundaries'
    for original_file in file_list:
        original_path = os.path.join(output_dir, original_file)
        # Get the file extension
        file_extension = os.path.splitext(original_file)[1]
        # Define the new file name
        new_file_name = f"SA2_district_boundaries{file_extension}"
        new_file_path = os.path.join(output_dir, new_file_name)
        # Rename the file if it exists
        if os.path.exists(original_path):
            os.rename(original_path, new_file_path)

# Return the list of renamed files
renamed_files = os.listdir(output_dir)
renamed_files


['SA2_district_boundaries.dbf',
 'SA2_district_boundaries.shx',
 'SA2_2021_AUST_SHP_GDA2020.zip',
 'SA2_district_boundaries.shp',
 'SA2_district_boundaries.prj',
 'SA2_district_boundaries.xml']

In [2]:
# Define the URL for the CSV file
url = 'https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv'

# Define the output directory and file name (using current directory)
output_dir = '../data/landing/suburb_match/'
csv_file_path = os.path.join(output_dir, 'suburb_match.csv')
parquet_file_path = os.path.join(output_dir, 'suburb_match.parquet')

# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Download the CSV file and save it locally
response = requests.get(url)
if response.status_code == 200:
    with open(csv_file_path, 'wb') as file:
        file.write(response.content)

    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file_path)

    # Save the DataFrame as a Parquet file
    df.to_parquet(parquet_file_path, index=False)

    # Return the path of the saved Parquet file
    print(f"File saved as Parquet at: {parquet_file_path}")
else:
    print("Failed to download the file. Status code:", response.status_code)

# Return the list of files in the output directory
downloaded_files = os.listdir(output_dir)
print(downloaded_files)

File saved as Parquet at: ../data/landing/suburb_match/suburb_match.parquet
['suburb_match.csv', 'suburb_match.parquet']


In [4]:
# Define the URL for the ZIP file
url = 'https://s3.ap-southeast-2.amazonaws.com/cl-isd-prd-datashare-s3-delivery/Order_Y9LSRC.zip'

# Define the output directory and file name
output_dir = '../data/landing/parkres/'
zip_file_path = os.path.join(output_dir, 'parkres.zip')

# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Download the ZIP file and save it locally
response = requests.get(url)
with open(zip_file_path, 'wb') as file:
    file.write(response.content)

# Extract and rename all files in the ZIP archive
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all files to the output directory
    zip_ref.extractall(output_dir)
    # List all files in the ZIP archive
    file_list = zip_ref.namelist()

    # Rename each file in the output directory to start with 'parkres'
    for original_file in file_list:
        original_path = os.path.join(output_dir, original_file)
        # Get the file extension
        file_extension = os.path.splitext(original_file)[1]
        # Define the new file name
        new_file_name = f"parkres{file_extension}"
        new_file_path = os.path.join(output_dir, new_file_name)
        # Rename the file if it exists
        if os.path.exists(original_path):
            os.rename(original_path, new_file_path)

# Return the list of renamed files
renamed_files = os.listdir(output_dir)
renamed_files

['parkres.zip',
 'parkres.prj',
 'parkres.cpg',
 'parkres.dbf',
 'parkres.txt',
 'll_gda94',
 'parkres.shx',
 'parkres.shp',
 'parkres.html']

In [9]:
# Define the URL of the file to download
url = "https://www.health.vic.gov.au/sites/default/files/2024-07/postcode-locality-reference.xls"

# Define the output directory and file name
output_dir = '../data/landing/postcode/'  # Change this to your desired directory
xls_file_name = 'postcode_ref.xls'
csv_file_name = 'postcode_ref.csv'

# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Full path to save the Excel and CSV files
xls_file_path = os.path.join(output_dir, xls_file_name)
csv_file_path = os.path.join(output_dir, csv_file_name)

# Download the Excel file
response = requests.get(url)

# Check if the download was successful (HTTP 200 OK)
if response.status_code == 200:
    # Write the Excel file content to the local system
    with open(xls_file_path, 'wb') as f:
        f.write(response.content)
    print(f"Excel file downloaded and saved at: {xls_file_path}")
    
    # Load the Excel file into a pandas DataFrame
    df = pd.read_excel(xls_file_path)

    # Save the DataFrame as a CSV file
    df.to_csv(csv_file_path, index=False)
    print(f"File converted to CSV and saved at: {csv_file_path}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

Excel file downloaded and saved at: ../data/landing/postcode/postcode_ref.xls
File converted to CSV and saved at: ../data/landing/postcode/postcode_ref.csv
