In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import requests
import zipfile

In [1]:
# Define directory
output_relative_dir = '../../data/landing/ABS_top50_school/'

# Create the directory if it doesn't exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

# URL of the target webpage
url = "https://www.topscores.co/Vic/vce-school-rank-median-vce/2023/"

# Send a request to fetch the page content
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table on the page
table = soup.find('table')

# Extract headers
headers = []
for th in table.find_all('th'):
    headers.append(th.text.strip())

# Extract table rows and ensure empty rows are handled properly
rows = []
for tr in table.find_all('tr'):
    cells = [td.text.strip() for td in tr.find_all('td')]
    # Ensure there are cells in the row
    if len(cells) == len(headers):  # This checks that the row has the correct number of cells
        rows.append(cells)

# Create a DataFrame
df = pd.DataFrame(rows, columns=headers)

# Display the first 50 rows
df_top_50 = df.head(50)

# Define the path for saving the CSV
relative_path = os.path.join(output_relative_dir, 'vce_top_50_schools.csv')

# Save the CSV to the landing directory
df_top_50.to_csv(relative_path, index=False)

print(f"CSV file saved to: {relative_path}")

# Show the result
print(df_top_50)


Directory ../../data/landing/ABS_top50_school/ created.
CSV file saved to: ../../data/landing/ABS_top50_school/vce_top_50_schools.csv
     ?                                    School        Location  \
0    1                Ballarat Clarendon College        Ballarat   
1    2                            Bialik College        Hawthorn   
2    3                       Huntingtower School  Mount Waverley   
3    4             Mount Scopus Memorial College         Burwood   
4    5                       Ruyton Girls School             Kew   
5    6  Yesodei Hatorah College Secondary Campus          Ormond   
6    7                    Loreto Mandeville Hall          Toorak   
7    8              Mentone Girls Grammar School         Mentone   
8    9            Macrobertson Girls High School       Melbourne   
9   10              Korowa Anglican Girls School       Glen Iris   
10  11                      St Catherines School          Toorak   
11  12                  Haileybury Girls College  

In [2]:
output_relative_dir = '../../data/landing/ABS_LGA/'
output_absolute_dir = '../../data/raw/ABS_LGA/'

if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")


Directory ../../data/landing/ABS_LGA/ created.
Directory ../../data/raw/ABS_LGA/ created.


In [3]:
url = "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/LGA_2024_AUST_GDA94.zip"
download_path = os.path.join(output_relative_dir, "LGA_2024_AUST_GDA94.zip")
extract_to_path = output_absolute_dir

if not os.path.exists(download_path):
    print("Downloading LGA shapefile...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)

    print("Extracting shapefile...")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)

    print("LGA shapefile downloaded and extracted successfully.")
else:
    print("Zip file already exists, skipping download and extraction.")

Downloading LGA shapefile...
Extracting shapefile...
LGA shapefile downloaded and extracted successfully.
