#### !!  Important  !!
Before running this code make sure all of the education related datasets are properly downloaded / scraped and saved into the landing folder
- for datasets that are scraped check that they are in the landing folder 
- for datasets that need to be manually downloaded BEFORE running this notebook:

      - please move the csv file (university_locations.csv) to the data/landing folder
      - please move the government school zones shapefile folder to the data/landing folder

In [125]:
import pandas as pd
import geopandas as gpd
import numpy as np
from fuzzywuzzy import process
import os


Preprocessing for TAFE locations

In [59]:
# reading TAFE dataset
tafe_df = pd.read_csv('../data/landing/TAFE_locations.csv')

In [60]:
# extract suburbs where TAFE campuses are located
def extract_suburbs(location):
    suburbs = []
    # split by semicolumn and space to get each part of the multiple campus locations
    for part in location.split('; '):
        # for each part get everything before the first colon (the suburb)
        suburb = part.split(':')[0]
        suburbs.append(suburb)
    return suburbs

# apply function to TAFE df
tafe_df['suburbs'] = tafe_df['Locations'].apply(extract_suburbs)
# each suburb gets an individual row 
tafe_df = tafe_df.explode('suburbs')


In [61]:
# drop locations column - dont need anymore
tafe_df = tafe_df.drop(columns='Locations')
# change column name to align format
tafe_df = tafe_df.rename(columns={'Institute Name': 'institute_name'})

In [22]:
# save as csv to data/raw folder 
tafe_df.to_csv('../data/raw/tafe_suburbs.csv', index=False)

Preprocessing for University locations

In [84]:
# reading university dataset
university_df = pd.read_csv('../data/landing/university_locations.csv')

In [85]:
# drop columns that we dont need 
university_df = university_df.drop(columns=['Campus', 'City or Town', 'Post Code', 'Name'])
# rename columns 
university_df = university_df.rename(columns={'University': 'institute_name'})

In [86]:
# extract the suburb information from address column
pattern = r'(?:,\s*(.*?)\s*V)|(?:\s+([^\s]+)\s*V)' # i want the text between , and VIC, if theres no , then between a space and VIC
university_df['suburbs'] = university_df['Address'].str.extract(pattern).bfill(axis=1).iloc[:, 0]

# drop columnds that we dont need
university_df = university_df.drop(columns=['Address'])

# remove any punctuation
university_df['suburbs'] = university_df['suburbs'].str.replace('[,"\']', '', regex=True)

In [87]:
# save as csv to data/raw folder 
university_df.to_csv('../data/raw/university_suburbs.csv', index=False)

Combine into Tertiary Insitution location csv

In [88]:
tertiary_df = pd.concat([tafe_df, university_df], ignore_index=True)
# save as csv to data/raw folder 
tertiary_df.to_csv('../data/raw/tertiary_insitutions_suburbs.csv', index=False)

Preprocessing for Primary Secondary location

In [109]:
# reading dataset
# specify encoding to handle non-UTF-8 characters
prim_sec_df = pd.read_csv('../data/landing/2023_primary_secondary_locations.csv', encoding='ISO-8859-1')

In [110]:
# list of columns to keep
columns_to_keep = ['School_Name', 'School_Type', 'Address_Town', 'Address_Postcode', 'X', 'Y']
# keep selected columns
prim_sec_df = prim_sec_df[columns_to_keep]
# change column names
prim_sec_df = prim_sec_df.rename(columns={
    'School_Name': 'school_name',
    'School_Type': 'school_type',
    'Address_Town': 'suburb',
    'Address_Postcode': 'postcode',
    'X': 'longitude',
    'Y': 'latitude'
})

In [112]:
# split df into primary schools and secondary schools
primary_schools_df = prim_sec_df[prim_sec_df['school_type'].isin(['Primary', 'Pri/Sec'])]
secondary_schools_df = prim_sec_df[prim_sec_df['school_type'].isin(['Secondary', 'Pri/Sec'])]

In [151]:
# save as csv to data/raw folder 
primary_schools_df.to_csv('../data/raw/primary_school_locations.csv', index=False)
secondary_schools_df.to_csv('../data/raw/highschool_locations.csv', index=False)

In [146]:
primary_schools_df.head()

Unnamed: 0,school_name,school_type,suburb,postcode,longitude,latitude
0,Alberton Primary School,Primary,Alberton,3971,146.6666,-38.61771
1,Allansford and District Primary School,Primary,Allansford,3277,142.59039,-38.38628
2,Avoca Primary School,Primary,Avoca,3467,143.47565,-37.0845
3,Avenel Primary School,Primary,Avenel,3664,145.23472,-36.90137
4,Warrandyte Primary School,Primary,Warrandyte,3113,145.21398,-37.74268


Preprocessing for Highschool rankings

In [129]:
# read df
highschool_rankings = pd.read_csv('../data/landing/VIC_high_school_rankings.csv')
# drop median vce column
highschool_rankings = highschool_rankings.drop(columns=['Median VCE Score'])
# rename columns
highschool_rankings = highschool_rankings.rename(columns={
    'Rank': 'rank',
    'School': 'school_name',
    'Percentage of study scores of 40 or above': 'over_forty_score_percentage'
})

In [130]:
highschool_rankings.head()

Unnamed: 0,rank,school_name,over_forty_score_percentage
0,1,Ballarat Clarendon College,45.8
1,2,Bialik College,34.1
2,3,Huntingtower School,33.8
3,4,Mount Scopus Memorial College,31.5
4,5,Ruyton Girls’ School,31.4


In [132]:
# use fuzzy matching on school_names to get longitude and latitude coordinates from secondary schools df 

# find the best match for a school name in secondary_schools_df
def fuzzy_match_school(school_name, possible_school_names, scorer=process.extractOne):
    match = scorer(school_name, possible_school_names)
    if match:  # if match found, return best match and score
        return match[0], match[1] 
    return None, 0  # if not found return none and 0 

# apply fuzzy matching to merge the longitude and latitude by comparing school name to list of school names in secondary schools df
highschool_rankings['best_match'], highschool_rankings['match_score'] = zip(
    *highschool_rankings['school_name'].apply(fuzzy_match_school, possible_school_names=secondary_schools_df['school_name'].tolist())
)

In [133]:
# merge dataframes based on best matched school names
highschool_rankings = pd.merge(
    highschool_rankings,
    secondary_schools_df[['school_name', 'longitude', 'latitude']],
    left_on='best_match',
    right_on='school_name',
    how='left'
)

# drop the extra school_name column from the secondary_schools_df
highschool_rankings = highschool_rankings.drop(columns=['school_name_y', 'best_match', 'match_score']).rename(columns={'school_name_x': 'school_name'})

In [135]:
# save as csv to data/raw folder 
highschool_rankings.to_csv('../data/raw/highschool_rankings.csv', index=False)

Preprocessing for Primary School rankings

In [140]:
# read df
primary_school_rankings = pd.read_csv('../data/landing/VIC_primary_school_rankings.csv')
# drop first row with NaN rank
primary_school_rankings = primary_school_rankings.drop(index=0)
# keep columns that we need
primary_school_rankings = primary_school_rankings[['Order', 'School', 'State Overall Score']]
# rename columns
primary_school_rankings = primary_school_rankings.rename(columns={
    'Order': 'rank',
    'School': 'school_name',
    'State Overall Score': 'state_overall_score'
})

In [143]:
# since school_name column includes suburb, state, and postcode information that we dont need,
# remove the additional information after the first comma in the school_name column
primary_school_rankings['school_name'] = primary_school_rankings['school_name'].str.split(',', n=1).str[0]

In [147]:
# do fuzzy matching on primary_school_rankings and primary_schools_df to get longitude and latitude columns
primary_school_rankings['best_match'], primary_school_rankings['match_score'] = zip(
    *primary_school_rankings['school_name'].apply(fuzzy_match_school, possible_school_names=primary_schools_df['school_name'].tolist())
)
# merge longitude and latitude from primary_schools_df into primary_school_rankings based on the best match
primary_school_rankings = pd.merge(
    primary_school_rankings, 
    primary_schools_df[['school_name', 'longitude', 'latitude']], 
    left_on='best_match', 
    right_on='school_name', 
    how='left'
)
# drop the extra 'school_name_y' column and rename 'school_name_x' to 'school_name'
primary_school_rankings = primary_school_rankings.drop(columns=['school_name_y', 'best_match', 'match_score']).rename(columns={'school_name_x': 'school_name'})

In [148]:
# convert rank column to integers
primary_school_rankings['rank'] = primary_school_rankings['rank'].astype(int)

In [150]:
# save as csv to data/raw folder 
primary_school_rankings.to_csv('../data/raw/primary_school_rankings.csv', index=False)

In [149]:
primary_school_rankings

Unnamed: 0,rank,school_name,state_overall_score,longitude,latitude
0,1,Presbyterian Ladies' College,100,145.10700,-37.84856
1,2,Camberwell Grammar School,100,145.06641,-37.81609
2,3,St Andrews Christian College,100,145.24464,-37.87387
3,4,Huntingtower School,100,145.13629,-37.87674
4,5,Ballarat Clarendon College,100,143.83365,-37.55965
...,...,...,...,...,...
924,449,St Michael's School,90,147.17342,-36.21623
925,449,St Michael's School,90,145.35357,-38.03382
926,450,Thomas Mitchell Primary School,90,145.27391,-37.98056
927,451,St Luke's Primary School,90,145.41203,-36.32432
