## SChool Preprocessing

This notebook takes in three datasets:
1. A list of all schools in Victoria, with their coordinate position and name
2. A list of most schools in Victoria with their 'performance'
3. A shapefile for all school zones in Victoria

This notebook preprocesses the datasets, along with joining the first two datasets.

Imports

In [11]:
import sys
import os
import pandas as pd
import geopandas as gpd

sys.path.append(os.path.abspath('../../scripts/2. modules'))
import processing

# Input paths
SCHOOL_LOCATIONS_PATH = '../../data/1. landing/school/locations.csv'
SCHOOL_QUALITY_PATH = '../../data/1. landing/school/achievement.csv'
SCHOOL_ZONES_PATH = '../../data/1. landing/shapefile/school_zones/Secondary_Integrated_Year7_2022.geojson'

# Output paths
SCHOOL_OUTPUT_PATH = "../../data/2. raw/school"
SCHOOL_OUTPUT_NAME = "schools.csv"
SCHOOL_ZONES_OUTPUT_NAME = "zones.geojson"

Load in the required values

In [5]:
schools_general_df = pd.read_csv(SCHOOL_LOCATIONS_PATH, encoding='ISO-8859-1')
schools_quality_df = pd.read_csv(SCHOOL_QUALITY_PATH, encoding='ISO-8859-1')
school_zones_df = gpd.read_file(SCHOOL_ZONES_PATH)

schools_quality_new_df_renamed = schools_quality_df.rename\
    (columns={'School': 'school_name', 
            'Number of VCE and VCE Vocational Major (VM) studies at Units 3 and 4 level with enrolments ': 'number_vce_subjects',
            'Percentage of satisfactory VCE completions': 'satisfactory_complete_vce_percent',
            'Median VCE study score': 'median_study_score',
            'Percentage of study scores of 40 and over': 'study_score_over_40_percent',
            'Percentage of VCE students applying for tertiary places through the Victorian Tertiary Admissions Centre (VTAC)': 'percentage_applying_to_victorian_uni'
    })[['school_name', 'number_vce_subjects', 'satisfactory_complete_vce_percent', 'median_study_score', 'study_score_over_40_percent', 'percentage_applying_to_victorian_uni']]

schools_quality_new_df_renamed.sort_values('median_study_score', ascending=False)

school_names = schools_quality_new_df_renamed['school_name']
numeric_df = schools_quality_new_df_renamed.drop(columns=['school_name']).replace('< 4', 3).apply(pd.to_numeric, errors='coerce').apply(lambda col: col.fillna(col.mean()), axis=0)

schools_quality_new_df_renamed = pd.concat([school_names, numeric_df], axis=1)

COLS_OF_INTEREST = [col for col in schools_quality_new_df_renamed if col != 'school_name']
schools_quality_df_renamed = schools_quality_new_df_renamed

schools_quality_df_renamed['school_name'] = schools_quality_df_renamed['school_name'].str.lower().str.replace(r'[^a-z\s]', '', regex=True)
schools_quality_df_renamed = schools_quality_df_renamed.dropna(subset=['school_name'])

schools_quality_df_renamed

Unnamed: 0,school_name,number_vce_subjects,satisfactory_complete_vce_percent,median_study_score,study_score_over_40_percent,percentage_applying_to_victorian_uni
0,academy of mary immaculate,33.0,99.00000,31.000000,10.300000,93.800000
1,adass israel school,9.0,100.00000,28.756957,7.186515,67.019565
2,adass israel school,2.0,95.62892,28.756957,7.186515,67.019565
3,advance college of education,13.0,75.00000,28.756957,7.186515,67.019565
4,aitken college,41.0,99.00000,28.000000,5.600000,89.100000
...,...,...,...,...,...,...
593,yarrawonga college p,30.0,98.00000,26.000000,3.700000,43.200000
594,yea high school,33.0,85.00000,25.000000,7.186515,32.400000
595,yeshivah college,14.0,100.00000,34.000000,17.900000,76.900000
596,yesodei hatorah college,11.0,100.00000,37.000000,20.000000,100.000000


First, we clean up the general school listings

In [12]:
processing.fix_col_names(schools_general_df)

schools_general_df['school_name'] = schools_general_df['school_name'].str.lower().str.replace(r'[^a-z\s]', '', regex=True)

schools_filtered = schools_general_df.copy()[schools_general_df['school_type'].isin(['Pri/Sec', 'Secondary'])].dropna(subset=['school_name'])

schools_joined = pd.merge(schools_filtered, schools_quality_df_renamed, on='school_name')

out_school_df = schools_joined[['x', 'y', 'school_name'] + COLS_OF_INTEREST]

out_school_df

Unnamed: 0,x,y,school_name,number_vce_subjects,satisfactory_complete_vce_percent,median_study_score,study_score_over_40_percent,percentage_applying_to_victorian_uni
0,146.96093,-36.73310,bright p college,30.000000,95.00000,29.000000,4.800000,20.000000
1,144.16067,-38.04200,bannockburn p college,33.000000,92.00000,24.000000,0.900000,42.300000
2,144.11368,-36.04736,pyramid hill college,3.000000,95.62892,28.756957,7.186515,67.019565
3,149.75430,-37.55943,mallacoota p college,7.000000,100.00000,34.000000,16.700000,50.000000
4,149.15302,-37.56970,cann river p college,8.000000,100.00000,24.000000,7.186515,66.700000
...,...,...,...,...,...,...,...,...
431,145.40034,-36.43612,st annes college,9.000000,95.62892,26.000000,7.186515,67.019565
432,145.55883,-38.52856,village high school,37.308081,95.62892,28.756957,7.186515,67.019565
433,144.35671,-38.21912,iona college geelong,3.000000,95.62892,28.756957,7.186515,67.019565
434,144.96665,37.83731,youthindustry college,17.000000,59.00000,28.756957,7.186515,11.800000


Preprocess the school zones

In [10]:
processing.fix_col_names(school_zones_df)

In [14]:
# save the new datasets to the raw layer
processing.to_csv(out_school_df, SCHOOL_OUTPUT_PATH, SCHOOL_OUTPUT_NAME)
processing.to_shapefile(school_zones_df, SCHOOL_OUTPUT_PATH, SCHOOL_ZONES_OUTPUT_NAME)