In [9]:
import sys
import os
import pandas as pd
import geopandas as gpd

sys.path.append(os.path.abspath('../../scripts/2. modules'))
import processing

# Input paths
SCHOOL_PATH = '../../data/2. raw/school/schools.csv'
SCHOOL_ZONES_PATH = '../../data/2. raw/school/zones.geojson'

# Output paths
SCHOOL_OUTPUT_PATH = "../../data/3. raw/school"
SCHOOL_OUTPUT_NAME = "school_by_region.csv"
ZONE_OUTPUT_NAME = "zone_by_quality.csv"

In [2]:
regions_df = processing.get_regions_df(2)
schools_df = pd.read_csv(SCHOOL_PATH, encoding='ISO-8859-1')
school_zones_df = gpd.read_file(SCHOOL_ZONES_PATH)

In [5]:
# peform a spatial join
school_point_gdf = gpd.GeoDataFrame(
    schools_df, 
    geometry=gpd.points_from_xy(schools_df['x'], schools_df['y']),
)

COLS_OF_INTEREST = ['number_vce_subjects',
                    'satisfactory_complete_vce_percent',
                    'median_study_score',
                    'study_score_over_40_percent',
                    'percentage_applying_to_victorian_uni']

joined_schools_to_regions_gdf = gpd.sjoin(school_point_gdf, regions_df, how='right', predicate='within')
joined_schools_to_regions_gdf['study_score_over_40_percent'] = joined_schools_to_regions_gdf['study_score_over_40_percent'].fillna(0)
joined_schools_to_regions_gdf = joined_schools_to_regions_gdf.reset_index(drop=True)

idx = joined_schools_to_regions_gdf.groupby('suburbs')['study_score_over_40_percent'].idxmax()
best_schools = joined_schools_to_regions_gdf.loc[idx]

best_schools = best_schools[COLS_OF_INTEREST + ['suburbs', 'school_name']]

best_schools.columns = [col if col in ['suburbs'] else 'best_school_' + col for col in best_schools.columns]

avg_schools = joined_schools_to_regions_gdf.groupby('suburbs')[COLS_OF_INTEREST].mean()

avg_schools.columns = ['avg_school_' + col for col in avg_schools.columns]

avg_schools = avg_schools.reset_index()

schools_out = pd.merge(best_schools, avg_schools, on='suburbs')
schools_out

Unnamed: 0,best_school_number_vce_subjects,best_school_satisfactory_complete_vce_percent,best_school_median_study_score,best_school_study_score_over_40_percent,best_school_percentage_applying_to_victorian_uni,suburbs,best_school_school_name,avg_school_number_vce_subjects,avg_school_satisfactory_complete_vce_percent,avg_school_median_study_score,avg_school_study_score_over_40_percent,avg_school_percentage_applying_to_victorian_uni
0,57.0,99.0,31.0,6.3,83.3,Albert Park-Middle Park-West St Kilda,albert park college,57.0,99.0,31.00,6.300,83.30
1,50.0,99.0,30.0,7.8,84.3,Altona,mount st joseph girls college,46.0,99.5,28.75,4.675,69.50
2,30.0,100.0,34.0,19.8,96.7,Armadale,lauriston girls school,30.0,100.0,34.00,19.800,96.70
3,55.0,96.0,30.0,6.4,65.4,Aspendale-Chelsea-Carrum,mordialloc college,55.0,96.0,30.00,6.400,65.40
4,45.0,95.0,28.0,3.0,38.0,Bairnsdale,nagle college,46.5,97.0,27.00,1.900,38.05
...,...,...,...,...,...,...,...,...,...,...,...,...
139,,,,0.0,,West Footscray,,,,,0.000,
140,36.0,96.0,22.0,1.4,78.2,Whittlesea,whittlesea secondary college,36.0,96.0,22.00,1.400,78.20
141,74.0,97.0,32.0,12.0,72.2,Williamstown,williamstown high school,61.5,96.5,30.00,7.000,63.25
142,36.0,100.0,28.0,4.0,59.1,Wodonga,victory lutheran college,40.5,99.5,28.00,3.700,60.00


In [7]:
# load geojson
processing.fix_col_names(school_zones_df)

school_zones_df['school_name'] = school_zones_df['school_name'].str.lower()

# schools_joined_with_zones = pd.merge(school_zones_df, schools_joined, on='school_name')

# schools_joined_with_zones

school_zones_df['geometry_right'] = school_zones_df['geometry']

regions_df_w_crs = regions_df.set_crs('EPSG:4326')

# join based on the largest intersecting area
intersecting_zones_df = gpd.sjoin(regions_df_w_crs, school_zones_df, how='left', predicate='intersects')

intersecting_zones_df

intersecting_zones_df['overlap_area'] = intersecting_zones_df.apply(
    lambda row: row.geometry.intersection(row['geometry_right']).area 
    if not pd.isna(row['geometry_right']) else 0, axis=1
)
intersecting_zones_df['overlap_area'] = intersecting_zones_df['overlap_area'].fillna(0)

schools_zones_joined = pd.merge(intersecting_zones_df, schools_df, on='school_name', how='left')
schools_zones_joined

grouped = schools_zones_joined.groupby(['suburbs'])

# Define a function to calculate the weighted average for the selected columns
def weighted_avg(df, weight_col, cols):
    # Multiply each column by the weight (overlap_area), sum and divide by the total weight
    return (df[cols].multiply(df[weight_col], axis=0).sum()) / df[weight_col].sum()

weighted_averages = grouped.apply(weighted_avg, weight_col='overlap_area', cols=COLS_OF_INTEREST)
weighted_averages = weighted_averages.reset_index()

weighted_averages.columns = ['suburbs' if col == 'suburbs' else 'zoned_school_' + col for col in weighted_averages.columns]
weighted_averages.sort_values('zoned_school_study_score_over_40_percent', ascending=False)

  weighted_averages = grouped.apply(weighted_avg, weight_col='overlap_area', cols=COLS_OF_INTEREST)


Unnamed: 0,suburbs,zoned_school_number_vce_subjects,zoned_school_satisfactory_complete_vce_percent,zoned_school_median_study_score,zoned_school_study_score_over_40_percent,zoned_school_percentage_applying_to_victorian_uni
6,Balwyn,54.456612,99.088112,31.544725,12.941778,92.082578
106,Preston,26.617552,95.593574,30.948697,11.879529,69.920083
42,Docklands,53.115429,84.462240,26.993087,11.842128,79.237771
23,CBD-St Kilda Rd,58.909005,94.839003,29.617576,11.796245,85.274377
17,Brighton East,51.000043,97.000006,31.000004,11.200014,90.799995
...,...,...,...,...,...,...
63,Gladstone Park-Tullamarine,0.000000,0.000000,0.000000,0.000000,0.000000
65,Golden Square-Kangaroo Flat,0.000000,0.000000,0.000000,0.000000,0.000000
95,North Geelong,0.000000,0.000000,0.000000,0.000000,0.000000
115,Shepparton,0.000000,0.000000,0.000000,0.000000,0.000000


Now save to a dataframe

In [14]:
final_df = pd.merge(schools_out, weighted_averages, on='suburbs')
school_zones_with_school_quality = pd.merge(school_zones_df, schools_df, on='school_name', how='left')[['geometry', 'school_name'] + COLS_OF_INTEREST]

processing.to_csv(final_df, SCHOOL_OUTPUT_PATH, SCHOOL_OUTPUT_NAME)
processing.to_csv(school_zones_with_school_quality, SCHOOL_OUTPUT_PATH, ZONE_OUTPUT_NAME)

Unnamed: 0,geometry,school_name,number_vce_subjects,satisfactory_complete_vce_percent,median_study_score,study_score_over_40_percent,percentage_applying_to_victorian_uni
0,"POLYGON ((143.85476 -37.56182, 143.85635 -37.5...",ballarat high school,56.0,95.0,29.0,5.3,53.4
1,"POLYGON ((143.53235 -37.65871, 143.49539 -37.7...",phoenix p-12 community college,,,,,
2,"POLYGON ((143.98962 -37.42295, 143.85442 -37.2...",mount rowan secondary college,26.0,96.0,27.0,1.4,60.9
3,"POLYGON ((143.85635 -37.54561, 143.85476 -37.5...",woodmans hill secondary college,35.0,90.0,29.0,9.3,38.1
4,"POLYGON ((144.6893 -36.75936, 144.65998 -36.67...",weeroona college bendigo,,,,,
...,...,...,...,...,...,...,...
310,"POLYGON ((144.80071 -37.62355, 144.7958 -37.63...",greenvale secondary college,,,,,
311,"POLYGON ((145.59828 -38.40931, 145.59965 -38.4...",bass coast college,56.0,98.0,28.0,3.0,57.2
312,"POLYGON ((144.95806 -37.85308, 144.95061 -37.8...",albert park college,57.0,99.0,31.0,6.3,83.3
313,"POLYGON ((144.75293 -37.90368, 144.74664 -37.9...",saltwater p-9 college,,,,,
