In [1]:
import sys
import os
import pandas as pd
import geopandas as gpd

sys.path.append(os.path.abspath('../../scripts/2. modules'))
import processing

# Input paths
SCHOOL_PATH = '../../data/2. raw/school/schools.csv'
SCHOOL_ZONES_PATH = '../../data/2. raw/school/zones.geojson'

# Output paths
SCHOOL_OUTPUT_PATH = "../../data/2. raw/school"
SCHOOL_OUTPUT_NAME = "schools.csv"
SCHOOL_ZONES_OUTPUT_NAME = "zones.geojson"

In [2]:
regions_df = processing.get_regions_df(2)
schools_df = pd.read_csv(SCHOOL_PATH, encoding='ISO-8859-1')
school_zones_df = gpd.read_file(SCHOOL_ZONES_PATH)

In [None]:
# peform join
school_point_gdf = gpd.GeoDataFrame(
    out_school_df, 
    geometry=gpd.points_from_xy(out_school_df['x'], out_school_df['y']),
)

joined_schools_to_regions_gdf = gpd.sjoin(school_point_gdf, regions_df, how='right', predicate='within')
joined_schools_to_regions_gdf['study_score_over_40_percent'] = joined_schools_to_regions_gdf['study_score_over_40_percent'].fillna(0)
joined_schools_to_regions_gdf = joined_schools_to_regions_gdf.reset_index(drop=True)

idx = joined_schools_to_regions_gdf.groupby('suburbs')['study_score_over_40_percent'].idxmax()
best_schools = joined_schools_to_regions_gdf.loc[idx]

best_schools = best_schools[COLS_OF_INTEREST + ['suburbs', 'school_name']]

best_schools.columns = [col if col in ['suburbs'] else 'best_school_' + col for col in best_schools.columns]

avg_schools = joined_schools_to_regions_gdf.groupby('suburbs')[COLS_OF_INTEREST].mean()

avg_schools.columns = ['avg_school_' + col for col in avg_schools.columns]

avg_schools = avg_schools.reset_index()

schools_out = pd.merge(best_schools, avg_schools, on='suburbs')
schools_out

In [None]:
# load geojson


fix_col_names(school_zones_df)

school_zones_df['school_name'] = school_zones_df['school_name'].str.lower()

# schools_joined_with_zones = pd.merge(school_zones_df, schools_joined, on='school_name')

# schools_joined_with_zones

school_zones_df['geometry_right'] = school_zones_df['geometry']

regions_df_w_crs = regions_df.set_crs('EPSG:4326')

# join based on the largest intersecting area
intersecting_zones_df = gpd.sjoin(regions_df_w_crs, school_zones_df, how='left', predicate='intersects')

intersecting_zones_df

intersecting_zones_df['overlap_area'] = intersecting_zones_df.apply(
    lambda row: row.geometry.intersection(row['geometry_right']).area 
    if not pd.isna(row['geometry_right']) else 0, axis=1
)
intersecting_zones_df['overlap_area'] = intersecting_zones_df['overlap_area'].fillna(0)

schools_zones_joined = pd.merge(intersecting_zones_df, schools_joined, on='school_name', how='left')
schools_zones_joined

grouped = schools_zones_joined.groupby(['suburbs'])

# Define a function to calculate the weighted average for the selected columns
def weighted_avg(df, weight_col, cols):
    # Multiply each column by the weight (overlap_area), sum and divide by the total weight
    return (df[cols].multiply(df[weight_col], axis=0).sum()) / df[weight_col].sum()

weighted_averages = grouped.apply(weighted_avg, weight_col='overlap_area', cols=COLS_OF_INTEREST)
weighted_averages = weighted_averages.reset_index()

weighted_averages.columns = ['suburbs' if col == 'suburbs' else 'zoned_school_' + col for col in weighted_averages.columns]
weighted_averages.sort_values('zoned_school_study_score_over_40_percent', ascending=False)