# School preprocessing

This notebook aggregates the School data such that the relevant fields are extracted and the data is neatly formatted into a csv wherein the rows are associated with an SA2

In [5]:
import pandas as pd
import geopandas as gpd
from shapely import Point

In [6]:
df = pd.read_csv("../../data/landing/school_data.csv")

def convert_price(price):
    if pd.isna(price):
        return 0  # Replace NaN with 0
    if '-' in price:
        # If the price is a range, split it and take the average
        price_range = price.replace('$', '').split(' - ')
        return sum([float(p) for p in price_range]) / len(price_range)
    else:
        # If it's a single price, just remove the $ and convert to float
        return float(price.replace('$', ''))

# Apply the price conversion function directly on the original DataFrame 'df' before converting it to a GeoDataFrame
df['average_price'] = df['price_local'].apply(convert_price)
df = df[df['average_price'] > 0]

filtered_df = df[df['median_score'].notna()]
filtered_df.head()

Unnamed: 0,school_name,school_type,median_score,price_local,ELC,Kinder,Primary,Secondary,href,latitude,longitude,average_price
1696,Ozford College,independent,21.18,$20000,False,True,True,True,https://www.aroundschools.com.au/schools/detai...,-37.810607,144.968516,20000.0
1698,Hallam Secondary College,government,22.55,$12210 - $13640,False,True,True,True,https://www.aroundschools.com.au/schools/detai...,-38.00047,145.264404,12925.0
1700,Bundoora Secondary College,government,23.27,$10990 - $12270,False,True,True,True,https://www.aroundschools.com.au/schools/detai...,-37.707084,145.06398,11630.0
1705,East Preston Islamic College,government,23.64,$1000,False,True,True,True,https://www.aroundschools.com.au/schools/detai...,-37.733847,145.035968,1000.0
1725,Ararat College,catholic,24.7,$3130 - $3715,False,True,True,True,https://www.aroundschools.com.au/schools/detai...,-37.286014,142.922124,3422.5


In [7]:
# Create geometry from the latitude and longitude
df['geometry'] = filtered_df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

# Convert to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

In [8]:
SA2 = gpd.read_file("../../data/landing/SAL_data/")

In [9]:
schools_in_sa2 = gpd.sjoin(SA2, gdf , how="inner", predicate="intersects")

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:7844
Right CRS: None

  schools_in_sa2 = gpd.sjoin(SA2, gdf , how="inner", predicate="intersects")


In [10]:
# Group by SA2_CODE21 and calculate the mean score and mean price
grouped_df = schools_in_sa2.groupby('SA2_CODE21').agg({
    'median_score': 'mean',
    'average_price': 'mean'
}).reset_index()

# Display the grouped data
grouped_df.head()

Unnamed: 0,SA2_CODE21,median_score,average_price
0,201011002,32.423333,10107.5
1,201011008,32.55,15520.0
2,201011483,27.64,6250.0
3,201011484,28.27,4365.0
4,201031016,28.09,4250.0


In [11]:
grouped_df.count()

SA2_CODE21       189
median_score     189
average_price    189
dtype: int64

In [12]:
grouped_df.to_csv("../../data/curated/schools.csv", index=False)