In [1]:
import geopandas as gpd
import pathlib
import sqlite3
import pandas as pd

data_dir = pathlib.Path("..").resolve() / "data"
db_con = sqlite3.connect(str(data_dir / "ppm.sqlite"))
schools = gpd.read_postgis("SELECT * FROM schools;", db_con, geom_col="GEOMETRY")
education = pd.read_sql("SELECT * FROM education;", db_con)

In [3]:
education_locations = pd.merge(schools, education, left_on=["schoollabel"], right_on=["School Name"], how="outer")

# To fix
- Merge schoollabel and school name

In [25]:
missing_school_names = education_locations["School Name"].isnull()
education_locations.loc[missing_school_names,"School Name"] = education_locations.loc[missing_school_names,"schoollabel"]
education_locations.loc[education_locations["schoollabel"].isnull(),"schoollabel"] = education_locations.loc[education_locations["schoollabel"].isnull(),"School Name"]
# #
education_locations.loc[education_locations["schoolLink"].isnull(),"schoolLink"] = education_locations.loc[education_locations["schoolLink"].isnull(),"school"]
education_locations.loc[education_locations["school"].isnull(),"school"] = education_locations.loc[education_locations["school"].isnull(),"schoolLink"]

In [26]:
education_locations["is_university"] = education_locations["School Name"].str.contains("University")
high_school_college =pattern = r"(High School|College|Grammar School)"
education_locations["is_high_school"] = education_locations["School Name"].str.contains(high_school_college)
education_locations["is_alt_edu"] = (~education_locations["is_university"])& (~education_locations["is_high_school"])

  education_locations["is_high_school"] = education_locations["School Name"].str.contains(high_school_college)


# Attempt to get missing locations
Property:625

In [28]:
sparql_entities = education_locations[education_locations["location"].isna()].schoolLink.str.extract(r"(Q\d+)")
sparql_entities.columns = ["entity"]
sparql_entities.dropna(inplace=True)
cleaned_ents = ' '.join(['wd:'+ e for e in sparql_entities["entity"].values])
cleaned_ents

'wd:Q781371 wd:Q371370 wd:Q4824219 wd:Q127990 wd:Q892188 wd:Q1053985 wd:Q1066188 wd:Q5102091 wd:Q907481 wd:Q48815827 wd:Q1180978 wd:Q1284960 wd:Q5353931 wd:Q15575 wd:Q49122 wd:Q6124478 wd:Q6152232 wd:Q49127 wd:Q1478723 wd:Q174570 wd:Q289348 wd:Q741082 wd:Q6811735 wd:Q6811823 wd:Q598841 wd:Q6898254 wd:Q1375146 wd:Q706712 wd:Q62516299 wd:Q1144750 wd:Q1057890 wd:Q1278780 wd:Q7593823 wd:Q7595174 wd:Q7627278 wd:Q787234 wd:Q7660015 wd:Q7660036 wd:Q469476 wd:Q7741168 wd:Q7865388 wd:Q15574 wd:Q1640648 wd:Q160302 wd:Q1961570 wd:Q1516684 wd:Q170027 wd:Q319078 wd:Q1814435 wd:Q734764 wd:Q18545951 wd:Q531285 wd:Q1887921 wd:Q866012 wd:Q15576 wd:Q4614 wd:Q978124 wd:Q487556 wd:Q7896374 wd:Q962011 wd:Q1145731 wd:Q1517021 wd:Q1350021 wd:Q185246 wd:Q180514 wd:Q83303158 wd:Q1141452 wd:Q8001157 wd:Q49112'

In [29]:
from pollypedagogy.utils import  get_results, clean_results
endpoint_url="https://query.wikidata.org/sparql"

school_location_query = f"""SELECT ?school ?schoolLabel ?location ?locationLabel
WHERE
{{
  VALUES ?school {{ {cleaned_ents} }}
  ?school wdt:P625 ?location.
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}"""

school_locs = get_results(endpoint_url, school_location_query)

schools_df = clean_results(school_locs['results']['bindings'])

In [33]:
merge_results = pd.merge(education_locations, schools_df, left_on="schoollabel", right_on="schoolLabel", how="left")

In [34]:
no_locs = education_locations["location"].isnull()
education_locations.loc[no_locs,"location"] = merge_results.location_y

In [38]:
# Lookup some manual locations
charles_sturt_university_hdq = "Point(151.074444  -33.846667)"
deakin_university_hdq = "Point(144.297 -38.1979)"
la_trobe = "Point(145.047909 -37.72179)"
mac_university = "Point(151.112915 -33.775259)"
sturtholme = "Point(152.974 -27.4692)"
edinburgh_uni = "Point(-3.187194 55.947389)"
unsw_law = "Point(151.2279886 -33.9171807)"
sydney_uni = "Point(151.18722 -33.88778)"
uwa = "Point(115.818611 -31.980278)"
westend_ss = "Point(153.0081 -27.4796)"

education_locations.loc[ education_locations["schoollabel"] == "Charles Sturt University", "location"] = charles_sturt_university_hdq
education_locations.loc[ education_locations["schoollabel"] == "Deakin University", "location"] = deakin_university_hdq
education_locations.loc[ education_locations["schoollabel"] == "La Trobe University", "location"]= la_trobe
education_locations.loc[ education_locations["schoollabel"] == "Macquarie University", "location"] = mac_university
education_locations.loc[ education_locations["schoollabel"] == "Stuartholme School", "location"] = sturtholme
education_locations.loc[ education_locations["schoollabel"] == "University of Edinburgh", "location"] = edinburgh_uni
education_locations.loc[ education_locations["schoollabel"] == "University of New South Wales Law School", "location"] = unsw_law
education_locations.loc[ education_locations["schoollabel"] == "University of Sydney", "location"] = sydney_uni
education_locations.loc[ education_locations["schoollabel"] == "University of Western Australia", "location"] = uwa
education_locations.loc[ education_locations["schoollabel"] == "West End State School", "location"] = westend_ss

In [39]:
education_locations["GEOMETRY"] = gpd.GeoSeries.from_wkt(education_locations.location)
education_locations = education_locations.set_crs(epsg=4326)


In [56]:
education = gpd.GeoDataFrame(education_locations, geometry="GEOMETRY")
education = education.drop(columns=["ogc_fid", "index", "school", "schoollabel"]).rename(columns={"School Name":"school_name", "location": "wkt", "status":"operational_status", "schoolLink":"school_link"})
education = education[["school_name", "school_link", "is_university", "is_high_school", "is_alt_edu", "operational_status", "wkt", "GEOMETRY"]]
education.rename_geometry("geom", inplace=True)

In [57]:
education.to_file(str(data_dir / "ppm.sqlite"), layer='education', driver="SQLite", if_exists="replace")
education.to_file(str(data_dir / "ppm.gpkg"), layer='education', driver="GPKG")

# TODO Outstanding
We need a good way to check if is_public -> I manually added this post cleaning.
I think this can mostly be found on wikipedia.