Workflow for going from APH handbook to Education Data

In [95]:
import os

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

from apemap.utils import get_wikipedia_entity_id

load_dotenv("../../.env")

engine = create_engine(
    f"postgresql+psycopg://{os.environ.get('DATABASE_USERNAME')}:{os.environ.get('DATABASE_PASSWORD')}@localhost:5432/{os.environ.get('DATABASE_NAME')}")

# get members from 46th parliament because we already have 47 done
# replace "PreferredName" with "GivenName" when no preferred name and convert preferred name like (John) to John

with engine.begin() as connection:
    df = pd.read_sql(
        'SELECT  coalesce(substring(("PreferredName") from \'\((.*?)\)\') , "GivenName") as firstname, "FamilyName", "SecondarySchool", "PHID", "Party", "RepresentedElectorates", "RepresentedParliaments", "MPorSenator", "ServiceHistory_Start","DateOfBirth"  FROM aph_parliamentarians WHERE 46 = ANY("RepresentedParliaments") AND  NOT (47 = ANY("RepresentedParliaments") ) ;',
        connection)
df

Unnamed: 0,firstname,FamilyName,SecondarySchool,PHID,Party,RepresentedElectorates,RepresentedParliaments,MPorSenator,ServiceHistory_Start,DateOfBirth
0,Eric,ABETZ,,N26,Liberal Party of Australia,[],"[37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Senator],1994-02-22,1958-01-25
1,John,ALEXANDER,Narrabeen Boys High School,M3M,Liberal Party of Australia,[Bennelong],"[43, 44, 45, 46]",[Member],2010-08-21,1951-07-04
2,Katie,ALLEN,"Melbourne Girls Grammar School, Albury High Sc...",282986,Liberal Party of Australia,[Higgins],[46],[Member],2019-05-18,1966-02-24
3,Kevin,ANDREWS,,HK5,Liberal Party of Australia,[Menzies],"[36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Member],1991-05-11,1955-11-09
4,Cory,BERNARDI,Prince Alfred College,G0D,Independent,[],"[41, 42, 43, 44, 45, 46]",[Senator],2006-05-04,1969-11-06
5,Sharon,BIRD,Warilla High School/Airds High School,DZP,Australian Labor Party,[Cunningham],"[41, 42, 43, 44, 45, 46]",[Member],2004-10-09,1962-11-15
6,Terri,BUTLER,Cairns State High School,248006,Australian Labor Party,[Griffith],"[44, 45, 46]",[Member],2014-02-08,1977-11-28
7,Anthony,BYRNE,,008K0,Australian Labor Party,[Holt],"[39, 40, 41, 42, 43, 44, 45, 46]",[Member],1999-11-06,1962-12-01
8,Kim,CARR,,AW5,Australian Labor Party,[],"[37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Senator],1993-04-28,1955-07-02
9,Nick,CHAMPION,Kapunda High School,HW9,Australian Labor Party,"[Spence, Wakefield]","[42, 43, 44, 45, 46]",[Member],2007-11-24,1972-02-27


In [15]:
group_lookup = pd.read_sql('SELECT distinct party, "group" from minister', engine)
group_map = group_lookup.set_index('party')['group'].to_dict()

district_lookup = pd.read_sql('SELECT distinct district, "district_link" from minister', engine)
# district_lookup = { d['district']:d['district_link'] for d in  district_lookup.to_dict('records') }
district_map = district_lookup.set_index('district')['district_link'].to_dict()

In [16]:
# clean aph_parliamentarians
df["member"] = df["firstname"].str.rstrip() + " " + df["FamilyName"].str.rstrip().str.capitalize()
df["district"] = df["RepresentedElectorates"].str[0]
df["start"] = df["ServiceHistory_Start"]
df["dob"] = df["DateOfBirth"]
df["party"] = df["Party"].replace({"Liberal Party of Australia": "Coalition", "The Nationals": "Coalition", })
df["district_link"] = df["district"].map(district_map)
df["group"] = df["party"].map(group_map)
df["is_senator"] = df["MPorSenator"] == "Senator"
df["is_representative"] = df["MPorSenator"] == "Member"
df["graduated"] = 1
df["wiki_link"] = df["member"].apply(get_wikipedia_entity_id)
df.rename(columns={"PHID": "mp_id"}, inplace=True)
df

Unnamed: 0,firstname,FamilyName,SecondarySchool,mp_id,Party,RepresentedElectorates,RepresentedParliaments,MPorSenator,ServiceHistory_Start,DateOfBirth,...,district,start,dob,party,district_link,group,is_senator,is_representative,graduated,wiki_link
0,Eric,ABETZ,,N26,Liberal Party of Australia,[],"[37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Senator],1994-02-22,1958-01-25,...,,1994-02-22,1958-01-25,Coalition,,http://www.wikidata.org/entity/Q1065320,False,False,1,http://www.wikidata.org/entity/Q964840
1,John,ALEXANDER,Narrabeen Boys High School,M3M,Liberal Party of Australia,[Bennelong],"[43, 44, 45, 46]",[Member],2010-08-21,1951-07-04,...,Bennelong,2010-08-21,1951-07-04,Coalition,http://www.wikidata.org/entity/Q817830,http://www.wikidata.org/entity/Q1065320,False,False,1,http://www.wikidata.org/entity/Q364407
2,Katie,ALLEN,"Melbourne Girls Grammar School, Albury High Sc...",282986,Liberal Party of Australia,[Higgins],[46],[Member],2019-05-18,1966-02-24,...,Higgins,2019-05-18,1966-02-24,Coalition,http://www.wikidata.org/entity/Q2973648,http://www.wikidata.org/entity/Q1065320,False,False,1,http://www.wikidata.org/entity/Q6375331
3,Kevin,ANDREWS,,HK5,Liberal Party of Australia,[Menzies],"[36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Member],1991-05-11,1955-11-09,...,Menzies,1991-05-11,1955-11-09,Coalition,http://www.wikidata.org/entity/Q178752,http://www.wikidata.org/entity/Q1065320,False,False,1,http://www.wikidata.org/entity/Q6395732
4,Cory,BERNARDI,Prince Alfred College,G0D,Independent,[],"[41, 42, 43, 44, 45, 46]",[Senator],2006-05-04,1969-11-06,...,,2006-05-04,1969-11-06,Independent,,,False,False,1,http://www.wikidata.org/entity/Q4354299
5,Sharon,BIRD,Warilla High School/Airds High School,DZP,Australian Labor Party,[Cunningham],"[41, 42, 43, 44, 45, 46]",[Member],2004-10-09,1962-11-15,...,Cunningham,2004-10-09,1962-11-15,Australian Labor Party,http://www.wikidata.org/entity/Q182615,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q7489994
6,Terri,BUTLER,Cairns State High School,248006,Australian Labor Party,[Griffith],"[44, 45, 46]",[Member],2014-02-08,1977-11-28,...,Griffith,2014-02-08,1977-11-28,Australian Labor Party,http://www.wikidata.org/entity/Q1074129,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q16216498
7,Anthony,BYRNE,,008K0,Australian Labor Party,[Holt],"[39, 40, 41, 42, 43, 44, 45, 46]",[Member],1999-11-06,1962-12-01,...,Holt,1999-11-06,1962-12-01,Australian Labor Party,http://www.wikidata.org/entity/Q2973650,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q4772191
8,Kim,CARR,,AW5,Australian Labor Party,[],"[37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Senator],1993-04-28,1955-07-02,...,,1993-04-28,1955-07-02,Australian Labor Party,,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q1639334
9,Nick,CHAMPION,Kapunda High School,HW9,Australian Labor Party,"[Spence, Wakefield]","[42, 43, 44, 45, 46]",[Member],2007-11-24,1972-02-27,...,Spence,2007-11-24,1972-02-27,Australian Labor Party,http://www.wikidata.org/entity/Q55887412,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q7026894


In [34]:
df

Unnamed: 0,GivenName,FamilyName,SecondarySchool,mp_id,Party,RepresentedElectorates,RepresentedParliaments,MPorSenator,ServiceHistory_Start,DateOfBirth,...,district,start,dob,party,group,is_senator,is_representative,graduated,wiki_link,district_link
0,Eric,ABETZ,,N26,Liberal Party of Australia,[],"[37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Senator],1994-02-22,1958-01-25,...,,1994-02-22,1958-01-25,Coalition,http://www.wikidata.org/entity/Q1065320,False,False,1,http://www.wikidata.org/entity/Q964840,
1,John,ALEXANDER,Narrabeen Boys High School,M3M,Liberal Party of Australia,[Bennelong],"[43, 44, 45, 46]",[Member],2010-08-21,1951-07-04,...,Bennelong,2010-08-21,1951-07-04,Coalition,http://www.wikidata.org/entity/Q1065320,False,False,1,http://www.wikidata.org/entity/Q364407,http://www.wikidata.org/entity/Q817830
2,Katrina,ALLEN,"Melbourne Girls Grammar School, Albury High Sc...",282986,Liberal Party of Australia,[Higgins],[46],[Member],2019-05-18,1966-02-24,...,Higgins,2019-05-18,1966-02-24,Coalition,http://www.wikidata.org/entity/Q1065320,False,False,1,http://www.wikidata.org/entity/Q38589865,http://www.wikidata.org/entity/Q2973648
3,Kevin,ANDREWS,,HK5,Liberal Party of Australia,[Menzies],"[36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Member],1991-05-11,1955-11-09,...,Menzies,1991-05-11,1955-11-09,Coalition,http://www.wikidata.org/entity/Q1065320,False,False,1,http://www.wikidata.org/entity/Q6395732,http://www.wikidata.org/entity/Q178752
4,Cory,BERNARDI,Prince Alfred College,G0D,Independent,[],"[41, 42, 43, 44, 45, 46]",[Senator],2006-05-04,1969-11-06,...,,2006-05-04,1969-11-06,Independent,,False,False,1,http://www.wikidata.org/entity/Q4354299,
5,Sharon,BIRD,Warilla High School/Airds High School,DZP,Australian Labor Party,[Cunningham],"[41, 42, 43, 44, 45, 46]",[Member],2004-10-09,1962-11-15,...,Cunningham,2004-10-09,1962-11-15,Australian Labor Party,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q7489994,http://www.wikidata.org/entity/Q182615
6,Terri,BUTLER,Cairns State High School,248006,Australian Labor Party,[Griffith],"[44, 45, 46]",[Member],2014-02-08,1977-11-28,...,Griffith,2014-02-08,1977-11-28,Australian Labor Party,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q16216498,http://www.wikidata.org/entity/Q1074129
7,Anthony,BYRNE,,008K0,Australian Labor Party,[Holt],"[39, 40, 41, 42, 43, 44, 45, 46]",[Member],1999-11-06,1962-12-01,...,Holt,1999-11-06,1962-12-01,Australian Labor Party,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q4772191,http://www.wikidata.org/entity/Q2973650
8,Kim,CARR,,AW5,Australian Labor Party,[],"[37, 38, 39, 40, 41, 42, 43, 44, 45, 46]",[Senator],1993-04-28,1955-07-02,...,,1993-04-28,1955-07-02,Australian Labor Party,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q1639334,
9,Nicholas,CHAMPION,Kapunda High School,HW9,Australian Labor Party,"[Spence, Wakefield]","[42, 43, 44, 45, 46]",[Member],2007-11-24,1972-02-27,...,Spence,2007-11-24,1972-02-27,Australian Labor Party,http://www.wikidata.org/entity/Q216082,False,False,1,http://www.wikidata.org/entity/Q7026894,http://www.wikidata.org/entity/Q55887412


In [17]:
# Drop all nulls and save them in a table to look up later
df_nulls = df[df["SecondarySchool"].isnull()]
education_new = df[["member", "SecondarySchool"]]
education_new["SecondarySchool"] = education_new["SecondarySchool"].str.split(r"(/| and | & |,)")
education_new = education_new.explode("SecondarySchool")
education_new["SecondarySchool"] = education_new["SecondarySchool"].str.strip()
education_new["SecondarySchool"] = education_new["SecondarySchool"].str.replace("Mt", "Mount")
education_new = education_new[
    education_new["SecondarySchool"].notnull() & (education_new["SecondarySchool"].str.strip().str.len() > 10)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  education_new["SecondarySchool"] = education_new["SecondarySchool"].str.split(r"(/| and | & |,)")


In [4]:
# Manual cleaning for 46th to tidy and improve lookup rate
# Remove Strathfield and South Australia
education_new = education_new[~education_new["SecondarySchool"].str.contains("Strathfield")]
education_new = education_new[~education_new["SecondarySchool"].str.contains("South Australia")]

education_new.loc[
    education_new["SecondarySchool"].str.contains("Narrabeen"), "SecondarySchool"] = "Narrabeen Sports High School"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Melbourne Girls Grammar"), "SecondarySchool"] = "Melbourne Girls Grammar"
education_new.loc[
    education_new["SecondarySchool"].str.contains("St Ignatius College"), "SecondarySchool"] = "Saint Ignatius' College"
# education_new.loc[education_new["SecondarySchool"].str.contains("Mt Scopus Memorial College"), "SecondarySchool"] = "Mount Scopus Memorial College"
# actually amalgamated into a different school
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Box Hill Technical College"), "SecondarySchool"] = "Box Hill Senior Secondary College"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Peakhurst High School"), "SecondarySchool"] = "Georges River College Peakhurst Campus"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Notre Dame Academy"), "SecondarySchool"] = "International - Notre Dame Academy (Toledo)"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Church of England"), "SecondarySchool"] = "Anglican Church Grammar School"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Belilios"), "SecondarySchool"] = "International - Belilios Public School"
education_new.loc[
    education_new["SecondarySchool"].str.contains("Nanango"), "SecondarySchool"] = "Nanango State High School"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Rockhampton Grammar School"), "SecondarySchool"] = "The Rockhampton Grammar School"
# need manual checking add suburb??? NSW/QLD ?? lookup zdenko though
education_new.loc[
    education_new["SecondarySchool"].str.contains("Padua Catholic College"), "SecondarySchool"] = "Padua College"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "St Peters Catholic College"), "SecondarySchool"] = "St Peter's Catholic College"
# needs commentary on almagamation and change to independent
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Duval High School"), "SecondarySchool"] = "Armidale Secondary College"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Gosford Christian School"), "SecondarySchool"] = "St Philip's Christian College - Gosford"
education_new.loc[education_new["SecondarySchool"].str.contains(
    "Carey Grammar School"), "SecondarySchool"] = "Carey Baptist Grammar School"

In [5]:
with engine.begin() as connection:
    rows = connection.execute(text('SELECT * FROM education')).fetchall()
    education = pd.DataFrame(rows)

with engine.begin() as connection:
    rows = connection.execute(text('SELECT * FROM acara_school_locations_2022')).fetchall()
    acara = pd.DataFrame(rows)

In [6]:
# This gets 46 schools pretty good
education_new3 = education_new.merge(acara, how="left", left_on="SecondarySchool", right_on="school name")
# education_new = education_new[education_new["school name"].isnull()]

education_new3["fid"].notna().sum()

63

In [None]:
# convert education_new to a format matching existing education table
import geopandas as gpd

education_new3["member"] = education_new3["member"].str.strip()
education_new3["school_name"] = education_new3["school name"].str.strip()
education_new3["operational_status"] = pd.NA
education_new3["is_university"] = 0
education_new3["is_high_school"] = 1
education_new3["is_alt_edu"] = 0
education_new3["school_link"] = education_new3["school name"].apply(get_wikipedia_entity_id)
education_new3["wkt"] = education_new3.apply(lambda x: f"Point({x['longitude']} {x['latitude']})", axis=1)
education_to_add = gpd.GeoDataFrame(education_new3,
                                    geometry=gpd.points_from_xy(education_new3.longitude, education_new3.latitude))
education_to_add.set_crs("EPSG:4326", inplace=True)

In [10]:
education_to_add.to_postgis("education_46", engine, if_exists="replace", index=False)

In [18]:
minister_46 = df[
    ["member", "party", "group", "district", "is_senator", "is_representative", "graduated", "mp_id", "start",
     "wiki_link", "district_link", "dob"]]
minister_46

Unnamed: 0,member,party,group,district,is_senator,is_representative,graduated,mp_id,start,wiki_link,district_link,dob
0,Eric Abetz,Coalition,http://www.wikidata.org/entity/Q1065320,,False,False,1,N26,1994-02-22,http://www.wikidata.org/entity/Q964840,,1958-01-25
1,John Alexander,Coalition,http://www.wikidata.org/entity/Q1065320,Bennelong,False,False,1,M3M,2010-08-21,http://www.wikidata.org/entity/Q364407,http://www.wikidata.org/entity/Q817830,1951-07-04
2,Katie Allen,Coalition,http://www.wikidata.org/entity/Q1065320,Higgins,False,False,1,282986,2019-05-18,http://www.wikidata.org/entity/Q6375331,http://www.wikidata.org/entity/Q2973648,1966-02-24
3,Kevin Andrews,Coalition,http://www.wikidata.org/entity/Q1065320,Menzies,False,False,1,HK5,1991-05-11,http://www.wikidata.org/entity/Q6395732,http://www.wikidata.org/entity/Q178752,1955-11-09
4,Cory Bernardi,Independent,,,False,False,1,G0D,2006-05-04,http://www.wikidata.org/entity/Q4354299,,1969-11-06
5,Sharon Bird,Australian Labor Party,http://www.wikidata.org/entity/Q216082,Cunningham,False,False,1,DZP,2004-10-09,http://www.wikidata.org/entity/Q7489994,http://www.wikidata.org/entity/Q182615,1962-11-15
6,Terri Butler,Australian Labor Party,http://www.wikidata.org/entity/Q216082,Griffith,False,False,1,248006,2014-02-08,http://www.wikidata.org/entity/Q16216498,http://www.wikidata.org/entity/Q1074129,1977-11-28
7,Anthony Byrne,Australian Labor Party,http://www.wikidata.org/entity/Q216082,Holt,False,False,1,008K0,1999-11-06,http://www.wikidata.org/entity/Q4772191,http://www.wikidata.org/entity/Q2973650,1962-12-01
8,Kim Carr,Australian Labor Party,http://www.wikidata.org/entity/Q216082,,False,False,1,AW5,1993-04-28,http://www.wikidata.org/entity/Q1639334,,1955-07-02
9,Nick Champion,Australian Labor Party,http://www.wikidata.org/entity/Q216082,Spence,False,False,1,HW9,2007-11-24,http://www.wikidata.org/entity/Q7026894,http://www.wikidata.org/entity/Q55887412,1972-02-27


In [36]:
minister_46.to_sql("minister_46", engine, if_exists="replace", index=False)

-1

In [None]:
from apemap.utils import query_reps, query_senate, clean_results, get_results, endpoint_url

reps_results = get_results(endpoint_url, query_reps)

In [4]:
reps_df = clean_results(reps_results)
reps_df["is_senator"] = False
reps_df["is_representative"] = True
reps_df.to_sql("ministers_wiki", engine, if_exists="replace", index=False)

-1

In [5]:
senate_results = get_results(endpoint_url, query_senate)
senate_df = clean_results(senate_results)
senate_df["is_senator"] = True
senate_df["is_representative"] = False
senate_df.to_sql("ministers_wiki", engine, if_exists="append", index=False)

-1

In [41]:
from utils import get_google_geocode

# Get the 46th parliament
# education_46 = pd.read_sql('SELECT * FROM education_46', engine)
# minister_46 = pd.read_sql('SELECT * FROM minister_46', engine)
ministers_wiki = pd.read_sql('SELECT * FROM ministers_wiki', engine)
ministers_wiki.rename(columns={"itemLabel": "member", "groupLabel": "party", "districtLabel": "district",
                               "item": "wiki_link", "edu": "school_link", "eduLabel": "school_name",
                               "district": "district_link"}, inplace=True)
education_46_wiki = minister_46.merge(ministers_wiki, on="member", how="left", suffixes=("_46", "_wiki"))
education_46_wiki.loc[:, "is_university"] = education_46_wiki["school_name"].str.contains("University")
high_school_college = r"(High School|College|Grammar School|State High School|Grammar School|Grammar|High|Secondary School)"
education_46_wiki.loc[:, "is_high_school"] = education_46_wiki["school_name"].str.contains(high_school_college)
# # set "is_alt_edu" to not is_high_school and not is_university
education_46_wiki.loc[:, "is_alt_edu"] = ~(education_46_wiki["is_high_school"] | education_46_wiki["is_university"])
education_46_wiki["operational_status"] = pd.NA
# drop na
education_46_wiki = education_46_wiki[education_46_wiki["school_name"].notna()]
education_46_wiki.loc[:, "wkt"] = education_46_wiki.school_name.apply(get_google_geocode)
education_46_wiki[
    ["member", "school_name", "school_link", "is_university", "is_high_school", "is_alt_edu", "operational_status",
     "wkt"]]

  education_46_wiki.loc[:,"is_high_school"] = education_46_wiki["school_name"].str.contains(high_school_college)


Unnamed: 0,member,school_name,school_link,is_university,is_high_school,is_alt_edu,operational_status,wkt
0,Eric Abetz,University of Tasmania,http://www.wikidata.org/entity/Q962011,True,False,False,,Point(147.3247503 -42.9041118)
1,Eric Abetz,Hobart College,http://www.wikidata.org/entity/Q5874683,False,True,False,,Point(147.315905 -42.9192938)
2,Eric Abetz,Taroona High School,http://www.wikidata.org/entity/Q7686555,False,True,False,,Point(147.3558807 -42.94317239999999)
5,Katie Allen,University of Melbourne,http://www.wikidata.org/entity/Q319078,True,False,False,,Point(144.960974 -37.7983459)
6,Katie Allen,Monash University,http://www.wikidata.org/entity/Q598841,True,False,False,,Point(145.1346592 -37.9142416)
...,...,...,...,...,...,...,...,...
121,Lucy Wicks,University of Sydney,http://www.wikidata.org/entity/Q487556,True,False,False,,Point(151.1873494 -33.8885748)
122,Tim Wilson,Monash University,http://www.wikidata.org/entity/Q598841,True,False,False,,Point(145.1346592 -37.9142416)
123,Tim Wilson,Murdoch University,http://www.wikidata.org/entity/Q1375146,True,False,False,,Point(115.839624 -32.0663179)
124,Tim Wilson,Peninsula Grammar,http://www.wikidata.org/entity/Q7756550,False,True,False,,Point(145.0923335 -38.197817)


In [43]:
# Data from APH joined to ACARA data
education_all_cols = pd.read_sql('SELECT * FROM education_46_all_cols', engine)
education_46 = education_all_cols[
    ["member", "school_name", "school_link", "is_university", "is_high_school", "is_alt_edu", "operational_status",
     "fid", "wkt"]]
education_46.loc[:, "is_university"] = education_46["school_name"].str.contains(
    r"(University|Law School|Oxford|Cambridge|John F. Kennedy School of Government|Universiteit)")
high_school_college = r"(High School|College|Grammar School|State High School|Grammar School|Grammar|High|Secondary School)"
education_46.loc[:, "is_high_school"] = education_46["school_name"].str.contains(high_school_college)
# # set "is_alt_edu" to not is_high_school and not is_university
education_46.loc[:, "is_alt_edu"] = ~(education_46["is_high_school"] | education_46["is_university"])
education_46[~education_46["school_name"].isna()]

  education_46.loc[:,"is_high_school"] = education_46["school_name"].str.contains(high_school_college)


Unnamed: 0,member,school_name,school_link,is_university,is_high_school,is_alt_edu,operational_status,fid,wkt
0,John Alexander,Narrabeen Sports High School,http://www.wikidata.org/entity/None,False,True,False,,1941.0,Point(151.298401 -33.700941)
1,Katrina Allen,Melbourne Girls Grammar,http://www.wikidata.org/entity/None,False,True,False,,5260.0,Point(144.988851 -37.835293)
2,Katrina Allen,Albury High School,http://www.wikidata.org/entity/None,False,True,False,,1.0,Point(146.915442 -36.072846)
3,Cory Bernardi,Prince Alfred College,http://www.wikidata.org/entity/None,False,True,False,,8288.0,Point(138.617615 -34.923142)
4,Sharon Bird,Warilla High School,http://www.wikidata.org/entity/None,False,True,False,,2090.0,Point(150.858391 -34.565199)
...,...,...,...,...,...,...,...,...,...
59,Lucy Wicks,Armidale Secondary College,http://www.wikidata.org/entity/None,False,True,False,,33.0,Point(151.6532283 -30.51925421)
60,Lucy Wicks,St Philip's Christian College - Gosford,http://www.wikidata.org/entity/None,False,True,False,,938.0,Point(151.333151 -33.408273)
61,Kenneth Wyatt,Corrigin District High School,http://www.wikidata.org/entity/None,False,True,False,,9057.0,Point(117.8768 -32.32684)
62,Kenneth Wyatt,Hampton Senior High School,http://www.wikidata.org/entity/None,False,True,False,,8886.0,Point(115.9259254 -31.8901676)


In [69]:
combined_all_edu = pd.concat([education_46_wiki[
                                  ["member", "school_name", "school_link", "is_university", "is_high_school",
                                   "is_alt_edu", "operational_status", "wkt"]], education_46[
                                  ["member", "school_name", "school_link", "is_university", "is_high_school",
                                   "is_alt_edu", "operational_status", "wkt"]]])
education = pd.read_sql('SELECT * FROM education', engine)
existing_schools = combined_all_edu[combined_all_edu["wkt"].isna()].merge(education, on="school_name", how="left",
                                                                          suffixes=(None, "_existing"))
existing_schools.loc[
    existing_schools["school_name"] == "St Mary MacKillop College", "wkt_existing"] = "Point(149.092455 -35.424483)"
# update combined_all_edu wkt to existing_schools wkt_existing based on member and school_name


In [None]:
# We need to remove duplicated schools
pd.read_sql(
    'SELECT * from education_46 JOIN acara_school_locations_2022 asl ON education_46.school_name = asl."school name";',
    engine)


In [86]:
import geopandas as gpd

existing_schools["wkt"] = existing_schools["wkt_existing"]
combined_all_edu = pd.concat([combined_all_edu[combined_all_edu["wkt"].notna()], existing_schools[
    ["member", "school_name", "school_link", "is_university", "is_high_school", "is_alt_edu", "operational_status",
     "wkt"]]])
education_46 = gpd.GeoDataFrame(combined_all_edu, geometry=gpd.GeoSeries.from_wkt(combined_all_edu["wkt"]),
                                crs="EPSG:4326")
education_46.to_postgis("education_46", engine, if_exists="replace", index=False)

In [74]:
combined_all_edu[combined_all_edu["wkt"].isna()]

Unnamed: 0,member,school_name,school_link,is_university,is_high_school,is_alt_edu,operational_status,wkt
56,Mike Kelly,Macquarie University,http://www.wikidata.org/entity/Q741082,True,False,False,,
60,Mike Kelly,Macquarie University,http://www.wikidata.org/entity/Q741082,True,False,False,,
69,Gladys Liu,La Trobe University,http://www.wikidata.org/entity/Q1478723,True,False,False,,
92,Zed Seselja,St Mary MacKillop College,http://www.wikidata.org/entity/Q7594565,False,True,False,,
95,Dave Sharma,Deakin University,http://www.wikidata.org/entity/Q1180978,True,False,False,,
120,Amanda Stoker,Sydney Law School,http://www.wikidata.org/entity/Q7660015,False,False,True,,


In [None]:
# Remove duplicate member school_name from table education_46 in SQL
engine.connect().execute(
    "DELETE FROM education_46 WHERE fid NOT IN (SELECT MIN(fid) FROM education_46 GROUP BY member, school_name);")

In [3]:
# find as many acara ids and lookup there finances to create a financial table

In [109]:
schools_to_fetch = pd.read_sql("SELECT acara_id from education_acara", engine)
schools_to_fetch = schools_to_fetch[~schools_to_fetch.acara_id.isin(finances.index)]
schools_to_fetch

Unnamed: 0,acara_id
3,40803
4,41849
5,43836
6,43836
7,41849
...,...
489,47494
490,48038
491,42094
492,49463


In [114]:
finances2 = pd.DataFrame()
for row in schools_to_fetch.itertuples():
    df = get_finances(row.acara_id)
    if finances2.empty:
        finances2 = df
    else:
        # append df to finances
        finances2 = pd.concat([finances2, df])


AttributeError: 'NoneType' object has no attribute 'find_all'

In [115]:
# convert o to a dataframe
finances2

Category,australian_government_recurrent_funding_total,state__territory_government_recurring_funding_total,fees_charges_and_parent_contributions_total,other_private_sources_total,total_gross_income_total,total_net_recurrent_income_total,australian_government_recurrent_funding_per_student,state__territory_government_recurring_funding_per_student,fees_charges_and_parent_contributions_per_student,other_private_sources_per_student,total_gross_income_per_student,total_net_recurrent_income_per_student,year
school_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
40803,198285,591719,0,19,790023,790023,0,0,0,0,0,0,2021
41849,2491972,9495705,289580,111429,12388686,12388686,3112,11858,362,139,15470,15470,2021
43836,5732363,2009402,20742932,1018721,29503418,29390359,6123,2146,22157,1088,31514,31393,2021
43836,5732363,2009402,20742932,1018721,29503418,29390359,6123,2146,22157,1088,31514,31393,2021
41849,2491972,9495705,289580,111429,12388686,12388686,3112,11858,362,139,15470,15470,2021
50244,2317389,11040817,59493,84223,13501922,13501922,3576,17038,92,130,20836,20836,2021


In [100]:
get_finances(40803)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [12]:
import os

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv("../../.env")

engine = create_engine(
    f"postgresql+psycopg://{os.environ.get('DATABASE_USERNAME')}:{os.environ.get('DATABASE_PASSWORD')}@localhost:5432/{os.environ.get('DATABASE_NAME')}")
aph_df = pd.read_sql('SELECT * FROM aph_parliamentarians WHERE "PHID" not in (SELECT PHID FROM ministers_aph) ', engine)
aph_df.columns = aph_df.columns.str.lower()
max_fid = pd.read_sql('SELECT MAX(fid) FROM ministers_aph', engine).iloc[0, 0]
aph_df.index = aph_df.index + max_fid + 1
aph_df.set_index("index", inplace=True)
ministers_table = aph_df[
    ["phid", "givenname", "middlenames", "familyname", "preferredname", "displayname", "dateofbirth", "placeofbirth",
     "image", "gender", "maritalstatus", "countryofbirth", "stateofbirth", "party", "partyabbrev", "senatestate",
     "state", "stateabbrev", "incurrentparliament", "servicehistory_start", "servicehistory_end", "servicehistory_days",
     "electedmemberno", "electedsenatorno", "firstnations"]]
ministers_occupations = aph_df[["phid", "occupations"]].explode("occupations").dropna()
ministers_secondary_occupations = aph_df[["phid", "secondaryoccupations"]].explode("secondaryoccupations").dropna()
ministers_qualifications = aph_df[["phid", "qualifications"]].explode("qualifications").dropna()
ministers_secondary_schools = aph_df[["phid", "secondaryschool"]].explode("secondaryschool").dropna()
mss = ministers_secondary_schools.set_index("phid")
mss = mss.secondaryschool.str.split("/").explode()
mss = mss.reset_index()

# mss.dropna().to_sql("ministers_secondary_school", engine, if_exists="append",
#                     index=False)
ministers_table.convert_dtypes().to_sql("ministers_aph", engine, if_exists="append", index=False)
ministers_occupations.convert_dtypes().dropna().to_sql("ministers_occupations", engine, if_exists="append", index=False)
ministers_secondary_occupations.convert_dtypes().dropna().to_sql("ministers_secondary_occupations", engine, if_exists="append",
                                                index=False)
ministers_qualifications.convert_dtypes().dropna().to_sql("ministers_secondary_qualifications", engine, if_exists="append",
                                         index=False)

-1