# Poltician's education pathways

This notebook is for doing some basic data cleanup and downloading.
Only current-ish data https://www.smh.com.au/interactive/2021/careers-before-politics/


In [4]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

endpoint_url = "https://query.wikidata.org/sparql"

query_reps = """SELECT ?item ?itemLabel ?group ?groupLabel ?district ?districtLabel ?term ?termLabel ?edu ?eduLabel ?start ?end
WHERE
{
  ?item p:P39 ?statement .
  ?statement ps:P39/wdt:P279* wd:Q18912794 ; pq:P580 ?start .
  OPTIONAL { ?statement pq:P2937 ?term }
  OPTIONAL { ?statement pq:P582  ?end }
  OPTIONAL { ?statement pq:P768  ?district }
  OPTIONAL { ?statement pq:P4100 ?group }
  OPTIONAL { ?item wdt:P69 ?edu .
             ?edu rdfs:label ?eduLabel FILTER (lang(?eduLabel) = "en")}
  FILTER(!BOUND(?end) || ?end > NOW())
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
ORDER BY ?start ?end"""

query_senate = """
SELECT ?item ?itemLabel ?group ?groupLabel ?district ?districtLabel ?term ?termLabel ?edu ?eduLabel ?start ?end
WHERE
{
  ?item p:P39 ?statement .
  ?statement ps:P39/wdt:P279* wd:Q6814428 ; pq:P580 ?start .
  OPTIONAL { ?statement pq:P2937 ?term }
  OPTIONAL { ?statement pq:P582  ?end }
  OPTIONAL { ?statement pq:P768  ?district }
  OPTIONAL { ?statement pq:P4100 ?group }
  OPTIONAL { ?item wdt:P69 ?edu .
             ?edu rdfs:label ?eduLabel FILTER (lang(?eduLabel) = "en")}
  FILTER(!BOUND(?end) || ?end > NOW())
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
ORDER BY ?start ?end
"""


def get_results(endpoint_url, query):
    user_agent = "WDQS Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


In [5]:
def clean_results(results:dict) -> pd.DataFrame:
    output = pd.json_normalize(results)
    col_vals = [ c for c in output.columns if c.endswith(".value")]
    output_cleaned = output[col_vals]

    output_cleaned.columns = [ c.replace(".value", "") for c in col_vals ]
    return output_cleaned

In [6]:
senate_results = get_results(endpoint_url, query_senate)
senate_results

{'head': {'vars': ['item',
   'itemLabel',
   'group',
   'groupLabel',
   'district',
   'districtLabel',
   'term',
   'termLabel',
   'edu',
   'eduLabel',
   'start',
   'end']},
 'results': {'bindings': [{'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q16191011'},
    'edu': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q7865388'},
    'eduLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'UNSW Faculty of Law'},
    'start': {'datatype': 'http://www.w3.org/2001/XMLSchema#dateTime',
     'type': 'literal',
     'value': '1997-04-09T00:00:00Z'},
    'district': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q56649105'},
    'group': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q1065320'},
    'itemLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Marise Payne'},
    'groupLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Coalition'},
    'districtLabel': {'xml:lang': 'en',
     'ty

In [7]:
senate_df = clean_results(senate_results['results']['bindings'])
senate_df["is_senator"] = True
senate_df["is_representative"] = False

In [8]:
reps_results  = get_results(endpoint_url, query_reps)
reps_df = clean_results(reps_results['results']['bindings'])
reps_df["is_senator"] = False
reps_df["is_representative"] = True
reps_df

Unnamed: 0,item,edu,eduLabel,start,district,group,itemLabel,groupLabel,districtLabel,is_senator,is_representative
0,http://www.wikidata.org/entity/Q4932983,http://www.wikidata.org/entity/Q866012,University of Queensland,1993-03-13T00:00:00Z,http://www.wikidata.org/entity/Q2973687,http://www.wikidata.org/entity/Q6378340,Bob Katter,Katter's Australian Party,Kennedy,False,True
1,http://www.wikidata.org/entity/Q335697,http://www.wikidata.org/entity/Q487556,University of Sydney,1996-03-02T00:00:00Z,http://www.wikidata.org/entity/Q1079489,http://www.wikidata.org/entity/Q216082,Anthony Albanese,Australian Labor Party,Grayndler,False,True
2,http://www.wikidata.org/entity/Q335697,http://www.wikidata.org/entity/Q7594250,St Mary's Cathedral College,1996-03-02T00:00:00Z,http://www.wikidata.org/entity/Q1079489,http://www.wikidata.org/entity/Q216082,Anthony Albanese,Australian Labor Party,Grayndler,False,True
3,http://www.wikidata.org/entity/Q335697,http://www.wikidata.org/entity/Q7896374,University of Sydney Business School,1996-03-02T00:00:00Z,http://www.wikidata.org/entity/Q1079489,http://www.wikidata.org/entity/Q216082,Anthony Albanese,Australian Labor Party,Grayndler,False,True
4,http://www.wikidata.org/entity/Q7684036,http://www.wikidata.org/entity/Q741082,Macquarie University,1998-10-03T00:00:00Z,http://www.wikidata.org/entity/Q1075639,http://www.wikidata.org/entity/Q216082,Tanya Plibersek,Australian Labor Party,Sydney,False,True
...,...,...,...,...,...,...,...,...,...,...,...
256,http://www.wikidata.org/entity/Q112127622,,,2022-05-21T00:00:00Z,http://www.wikidata.org/entity/Q2973656,http://www.wikidata.org/entity/Q1065320,Jenny Ware,Coalition,Hughes,False,True
257,http://www.wikidata.org/entity/Q112131017,,,2022-05-21T00:00:00Z,http://www.wikidata.org/entity/Q107393285,http://www.wikidata.org/entity/Q216082,Sam Rae,Australian Labor Party,Hawke,False,True
258,http://www.wikidata.org/entity/Q112134180,,,2022-05-21T00:00:00Z,http://www.wikidata.org/entity/Q2973472,http://www.wikidata.org/entity/Q1065320,Henry Pike,Coalition,Bowman,False,True
259,http://www.wikidata.org/entity/Q112567447,,,2022-05-21T00:00:00Z,http://www.wikidata.org/entity/Q2973800,http://www.wikidata.org/entity/Q216082,Andrew Charlton,Australian Labor Party,Parramatta,False,True


In [9]:
ministers_df = pd.concat([reps_df, senate_df])
ministers_df.rename(columns={"itemLabel": "Member", "groupLabel": "Party", "districtLabel":"District", "item": "Wiki Link", "edu": "schoolLink", "eduLabel": "School Name", "district": "district_link"}, inplace=True)

In [10]:
education_df : pd.DataFrame = ministers_df[["schoolLink","School Name"]].copy()
education_df.dropna(inplace=True)
education_df.drop_duplicates(["schoolLink"], inplace=True)
education_df.sort_values(["School Name"], inplace=True)

In [11]:
education_df["is_university"] = education_df["School Name"].str.contains("University")
high_school_college =pattern = r"(High School|College|Grammar School)"
education_df["is_high_school"] = education_df["School Name"].str.contains(high_school_college)
education_df["is_alt_edu"] = (~education_df["is_university"])& (~education_df["is_high_school"])

  education_df["is_high_school"] = education_df["School Name"].str.contains(high_school_college)


In [12]:
education_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139 entries, 104 to 153
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   schoolLink      139 non-null    object
 1   School Name     139 non-null    object
 2   is_university   139 non-null    bool  
 3   is_high_school  139 non-null    bool  
 4   is_alt_edu      139 non-null    bool  
dtypes: bool(3), object(2)
memory usage: 3.7+ KB


In [13]:
schools_sparql_entities = education_df[education_df["is_high_school"]].schoolLink.str.extract(r"(Q\d+)")
schools_sparql_entities.columns = ["entity"]
cleaned_ents = ' '.join(['wd:'+ e for e in schools_sparql_entities["entity"].values])
cleaned_ents

'wd:Q4763439 wd:Q4782971 wd:Q4792525 wd:Q4968616 wd:Q4997498 wd:Q17514127 wd:Q5031258 wd:Q5032517 wd:Q5059079 wd:Q5060585 wd:Q6765675 wd:Q5130089 wd:Q5193900 wd:Q5244511 wd:Q5244515 wd:Q5267826 wd:Q6874499 wd:Q5273786 wd:Q5303222 wd:Q5373399 wd:Q5436281 wd:Q5528459 wd:Q5587183 wd:Q5874683 wd:Q6065569 wd:Q6265061 wd:Q6413539 wd:Q6481599 wd:Q6516506 wd:Q6721977 wd:Q6765669 wd:Q6772346 wd:Q1914698 wd:Q6801996 wd:Q6811796 wd:Q6813902 wd:Q6900526 wd:Q6911446 wd:Q14935237 wd:Q6961407 wd:Q1376987 wd:Q62516299 wd:Q7066890 wd:Q7110892 wd:Q7164606 wd:Q7244458 wd:Q7335025 wd:Q7355277 wd:Q2171074 wd:Q7435507 wd:Q14934996 wd:Q7569827 wd:Q7593812 wd:Q7594250 wd:Q7595311 wd:Q7659840 wd:Q7659951 wd:Q48769587 wd:Q7739701 wd:Q7762626 wd:Q7796467 wd:Q7830279 wd:Q7894929 wd:Q7897033 wd:Q7930877 wd:Q7975377 wd:Q8043217 wd:Q8060904'

In [14]:

school_location_query = f"""SELECT ?school ?schoolLabel ?location ?locationLabel
WHERE
{{
  VALUES ?school {{ {cleaned_ents} }}
  ?school wdt:P625 ?location.
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}"""

school_locs = get_results(endpoint_url, school_location_query)


In [15]:
schools_df = clean_results(school_locs['results']['bindings'])
schools_df

Unnamed: 0,school,location,schoolLabel,locationLabel
0,http://www.wikidata.org/entity/Q1376987,Point(-1.25166667 51.75416667),New College,Point(-1.25166667 51.75416667)
1,http://www.wikidata.org/entity/Q1914698,Point(116.039 -32.0067),"Mazenod College, Perth",Point(116.039 -32.0067)
2,http://www.wikidata.org/entity/Q2171074,Point(149.165 -35.299722222),Royal Military College,Point(149.165 -35.299722222)
3,http://www.wikidata.org/entity/Q4763439,Point(153.054 -27.4819),Anglican Church Grammar School,Point(153.054 -27.4819)
4,http://www.wikidata.org/entity/Q4782971,Point(145.235555555 -37.823055555),Aquinas College,Point(145.235555555 -37.823055555)
...,...,...,...,...
64,http://www.wikidata.org/entity/Q8060904,Point(116.00848498 -32.048444839),Yule Brook College,Point(116.00848498 -32.048444839)
65,http://www.wikidata.org/entity/Q14934996,Point(151.01666667 -33.88333333),Sefton High School,Point(151.01666667 -33.88333333)
66,http://www.wikidata.org/entity/Q14935237,Point(145.11722222 -37.84805556),Mount Scopus Memorial College,Point(145.11722222 -37.84805556)
67,http://www.wikidata.org/entity/Q17514127,Point(149.089 -35.3398),Canberra College,Point(149.089 -35.3398)


In [16]:
import geopandas as gpd
schools_gdf = gpd.GeoDataFrame(schools_df, geometry=gpd.GeoSeries.from_wkt(schools_df["location"]))
# schools_gdf.explore(
#      tooltip="schoolLabel", # show "BoroName" value in tooltip (on hover)
#      popup=True, # show all values in popup (on click)
#      tiles="CartoDB positron", # use "CartoDB positron" tiles
#      style_kwds=dict(color="black") # use black outline)
# )

# SMH
The below data comes from the sydney morning herald.

In [17]:

careers_csv = pd.read_csv("https://www.smh.com.au/interactive/2021/careers-before-politics/careers.csv")
inc_mat_csv = pd.read_csv("https://www.smh.com.au/interactive/2021/careers-before-politics/incMat.csv")
ministry_csv = pd.read_csv("https://www.smh.com.au/interactive/2021/careers-before-politics/Ministry.csv")

In [18]:
inc_mat_csv

Unnamed: 0,Member,Party,Sex,Start,Public,Home-schooled,Did not graduate,Non-government,International,Undergraduate,...,Lobbying or Activism,Farming,Academia,Judicial,Nonprofits,Arts,Religion,Local government,State or Territory government,Parliament
0,Anthony Albanese,Labor,M,True,False,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,True
1,John Alexander,Liberal,M,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
2,Katie Allen,Liberal,F,True,False,False,False,True,False,True,...,False,False,True,False,True,False,False,False,False,True
3,Anne Aly,Labor,F,True,False,False,False,True,False,True,...,False,False,True,False,False,False,False,False,False,True
4,Karen Andrews,Liberal,F,True,True,False,False,True,False,True,...,True,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,Jess Walsh,Labor,F,True,False,False,False,True,False,True,...,False,False,True,False,False,False,False,False,False,True
223,Larissa Waters,Greens,F,True,True,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,True
224,Murray Watt,Labor,M,True,True,False,False,False,False,True,...,False,False,False,True,False,False,False,False,True,True
225,Peter Whish-Wilson,Greens,M,True,True,False,False,True,False,True,...,False,False,True,False,False,False,False,False,False,True


In [19]:
careers_csv

Unnamed: 0,Member,Party,Chamber,Electorate,State,High School,Undergraduate,Postgraduate,Career,Government,Replied?,Sex,Birth Year
0,Anthony Albanese,Labor,House,Grayndler,NSW,Non-government,BEc (University of Sydney),,Business or Management,,,M,1963
1,John Alexander,Liberal,House,Bennelong,NSW,Non-government,,,"Sports, Business or Management, Media",,True,M,1951
2,Katie Allen,Liberal,House,Higgins,Vic,Non-government,"MBBS (Monash University), BMedSc (Monash Unive...","PhD (University of Melbourne), FRACP (Royal Au...","Science or Engineering, Medicine and Health, N...",,True,F,1966
3,Anne Aly,Labor,House,Cowan,WA,Non-government,BA (American University of Cairo),"MEd (Edith Cowan University), PhD (Edith Cowan...","Public Service, Academia",,True,F,1967
4,Karen Andrews,Liberal,House,McPherson,Qld,"Non-government, Public",BEng (Queensland University of Technology),,"Science or Engineering, Lobbying or Activism, ...",,,F,1960
...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,Jess Walsh,Labor,Senate,Vic,Vic,Non-government,BA (University of Melbourne),"MA (University of Southern California), PhD (U...","Academia, Unions",,,F,1971
223,Larissa Waters,Greens,Senate,Qld,Qld,Public,"BSc (Griffith University), LLB (Griffith Unive...",,"Public Service, Private Law, Nonprofits",,True,F,1977
224,Murray Watt,Labor,Senate,Qld,Qld,Public,"BCom (University of Queensland), LLB (Universi...",,"Private Law, Judicial, Public Service, Blue-co...",State or Territory government,True,M,1973
225,Peter Whish-Wilson,Greens,Senate,Tas,Tas,"Non-government, Public",BA (University of New South Wales),MEc (University of Western Australia),"Business or Management, Academia, Blue-collar ...",,True,M,1968


In [20]:
ministry_csv

Unnamed: 0,Member,Type,Government
0,Scott Morrison,Cabinet,True
1,Barnaby Joyce,Cabinet,True
2,Josh Frydenberg,Cabinet,True
3,David Littleproud,Cabinet,True
4,Simon Birmingham,Cabinet,True
...,...,...,...
81,Ged Kearney,Parliamentary Secretaries,False
82,Josh Wilson,Parliamentary Secretaries,False
83,Kimberley Kitching,Parliamentary Secretaries,False
84,Louise Pratt,Parliamentary Secretaries,False


In [21]:
ministers_df

Unnamed: 0,Wiki Link,schoolLink,School Name,start,district_link,group,Member,Party,District,is_senator,is_representative
0,http://www.wikidata.org/entity/Q4932983,http://www.wikidata.org/entity/Q866012,University of Queensland,1993-03-13T00:00:00Z,http://www.wikidata.org/entity/Q2973687,http://www.wikidata.org/entity/Q6378340,Bob Katter,Katter's Australian Party,Kennedy,False,True
1,http://www.wikidata.org/entity/Q335697,http://www.wikidata.org/entity/Q487556,University of Sydney,1996-03-02T00:00:00Z,http://www.wikidata.org/entity/Q1079489,http://www.wikidata.org/entity/Q216082,Anthony Albanese,Australian Labor Party,Grayndler,False,True
2,http://www.wikidata.org/entity/Q335697,http://www.wikidata.org/entity/Q7594250,St Mary's Cathedral College,1996-03-02T00:00:00Z,http://www.wikidata.org/entity/Q1079489,http://www.wikidata.org/entity/Q216082,Anthony Albanese,Australian Labor Party,Grayndler,False,True
3,http://www.wikidata.org/entity/Q335697,http://www.wikidata.org/entity/Q7896374,University of Sydney Business School,1996-03-02T00:00:00Z,http://www.wikidata.org/entity/Q1079489,http://www.wikidata.org/entity/Q216082,Anthony Albanese,Australian Labor Party,Grayndler,False,True
4,http://www.wikidata.org/entity/Q7684036,http://www.wikidata.org/entity/Q741082,Macquarie University,1998-10-03T00:00:00Z,http://www.wikidata.org/entity/Q1075639,http://www.wikidata.org/entity/Q216082,Tanya Plibersek,Australian Labor Party,Sydney,False,True
...,...,...,...,...,...,...,...,...,...,...,...
114,http://www.wikidata.org/entity/Q112152550,,,2022-07-01T00:00:00Z,http://www.wikidata.org/entity/Q56649104,http://www.wikidata.org/entity/Q216082,Fatima Payman,Australian Labor Party,Western Australia,True,False
115,http://www.wikidata.org/entity/Q112581963,,,2022-07-01T00:00:00Z,http://www.wikidata.org/entity/Q56649110,http://www.wikidata.org/entity/Q1065320,Kerrynne Liddle,Coalition,South Australia,True,False
116,http://www.wikidata.org/entity/Q112642694,,,2022-07-01T00:00:00Z,http://www.wikidata.org/entity/Q56649112,http://www.wikidata.org/entity/Q15130081,Ralph Babet,United Australia Party,Victoria,True,False
117,http://www.wikidata.org/entity/Q16185446,http://www.wikidata.org/entity/Q1375146,Murdoch University,2022-07-26T00:00:00Z,,,Sue Lines,,,True,False


In [22]:
merged_data_inc_mat = ministers_df.merge(inc_mat_csv, on=["Member", "Member"], how="left")

In [23]:
# # Lets save out our datasets
import pathlib
import sqlite3
data_dir = pathlib.Path("..").resolve() / "data"


try:
    db_con = sqlite3.connect(str(data_dir / "ppm.sqlite"))
    # Export to sqllite
    ministers_df.to_sql("ministers", db_con, if_exists="replace")
    ministry_csv.to_sql("ministy", db_con, if_exists="replace")
    careers_csv.to_sql("careers", db_con, if_exists="replace")
    education_df.to_sql("education", db_con, if_exists="replace")
except Exception as exc:
    db_con.rollback()



# Manual Datafixing and cleang
We have some data, with a fair amount of missing gaps.
I am going to fill in the easiest missing gaps and tidy it up into more coherent datasets in [the cleanup notebook](cleanup.ipynb)

In [24]:
# Geopackage + sqlite + flatgeobuffs
schools_gdf.to_file(str(data_dir / "ppm.gpkg"), layer='schools', driver="GPKG")
schools_gdf.to_file(str(data_dir / "ppm.sqlite"), layer='schools', driver="SQLite")

In [25]:
import duckdb

# export to diuck db
# ddb_con = duckdb.connect(str(data_dir/ "ppm.db"))

# export to parquet

# export to arrow