Data validation to make sure our data is clean, and all the data is in the correct format.

In [1]:
# Compare parliament 46 to smh careers
import os

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv("../../.env")

engine = create_engine(
    f"postgresql+psycopg://{os.environ.get('DATABASE_USERNAME')}:{os.environ.get('DATABASE_PASSWORD')}@localhost:5432/{os.environ.get('DATABASE_NAME')}")

In [126]:
# compare high_school in members to high_school in smh_careers
members = pd.read_sql("SELECT * FROM members", engine)
smh_members = pd.read_sql("SELECT * FROM smh_careers", engine)
members.head()

Unnamed: 0,id,orig_id,orig_table,member,district,is_senator,is_representative,graduated,start,wiki_link,dob,mp_id,party_id,house,high_school,preferred_name,aph_name
0,49,24,47,Anne Stanley,Werriwa,False,True,True,2016-07-02,http://www.wikidata.org/entity/Q25768264,1961-10-06,265990,11,house,Public,,Anne Stanley
1,174,178,47,Rebekha Sharkie,Mayo,False,True,True,2018-07-28,http://www.wikidata.org/entity/Q25324881,1972-08-24,265980,17,house,,,Rebekha Sharkie
2,62,156,47,Monique Ryan,Kooyong,False,True,True,2022-05-21,http://www.wikidata.org/entity/Q42696287,1967-01-20,297660,16,house,Non-government,,Monique Ryan
3,65,112,47,Kerrynne Liddle,South Australia,True,False,True,2022-07-01,http://www.wikidata.org/entity/Q112581963,1967-10-07,300644,42,senate,Public,,Kerrynne Liddle
4,185,91,47,Jenny McAllister,New South Wales,True,False,True,2015-05-06,http://www.wikidata.org/entity/Q16732210,1973-07-04,121628,11,senate,Public,Jenny Mcallister,Jennifer Mcallister


In [23]:
smh_members.head()

Unnamed: 0,fid,rowid,index,member,party,chamber,electorate,state,high school,undergraduate,postgraduate,career,government,replied?,sex,birth year
0,29,29,28,Julie Collins,Labor,House,Franklin,Tas,,,,Public Service,,,F,1971
1,48,48,47,Steve Georganas,Labor,House,Adelaide,SA,Public,,,"Blue-collar or service, Business or Management",,,M,1959
2,82,82,81,Nola Marino,Liberal,House,Forrest,WA,Public,,,"Blue-collar or service, Business or Management",,,F,1967
3,136,136,135,Bert van Manen,Liberal,House,Forde,Qld,Public,,,Business or Management,,,M,1954
4,162,162,161,Kim Carr,Labor,Senate,Vic,Vic,Public,BA (University of Melbourne),MA (University of Melbourne),Education,,,M,1955


In [40]:
smh_members.count()

fid              227
rowid            227
index            227
member           227
party            227
chamber          227
electorate       227
state            227
high school      223
undergraduate    173
postgraduate      67
career           220
government        51
replied?         125
sex              227
birth year       227
dtype: int64

In [128]:
smh_members["member"] = smh_members["member"].str.strip()
combined_init = pd.merge(members, smh_members, how="inner", left_on="member", right_on="member", suffixes=("_members", "_smh"))
combined_alias = pd.merge(members, smh_members, how="inner", left_on="preferred_name", right_on="member", suffixes=("_members", "_smh"))
combined_aph = pd.merge(members, smh_members, how="inner", left_on="aph_name", right_on="member", suffixes=("_members", "_smh"))
combined = pd.concat([combined_init, combined_alias, combined_aph])
combined.member.fillna(combined.member_smh, inplace=True)
combined.drop_duplicates(["member"], inplace=True)
combined[ combined["high_school"] != combined["high school"]]


Unnamed: 0,id,orig_id,orig_table,member,district,is_senator,is_representative,graduated,start,wiki_link,...,high school,undergraduate,postgraduate,career,government,replied?,sex,birth year,member_members,member_smh
1,174,178,47,Rebekha Sharkie,Mayo,False,True,True,2018-07-28,http://www.wikidata.org/entity/Q25324881,...,Non-government,BA (Flinders University),,"Private Law, Nonprofits, Public Service",,1.0,F,1975,,
2,185,91,47,Jenny McAllister,New South Wales,True,False,True,2015-05-06,http://www.wikidata.org/entity/Q16732210,...,"Non-government, Public",BA (University of Sydney),,"Public Service, Business or Management",,,F,1982,,
10,86,3,47,Alex Antic,South Australia,True,False,True,2019-07-01,http://www.wikidata.org/entity/Q63520981,...,Non-government,"BA (University of Adelaide), LLB (University o...",,Private Law,Local government,1.0,M,1974,,
12,71,82,47,Jacqui Lambie,Tasmania,True,False,False,2019-07-01,http://www.wikidata.org/entity/Q16731201,...,,,,Military,,1.0,F,1971,,
27,14,33,47,Bob Katter,Kennedy,False,True,True,1993-03-13,http://www.wikidata.org/entity/Q4932983,...,Non-government,,,"Blue-collar or service, Business or Management...",State or Territory government,,M,1945,,
32,133,170,47,Perin Davey,New South Wales,True,False,True,2019-07-01,http://www.wikidata.org/entity/Q64685099,...,"Non-government, Public",,,"Media, Business or Management, Military",,1.0,F,1972,,
33,123,105,47,Karen Andrews,McPherson,False,True,True,2010-08-21,http://www.wikidata.org/entity/Q6369482,...,"Non-government, Public",BEng (Queensland University of Technology),,"Science or Engineering, Lobbying or Activism, ...",,,F,1960,,
35,64,9,46,Kim Carr,Victoria,True,False,True,1993-04-28,http://www.wikidata.org/entity/Q1639334,...,Public,BA (University of Melbourne),MA (University of Melbourne),Education,,,M,1955,,
36,278,42,47,Catryna Bilyk,Tasmania,True,False,True,2008-07-01,http://www.wikidata.org/entity/Q5053981,...,Public,,,"Blue-collar or service, Business or Management...",,1.0,F,1959,,
38,34,137,47,Mark Dreyfus,Isaacs,False,True,True,2007-11-24,http://www.wikidata.org/entity/Q6767411,...,Non-government,"BA (University of Melbourne), LLB (University ...",,"Nonprofits, Private Law",,1.0,M,1956,,


In [26]:
combined

Unnamed: 0,id,orig_id,orig_table,member,district,is_senator,is_representative,graduated,start,wiki_link,...,high school,undergraduate,postgraduate,career,government,replied?,sex,birth year,member_members,member_smh
0,54,27,47,Anthony Albanese,Grayndler,False,True,True,1996-03-02,http://www.wikidata.org/entity/Q335697,...,Non-government,BEc (University of Sydney),,Business or Management,,,M,1963,,
1,187,2,46,John Alexander,Bennelong,False,True,True,2010-08-21,http://www.wikidata.org/entity/Q364407,...,Non-government,,,"Sports, Business or Management, Media",,1.0,M,1951,,
2,52,3,46,Katie Allen,Higgins,False,True,True,2019-05-18,http://www.wikidata.org/entity/Q38589865,...,Non-government,"MBBS (Monash University), BMedSc (Monash Unive...","PhD (University of Melbourne), FRACP (Royal Au...","Science or Engineering, Medicine and Health, N...",,1.0,F,1966,,
3,110,43,47,Chris Bowen,McMahon,False,True,True,2010-08-21,http://www.wikidata.org/entity/Q1077015,...,Public,BEc (University of Sydney),MIR (Griffith University),Unions,Local government,1.0,M,1973,,
4,210,9,47,Andrew Bragg,New South Wales,True,False,True,2019-07-01,http://www.wikidata.org/entity/Q64152252,...,Non-government,BA (Australian National University),"MFinReg (Macquarie University), MBus (Australi...","Business or Management, Lobbying or Activism",,,M,1984,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,265,56,46,Ken Wyatt,Hasluck,False,True,True,2010-08-21,http://www.wikidata.org/entity/None,...,"Non-government, Public",BEd (Churchlands College of Advanced Education),,"Education, Public Service, Business or Management",,,M,1952,,
208,242,212,47,Terry Young,Longman,False,True,False,2019-05-18,http://www.wikidata.org/entity/Q64216512,...,,,,"Blue-collar or service, Business or Management",,,M,1968,,
209,230,218,47,Tony Zappia,Makin,False,True,True,2007-11-24,http://www.wikidata.org/entity/Q7823721,...,Public,,,Business or Management,Local government,,M,1952,,
210,63,57,46,Trent Zimmerman,North Sydney,False,True,True,2015-12-05,http://www.wikidata.org/entity/Q21680762,...,Non-government,,,Lobbying or Activism,Local government,1.0,M,1968,,


In [26]:
# We need to add a member id to members wiki and then use name and wiki link to create new members_education table.
# We can then use this to create a new members_education table



In [85]:
# first lets also fix members secondary school and check secondary school contains a word
secondary_school = pd.read_sql('''SELECT "PHID", "SecondarySchool" FROM aph_parliamentarians''', engine)
secondary_school["SecondarySchool"] = secondary_school["SecondarySchool"].str.strip()
secondary_school = secondary_school[(secondary_school["SecondarySchool"].notnull()) & (secondary_school["SecondarySchool"].apply(lambda x: len(x)> 0 ) )]
secondary_school.set_index("PHID", inplace=True)
secondary_school["SecondarySchool"] = secondary_school["SecondarySchool"].str.split("/|,", regex=True)

schools = secondary_school.explode("SecondarySchool").reset_index()
schools["SecondarySchool"] = schools["SecondarySchool"].str.strip()
# remove those that are only one word
schools = schools[schools["SecondarySchool"].apply(lambda x: len(x.split(" ")) > 1)]
schools.columns = schools.columns.str.lower()
schools.to_sql("members_secondary_school", engine, if_exists="replace", index=True, index_label="id")

-1

In [140]:
# now lets add the member id to members wiki
members = pd.read_sql('''SELECT "id", "member", "wiki_link" FROM members''', engine)
null_link = "http://www.wikidata.org/entity/None"
members = members[members["wiki_link"] != null_link]
wiki_members = pd.read_sql('''SELECT id, "member", "wikidata_entity", school_name, school_link from  members_wiki''', engine)
education_links = pd.read_sql('''SELECT "id" as education_id, "school_name", "school_link" FROM education''', engine)
education_links = education_links[(education_links["school_link"].notnull()) & (education_links[education_links["school_link"].notnull()]["school_link"].apply(lambda x: len(x)> 0 )) & (education_links["school_link"] != null_link)]

wiki_merge = pd.merge(members, wiki_members, how="inner", left_on="wiki_link", right_on="wikidata_entity", suffixes=("_members", "_wiki"))
education_merge = pd.merge(wiki_merge, education_links, how="inner", left_on="school_link", right_on="school_link", suffixes=("_wiki", "_education"))
# Todo fix tim wilson now we have fixed his wiki link


Unnamed: 0,id,member,wikidata_entity,school_name,school_link
526,527,Frank Timson,http://www.wikidata.org/entity/Q15972597,Caulfield Grammar School,http://www.wikidata.org/entity/Q5054516
527,528,Frank Timson,http://www.wikidata.org/entity/Q15972597,Wesley College,http://www.wikidata.org/entity/Q7983897
1132,1133,Tim Fischer,http://www.wikidata.org/entity/Q7803494,Xavier College,http://www.wikidata.org/entity/Q8043217
1716,1717,Tim Watts,http://www.wikidata.org/entity/Q16222127,London School of Economics and Political Science,http://www.wikidata.org/entity/Q174570
1720,1721,Tim Watts,http://www.wikidata.org/entity/Q16222127,Monash University,http://www.wikidata.org/entity/Q598841
1726,1727,Tim Watts,http://www.wikidata.org/entity/Q16222127,Bond University,http://www.wikidata.org/entity/Q892188
1734,1735,Tim Watts,http://www.wikidata.org/entity/Q16222127,Centenary Heights State High School,http://www.wikidata.org/entity/Q5059079
1834,1835,Tim Hammond,http://www.wikidata.org/entity/Q25856253,Murdoch University,http://www.wikidata.org/entity/Q1375146
1835,1836,Tim Hammond,http://www.wikidata.org/entity/Q25856253,University of Western Australia,http://www.wikidata.org/entity/Q1517021
1860,1861,Tim Wilson,http://www.wikidata.org/entity/Q16205919,Monash University,http://www.wikidata.org/entity/Q598841


In [142]:
members[members.member.str.contains("Tim Wilson")]

Unnamed: 0,id,member,wiki_link
148,159,Tim Wilson,http://www.wikidata.org/entity/Q7807613


In [141]:
wiki_members[wiki_members.member.str.contains("Tim Wilson")]

Unnamed: 0,id,member,wikidata_entity,school_name,school_link
1860,1861,Tim Wilson,http://www.wikidata.org/entity/Q16205919,Monash University,http://www.wikidata.org/entity/Q598841
1862,1863,Tim Wilson,http://www.wikidata.org/entity/Q16205919,Murdoch University,http://www.wikidata.org/entity/Q1375146
1864,1865,Tim Wilson,http://www.wikidata.org/entity/Q16205919,Peninsula Grammar,http://www.wikidata.org/entity/Q7756550


In [139]:
education_links[education_links["school_name"].str.contains("Pen")]

Unnamed: 0,education_id,school_name,school_link
125,66,Penrith High School,http://www.wikidata.org/entity/Q7164606
201,160,Peninsula Grammar,http://www.wikidata.org/entity/Q7756550


In [104]:
education_merge[["id_members", "education_id"]].rename(columns={"id_members": "member_id"}).to_sql("members_education", engine, if_exists="replace", index=True, index_label="id")

Unnamed: 0,phid,secondaryschool
0,M3M,Narrabeen Boys High School
1,282986,Melbourne Girls Grammar School
2,282986,Albury High School
3,13050,Moorebank High School
4,13050,Meriden Girls School
...,...,...
340,E0F,Ferntree Gully Technical School
341,M3A,Corrigin District High School and Hampton Seni...
342,201906,Dakabin State High School
343,HWB,Enfield High School


In [108]:
education = pd.read_sql('''SELECT education_id, minister_id, school_name, school_link, member, mp_id  FROM education JOIN member_education me on education.id = me.education_id JOIN members m on me.minister_id = m.id''', engine)
all_merged = pd.merge(education, schools, how="inner", left_on="mp_id", right_on="phid", suffixes=("_education", "_secondary_school"))
all_merged


Unnamed: 0,education_id,minister_id,school_name,school_link,member,mp_id,phid,secondaryschool
0,338,237,Monash University Faculty of Law,http://www.wikidata.org/entity/Q6898254,Adam Bandt,M3C,M3C,Hollywood High School
1,335,237,Murdoch University,http://www.wikidata.org/entity/Q1375146,Adam Bandt,M3C,M3C,Hollywood High School
2,249,237,Monash University,http://www.wikidata.org/entity/Q598841,Adam Bandt,M3C,M3C,Hollywood High School
3,129,237,Hollywood Senior High School,https://www.wikidata.org/wiki/Q16930409,Adam Bandt,M3C,M3C,Hollywood High School
4,134,280,University of Southern Queensland,http://www.wikidata.org/entity/Q978124,Alexander Gallacher,204953,204953,Darwin High School
...,...,...,...,...,...,...,...,...
894,248,228,University of South Australia,http://www.wikidata.org/entity/Q15576,Zoe Daniel,008CH,008CH,Rosny College (Hobart)
895,24,228,Queechy High School (Launceston),,Zoe Daniel,008CH,008CH,Queechy High School (Launceston)
896,24,228,Queechy High School (Launceston),,Zoe Daniel,008CH,008CH,Rosny College (Hobart)
897,301,70,University of Melbourne,http://www.wikidata.org/entity/Q319078,Zoe McKenzie,124514,124514,Lauriston Girls School


In [122]:
from thefuzz import fuzz

def calc_dist(x):
    if isinstance(x["secondaryschool"], str) and isinstance(x["school_name"], str):
        s1, s2 = x["school_name"], x["secondaryschool"]
        return fuzz.ratio(s1, s2)
    return 0


all_merged["school_name_str_match"] = all_merged.apply(calc_dist, axis=1)
all_merged[all_merged["school_name_str_match"] > 50].sort_values(["school_name_str_match", "secondaryschool"], ascending=False).drop_duplicates(["mp_id", "secondaryschool"], keep="first")

Unnamed: 0,education_id,minister_id,school_name,school_link,member,mp_id,phid,secondaryschool,school_name_str_match
834,27,31,York District High School,,Tania Lawrence,299150,299150,York District High School,100
127,179,195,Xavier College,http://www.wikidata.org/entity/Q8043217,Bill Shorten,00ATG,00ATG,Xavier College,100
214,179,5,Xavier College,http://www.wikidata.org/entity/Q8043217,Dan Tehan,210911,210911,Xavier College,100
123,330,106,Wyong High School,,Ben Morton,265931,265931,Wyong High School,100
280,22,37,Wynyard High School,http://www.wikidata.org/entity/Q916063,Gavin Pearce,282306,282306,Wynyard High School,100
...,...,...,...,...,...,...,...,...,...
462,290,92,Canberra College,http://www.wikidata.org/entity/Q17514127,Katy Gallagher,ING,ING,Stirling College,56
637,133,62,Loreto Mandeville Hall,http://www.wikidata.org/entity/Q6680935,Monique Ryan,297660,297660,Loreto Convent,56
668,102,180,"St Joseph's College, Hunters Hill",http://www.wikidata.org/entity/Q3463907,Pat Conaghan,279991,279991,Hunters Hill,53
67,119,217,Royal Military College,http://www.wikidata.org/entity/Q2171074,Andrew Wilkie,C2T,C2T,McCarthy Catholic College,51


In [2]:
# check members not in member_education table
members = pd.read_sql('''SELECT * FROM members WHERE id not in (SELECT minister_id from member_education)''', engine)
members

Unnamed: 0,id,orig_id,orig_table,member,district,is_senator,is_representative,graduated,start,wiki_link,dob,mp_id,party_id,house,high_school,preferred_name,aph_name
0,90,106,47,Karen Grogan,South Australia,True,False,True,2021-09-21,http://www.wikidata.org/entity/Q108617920,1960-01-01,296331,11,senate,,,Karen Grogan
1,152,123,47,Llew O'Brien,Wide Bay,False,True,False,2016-07-02,http://www.wikidata.org/entity/Q25756234,1972-06-26,265991,42,house,,Llew O'Brien,Llewellyn O'Brien
2,175,219,47,Tracey Roberts,Pearce,False,True,True,2022-05-21,http://www.wikidata.org/entity/Q109850050,1960-01-01,157125,11,house,,,Tracey Roberts


In [None]:
# fix and look up correct ids in archive