run the other notebooks. should give two csv files. this one will combine them and clean up the data too.

In [1]:
from bs4 import BeautifulSoup as bs
import ast
import os
import os.path
from datetime import datetime
from enum import Enum
import re
import pandas as pd
import numpy as np


**for merging with mps.csv**

In [2]:
mp_df = pd.read_csv('mps.csv')
mp_df.Party = mp_df.Party.apply(ast.literal_eval)
mp_df.Parliaments = mp_df.Parliaments.apply(ast.literal_eval)
mps = dict(
    zip(mp_df.Name.apply(lambda x: x.replace('.', '').replace(',', '').lower()), # keys
    zip(mp_df.Name, mp_df.Party, mp_df.Parliaments))) # values
mp_names = list(mps.keys())

In [3]:
alr_matched = set() # honorific+names that have alr been matched to names so we don't spam the print
ministers_found = set() # minister titles that have alr been found (to be used for future searches in case of typos)

In [4]:
# for matching honorific+name in report to actual mp data.
# cannot simply remove honorific as the programmer doesn't have an exhaustive list
# of honorifics, and some are quite rare in everyday use (e.g. Inche Rahamat Bin Kenap).
def honorific_name_to_mp_data(honorific_name):
    honorific_name = honorific_name.replace('.','').replace(',','').lower().strip()
    
    # try the easy way first (find and remove honorific)
    honorific_match = re.match(r'(mr|mrs|ms|miss|mdm|dr|er dr|prof|assoc prof|er|asst prof|assoc prof dr|inche|encik)', honorific_name)
    if honorific_match:
        name = honorific_name[honorific_match.span()[1]+1:]
        if name in mps.keys():
            return mps[name]
        
        # seems quite common for them to write "asked" twice in the hansard proceedings
        last_asked = name.rfind(' asked')
        if last_asked and name[:last_asked] in mps.keys():
            return mps[name[:last_asked]]

        # slightly harder way (rearranging words)
        for mp_name in mp_names:
            mp_name_words = set(mp_name.split(' '))
            name_words = set(name.split(' '))
            if mp_name_words == name_words:
                if (honorific_name, mp_name) not in alr_matched:
                    print(f'rearranging matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
            
            # for omission of chinese name
            if len(mp_name_words) - len(name_words) <= 2 and len(name_words) >= 2 and name_words.issubset(mp_name_words):
                if (honorific_name, mp_name) not in alr_matched:
                    print(f'allowing omitted words in name matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
                
    digit_match = re.search('\d+', honorific_name)
    if digit_match:
        # names shldn't have digits
        honorific_name = honorific_name[digit_match.span()[1]:]
        return honorific_name_to_mp_data(honorific_name)
        
    # the hard way (levenshtein)
    min_levenshtein = 99999
    min_ind = -1
    for i in range(len(mp_names)):
        l_dist = levenshtein(mp_names[i], honorific_name)
        if l_dist < min_levenshtein:
            min_levenshtein = l_dist
            min_ind = i
            
    if (honorific_name, mp_names[min_ind]) not in alr_matched:
        print(f'levenshtein matched {honorific_name} to {mp_names[min_ind]}')
        alr_matched.add((honorific_name, mp_names[min_ind]))
    return mps[mp_names[min_ind]]


# borrowed from: https://blog.paperspace.com/implementing-levenshtein-distance-word-autocomplete-autocorrect/
# we use levenshtein as it helps to protect against typos too, like the "asked asked" in:
# https://sprs.parl.gov.sg/search/sprs3topic?reportid=oral-answer-2822
def levenshtein(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))
    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1
    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

we assume that all pqs are prefaced with #. (non sprs) or # (sprs). ignore follow up qns since we are only interested in mapping mps to topics, and the follow up qns will always be from the same mp and on the same topic.

notes regarding minister titles:
* Minister for Culture, Community and Youth is the only minister title with a comma
* no questions were ever directed to minister mentor

In [5]:
cap_word = r'[A-Z][a-z]+'
cap_words = f'({cap_word})( {cap_word})*'
mccy = 'Minister for Culture, Community and Youth'
minister_for_something = f'({cap_words} )?Minister (for|of) (the )?{cap_words}( and (the )?{cap_words})?'
something_minister = f'{cap_words} Minister'
minister_regex = re.compile(f'(({mccy})|({minister_for_something})|({something_minister}))')

def first_two_capitalized(words):
    return words[0][0].isupper() and words[1][0].isupper() and words[0][1] and words[0][1].islower()

def trim_off_non_pq_content_at_start(para):
    para_words = para.split(' ')
    if first_two_capitalized(para_words):
        return para
    
    # i assume honorific+name has at least two words capitalized and non-numbers
    while not first_two_capitalized(para_words):
        para_words = para_words[1:]
    
    return ' '.join(para_words)

def pq_para_to_pq(pq_para):
    para = pq_para[0]
    para = re.sub(r'Page:\s+\d+', '', para)
    para = re.sub(r'\s+', ' ', para)
    para = trim_off_non_pq_content_at_start(para) # sometimes we end up mistaking other numbers in the text as being the pq numbers. so we deal w that here.
    asker_honorific_name, para = para.split(' asked the ', 1)
    minister_match = re.search(minister_regex, para)
    if not minister_match:
        # report might've been in the wrong case; try to match to existing ministers
        minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace('for', '.{1,5}'), ministers_found))) + ')',
            para.lower()
        )
        min_levenshtein = 99999
        min_minister = None
        for existing_minister in ministers_found:
            dist = levenshtein(existing_minister, minister_match.group())
            if dist < min_levenshtein:
                min_levenshtein = dist
                min_minister = existing_minister
        
        print(f'found minister: {str(minister_match.group())}; matched to {min_minister}')
        askee = min_minister
    else:
        askee = para[:minister_match.span()[1]].replace(' of ', ' for ')
        ministers_found.add(askee)
    question = para[minister_match.span()[1]:].strip()
    if question[0] == ',':
        question = question[1:].strip()
    asker, asker_party, asker_parls = honorific_name_to_mp_data(asker_honorific_name.strip())
    return [asker, asker_party, asker_parls, askee, question, *pq_para[1:]]


In [6]:
pq_paras = pd.read_csv('pq_paras.csv').values.tolist()
pqs = []

In [7]:
try:
    for pq_para in pq_paras:
        pqs.append(pq_para_to_pq(pq_para))
except Exception as e:
    import pdb
    pdb.set_trace()

allowing omitted words in name matched mr murali pillai to murali pillai sc
allowing omitted words in name matched mr alex yam to alex yam ziming
allowing omitted words in name matched mr mohd fahmi aliman to mohd fahmi bin aliman
found minister: minister for transport; matched to Minister for Transport
levenshtein matched mr yee chia hsing ? to yee chia hsing
allowing omitted words in name matched mr david ong to david ong kim huat
levenshtein matched mrs lina chiam to lina chiam
levenshtein matched assoc prof dr muhammad faishal ibrahim to muhammad faishal ibrahim
found minister: minister or national development; matched to Minister for National Development
levenshtein matched mrs mildred tan to mildred tan
levenshtein matched mrs josephine teo to josephine teo
levenshtein matched mrs mildred tan: to mildred tan
allowing omitted words in name matched dr loo choon yong to loo choon yong dr


In [8]:
pq_df = pd.DataFrame(pqs, columns=['asker_name', 'asker_party', 'asker_parliaments', 'askee', 'question', 'sitting_date', 'parliament_no', 'report_section'])
pq_df

Unnamed: 0,asker_name,asker_party,asker_parliaments,askee,question,sitting_date,parliament_no,report_section
0,Pritam Singh,[Workers' Party],"(12, 13, 14)",Prime Minister,(a) what is the format of the Government's aft...,2022-05-09,14,Oral Answers to Questions
1,Seah Kian Peng,[People's Action Party],"(11, 12, 13, 14)",Minister for Law,(a) whether there were similar instances of ch...,2022-05-09,14,Oral Answers to Questions
2,He Ting Ru,[Workers' Party],"(14,)",Minister for Law,(a) whether any steps need to be taken to addr...,2022-05-09,14,Oral Answers to Questions
3,Hany Soh,[People's Action Party],"(14,)",Minister for Law,(a) whether the Singapore Institute of Legal E...,2022-05-09,14,Oral Answers to Questions
4,Dennis Tan Lip Fong,[Workers' Party],"(13, 14)",Minister for Transport,whether the fall in the volume of Singapore's ...,2022-05-09,14,Written Answers to Questions for Oral Answer N...
...,...,...,...,...,...,...,...,...
5294,Halimah Yacob,[People's Action Party],"(10, 11, 12, 13)",Minister for Trade and Industry,(a) how many households are currently in arrea...,2009-01-22 00:00:00,11,Oral Answers to Questions
5295,Ho Geok Choo,[People's Action Party],"(10, 11)",Minister for Transport,whether he will clarify on the exceeded budget...,2009-01-22 00:00:00,11,Oral Answers to Questions
5296,Penny Low,[People's Action Party],"(10, 11, 12)",Minister for Finance,what measures and tax incentives can be introd...,2009-01-22 00:00:00,11,Oral Answers to Questions
5297,Ong Ah Heng,[People's Action Party],"(9, 10, 11)",Minister for Education,in view of the single intake in January 2009 f...,2009-01-22 00:00:00,11,Oral Answers to Questions


In [9]:
pq_df.to_csv('pqs.csv', index=False)

In [10]:
assert all(pq_df.parliament_no < 15)
print(set(map(tuple, pq_df.asker_party)))
print()
print(set(pq_df.askee))

{("Singapore People's Party",), ("Workers' Party",), ('Nominated Member of Parliament',), ("People's Action Party",), ('Progress Singapore Party',), ('',), ('Singapore Democratic Alliance',)}

{'Deputy Prime Minister', 'Minister for Foreign Affairs', 'Minister for Culture, Community and Youth', 'Acting Minister for Social and Family Development', 'Minister for Information', 'Minister for Sustainability and the Environment', 'Minister for the Environment and Water Resources', 'Acting Minister for Culture', 'Minister for Law', 'Minister for Community Development', 'Minister for Transport', 'Minister for Defence', 'Minister for Trade and Industry', 'Acting Minister for Education', 'Acting Minister for Information', 'Minister for Finance', 'Minister for Health', 'Minister for Defence and Leader', 'Minister for Manpower', 'Minister for Social and Family Development', 'Senior Minister', 'Minister for Communications and Information', 'Prime Minister', 'Acting Minister for Community Developmen