In [1]:
from bs4 import BeautifulSoup as bs
import ast
import os
import os.path
from datetime import datetime
from enum import Enum
import re
import pandas as pd
import numpy as np

RAW_HANSARD_DIR = 'scraped_content'

In [2]:
class ReportSection(Enum):
    WRITTEN = 'Written Answers to Questions'
    WRITTEN_NA = 'Written Answers to Questions for Oral Answer Not Answered by End of Question Time'
    ORAL = 'Oral Answers to Questions'

**from html files, get question paragraphs and report metadata**

we assume that all pqs are prefaced with #. (non sprs) or # (sprs). ignore follow up qns since we are only interested in mapping mps to topics, and the follow up qns will always be from the same mp and on the same topic.

In [3]:
pq_paras = []
    
for file in os.listdir(RAW_HANSARD_DIR):
    filepath = os.path.join(RAW_HANSARD_DIR, file)
    if not os.path.isfile(filepath):
        continue
 
    # the files don't have consistent html formatting, but they fall into two major groups.
    if file[0:4] == 'sprs':
        is_sprs = True
    else:
        is_sprs = False
    
    with open(filepath, 'r') as f:
        soup = bs(f, 'html.parser')
        
    if is_sprs:
        # extract metadata from the table at the top of the page
        topic_table = soup.find('table', {'class': 'topic'})
        topic_text = topic_table.get_text()
        
        parl_no_label_span = re.match(r'Parliament No:', topic_text).span()
        topic_text = topic_text[parl_no_label_span[1]:]
        parl_no = int(re.match(r'\d+', topic_text).group())

        sit_date_label_span = re.match(r'.*Sitting Date:', topic_text).span()
        topic_text = topic_text[sit_date_label_span[1]:]
        sit_date = datetime.strptime(re.match(r'\d+-\d+-\d+', topic_text).group(), '%d-%m-%Y')
        
        # extract report section via filename
        if 'oral-answer' in file:
            section = ReportSection.ORAL
        elif 'written-answer-na' in file:
            section = ReportSection.WRITTEN_NA
        elif 'written-answer' in file:
            section = ReportSection.WRITTEN
        else:
            raise 'unidentified section (sprs)'
        
        # extract paragraphs from report in the form of rows of table
        report_table = soup.find('div', {'class': 'reportTable'})
        report_table_rows = report_table.findChildren()

        # find out which rows correspond to pqs and which rows don't.
        for row in report_table_rows:
            paragraph = row.get_text()
            paragraph = paragraph.replace('\xa0', ' ')
            if not re.match(r'\d+.{8,75} asked the .+', paragraph):
                continue
            paragraph_no_num = paragraph[
                re.search('\d+', paragraph).span()[1]:]
            pq_paras.append((paragraph_no_num.strip(), sit_date, parl_no, section))
    else:
        # extract metadata from meta elements directly
        sit_date_str = soup.find('meta', {'name': 'Sit_Date'})['content']
        parl_no_str = soup.find('meta', {'name': 'Parl_No'})['content']
        section_str = soup.find('meta', {'name': 'Sect_Name'})['content']
        
        sit_date = datetime.strptime(sit_date_str, '%Y-%m-%d')
        parl_no = int(parl_no_str)
        if 'NOT REACHED' in section_str:
            section = ReportSection.WRITTEN_NA
        elif 'ORAL' in section_str:
            section = ReportSection.ORAL
        elif 'WRITTEN' in section_str:
            section = ReportSection.WRITTEN
        else:
            raise 'unidentified section (non sprs)'
        
        # extract paragraphs from table in the form of raw text (since each paragraph doesn't have its own element)
        report_table = soup.find('div', {'class': 'reportTable'})
        raw_text = report_table.get_text()
        
        # look for things like 1. 2. etc
        first_number_dot_occurrence = re.search(r'\d+\. .{8,75} asked the ', raw_text)
        
        if not first_number_dot_occurrence:
            continue

        # cut off everything before the actual body of the text
        raw_text = raw_text[first_number_dot_occurrence.span()[0]:]
        
        # formatting stuff
        raw_text = raw_text.replace('\n\n', ' ')
        raw_paras = raw_text.split('\xa0\xa0\xa0\xa0')
        
        new_pq_paras = list(
            map(lambda s: (s[re.search('\d+\.', s).span()[1]:].strip(), sit_date, parl_no, section), # remove index, add sitting date, parl no, section
                map(lambda s: re.sub('Column: \d+', '', s.replace('  ', ' ').strip()), # remove column indicators and extra spaces
                    filter(lambda para: re.match(r'\d+\. .{8,75} asked the ', para), raw_paras)))) # keep only paragraphs that are pqs.
        
        pq_paras += new_pq_paras


**for merging with mps.csv**

In [4]:
mp_df = pd.read_csv('mps.csv')
mp_df.Party = mp_df.Party.apply(ast.literal_eval)
mp_df.Parliaments = mp_df.Parliaments.apply(ast.literal_eval)

In [5]:
mp_df

Unnamed: 0,Name,Party,Parliaments
0,A Nithiah Nandan,[Nominated Member of Parliament],"(10,)"
1,A. Rahim Ishak,[People's Action Party],"(1, 2, 3, 4, 5)"
2,A.P. Rajah,[Independent Singapore Party Alliance],"(0, 1)"
3,Abbas Abu Amin,[People's Action Party],"(5, 6, 7)"
4,Abdul Aziz Karim,[People's Action Party],"(2, 3)"
...,...,...,...
447,Zainul Abidin Rasheed,[People's Action Party],"(9, 10, 11)"
448,Zaqy Mohamad,[People's Action Party],"(11, 12, 13, 14)"
449,Zhulkarnain Abdul Rahim,[People's Action Party],"(14,)"
450,Zulkifli Bin Baharudin,[Nominated Member of Parliament],"(9,)"


In [6]:
mps = dict(
    zip(mp_df.Name.apply(lambda x: x.lower()), # keys
    zip(mp_df.Name, mp_df.Party, mp_df.Parliaments))) # values

In [7]:
# for matching honorific+name in report to actual mp data.
# cannot simply remove honorific as the programmer doesn't have an exhaustive list
# of honorifics, and some are quite rare in everyday use (e.g. Inche Rahamat Bin Kenap).
def honorific_name_to_mp_data(honorific_name):
    honorific_name = honorific_name.lower()
    
    # try the easy way first (find and remove honorific)
    honorific_match = re.match(r'(mr|mrs|ms|mdm|dr|prof|assoc prof|assoc prof dr|inche)', honorific_name)
    if honorific_match:
        name = honorific_name[honorific_match.span()[1]+1:]
        if name in mps.keys():
            return mps[name]
        
    # the hard way (levenshtein)
    mp_names = list(mps.keys())
    min_levenshtein = 99999
    min_ind = -1
    for i in range(len(mp_names)):
        l_dist = levenshtein(mp_names[i], honorific_name)
        if l_dist < min_levenshtein:
            min_levenshtein = l_dist
            min_ind = i
    print(f'levenshtein matched {honorific_name} to {mp_names[min_ind]}')
    return mps[mp_names[min_ind]]


# borrowed from: https://blog.paperspace.com/implementing-levenshtein-distance-word-autocomplete-autocorrect/
# we use levenshtein as it helps to protect against typos too, like the "asked asked" in:
# https://sprs.parl.gov.sg/search/sprs3topic?reportid=oral-answer-2822
def levenshtein(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))
    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1
    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

In [8]:
honorific_name_to_mp_data('Prof Lee Shienloong')

levenshtein matched prof lee shienloong to lee hsien loong


('Lee Hsien Loong',
 ["People's Action Party"],
 (6, 7, 8, 9, 10, 11, 12, 13, 14))

**from question paragraphs, get askers, askees, questions**

notes regarding minister titles:
* Minister for Culture, Community and Youth is the only minister title with a comma
* no questions were ever directed to minister mentor

In [9]:
cap_word = r'[A-Z][a-z]+'
cap_words = f'({cap_word})( {cap_word})*'
mccy = 'Minister for Culture, Community and Youth'
minister_for_something = f'({cap_words} )?Minister for {cap_words}( and (the )?{cap_words})?'
something_minister = f'{cap_words} Minister'
minister_regex = re.compile(f'(({mccy})|({minister_for_something})|({something_minister}))')

pqs = []

for pq_para in pq_paras:
    para = pq_para[0]
    asker_honorific_name, para = para.split(' asked the ', 1)
    minister_match = re.search(minister_regex, para)
    askee = para[:minister_match.span()[1]]
    question = para[minister_match.span()[1]:].strip()
    asker, asker_party, asker_parls = honorific_name_to_mp_data(asker_honorific_name)
    pqs.append([asker, asker_party, asker_parls, askee, question, *pq_para[1:]])


levenshtein matched mr zhulkarnain abdul rahim asked to zhulkarnain abdul rahim
levenshtein matched mr murali pillai to murali pillai sc
