# Overview

This notebook reads the raw text data from Hansard scrapings and combines it with MP data stored in CSV format, to produce a CSV file of the various parliamentary questions contained within the scrapings, as well as their askers and other relevant info. 

Run the two scraping notebooks first. The Hansard scraper gives you a directory of HTML files while the MP scraper gives you a CSV file. Then run this notebook to get another CSV file, `pqs.csv`. 

The Hansard data is a terrible mess. This code has to deal with the weird edge cases in the Hansard dataset so it is also a terrible mess. I left some comments describing some of these edge cases; you can ctrl+F for "edge case" to find them. The relevant file names of the HTML files which contain the edge cases are provided too. If you want to view the original pages in all their glory, you can perform the following steps: 
1. Take the file name and replace the first `_` with `?`
2. Remove the `.html` suffix
3. Prepend `https://sprs.parl.gov.sg/search/`
4. Then head to the URL that you get.

For example, `sprs3topic_reportid=budget-1904.html` becomes [https://sprs.parl.gov.sg/search/sprs3topic?reportid=budget-1904](https://sprs.parl.gov.sg/search/sprs3topic?reportid=budget-1904).

**Note: the CSV file you get from this is delimited by `|`. If reading the CSV file with pandas, make sure you set the `sep` argument to account for this. If opening in Excel, follow the steps [here](https://support.affinity.co/hc/en-us/articles/360044453711-How-to-open-CSV-files-with-the-correct-delimiter-separator).**

# Setup and stuff

In [1]:
import ast
from datetime import datetime
from enum import Enum
import os
import os.path
from pdb import set_trace as st
import pickle
import re

from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

In [2]:
# load mp data. dict mps maps (normalized name) -> (name, party, parliaments)
mp_df = pd.read_csv('mps.csv')
mp_df.Parliaments = mp_df.Parliaments.apply(ast.literal_eval)
mps = dict(
    zip(mp_df.Name.apply(lambda x: x.replace('.', '').replace(',', '').lower()), # keys
    zip(mp_df.Name, mp_df.Party, mp_df.Parliaments))) # values
mp_names = list(mps.keys())

In [3]:
alr_matched = set() # keep track of matches made so we don't spam the print
ministers_found = set() # minister titles that have already been found; to be used for future searches in case of typos

In [4]:
honorific_regex = r'(mrs|mr|ms|miss|mdm|er dr|er|assoc prof dr|assoc prof|asst prof|prof|dr|inche|encik|bg \[ns\]|mg \[ns\]|lg \[ns\]|gen \[ns\]|col \[ns\]|ltc \[ns\]|bg|mg|lg|gen|col|ltc)'
honorific_bracket_regex = f'\({honorific_regex} .+\)'
    
# for matching honorific+name in reports to the normalized mp names.
# the "quiet" variable just determines if we print output and throw errors. normally you'd want it to be False.
def honorific_name_to_mp_data(honorific_name, quiet=False):
    honorific_name = honorific_name.replace('.','').replace(',','').replace(':','').lower().strip()
    honorific_name = re.sub('\(.+\)', '', honorific_name)
    
    # try the easy way first (find and remove honorific)
    honorific_match = re.match(honorific_regex, honorific_name)
    if honorific_match:
        name = honorific_name[honorific_match.span()[1]+1:]
        if name in mps.keys():
            return mps[name]
        
        # edge case: (sprs3topic_reportid=oral-answer-2822). "asked asked".
        # so this block attempts to remove any "asked" in the name.
        last_asked = name.rfind(' asked')
        if last_asked and name[:last_asked] in mps.keys():
            return mps[name[:last_asked]]

        # try rearranging the words in the name
        for mp_name in mp_names:
            mp_name_words = set(mp_name.split(' '))
            name_words = set(name.split(' '))
            
            # exact same set of words, just different order. 
            # quite common for names of chinese ppl who also have english names; the english name can be either at the front or back
            if mp_name_words == name_words:
                if (honorific_name, mp_name) not in alr_matched:
                    if not quiet:
                        print(f'rearranging matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
            
            # one or two words omitted, but other words are the same. relevant for chinese names and "bin" in malay names.
            if len(mp_name_words) - len(name_words) <= 2 and len(name_words) >= 2 and name_words.issubset(mp_name_words):
                if (honorific_name, mp_name) not in alr_matched:
                    if not quiet:
                        print(f'allowing omitted words in name matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
    
    # names shouldn't have digits. check if there are digits.
    digit_match = re.search('\d+', honorific_name)
    if digit_match:
        honorific_name = honorific_name[digit_match.span()[1]:]
        return honorific_name_to_mp_data(honorific_name)
        
    # last resort: levenshtein distance
    closest_name = levenshtein_best_match(honorific_name, mp_names)
    
    if (honorific_name, closest_name) not in alr_matched:
        if not quiet:
            print(f'levenshtein matched {honorific_name} to {closest_name}')
        if re.sub('[A-Za-z]+', '', honorific_name) == honorific_name and not quiet:
            assert False, f'this honorific_name was found: {honorific_name}. seems quite sus because it has no letters; pls investigate'
        alr_matched.add((honorific_name, closest_name))
    return mps[closest_name]

def levenshtein_best_match(value, options):
    min_levenshtein = 99999
    min_val = None
    for option in options:
        l_dist = levenshtein(option, value)
        if l_dist < min_levenshtein:
            min_levenshtein = l_dist
            min_val = option
    return min_val
            

# borrowed from: https://blog.paperspace.com/implementing-levenshtein-distance-word-autocomplete-autocorrect/
def levenshtein(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))
    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1
    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

In [5]:
pqs = []

In [6]:
# different contexts in which parliamentary questions can be raised. part of metadata.
class ReportSection(Enum):
    WRITTEN = 'Written Answers to Questions'
    WRITTEN_NA = 'Written Answers to Questions for Oral Answer Not Answered by End of Question Time'
    ORAL = 'Oral Answers to Questions'
    BUDGET = 'Budget'

In [7]:
cap_word = r'[A-Z][a-z]+'
cap_words = f'({cap_word})( {cap_word})*'
mccy = '(Acting )?Minister for Culture, Community and Youth'
mica = '(Acting )?Minister for Information, Communications and the Arts'
mcdys = '(Acting )?Minister for Community Development, Youth( and|,) Sports'
micma = 'Minister-in-charge of Muslim Affairs'
minister_for_something = f'({cap_words} )?Minister( of State)? (for|of) (the )?{cap_words}( and (the )?{cap_words})?( \({cap_words}( and {cap_words})?\))?'
something_minister = f'{cap_words} Minister'
one_minister_regex = f'(({mccy})|({mica})|({mcdys})|({micma})|({minister_for_something})|({something_minister}))'
minister_regex = re.compile(f'{one_minister_regex}( and (the )?{one_minister_regex})?') # can have multiple targets

def contains_none_of(string, disallowed_strings):
    for s in disallowed_strings:
        if s in string:
            return False
    return True

def contains_one_of(string, required_strings):
    return not contains_none_of(string, required_strings)

# extracts the first substring which is a substring of ministers
def extract_first_ministers(para):
    minister_match = re.search(minister_regex, para)
    if not minister_match:
        # report might've been in the wrong case; try to match to existing ministers
        minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace(' for ', '.{1,5}'), ministers_found))) + ')',
            para.lower()
        )
        if not minister_match:
            minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace(' for ', '(\s)*(for|of)(\s)*'), ministers_found))) + ')',
            para.lower()
        )
        if not minister_match:
            for existing_minister in ministers_found:
                if existing_minister.replace(' ', '') in para.replace(' ', ''):
                    minister_match = re.search('(\s)?'.join(c for c in existing_minister.replace(' ', '').lower()), para.lower())
                    break
        minister = levenshtein_best_match(minister_match.group(), ministers_found)
        print(f'found minister: {str(minister_match.group())}; matched to {minister}')
    else:
        minister = para[:minister_match.span()[1]].replace(' of ', ' for ').replace('for State', 'of State')
        ministers_found.add(minister)

    para = para.replace(minister_match.group(), '').strip()

    if ' and Leader' in minister:
        minister = minister[:-11]

    if minister[:4] == 'The ':
        minister = minister[4:]
    
    # edge cases: (sprs3topic_reportid=written-answer-na-7122)
    minister = minister.replace('Trade Industry', 'Trade and Industry')
    
    while not minister[-1].isalpha() and minister[-1] != ')':
        minister = minister[:-1]
    
    return minister, para

def parse_speaker_title_honorific_name(speaker):
    honorific_bracket_search = re.search(honorific_bracket_regex, speaker.lower())
    honorific_name = speaker[honorific_bracket_search.span()[0]+1 : honorific_bracket_search.span()[1]-1].strip()
    responder_title = re.sub(honorific_bracket_regex, '', speaker, flags=re.IGNORECASE).replace(' of ', ' for ').replace('for State', 'of State').strip()
    if responder_title[:4] == 'The ':
        responder_title = responder_title[4:].strip()
    return honorific_name, responder_title

def get_ministers_and_question(para):
    askee, question = extract_first_ministers(para)
    
    if not re.search('and (the )?Minister', askee):
        return (askee,), question   
    else:
        askee = askee.replace('and the Minister', 'and Minister')
        askees = askee.split(' and Minister')
        return (askees[0], 'Minister' + askees[1]), question

def get_section_name(section_name_raw):
    if 'answered' in section_name_raw:
        return ReportSection.WRITTEN_NA
    elif 'written' in section_name_raw:
        return ReportSection.WRITTEN
    elif 'oral' in section_name_raw:
        return ReportSection.ORAL
    elif 'budget' in section_name_raw:
        return ReportSection.BUDGET
    else:
        raise f'no section name??? {section_name_raw}'

        
ministry_keywords_dict = {
    'MCCY': ['Muslim Affairs', 'Culture', 'Youth', 'Sports','Community'], 
    'MOT':['Transport'], 
    'MINDEF':['Defence'],
    'MinLaw':['Law'], 
    'MTI':['Trade and Industry'], 
    'MOM':['Manpower'], 
    'MND':['National Development'], 
    'MHA':['Home Affairs'],
    'MOH':['Health'], 
    'MFA':['Foreign Affairs'], 
    'MSF':['Social and Family Development','Social Services Integration'], 
    'MOF':['Finance'],  
    'MOE':['Education'], 
    'MSE':['Environment and Water Resources','Sustainability and the Environment'],
    'MCI':['Information','Communications and Information','Information, Communications and the Arts'],
    'PMO':['Coordinating Minister for National Security', 'Prime Minister', 'Senior Minister'] 
}

def identify_portfolios(titles):
    return tuple(set(map(lambda title: identify_portfolio(title), titles)))

def identify_portfolio(title): 
    title = title.lower()
    for k, v in ministry_keywords_dict.items():
        words_re = re.compile('|'.join(list(map(lambda x: x.lower(), v))))
        if words_re.search(title):
            return k
    if title != '':
        print(f'no portfolio? {title}')

def is_admin_guy(title, known_admin_guys):
    title = title.lower()
    if title in known_admin_guys:
        return True
    is_admin_guy = contains_one_of(title, ['speaker', 'chairman', 'leader']) and 'minister' not in title
    if not is_admin_guy:
        return False
    honorific_bracket_search = re.search(honorific_bracket_regex, title)
    if honorific_bracket_search:
        known_admin_guys.add(honorific_bracket_search.group()[1:-1])
        print(f'added new admin guy for current file: {honorific_bracket_search.group()[1:-1]} (originally {title})')
    return True

def is_pq_asker(guy, known_pq_askers):
    if contains_one_of(guy, ['Minister', 'Speaker', 'Leader', 'Chairman', 'ernacular', 'Mandarin', 'Chinese', 'Hokkien', 'Cantonese', 'Teochew', 'Malay', 'Bahasa', 'Tamil', 'Hindi', 'Indian']):
        return False
    guy = re.sub(r'\(.*\)', '', guy).strip()
    if guy == '':
        return False
    guy = honorific_name_to_mp_data(guy, quiet=True)[0].lower()
    return guy in known_pq_askers

def get_title_from_name_and_mp_list(name, mps_speaking):
    mps_speaking = list(map(lambda x: x + ')', mps_speaking.split('),')))
    mps_speaking[-1] = mps_speaking[-1][:-1]
    for mp_speaking in mps_speaking:
        if name in mp_speaking:
            return parse_speaker_title_honorific_name(mp_speaking)[1]
    return ''
        
def soup_to_pqs(soup, file):
    # seems to happen quite often sadly
    if soup.get_text() == '':
        print(f'empty text {file}')
        return
    
    stripped_strings = list(map(
        lambda text: re.sub(r'\s+', ' ', re.sub(r'(Page|Column):\s+\d+', '', text)),
        filter(
            lambda text: not re.match(r'Page:\s+\d+', text) and not re.match(r'Column:\s+\d+', text),
            [text for text in soup.stripped_strings])))
    
    if len(stripped_strings) < 20: # the table at the top of the page alr accounts for most of this.
        return
    
    # get metadata
    parl_no = int(stripped_strings[3])
    sess_no = int(stripped_strings[5])
    vol_no = int(stripped_strings[7])
    sitting_no = int(stripped_strings[9])
    sitting_date = datetime.strptime(stripped_strings[11], '%d-%m-%Y')
    section_name = get_section_name(stripped_strings[13].lower())
    title = stripped_strings[15]
    mps_speaking = stripped_strings[17]
    the_rest = stripped_strings[19:]

    if section_name != ReportSection.BUDGET:
        # trim off useless preamble stuff. so everything before the first pq is asked. in non-budget proceedings, pqs are denoted with numbers.
        while len(the_rest) > 0 and not re.match(r'\d\d?', the_rest[0]):
            the_rest = the_rest[1:]

        if len(the_rest) == 0:
            return
        
        # edge case: (sprs3topic_reportid=oral-answer-859.html). the "for the Minister for the Environment and Water Resources" should be in brackets
        for i in range(len(the_rest)):
            if re.match(f'for the ({one_minister_regex})', the_rest[i]) and i > 0 and the_rest[i-1][-1] != '(': 
                the_rest[i] = f'({the_rest[i]})'

        # find out which indices of the_rest correspond to pqs and which correspond to responses/sqs (speakers)
        indices_corresponding_to_pqs = []
        indices_corresponding_to_speakers = []
        maybe_more_pqs = True # after the first response, no more pqs are asked
        for i in range(len(the_rest)):
            if the_rest[i][0] == ':' or (
                i-1 >= 0 and the_rest[i-1][-1] == ':' and the_rest[i-1] in list(map(lambda s: re.sub('\s+', ' ', s.get_text().strip()), soup.select('strong')))): # edge case: (sprs3topic_reportid=oral-answer-2239.html), Ong Ye Kung's first response has the colon bolded, whereas it's normally not bolded. this throws us off. extra check in the condition is to resolve this.
                actual_index_to_append = i-1
                # edge case: (sprs3topic_reportid=oral-answer-1632.html), "The Senior Minister of State for Home Affairs (Mr Desmond Lee) (for the Minister for Home Affairs)" is broken up into multiple entries for some reason. this loop is to ensure the full name and title gets saved. notice the colon is split from the rest.
                # edge case: (topic_reportid=008_20120710_S0007_T0006). similar story. but now the colon is with the name, even tho it's still broken up
                # edge case: (sprs3topic_reportid=oral-answer-1159). faishal ibrahim's first response is another one. colon placement is terribad. this is why i need the re.sub. 
                while (the_rest[actual_index_to_append][0] == '(' and (
                    the_rest[actual_index_to_append][-1] == ')' or re.sub('\)\s+:', '):', the_rest[actual_index_to_append])[-2:] == '):'
                )) or ' ' not in the_rest[actual_index_to_append]: # edge case: (sprs3topic?reportid=written-answer-53). "teo" is separate from "josephine" in the html it seems
                    actual_index_to_append -= 1
                # edge case: (sprs3topic_reportid=oral-answer-2760.html), "The Minister of State for Home Affairs (Mr Desmond Tan) (for the  Minister for Home Affairs)" is also cut in the middle for some reason zzz
                # edge case: (sprs3topic_reportid=oral-answer-362.html) need a loop otherwise we won't get iswaran's title properly (both brackets broken off in the html)
                bracket_count = 0
                for j in range(actual_index_to_append, i):
                    bracket_count += the_rest[j].count('(') - the_rest[j].count(')')
                while bracket_count != 0:
                    actual_index_to_append -= 1
                    if actual_index_to_append < 0:
                        print(f'returning cuz of bad brackets in {file}')
                        return
                    bracket_count += the_rest[actual_index_to_append].count('(') - the_rest[actual_index_to_append].count(')')
                if contains_one_of(the_rest[actual_index_to_append][:20], ['ernacular', 'Mandarin', 'Chinese', 'Hokkien', 'Cantonese', 'Teochew', 'Malay', 'Bahasa', 'Tamil', 'Hindi', 'Indian']):
                    continue # to avoid picking up stuff like (In Mandarin) etc
                indices_corresponding_to_speakers.append(actual_index_to_append)
                maybe_more_pqs = False
            elif re.match(r'\d\d?', the_rest[i]) and maybe_more_pqs:
                indices_corresponding_to_pqs.append(i)

        if len(indices_corresponding_to_pqs) == 0:
            print(f'no pqs? {file}')
            return
        if len(indices_corresponding_to_speakers) == 0:
            print(f'no speakers? {file}') # edge case: (sprs3topic_reportid=oral-answer-1729.html)
            return

        pq_sublists = []
        pq_qn_indices = []
        while len(indices_corresponding_to_pqs) > 1:
            pq_qn_indices.append(the_rest[indices_corresponding_to_pqs[0]])
            pq_sublists.append(the_rest[indices_corresponding_to_pqs[0]+1:indices_corresponding_to_pqs[1]])
            indices_corresponding_to_pqs = indices_corresponding_to_pqs[1:]

        pq_qn_indices.append(the_rest[indices_corresponding_to_pqs[0]])
        pq_sublists.append(the_rest[indices_corresponding_to_pqs[0]+1:indices_corresponding_to_speakers[0]])

        speaking_sublists = []

        while len(indices_corresponding_to_speakers) > 1:
            speaking_sublists.append(the_rest[indices_corresponding_to_speakers[0]:indices_corresponding_to_speakers[1]])
            indices_corresponding_to_speakers = indices_corresponding_to_speakers[1:]

        speaking_sublists.append(the_rest[indices_corresponding_to_speakers[0]:])

        new_pqs = []
        new_pq_indices = []
        known_pq_askers = set() # edge case (sprs3topic?reportid=oral-answer-2133). deal w cases where the first guy to speak is actly a pq asker himself. they nvr say anyth useful if they speak first

        for pq_i, sl in zip(pq_qn_indices, pq_sublists):
            pq_para = ' '.join(sl)

            if ' asked the ' not in pq_para:
                continue
            asker_honorific_name, pq_para = pq_para.split(' asked the ', 1)    
            ministers, question = get_ministers_and_question(pq_para)

            if question[0] == ',':
                question = question[1:].strip()
            if len(asker_honorific_name.strip()) == 0:
                return
            asker, asker_party, asker_parls = honorific_name_to_mp_data(asker_honorific_name.strip())
            new_pq_indices.append(int(pq_i))
            new_pqs.append([asker, asker_party, asker_parls, ministers, question, parl_no, sess_no, vol_no, sitting_no, sitting_date, section_name, title])
            known_pq_askers.add(asker.lower())

        # find out what's said after the pqs have been asked, and who says it
        speakers_and_spokens = []
        for sl in speaking_sublists:
            text = ' '.join(sl)
            text = re.sub('\(\s+', '(', text) # edge case: (sprs3topic_reportid=oral-answer-910.html). seems to be some invisible char between an open bracket and "for the Minister".
            split_result = text.split(':', 1)
            if len(split_result) < 2:
                return
            speaker, spoken = split_result
            speaker = speaker.strip()
            spoken = spoken.strip()
            while len(spoken) > 0 and not spoken[0].isalpha():
                spoken = spoken[1:].strip()
            if len(spoken) == 0: # edge case (sprs3topic_reportid=written-answer-4142.html). sometimes people are just lost for words i guess.
                return
            if spoken[:11].lower() == 'question no' or (len(spoken) < 20 and spoken[:8].lower() == 'question'):
                continue
            
            speaker = re.sub(f'\(for .*\)', '', speaker)
            speaker = re.sub(f'for the ({one_minister_regex})', '', speaker)
            speaker = re.sub(f'\(on behalf of .*\)', '', speaker)
            spoken = re.sub('\[(.|\s){1,60}\]', '', spoken) # remove things like [Please refer to yadda yadda]
            speakers_and_spokens.append([speaker, spoken])

        known_admin_guys = set() # edge case: (sprs3topic_reportid=oral-answer-1325). have to weed out all instances of Desmond Lee from just this file cuz he's an admin guy here. sadly he's not referred to as "The Deputy Leader" but as "Desmond Lee"
        #st()
        
        # speaker never says anyth useful
        while len(speakers_and_spokens) > 0 and (
            is_admin_guy(speakers_and_spokens[0][0], known_admin_guys) or is_pq_asker(speakers_and_spokens[0][0], known_pq_askers) or (
                section_name == ReportSection.ORAL and 'minister' not in speakers_and_spokens[0][0].lower() # edge case: (sprs3topic?reportid=oral-answer-446). pritam singh decided to say stuff when it wasn't his turn.
            )
        ):
            speakers_and_spokens = speakers_and_spokens[1:]
        while len(speakers_and_spokens) > 0 and (is_admin_guy(speakers_and_spokens[-1][0], known_admin_guys) or is_pq_asker(speakers_and_spokens[-1][0], known_pq_askers)):
            speakers_and_spokens = speakers_and_spokens[:-1]

        # edge case: (sprs3topic_reportid=oral-answer-1325) and many others. sometimes there are really no responses.
        if len(speakers_and_spokens) == 0:
            return

        # edge case: (sprs3topic_reportid=oral-answer-1356.html). there's too many colons at the start of paragraphs which don't actly correspond to people saying new things. we resolve that here.
        indices_to_merge = []
        for i in range(1, len(speakers_and_spokens)):
            speaker = speakers_and_spokens[i][0]
            if re.sub('\W+', '', re.sub('\(.+\)', '', speaker)).strip() == '':
                indices_to_merge.append(i)
        
        for i in reversed(indices_to_merge):
            speakers_and_spokens[i-1][1] += ' ' + speakers_and_spokens[i][1]
            speakers_and_spokens.pop(i)

        # if minister title is provided then we take. else, just take the name.
        first_responder, first_response = speakers_and_spokens[0]

        if 'Minister' in first_responder:
            if 'Minister' in re.sub('\(.+\)', '', first_responder): # what usually happens
                first_responder_honorific_name, first_responder_title = parse_speaker_title_honorific_name(first_responder)
            else: # edge case: (sprs3topic_reportid=oral-answer-362.html). iswaran speaks on a minister's behalf, but it doesn't say what minister he is
                first_responder_honorific_name = re.sub('\(.+\)', '', first_responder).strip()
                first_responder_title = ''
            first_responder_name = honorific_name_to_mp_data(first_responder_honorific_name)[0]
        else:
            first_responder_title = ''
            if re.match(honorific_regex, first_responder.lower()):
                first_responder_name = honorific_name_to_mp_data(first_responder)[0]
            else:
                first_responder_name = honorific_name_to_mp_data('Mr ' + first_responder)[0]
                
        # edge case: (sprs3topic_reportid=oral-answer-2824.html): alvin tan's title is just "minister of state", which isn't v helpful cuz we needa ministry.
        # so we get it from the box at the top.
        if first_responder_title[-17:] == 'Minister of State':
            first_responder_title = get_title_from_name_and_mp_list(first_responder_name, mps_speaking)
        
        # edge case: (sprs3topic_reportid=oral-answer-27)
        first_responder_title = first_responder_title.replace('Affiars', 'Affairs').replace('Parlamentary', 'Parliamentary')
        speakers_and_spokens = speakers_and_spokens[1:]

        # if there's more than 1 pq, the responder will ask the speaker for permission to hit all the qns at once,
        # and the speaker will grant permission. and then there may be a bit more admin back and forth.
        # we wanna remove that.
        if len(new_pqs) > 1:
            while len(speakers_and_spokens) > 0 and 'speaker' in speakers_and_spokens[0][0].lower():
                first_response = speakers_and_spokens[1][1]
                speakers_and_spokens = speakers_and_spokens[2:]

        # the new pqs now have their responses ready, we can save them.
        for new_pq in new_pqs:
            pqs.append(new_pq + [[[first_responder_name, first_responder_title, first_response]], True])

        # pqs settled. now move on to followup (sqs)

        if section_name in (ReportSection.WRITTEN, ReportSection.WRITTEN_NA):
            return # there are no sqs in written responses

        new_sqs = []
        responder_names_to_titles = dict()
        responder_names_to_titles[first_responder_name] = first_responder_title

        for speaker, spoken in speakers_and_spokens:
            if is_admin_guy(speaker, known_admin_guys):
                continue # speaker says nothing useful

            if '(' in speaker: # if there's a brack8 then we've never seen this person speak before
                honorific_bracket_search = re.search(honorific_bracket_regex, speaker.lower())
                if honorific_bracket_search: # honorific and name occur inside brackets for responders, outside for askers
                    is_response = True
                    honorific_name, responder_title = parse_speaker_title_honorific_name(speaker)
                    speaker_data = honorific_name_to_mp_data(honorific_name)
                    responder_name = speaker_data[0]
                    if responder_title[-17:] == 'Minister of State':
                        responder_title = get_title_from_name_and_mp_list(responder_name, mps_speaking)
                    responder_names_to_titles[responder_name] = responder_title
                else: # if there's no honorific inside brack8, then honorific must be outside brack8. this only happens for asker.
                    is_response = False
                    honorific_name = re.sub('\(.+\)', '', speaker)
                    speaker_data = honorific_name_to_mp_data(honorific_name)
            else: # if there's no brack8 then we've seen the person speak before
                speaker_data = honorific_name_to_mp_data(speaker)
                if speaker_data[0] not in responder_names_to_titles.keys(): # check whether the person is a known responder
                    is_response = False
                    asker_name = speaker_data[0]
                else:
                    is_response = True
                    responder_name = speaker_data[0]
                    responder_title = responder_names_to_titles[responder_name]

            if is_response:
                for new_sq in new_sqs:
                    new_sq = new_sq + [[[responder_name, responder_title, spoken]], False]
                    new_sq[3] = (responder_title,) # backfill the missing askee title
                    pqs.append(new_sq)
                new_sqs = []
            else:
                new_sqs.append([
                    speaker_data[0],
                    speaker_data[1],
                    speaker_data[2],
                    None, # instead of ner to find out who the target of the qn is, we just backfill it l8r when we get the response
                    spoken,
                    parl_no,
                    sess_no,
                    vol_no,
                    sitting_no,
                    sitting_date,
                    section_name,
                    title
                ])
        return
    else: # budget cuts
        
        if 'debate' in title.lower() and 'budget' in title.lower():
            # they're never useful. e.g. sprs3topic_reportid=budget-117.html
            # note: budget debates aren't the same as budget cuts
            print(f'useless budget debate {file}')
            return
        
        # edge case: (sprs3topic?reportid=budget-1039). the first titled person to talk, faishal ibrahim, isn't actly the responder. so we double check the person's title against the article title.
        title_bracket_search = re.search('\(.+\)', title)
        if title_bracket_search:
            title_implied_ministry = identify_portfolio(title_bracket_search.group()[1:-1])
        else:
            title_implied_ministry = None
        
        # don't want things like (In Malay): [refer to vernacular speech]
        paras = list(map(lambda x: re.sub('\[(.|\s){1,60}\]', '', re.sub('\s+', ' ', x.get_text().replace('\xa0', ' ').replace('\ufeff', '').strip())), soup.select('p')))
        speakers_and_spokens = []
        next_speaker = None
        next_spoken = None

        for para in paras:
            if not next_speaker:
                if ':' not in para:
                    continue
                next_speaker, next_spoken = para.split(':', maxsplit=1)
                next_speaker = next_speaker.strip()
                next_spoken = next_spoken.strip()
            else:
                if ':' in para and (
                    para.find(':') < 50 or (
                        para.find(':') < 150 and para[para.find(':')-1] == ')'
                    )
                ) and (
                    re.match(honorific_regex, para.lower()) or
                    re.search(f'\(In {cap_word}\):', para) or (
                        para[:4] == 'The ' and contains_one_of(para[:para.find(':')], ['Minister', 'Speaker', 'Chairman']))):
                    speakers_and_spokens.append([next_speaker.strip(), next_spoken.strip()])
                    next_speaker, next_spoken = para.split(':', maxsplit=1)
                    next_speaker = next_speaker.strip()
                    next_spoken = next_spoken.strip()
                else:
                    next_spoken += f' {para.strip()}'
                    
        if next_speaker and next_spoken:
            speakers_and_spokens.append([next_speaker.strip(), next_spoken.strip()])
        
        speakers_and_spokens = list(filter(lambda x: x[0][:4] != 'Page' and x[0][:6] != 'Column', speakers_and_spokens))

        # edge case: (sprs3topic_reportid=budget-700.html). there's a tendency to wrongly flag "Mr Heng Chee How is right about our hard truths:" as a speaker.
        # we resolve this here.
        incorrect_speaker_indices = []
        for i in range(1, len(speakers_and_spokens)):
            if len(list(filter(lambda x: len(x) > 0 and x[0].islower() and x not in ['de', 'bin', 'binte', 'so', 'do', 's/o', 'd/o', 's.o', 'd.o', 's.o.', 'd.o.', 'the', 'of', 'for', 'and', 'to', 'by', 'in', 'at', 'on'], re.sub('(^([A-Z][a-z]\s-\.,)+|\(.+\))', '', speakers_and_spokens[i][0]).split(' ')))) > 0:
                incorrect_speaker_indices.append(i)
                #print(f'dont think this is a real speaker {speakers_and_spokens[i][0]} in {file}')
                
        for i in reversed(incorrect_speaker_indices):
            speakers_and_spokens[i-1][1] += ' ' + speakers_and_spokens[i][1]
            speakers_and_spokens.pop(i)
        
        # get rid of closing remarks by admin ppl
        while len(speakers_and_spokens) > 0 and contains_one_of(speakers_and_spokens[-1][0], ['Speaker', 'Chairman']):
            speakers_and_spokens = speakers_and_spokens[:-1]
            
        for i in range(len(speakers_and_spokens)):
            ss = speakers_and_spokens[i]
            # get rid of things like (In Malay): or in any other lang rly
            ss[1] = re.sub(f'\(In {cap_word}\):', '', ss[1]).strip()
            if re.match(f'\(In {cap_word}\)', ss[0]):
                ss[0] = speakers_and_spokens[i-1][0]
                
        responder_names_to_titles = dict()
        askers = set()
        
        # normalize the speaker names to not have title and honorifics, and also determine which ones are askers and which are responders
        for ss in speakers_and_spokens:
            if 'Chairman' in ss[0] or 'Speaker' in ss[0]:
                continue
            
            ss[0] = re.sub('\(In .+\)', '', ss[0]) # remove those (In Malay) things too

            if '(' not in ss[0]: # bracket not in name - implies we've seen this person before
                guessed_speaker_name = ss[0].split(' ', maxsplit=1)[1]
                if guessed_speaker_name in responder_names_to_titles.keys() or guessed_speaker_name in askers:
                    ss[0] = guessed_speaker_name
                    continue
                else:
                    ss[0] = honorific_name_to_mp_data(ss[0])[0]
                    if ss[0] not in responder_names_to_titles.keys():
                        askers.add(ss[0])
                    continue
                
            if re.search(honorific_bracket_regex, ss[0].lower()): # brackets w honorific in name - implies it's a responder
                honorific_name, responder_title = parse_speaker_title_honorific_name(ss[0])
                responder_name = honorific_name_to_mp_data(honorific_name)[0]

                # doublecheck that the responder's title corresponds to the ministry in the doc title. else he's probably an asker.
                if (not title_implied_ministry) or (identify_portfolio(responder_title) == title_implied_ministry):
                    if responder_title[:4] == 'The ':
                        responder_title = responder_title[4:]
                    responder_names_to_titles[responder_name] = responder_title
                    ss[0] = responder_name
                    continue
                else:
                    asker_name = responder_name
                    askers.add(asker_name)
                    ss[0] = asker_name
                    continue
                
            # it's an asker
            honorific_name = re.sub('\(.+\)', '', ss[0])
            asker_name = honorific_name_to_mp_data(honorific_name)[0]
            askers.add(asker_name)
            ss[0] = asker_name
            continue
            
        speakers_and_spokens = list(filter(
            lambda ss: contains_none_of(ss[0], ['Chairman', 'Speaker']) and len(ss[1]) >= 75 and (
                len(ss[1]) > 300 or contains_none_of(ss[1], ['withdraw', 'beg leave', 'please', 'permission', 'cuts'])
            ),
            speakers_and_spokens
        ))
        
        if len(speakers_and_spokens) == 0:
            return
        
        # there's nvr anyth good after the last responder speaks
        while len(speakers_and_spokens) > 0 and speakers_and_spokens[-1][0] not in responder_names_to_titles.keys():
            speakers_and_spokens = speakers_and_spokens[:-1]
            
        # remove stuff said b4 any qns are asked, cuz obv not relevant to any qns if said so early
        while len(speakers_and_spokens) > 0 and speakers_and_spokens[0][0] not in askers:
            speakers_and_spokens = speakers_and_spokens[1:]
            
        if len(speakers_and_spokens) == 0:
            return
        
        # when same person speaks twice in a row, merge. unless it's asking qns (so we get 2 separate pqs)
        repeated_speaker_indices = []
        for i in range(1, len(speakers_and_spokens)):
            if speakers_and_spokens[i][0] == speakers_and_spokens[i-1][0] and speakers_and_spokens[i][0] in responder_names_to_titles.keys():
                repeated_speaker_indices.append(i)
                
        for i in reversed(repeated_speaker_indices):
            speakers_and_spokens[i-1][1] += ' ' + speakers_and_spokens[i][1]
            speakers_and_spokens.pop(i)
        
        new_pqs = []
        halfway_through_response = False
        first_wave = True

        for ss in speakers_and_spokens:
            speaker_data = honorific_name_to_mp_data(f'Mr {ss[0]}') # not elegant, shld fix l8r
            if ss[0] in askers:
                if halfway_through_response:
                    halfway_through_response = False
                    for new_pq in new_pqs:
                        new_pq[3] = tuple(new_pq[3])
                    pqs.extend(new_pqs)
                    new_pqs = []
                    first_wave = False
                new_pqs.append([
                    speaker_data[0], speaker_data[1], speaker_data[2], None, ss[1], parl_no, sess_no, vol_no, sitting_no, sitting_date, section_name, title, None, None])
            elif ss[0] in responder_names_to_titles.keys():
                halfway_through_response = True
                for new_pq in new_pqs:
                    responder_title = responder_names_to_titles[ss[0]] 
                    if not new_pq[3]:
                        new_pq[3] = [responder_title]
                    elif responder_title not in new_pq[3]: # if alr in, no need to append
                        new_pq[3].append(responder_title)
                    new_pq[-1] = first_wave
                    if not new_pq[-2]:
                        new_pq[-2] = [[ss[0], responder_title, ss[1]]]
                    else:
                        new_pq[-2].append([ss[0], responder_title, ss[1]])
            else:
                print(ss)
                print(askers)
                print(responder_names_to_titles)
                assert False
        
        for new_pq in new_pqs:
            new_pq[3] = tuple(new_pq[3])
        pqs.extend(new_pqs)
        new_pqs = []
        return
    

In [8]:
pqs = []
files_and_exceptions = []
#files_to_run_through = ['sprs3topic_reportid=budget-1898.html','topic_reportid=006_20120306_S0004_T0004.html','sprs3topic_reportid=budget-1630.html']
files_to_run_through = os.listdir('scraped_content')
#files_to_run_through = ['sprs3topic?reportid=budget-1039']

for i in range(len(files_to_run_through)):
    file = files_to_run_through[i]
    file = file.replace('?', '_')
    if '.html' not in file:
        file = file + '.html'
    #print(file)
    filepath = os.path.join('scraped_content', file)
    if os.stat(filepath).st_size < 200000: # the html elements alr take up more than 300kb, so if a file is this small then someth's wrong
        continue
    try:
        with open(filepath, 'r', encoding='utf-8-sig', errors='ignore') as f:
            soup = bs(f, 'html.parser')
        soup_to_pqs(soup, file)
    except Exception as e:
        #import pdb
        #pdb.set_trace()
        if i < 50:
            raise e
        else:
            print(f'exception: {str(e)} - {file}')
            files_and_exceptions.append([file, e])
            continue
        
    if i%50==0:
        print(f'{i}/{len(files_to_run_through)}')
        
print('=====DONE==================================================')
print(f'total pqs: {len(pqs)}')
print(f'total files: {len(files_to_run_through)}')
print(f'avg pqs per file: {len(pqs)/len(files_to_run_through)}')
print(f'files with exceptions: {files_and_exceptions}')

0/10339
levenshtein matched mr masagos zulkifli b m m to masagos zulkifli bin masagos mohamad
allowing omitted words in name matched mr murali pillai to murali pillai sc
useless budget debate sprs3topic_reportid=budget-1099.html
useless budget debate sprs3topic_reportid=budget-1101.html
useless budget debate sprs3topic_reportid=budget-1103.html
useless budget debate sprs3topic_reportid=budget-1106.html
useless budget debate sprs3topic_reportid=budget-1109.html
allowing omitted words in name matched mr alex yam to alex yam ziming
levenshtein matched mr liang eng hwa (holland-bukit timah to liang eng hwa
useless budget debate sprs3topic_reportid=budget-117.html
useless budget debate sprs3topic_reportid=budget-120.html
useless budget debate sprs3topic_reportid=budget-1279.html
50/10339
useless budget debate sprs3topic_reportid=budget-1281.html
useless budget debate sprs3topic_reportid=budget-1291.html
useless budget debate sprs3topic_reportid=budget-1294.html
no portfolio? attorney-genera

In [9]:
with open('pqs_pickle_raw', 'wb') as f:
    pickle.dump(pqs, f)

In [10]:
with open('pqs_pickle_raw', 'rb') as f:
    pqs = pickle.load(f)

In [11]:
pq_df = pd.DataFrame(pqs, columns=['asker_name', 'asker_party', 'asker_parliaments', 'askees', 'question', 'parliament_no', 'session_no', 'volume_no', 'sitting_no', 'sitting_date', 'report_section', 'title', 'responses', 'is_pq'])
def tidy_up_title(title):
    if title == '':
        return ''
    while not title[-1].isalpha() and title[-1] != ')':
        title = title[:-1]
    title = title.replace('Youth, Sports', 'Youth and Sports')
    return title

pq_df.askees = pq_df.askees.apply(lambda x: tuple(map(tidy_up_title, x)))
pq_df.responses = pq_df.responses.apply(lambda x: list(map(lambda y: [y[0], tidy_up_title(y[1]), y[2]], x)))

# parliament no, name -> title. implicit assumption is that cabinet only reshuffles after every parliament (i.e. no reshuffling in between)
known_titles = dict()
for parl_no in [12,13,14]:
    known_titles[parl_no] = dict()
known_titles[13]['Lee Hsien Loong'] = 'Prime Minister'
known_titles[14]['Tharman Shanmugaratnam'] = 'Senior Minister'

for parl_no, responses in zip(pq_df.parliament_no, pq_df.responses):
    for response in responses:
        if response[0] not in known_titles[parl_no].keys() and response[1] != '':
            known_titles[parl_no][response[0]] = response[1]

stats = {'filled_title_count': 0, 'missing_title_count': 0}
still_none = set()
            
def fill_in_missing_titles(row, stats, still_none):
    titles_found = set()
    for response in row.responses:
        if response[1] != '':
            titles_found.add(response[1])
            continue
        stats['missing_title_count'] += 1
        if response[0] in known_titles[row.parliament_no].keys():
            guessed_title = known_titles[row.parliament_no][response[0]]
            response[1] = guessed_title
            titles_found.add(guessed_title)
            stats['filled_title_count'] += 1
        else:
            still_none.add((response[0], row.parliament_no))
    # for budget qns and all sqs, we infer the askees based on the responder. we don't do this for non-budget pqs cuz the askee title is explicitly given.
    if row.report_section == ReportSection.BUDGET or not row.is_pq:
        row.askees = tuple(titles_found)
    return row
            
pq_df = pq_df.apply(lambda x: fill_in_missing_titles(x, stats, still_none), axis=1)
pq_df.insert(4, 'askees_portfolios', pq_df.askees.apply(identify_portfolios))
pq_df.responses = pq_df.responses.apply(lambda y: list(map(lambda x: (x[0], x[1], identify_portfolio(x[1]), x[2]), y)))
print(stats)
print('ppl who still have no titles:')
print(still_none)

{'filled_title_count': 7982, 'missing_title_count': 7982}
ppl who still have no titles:
set()


In [26]:
pq_df['askees'] = pq_df['askees'].apply(lambda askees: tuple(sorted(askees)))
pq_df['askees_portfolios'] = pq_df['askees_portfolios'].apply(lambda askees_portfolios: tuple(sorted(askees_portfolios)))

In [25]:
pq_df

Unnamed: 0,asker_name,asker_party,asker_parliaments,askees,askees_portfolios,question,parliament_no,session_no,volume_no,sitting_no,sitting_date,report_section,title,responses,is_pq
0,Lee Bee Wah,People's Action Party,"(11, 12, 13)",(Minister for the Environment and Water Resour...,"(MSE,)","Mdm Chair, I beg to move, ""That the total sum ...",13,1,94,41,2017-03-08,ReportSection.BUDGET,Committee of Supply − Head L (Ministry of the ...,"[(Masagos Zulkifli Bin Masagos Mohamad, Minist...",True
1,K Thanaletchimi,Nominated Member of Parliament,"(13,)",(Minister for the Environment and Water Resour...,"(MSE,)","Madam, regarding the recent announcement on ca...",13,1,94,41,2017-03-08,ReportSection.BUDGET,Committee of Supply − Head L (Ministry of the ...,"[(Masagos Zulkifli Bin Masagos Mohamad, Minist...",True
2,K Thanaletchimi,Nominated Member of Parliament,"(13,)",(Minister for the Environment and Water Resour...,"(MSE,)",— of what we need to do and that protecting th...,13,1,94,41,2017-03-08,ReportSection.BUDGET,Committee of Supply − Head L (Ministry of the ...,"[(Masagos Zulkifli Bin Masagos Mohamad, Minist...",True
3,Gan Thiam Poh,People's Action Party,"(12, 13, 14)",(Minister for the Environment and Water Resour...,"(MSE,)","At the beginning of this year, the National En...",13,1,94,41,2017-03-08,ReportSection.BUDGET,Committee of Supply − Head L (Ministry of the ...,"[(Masagos Zulkifli Bin Masagos Mohamad, Minist...",True
4,Chia Shi-Lu,People's Action Party,"(12, 13)",(Minister for the Environment and Water Resour...,"(MSE,)","Madam, the Ministry announced air quality targ...",13,1,94,41,2017-03-08,ReportSection.BUDGET,Committee of Supply − Head L (Ministry of the ...,"[(Masagos Zulkifli Bin Masagos Mohamad, Minist...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19128,Zainal Sapari,People's Action Party,"(12, 13)","(Acting Minister for Manpower,)","(MOM,)",(a) what is the current number of Singaporeans...,12,1,89,5,2012-08-13,ReportSection.WRITTEN,CPF Dependants' Protection Scheme,"[(Tan Chuan-Jin, Acting Minister for Manpower,...",True
19129,Patrick Tay Teck Guan,People's Action Party,"(12, 13, 14)","(Acting Minister for Manpower,)","(MOM,)",(a) what is the current number of CPF account ...,12,1,89,5,2012-08-13,ReportSection.WRITTEN,CPF Dependants' Protection Scheme,"[(Tan Chuan-Jin, Acting Minister for Manpower,...",True
19130,Seng Han Thong,People's Action Party,"(9, 10, 11, 12)","(Acting Minister for Manpower,)","(MOM,)",(a) how many CPF members have opted out of the...,12,1,89,5,2012-08-13,ReportSection.WRITTEN,CPF Dependants' Protection Scheme,"[(Tan Chuan-Jin, Acting Minister for Manpower,...",True
19131,Mary Liew,Nominated Member of Parliament,"(12,)","(Acting Minister for Manpower,)","(MOM,)",if he will give an update on the Special Emplo...,12,1,89,5,2012-08-13,ReportSection.WRITTEN,Update on Special Employment Credit Scheme,"[(Tan Chuan-Jin, Acting Minister for Manpower,...",True


In [13]:
pq_df.to_csv('pqs.csv', index=False, sep='|') # sep=',' gives formatting issues 

In [14]:
parties_set = set(pq_df.asker_party.values)
print(f'parties: {parties_set} (len: {len(parties_set)})\n')
askee_set = set([askees for sublist in pq_df.askees for askees in sublist])
print(f'askees: {askee_set} (len: {len(askee_set)})\n')
askees_portfolios_set = set([askees_portfolios for sublist in pq_df.askees_portfolios for askees_portfolios in sublist])
print(f'askees portfolios: {askees_portfolios_set} (len: {len(askees_portfolios_set)})\n')
responder_title_set = set([response[1] for responses in pq_df.responses for response in responses])
print(f'responder titles: {responder_title_set} (len: {len(responder_title_set)})')

parties: {"Workers' Party", "Singapore People's Party", "People's Action Party", 'Nominated Member of Parliament', 'Progress Singapore Party'} (len: 5)

askees: {'Minister of State for National Development', 'Senior Minister of State for Transport', "Senior Minister of State, Prime Minister's Office", 'Senior Minister for Foreign Affairs', 'Acting Minister for Culture, Community and Youth', 'Parliamentary Secretary to the Minister for Transport', 'Minister of State for Finance', 'Senior Minister of State for National Development and Trade and Industry', 'Minister for Culture, Community and Youth', 'Senior Minister of State for Information, Communications and the Arts', 'Deputy Prime Minister and Coordinating Minister for National Security', 'Second Minister for Foreign Affairs', 'Minister for Communications and Information and Minister-in-charge for Muslim Affairs', 'Senior Minister of State for Health', 'Minister for Home Affairs and Minister for Law', 'Minister for Communications and

In [15]:
len(set(pq_df.asker_name.values))

166

In [16]:
min_count = 999
min_mp = None
max_count = 0
max_mp = None
less_than_ten = 0
just_one = 0
for name in set(pq_df.asker_name.values):
    count_here = pq_df[pq_df.asker_name == name]['asker_name'].count()
    if count_here < min_count:
        min_count = count_here
        min_mp = name
    if count_here > max_count:
        max_count = count_here
        max_mp = name
    if count_here < 10:
        less_than_ten += 1
    if count_here == 1:
        just_one += 1
        
min_count, min_mp, max_count, max_mp, less_than_ten, just_one

(1, 'Koh Poh Koon', 773, 'Louis Ng Kok Kwang', 23, 2)

In [17]:
# questions relevant to mccy
pq_df[pq_df['askees_portfolios'].apply(lambda x: 'MCCY' in x)].sitting_date.apply(lambda x: str(x)[:4]).value_counts().sort_index()

2011     19
2012    159
2013    115
2014     76
2015     85
2016     57
2017     80
2018     81
2019     77
2020     76
2021    139
2022     88
Name: sitting_date, dtype: int64

In [18]:
pq_df['report_section'].value_counts()

ReportSection.ORAL          6523
ReportSection.BUDGET        4628
ReportSection.WRITTEN       3993
ReportSection.WRITTEN_NA    3989
Name: report_section, dtype: int64

In [19]:
pq_df['is_pq'].value_counts()

True     13717
False     5416
Name: is_pq, dtype: int64