run the other notebooks. should give two csv files. this one will combine them and clean up the data too.

In [1]:
from bs4 import BeautifulSoup as bs
import ast
import os
import os.path
from datetime import datetime
from enum import Enum
import re
import pandas as pd
import numpy as np


**for merging with mps.csv**

In [2]:
mp_df = pd.read_csv('mps.csv')
mp_df.Party = mp_df.Party.apply(ast.literal_eval)
mp_df.Parliaments = mp_df.Parliaments.apply(ast.literal_eval)
mps = dict(
    zip(mp_df.Name.apply(lambda x: x.replace('.', '').replace(',', '').lower()), # keys
    zip(mp_df.Name, mp_df.Party, mp_df.Parliaments))) # values
mp_names = list(mps.keys())

In [3]:
alr_matched = set() # honorific+names that have alr been matched to names so we don't spam the print
ministers_found = set() # minister titles that have alr been found (to be used for future searches in case of typos)

In [4]:
# for matching honorific+name in report to actual mp data.
# cannot simply remove honorific as the programmer doesn't have an exhaustive list
# of honorifics, and some are quite rare in everyday use (e.g. Inche Rahamat Bin Kenap).
def honorific_name_to_mp_data(honorific_name):
    honorific_name = honorific_name.replace('.','').replace(',','').lower().strip()
    
    # try the easy way first (find and remove honorific)
    honorific_match = re.match(r'(mr|mrs|ms|miss|mdm|dr|er dr|prof|assoc prof|er|asst prof|assoc prof dr|inche|encik)', honorific_name)
    if honorific_match:
        name = honorific_name[honorific_match.span()[1]+1:]
        if name in mps.keys():
            return mps[name]
        
        # seems quite common for them to write "asked" twice in the hansard proceedings
        last_asked = name.rfind(' asked')
        if last_asked and name[:last_asked] in mps.keys():
            return mps[name[:last_asked]]

        # slightly harder way (rearranging words)
        for mp_name in mp_names:
            mp_name_words = set(mp_name.split(' '))
            name_words = set(name.split(' '))
            if mp_name_words == name_words:
                if (honorific_name, mp_name) not in alr_matched:
                    #print(f'rearranging matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
            
            # for omission of chinese name
            if len(mp_name_words) - len(name_words) <= 2 and len(name_words) >= 2 and name_words.issubset(mp_name_words):
                if (honorific_name, mp_name) not in alr_matched:
                    print(f'allowing omitted words in name matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
                
    digit_match = re.search('\d+', honorific_name)
    if digit_match:
        # names shldn't have digits
        honorific_name = honorific_name[digit_match.span()[1]:]
        return honorific_name_to_mp_data(honorific_name)
        
    # the hard way (levenshtein)
    min_levenshtein = 99999
    min_ind = -1
    for i in range(len(mp_names)):
        l_dist = levenshtein(mp_names[i], honorific_name)
        if l_dist < min_levenshtein:
            min_levenshtein = l_dist
            min_ind = i
            
    if (honorific_name, mp_names[min_ind]) not in alr_matched:
        print(f'levenshtein matched {honorific_name} to {mp_names[min_ind]}')
        alr_matched.add((honorific_name, mp_names[min_ind]))
    return mps[mp_names[min_ind]]

# borrowed from: https://blog.paperspace.com/implementing-levenshtein-distance-word-autocomplete-autocorrect/
# we use levenshtein as it helps to protect against typos too, like the "asked asked" in:
# https://sprs.parl.gov.sg/search/sprs3topic?reportid=oral-answer-2822
def levenshtein(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))
    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1
    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

In [5]:
pqs = []

In [6]:
class ReportSection(Enum):
    WRITTEN = 'Written Answers to Questions'
    WRITTEN_NA = 'Written Answers to Questions for Oral Answer Not Answered by End of Question Time'
    ORAL = 'Oral Answers to Questions'

we assume that all pqs are prefaced with #. (non sprs) or # (sprs). ignore follow up qns since we are only interested in mapping mps to topics, and the follow up qns will always be from the same mp and on the same topic.

notes regarding minister titles:
* Minister for Culture, Community and Youth is the only minister title with a comma
* but there used to be Minister for Information, Communication and the Arts and Minister for Community Development, Youth and Sports
* no questions were ever directed to minister mentor

In [7]:
cap_word = r'[A-Z][a-z]+'
cap_words = f'({cap_word})( {cap_word})*'
mccy = '(Acting )?Minister for Culture, Community and Youth'
mica = '(Acting )?Minister for Information, Communications and the Arts'
mcdys = '(Acting )?Minister for Community Development, Youth( and|,) Sports'
micma = 'Minister-in-charge of Muslim Affairs'
minister_for_something = f'({cap_words} )?Minister (for|of) (the )?{cap_words}( and (the )?{cap_words})?'
something_minister = f'{cap_words} Minister'
one_minister_regex = f'(({mccy})|({mica})|({mcdys})|({micma})|({minister_for_something})|({something_minister}))'
minister_regex = re.compile(f'{one_minister_regex}( and (the )?{one_minister_regex})?') # can have multiple targets

def first_two_capitalized(words):
    return words[0][0].isupper() and words[1][0].isupper() and words[0][1] and words[0][1].islower()

def trim_off_non_pq_content_at_start(para):
    para_words = para.split(' ')
    if first_two_capitalized(para_words):
        return para
    
    # i assume honorific+name has at least two words capitalized and non-numbers
    while not first_two_capitalized(para_words):
        para_words = para_words[1:]
    
    return ' '.join(para_words)

def get_ministers_and_question(para):
    minister_match = re.search(minister_regex, para)
    if not minister_match:
        # report might've been in the wrong case; try to match to existing ministers
        minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace(' for ', '.{1,5}'), ministers_found))) + ')',
            para.lower()
        )
        if not minister_match:
            minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace(' for ', '(\s)*(for|of)(\s)*'), ministers_found))) + ')',
            para.lower()
        )
        if not minister_match:
            for existing_minister in ministers_found:
                if existing_minister.replace(' ', '') in para.replace(' ', ''):
                    minister_match = re.search('(\s)?'.join(c for c in existing_minister.replace(' ', '').lower()), para.lower())
                    break   
        min_levenshtein = 99999
        min_minister = None
        for existing_minister in ministers_found:
            dist = levenshtein(existing_minister, minister_match.group())
            if dist < min_levenshtein:
                min_levenshtein = dist
                min_minister = existing_minister
        
        print(f'found minister: {str(minister_match.group())}; matched to {min_minister}')
        askee = min_minister
    else:
        askee = para[:minister_match.span()[1]].replace(' of ', ' for ')
        ministers_found.add(askee)
    question = para[minister_match.span()[1]:].strip()
    if ' and Leader' in askee:
        askee = askee[:-11]
        print(f'removed "and leader" from {askee}')
    
    if not re.search('and (the )?Minister', askee):
        return (askee,), question   
    else:
        askee = askee.replace('and the Minister', 'and Minister')
        askees = askee.split(' and Minister')
        return (askees[0], 'Minister' + askees[1]), question

def soup_to_pqs(soup):
    stripped_strings = list(map(
        lambda text: re.sub(r'\s+', ' ', text),
        filter(
            lambda text: not re.match(r'Page:\s+\d+', text) and not re.match(r'Column:\s+\d+', text),
            [text for text in soup.stripped_strings])))
    parl_no = int(stripped_strings[3])
    sess_no = int(stripped_strings[5])
    vol_no = int(stripped_strings[7])
    sitting_no = int(stripped_strings[9])
    sitting_date = datetime.strptime(stripped_strings[11], '%d-%m-%Y')
    section_name_raw = stripped_strings[13].lower()
    
    if 'answered' in section_name_raw:
        section_name = ReportSection.WRITTEN_NA
    elif 'written' in section_name_raw:
        section_name = ReportSection.WRITTEN
    elif 'oral' in section_name_raw:
        section_name = ReportSection.ORAL
    else:
        raise f'no section name??? {section_name_raw}'
    
    title = stripped_strings[15]
    the_rest = stripped_strings[19:]
    while not re.match(r'\d\d?', the_rest[0]):
        the_rest = the_rest[1:]
        
    indices_corresponding_to_pqs = []
    indices_corresponding_to_speakers = []
    for i in range(len(the_rest)):
        if the_rest[i][0] == ':':
            indices_corresponding_to_speakers.append(i-1)
        elif re.match(r'\d\d?', the_rest[i]):
            indices_corresponding_to_pqs.append(i)
        
    pq_sublists = []
    while len(indices_corresponding_to_pqs) > 1:
        pq_sublists.append(the_rest[indices_corresponding_to_pqs[0]+1:indices_corresponding_to_pqs[1]])
        indices_corresponding_to_pqs = indices_corresponding_to_pqs[1:]
        
    pq_sublists.append(the_rest[indices_corresponding_to_pqs[0]+1:indices_corresponding_to_speakers[0]])
    
    speaking_sublists = []
    
    while len(indices_corresponding_to_speakers) > 1:
        speaking_sublists.append(the_rest[indices_corresponding_to_speakers[0]:indices_corresponding_to_speakers[1]])
        indices_corresponding_to_speakers = indices_corresponding_to_speakers[1:]
    
    speaking_sublists.append(the_rest[indices_corresponding_to_speakers[0]:])
    
    pqs = list(map(lambda sl: ' '.join(sl)
    
    
    import pdb
    pdb.set_trace()
    
    para = pq_para[0]
    para = trim_off_non_pq_content_at_start(para) # sometimes we end up mistaking other numbers in the text as being the pq numbers. so we deal w that here.
    asker_honorific_name, para = para.split(' asked the ', 1)
    
    ministers, question = get_ministers_and_question(para)
    if question[0] == ',':
        question = question[1:].strip()
    asker, asker_party, asker_parls = honorific_name_to_mp_data(asker_honorific_name.strip())
    pqs.append([asker, asker_party, asker_parls, ministers, question, *pq_para[1:]])


In [8]:
for file in [os.listdir('scraped_content')[0], os.listdir('scraped_content')[-1]]:
    filepath = os.path.join('scraped_content', file)
    with open(filepath, 'r') as f:
        soup = bs(f, 'html.parser')
    try:
        soup_to_pqs(soup)
    except Exception as e:
        print(str(e))
        continue

> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_6524\607813641.py[0m(122)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  pq_sublists


[['Mr Louis Ng Kok Kwang', 'asked the', 'Minister for Transport (a) what are the main factors taken into consideration when deciding on the possible underground alignments in the vicinity of the Central Catchment Nature Reserve (CCNR) for the Cross Island Line; (b) in view of the moderate environmental impact on the nature reserve for the alignment option that cuts beneath the CCNR, whether the Ministry will consider the alternative alignment along Lornie Road which will allow the MRT line to serve more residents and commuters in that vicinity and also result in the protection of our nature reserve and primary forest; and (c) whether the Environmental Impact Assessment report that was recently published can be made available for viewing online.']]


ipdb>  speaking_sublists


[['The Minister for Transport (Mr Khaw Boon Wan)', ': Mdm Speaker, the Cross Island Line (CRL) will be an important part of our future MRT network. It will link east to west, from Changi to Jurong, covering more than 50 km with about 30 stations. The exact alignment is being studied. Our preliminary estimate is that commuters from residential areas like Loyang, Pasir Ris, Hougang, Ang Mo Kio, Sin Ming, Bukit Timah, Clementi and West Coast will make at least 600,000 trips on the CRL every day. This will place the CRL higher, in terms of capacity and usage, compared, for example, to the North East Line (NEL). The CRL will also significantly enhance our network resilience as commuters will have many more routing options with the CRL connecting to other lines.', 'What this means is that – nearly half of the 30-plus new stations will be interchange stations and that means that every other station will be an exchange station where you can switch to another line. This will significantly enhan

ipdb>  continue


name 'pq_para' is not defined
> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_6524\607813641.py[0m(122)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  speaker_sublists


*** NameError: name 'speaker_sublists' is not defined


ipdb>  pq_sublsits


*** NameError: name 'pq_sublsits' is not defined


ipdb>  pq_sublists


[['Mr Zainal Sapari', "asked the Acting Minister for Manpower (a) what is the current number of Singaporeans earning below $1,700 per month who opted out of the CPF Dependants' Protection Scheme; (b) what are the reasons for their opting out of the scheme; and (c) whether the Government can consider a scheme to waive or reduce the premiums for workers earning below $1,700 per month."], ['Mr Patrick Tay Teck Guan', "asked the Acting Minister for Manpower (a) what is the current number of CPF account holders, with a breakdown in terms of age, gender and salary range, who are (i) on the Dependants' Protection Scheme (DPS); and (ii) not on DPS because of their opt-out; and (b) what has been the claims experience in the last five years."], ['Mr Seng Han Thong', "asked the Acting Minister for Manpower (a) how many CPF members have opted out of the Dependants' Protection Scheme (DPS) since its inception and what is the income profile of those who opted out; (b) whether there are plans to revi

ipdb>  exit()





In [9]:
pq_df = pd.DataFrame(pqs, columns=['asker_name', 'asker_party', 'asker_parliaments', 'askee', 'question', 'sitting_date', 'parliament_no', 'report_section'])
pq_df.sitting_date = pq_df.sitting_date.apply(lambda dt: dt if dt.find('00:00:00') == -1 else dt[:dt.find('00:00:00')-1])
pq_df.sitting_date = pq_df.sitting_date.apply(lambda dt: dt if dt.find('0:00') == -1 else dt[:dt.find('0:00')-1])
pq_df.asker_party = pq_df.asker_party.apply(lambda x: x[0])
pq_df

Unnamed: 0,asker_name,asker_party,asker_parliaments,askee,question,sitting_date,parliament_no,report_section


In [10]:
# pq_df.to_csv('pqs.csv', index=False)

In [11]:
assert all(pq_df.parliament_no < 15) and all(pq_df.parliament_no >= 12)
assert all(map(lambda x: not x[0].isupper(), pq_df.question.values)) 
assert all(map(lambda x: not x[:3] == 'and', pq_df.question.values)) 
print(set(pq_df.asker_party.values))
print()
print(set([askee for sublist in pq_df.askee for askee in sublist]))

set()

set()


In [12]:
len(set(pq_df.asker_name.values))

0

In [13]:
min_count = 999
min_mp = None
max_count = 0
max_mp = None
less_than_ten = 0
for name in pq_df.asker_name.values:
    count_here = pq_df[pq_df.asker_name == name]['asker_name'].count()
    if count_here < min_count:
        min_count = count_here
        min_mp = name
    if count_here > max_count:
        max_count = count_here
        max_mp = name
    if count_here < 10:
        less_than_ten += 1
        
min_count, min_mp, max_count, max_mp, less_than_ten

(999, None, 0, None, 0)