run the mp scraper one and the actual hansard scraper one. then you'll have some html files of hansard and a csv file of mps. then run this nb.

if u wanna find code that deals w edge cases, find comments that start with "edge case". examples are given. i started writing those comments q l8 tho so i don't guarantee that i commented on all of them. 

**the csv file you get from this is delimited by '|'. if reading with code make sure u account for that. if opening in excel, follow these instrns (https://support.affinity.co/hc/en-us/articles/360044453711-How-to-open-CSV-files-with-the-correct-delimiter-separator).**

In [1]:
from bs4 import BeautifulSoup as bs
import ast
import os
import os.path
from datetime import datetime
from enum import Enum
import re
import pandas as pd
import numpy as np
from pdb import set_trace as st

**for merging with mps.csv**

In [2]:
mp_df = pd.read_csv('mps.csv')
mp_df.Parliaments = mp_df.Parliaments.apply(ast.literal_eval)
mps = dict(
    zip(mp_df.Name.apply(lambda x: x.replace('.', '').replace(',', '').lower()), # keys
    zip(mp_df.Name, mp_df.Party, mp_df.Parliaments))) # values
mp_names = list(mps.keys())

In [3]:
alr_matched = set() # honorific+names that have alr been matched to names so we don't spam the print
ministers_found = set() # minister titles that have alr been found (to be used for future searches in case of typos)

In [4]:
honorific_regex = r'(mrs|mr|ms|miss|mdm|er dr|er|assoc prof dr|assoc prof|asst prof|prof|dr|inche|encik)'

# for matching honorific+name in report to actual mp data.
# cannot simply remove honorific as the programmer doesn't have an exhaustive list
# of honorifics, and some are quite rare in everyday use (e.g. Inche Rahamat Bin Kenap).
def honorific_name_to_mp_data(honorific_name):
    honorific_name = honorific_name.replace('.','').replace(',','').replace(':','').lower().strip()
    honorific_name = re.sub('\(.+\)', '', honorific_name)
    
    # try the easy way first (find and remove honorific)
    honorific_match = re.match(honorific_regex, honorific_name)
    if honorific_match:
        name = honorific_name[honorific_match.span()[1]+1:]
        if name in mps.keys():
            return mps[name]
        
        # seems quite common for them to write "asked" twice in the hansard proceedings
        last_asked = name.rfind(' asked')
        if last_asked and name[:last_asked] in mps.keys():
            return mps[name[:last_asked]]

        # slightly harder way (rearranging words)
        for mp_name in mp_names:
            mp_name_words = set(mp_name.split(' '))
            name_words = set(name.split(' '))
            if mp_name_words == name_words:
                if (honorific_name, mp_name) not in alr_matched:
                    #print(f'rearranging matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
            
            # for omission of chinese name
            if len(mp_name_words) - len(name_words) <= 2 and len(name_words) >= 2 and name_words.issubset(mp_name_words):
                if (honorific_name, mp_name) not in alr_matched:
                    print(f'allowing omitted words in name matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
                
    digit_match = re.search('\d+', honorific_name)
    if digit_match:
        # names shldn't have digits
        honorific_name = honorific_name[digit_match.span()[1]:]
        return honorific_name_to_mp_data(honorific_name)
        
    # the hard way (levenshtein)
    closest_name = levenshtein_best_match(honorific_name, mp_names)
    
    if (honorific_name, closest_name) not in alr_matched:
        print(f'levenshtein matched {honorific_name} to {closest_name}')
        alr_matched.add((honorific_name, closest_name))
    return mps[closest_name]

def levenshtein_best_match(value, options):
    min_levenshtein = 99999
    min_val = None
    for option in options:
        l_dist = levenshtein(option, value)
        if l_dist < min_levenshtein:
            min_levenshtein = l_dist
            min_val = option
    return min_val
            

# borrowed from: https://blog.paperspace.com/implementing-levenshtein-distance-word-autocomplete-autocorrect/
# we use levenshtein as it helps to protect against typos too, like the "asked asked" in:
# https://sprs.parl.gov.sg/search/sprs3topic?reportid=oral-answer-2822
def levenshtein(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))
    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1
    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

In [5]:
pqs = []

In [6]:
class ReportSection(Enum):
    WRITTEN = 'Written Answers to Questions'
    WRITTEN_NA = 'Written Answers to Questions for Oral Answer Not Answered by End of Question Time'
    ORAL = 'Oral Answers to Questions'
    BUDGET = 'Budget'

we assume that all pqs are prefaced with #. (non sprs) or # (sprs). ignore follow up qns since we are only interested in mapping mps to topics, and the follow up qns will always be from the same mp and on the same topic.

notes regarding minister titles:
* Minister for Culture, Community and Youth is the only minister title with a comma
* but there used to be Minister for Information, Communication and the Arts and Minister for Community Development, Youth and Sports
* no questions were ever directed to minister mentor

In [7]:
cap_word = r'[A-Z][a-z]+'
cap_words = f'({cap_word})( {cap_word})*'
mccy = '(Acting )?Minister for Culture, Community and Youth'
mica = '(Acting )?Minister for Information, Communications and the Arts'
mcdys = '(Acting )?Minister for Community Development, Youth( and|,) Sports'
micma = 'Minister-in-charge of Muslim Affairs'
minister_for_something = f'({cap_words} )?Minister( of State)? (for|of) (the )?{cap_words}( and (the )?{cap_words})?( \({cap_words}( and {cap_words})?\))?'
something_minister = f'{cap_words} Minister'
one_minister_regex = f'(({mccy})|({mica})|({mcdys})|({micma})|({minister_for_something})|({something_minister}))'
minister_regex = re.compile(f'{one_minister_regex}( and (the )?{one_minister_regex})?') # can have multiple targets

def first_two_capitalized(words):
    return words[0][0].isupper() and words[1][0].isupper() and words[0][1] and words[0][1].islower()

def trim_off_non_pq_content_at_start(para):
    para_words = para.split(' ')
    if first_two_capitalized(para_words):
        return para
    
    # i assume honorific+name has at least two words capitalized and non-numbers
    while not first_two_capitalized(para_words):
        para_words = para_words[1:]
    
    return ' '.join(para_words)

# extracts the first substring which is a substring of ministers
def extract_first_ministers(para):
    minister_match = re.search(minister_regex, para)
    if not minister_match:
        # report might've been in the wrong case; try to match to existing ministers
        minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace(' for ', '.{1,5}'), ministers_found))) + ')',
            para.lower()
        )
        if not minister_match:
            minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace(' for ', '(\s)*(for|of)(\s)*'), ministers_found))) + ')',
            para.lower()
        )
        if not minister_match:
            for existing_minister in ministers_found:
                if existing_minister.replace(' ', '') in para.replace(' ', ''):
                    minister_match = re.search('(\s)?'.join(c for c in existing_minister.replace(' ', '').lower()), para.lower())
                    break
        minister = levenshtein_best_match(minister_match.group(), ministers_found)
        print(f'found minister: {str(minister_match.group())}; matched to {minister}')
    else:
        minister = para[:minister_match.span()[1]].replace(' of ', ' for ')
        ministers_found.add(minister)

    para = para.replace(minister_match.group(), '').strip()

    if ' and Leader' in minister:
        minister = minister[:-11]

    if minister[:4] == 'The ':
        minister = minister[4:]
        
    return minister, para

def parse_speaker_title_honorific_name(speaker):
    honorific_bracket_regex = f'\({honorific_regex} .+\)'
    honorific_bracket_search = re.search(honorific_bracket_regex, speaker.lower())
    honorific_name = speaker[honorific_bracket_search.span()[0]+1 : honorific_bracket_search.span()[1]-1].strip()
    responder_title = re.sub(honorific_bracket_regex, '', speaker, flags=re.IGNORECASE).replace(' of ', ' for ').strip()
    if responder_title[:4] == 'The ':
        responder_title = responder_title[4:].strip()
    return honorific_name, responder_title

def get_ministers_and_question(para):
    askee, question = extract_first_ministers(para)
    
    if not re.search('and (the )?Minister', askee):
        return (askee,), question   
    else:
        askee = askee.replace('and the Minister', 'and Minister')
        askees = askee.split(' and Minister')
        return (askees[0], 'Minister' + askees[1]), question

def get_section_name(section_name_raw):
    if 'answered' in section_name_raw:
        return ReportSection.WRITTEN_NA
    elif 'written' in section_name_raw:
        return ReportSection.WRITTEN
    elif 'oral' in section_name_raw:
        return ReportSection.ORAL
    elif 'budget' in section_name_raw:
        return ReportSection.BUDGET
    else:
        raise f'no section name??? {section_name_raw}'

ministry_keywords_dict = {
    'MCCY': ['Muslim Affairs', 'Culture', 'Youth', 'Sports','Community'], 
    'MOT':['Transport'], 
    'MINDEF':['Defence'],
    'MinLaw':['Law'], 
    'MTI':['Trade and Industry'], 
    'MOM':['Manpower'], 
    'MND':['National Development'], 
    'MHA':['Home Affairs'], 
    'MOH':['Health'], 
    'MFA':['Foreign Affairs'], 
    'MSF':['Social and Family Development'], 
    'MOF':['Finance'],  
    'MOE':['Education'], 
    'MSE':['the Environment and Water Resources','Sustainability and the Environment'],
    'MCI':['Information','Communications and Information','Information, Communications and the Arts'],
    'PMO':['Coordinating Minister for National Security','Prime Minister'] 
    #"Deputy Prime Minister" dropping this for now because DPM may also have specific portfolios, rather than having the issue fall under PMO
    #e.g. HSK holding MOF portfolio when he was DPM
}

def identify_portfolios(titles):
    return tuple(set(map(lambda title: identify_portfolio(title), titles)))

def identify_portfolio(title): 
    for k, v in ministry_keywords_dict.items():
        words_re = re.compile('|'.join(v))
        if words_re.search(title):
            return k
    print(f'no portfolio? {title}')
    
def soup_to_pqs(soup, file):
    # print(file)
    # seems to happen quite often sadly
    if soup.get_text() == '':
        print(f'empty text {file}')
        return
    
    stripped_strings = list(map(
        lambda text: re.sub(r'\s+', ' ', text),
        filter(
            lambda text: not re.match(r'Page:\s+\d+', text) and not re.match(r'Column:\s+\d+', text),
            [text for text in soup.stripped_strings])))
    if len(stripped_strings) < 20: # the table at the top of the page alr accounts for most of this.
        return
    parl_no = int(stripped_strings[3])
    sess_no = int(stripped_strings[5])
    vol_no = int(stripped_strings[7])
    sitting_no = int(stripped_strings[9])
    sitting_date = datetime.strptime(stripped_strings[11], '%d-%m-%Y')
    section_name = get_section_name(stripped_strings[13].lower())
    title = stripped_strings[15]
    the_rest = stripped_strings[19:]

    # trim off useless preamble stuff
    while len(the_rest) > 0 and (
        (section_name == ReportSection.BUDGET and not re.match(honorific_regex, the_rest[0].lower()) and the_rest[0][:4] != 'The ') or
        (section_name != ReportSection.BUDGET and not re.match(r'\d\d?', the_rest[0]))):
        the_rest = the_rest[1:]
        
    if len(the_rest) == 0:
        return
    
    if section_name == ReportSection.BUDGET:
        st()
        return
        
    indices_corresponding_to_pqs = []
    indices_corresponding_to_speakers = []
    maybe_more_pqs = True
    for i in range(len(the_rest)):
        if the_rest[i][0] == ':' or (i-1 >= 0 and the_rest[i-1][-1] == ':' and the_rest[i-1] in list(map(lambda s: s.get_text().strip(), soup.select('strong')))): # edge case: (sprs3topic_reportid=oral-answer-2239.html), Ong Ye Kung's first response has the colon bolded, whereas it's normally not bolded. this throws us off. extra check in the condition is to resolve this.
            actual_index_to_append = i-1
            # edge case: (sprs3topic_reportid=oral-answer-1632.html), "The Senior Minister of State for Home Affairs (Mr Desmond Lee) (for the Minister for Home Affairs)" is broken up into multiple entries for some reason. this loop is to ensure the full name and title gets saved.
            while the_rest[actual_index_to_append][0] == '(' and the_rest[actual_index_to_append][-1] == ')':
                actual_index_to_append -= 1
            # edge case: (sprs3topic_reportid=oral-answer-2760.html), "The Minister of State for Home Affairs (Mr Desmond Tan) (for the  Minister for Home Affairs)" is also cut in the middle for some reason zzz
            bracket_count = the_rest[actual_index_to_append].count('(') - the_rest[actual_index_to_append].count(')')
            while bracket_count != 0:
                actual_index_to_append -= 1
                if actual_index_to_append == 0:
                    return
                bracket_count += the_rest[actual_index_to_append].count('(') - the_rest[actual_index_to_append].count(')')
            indices_corresponding_to_speakers.append(actual_index_to_append)
            maybe_more_pqs = False
        elif re.match(r'\d\d?', the_rest[i]) and maybe_more_pqs:
            indices_corresponding_to_pqs.append(i)
            
    if len(indices_corresponding_to_pqs) == 0:
        print(f'no pqs? {file}')
        return
    if len(indices_corresponding_to_speakers) == 0:
        print(f'no speakers? {file}')
        return

    pq_sublists = []
    pq_qn_indices = []
    while len(indices_corresponding_to_pqs) > 1:
        pq_qn_indices.append(the_rest[indices_corresponding_to_pqs[0]])
        pq_sublists.append(the_rest[indices_corresponding_to_pqs[0]+1:indices_corresponding_to_pqs[1]])
        indices_corresponding_to_pqs = indices_corresponding_to_pqs[1:]
        
    pq_qn_indices.append(the_rest[indices_corresponding_to_pqs[0]])
    pq_sublists.append(the_rest[indices_corresponding_to_pqs[0]+1:indices_corresponding_to_speakers[0]])
    
    speaking_sublists = []
    
    while len(indices_corresponding_to_speakers) > 1:
        speaking_sublists.append(the_rest[indices_corresponding_to_speakers[0]:indices_corresponding_to_speakers[1]])
        indices_corresponding_to_speakers = indices_corresponding_to_speakers[1:]
    
    speaking_sublists.append(the_rest[indices_corresponding_to_speakers[0]:])
    
    new_pqs = []
    new_pq_indices = []
    
    for pq_i, sl in zip(pq_qn_indices, pq_sublists):
        pq_para = ' '.join(sl)
        
        #pq_para = trim_off_non_pq_content_at_start(pq_para) # sometimes we end up mistaking other numbers in the text as being the pq numbers. so we deal w that here.
        if ' asked the ' not in pq_para:
            continue
        asker_honorific_name, pq_para = pq_para.split(' asked the ', 1)    
        ministers, question = get_ministers_and_question(pq_para)

        if question[0] == ',':
            question = question[1:].strip()
        if len(asker_honorific_name.strip()) == 0:
            return
        asker, asker_party, asker_parls = honorific_name_to_mp_data(asker_honorific_name.strip())
        new_pq_indices.append(int(pq_i))
        new_pqs.append([asker, asker_party, asker_parls, ministers, question, parl_no, sess_no, vol_no, sitting_no, sitting_date, section_name, title])
    
    # find out what's said after the pqs have been asked, and who says it
    speakers_and_spokens = []
    for sl in speaking_sublists:
        text = ' '.join(sl)
        split_result = text.split(':', 1)
        if len(split_result) < 2:
            return
        speaker, spoken = split_result
        speaker = speaker.strip()
        spoken = spoken.strip()
        while len(spoken) > 0 and not spoken[0].isalpha():
            spoken = spoken[1:].strip()
        if len(spoken) == 0: # edge case (sprs3topic_reportid=written-answer-4142.html). sometimes people are just lost for words i guess.
            return
        if spoken[:len('Question No')] == 'Question No':
            continue
        speaker = re.sub(f'\(for .*\)', '', speaker)
        speaker = re.sub(f'\(on behalf of .*\)', '', speaker)
        speakers_and_spokens.append([speaker, spoken])

    # speaker never says anyth useful
    while 'speaker' in speakers_and_spokens[0][0].lower():
        speakers_and_spokens = speakers_and_spokens[1:]
    while 'speaker' in speakers_and_spokens[-1][0].lower() or 'leader' in speakers_and_spokens[-1][0].lower():
        speakers_and_spokens = speakers_and_spokens[:-1]
    
    # if minister title is provided then we take. else, just take the name.
    first_responder, first_response = speakers_and_spokens[0]
    if 'Minister' in first_responder:
        first_responder_honorific_name, first_responder_title = parse_speaker_title_honorific_name(first_responder)
        first_responder_name = honorific_name_to_mp_data(first_responder_honorific_name)[0]
    else:
        first_responder_title = ''
        first_responder_name = first_responder
    
    speakers_and_spokens = speakers_and_spokens[1:]
    
    # if there's more than 1 pq, the responder will ask the speaker for permission to hit all the qns at once,
    # and the speaker will grant permission. and then there may be a bit more admin back and forth.
    # we wanna remove that.
    if len(new_pqs) > 1:
        while len(speakers_and_spokens) > 0 and 'speaker' in speakers_and_spokens[0][0].lower():
            first_response = speakers_and_spokens[1][1]
            speakers_and_spokens = speakers_and_spokens[2:]

    # the new pqs now have their responses ready, we can save them.
    for new_pq in new_pqs:
        pqs.append(new_pq + [first_responder_name, first_responder_title, first_response, True])
        
    # pqs settled. now move on to followup (sqs)
    
    if section_name in (ReportSection.WRITTEN, ReportSection.WRITTEN_NA):
        return # there are no sqs in written responses
    
    new_sqs = []
    responder_names_to_titles = dict()
    responder_names_to_titles[first_responder_name] = first_responder_title

    for speaker, spoken in speakers_and_spokens:
        if 'speaker' in speaker.lower() or 'leader' in speaker.lower():
            continue # speaker says nothing useful
        
        if '(' in speaker: # if there's a brack8 then we've never seen this person speak before
            honorific_bracket_search = re.search(f'\({honorific_regex} .+\)', speaker.lower())
            if honorific_bracket_search: # honorific and name occur inside brackets for responders, outside for askers
                is_response = True
                honorific_name, responder_title = parse_speaker_title_honorific_name(speaker)
                speaker_data = honorific_name_to_mp_data(honorific_name)
                responder_name = speaker_data[0]
                responder_names_to_titles[responder_name] = responder_title
            else: # if there's no honorific inside brack8, then honorific must be outside brack8. this only happens for asker.
                is_response = False
                honorific_name = re.sub('\(.+\)', '', speaker)
                speaker_data = honorific_name_to_mp_data(honorific_name)
        else: # if there's no brack8 then we've seen the person speak before
            speaker_data = honorific_name_to_mp_data(speaker)
            if speaker_data[0] not in responder_names_to_titles.keys(): # check whether the person is a known responder
                is_response = False
                asker_name = speaker_data[0]
            else:
                is_response = True
                responder_name = speaker_data[0]
                responder_title = responder_names_to_titles[responder_name]
    
        if is_response:
            for new_sq in new_sqs:
                new_sq = new_sq + [responder_name, responder_title, spoken, False]
                new_sq[3] = (responder_title,) # backfill the missing askee title
                pqs.append(new_sq)
            new_sqs = []
        else:
            new_sqs.append([
                speaker_data[0],
                speaker_data[1],
                speaker_data[2],
                None, # instead of ner to find out who the target of the qn is, we just backfill it l8r when we get the response
                spoken,
                parl_no,
                sess_no,
                vol_no,
                sitting_no,
                sitting_date,
                section_name,
                title
            ])
            
    return

In [8]:
pqs = []
files_and_exceptions = []
files_to_run_through = os.listdir('scraped_content')[:25]

for i in range(len(files_to_run_through)):
    file = files_to_run_through[i]
    print(file)
    filepath = os.path.join('scraped_content', file)
    if os.stat(filepath).st_size < 200000: # the html elements alr take up more than 300kb, so if a file is this small then someth's wrong
        continue
    try:
        with open(filepath, 'r', encoding='utf-8-sig', errors='ignore') as f:
            soup = bs(f, 'html.parser')
        soup_to_pqs(soup, file)
    except Exception as e:
        #import pdb
        #pdb.set_trace()
        raise e
        #print(f'exception: {str(e)} - {file}')
        #files_and_exceptions.append([file, e])
        #continue
        
    if i%25==0:
        print(f'{i}/{len(files_to_run_through)}')
        
print('=====DONE==================================================')
print(f'total pqs: {len(pqs)}')
print(f'total files: {len(files_to_run_through)}')
print(f'avg pqs per file: {len(pqs)/len(files_to_run_through)}')
print(f'files with exceptions: {files_and_exceptions}')

sprs3topic_reportid=budget-1037.html
> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_1936\1403815685.py[0m(157)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  the_rest[0]


'The Deputy Prime Minister and Minister for Finance (Mr Tharman Shanmugaratnam)'


ipdb>  c


0/25
sprs3topic_reportid=budget-1038.html
sprs3topic_reportid=budget-1039.html
> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_1936\1403815685.py[0m(157)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  the_rest[0]


'Er Dr Lee Bee Wah'


ipdb>  c


sprs3topic_reportid=budget-1040.html
> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_1936\1403815685.py[0m(157)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  the_rest[0]


'Ms Tin Pei Ling (MacPherson)'


ipdb>  the_rest[1]


': Mdm Chair, I beg to move, "That the total sum to be allocated for Head I of the Estimates be reduced by $100."'


ipdb>  c


sprs3topic_reportid=budget-1041.html
> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_1936\1403815685.py[0m(157)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  the_rest[0]


'The Minister for Social and Family Development (Mr Tan Chuan-Jin)'


ipdb>  c


sprs3topic_reportid=budget-108.html
> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_1936\1403815685.py[0m(157)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  the_rest[0]


'The Deputy Prime Minister and Minister for Finance (Mr Tharman Shanmugaratnam)'


ipdb>  c


sprs3topic_reportid=budget-1081.html
> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_1936\1403815685.py[0m(157)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  the_rest


['The Minister for Finance (Mr Heng Swee Keat)', ': Mr Speaker, I beg to move, "That Parliament approves the financial policy of the Government for the financial year 1 April 2019 to 31 March 2020."', 'This year marks 200 years since Sir Stamford Raffles landed in Singapore. Archaeological finds and records show that Singapore’s history stretches back at least 700 years, serving as a trading emporium in the region.', 'Eighteen nineteen was a key turning point in Singapore’s development. The British decision to declare Singapore a free port plugged us into an emerging network of global trade. This, and subsequent developments, transformed Singapore into a global node.', 'In our bicentennial year, let us reflect on the twists and turns in our history, so as to chart a path forward for an even better future for our people.', 'Today, we are in a different phase of globalisation, with new forces reshaping the global environment. In last year’s Budget Statement, I mentioned three major shift

ipdb>  the_rest


['The Minister for Finance (Mr Heng Swee Keat)', ': Mr Speaker, I beg to move, "That Parliament approves the financial policy of the Government for the financial year 1 April 2019 to 31 March 2020."', 'This year marks 200 years since Sir Stamford Raffles landed in Singapore. Archaeological finds and records show that Singapore’s history stretches back at least 700 years, serving as a trading emporium in the region.', 'Eighteen nineteen was a key turning point in Singapore’s development. The British decision to declare Singapore a free port plugged us into an emerging network of global trade. This, and subsequent developments, transformed Singapore into a global node.', 'In our bicentennial year, let us reflect on the twists and turns in our history, so as to chart a path forward for an even better future for our people.', 'Today, we are in a different phase of globalisation, with new forces reshaping the global environment. In last year’s Budget Statement, I mentioned three major shift

ipdb>  c


sprs3topic_reportid=budget-1099.html
> [1;32mc:\users\thisi\appdata\local\temp\ipykernel_1936\1403815685.py[0m(157)[0;36msoup_to_pqs[1;34m()[0m



ipdb>  the_rest[0


*** SyntaxError: '[' was never closed


ipdb>  the_rest[0]


'Mr Liang Eng Hwa (Holland-Bukit Timah)'


ipdb>  the_rest[1]


': Mr Speaker, Sir, I would like to touch on four areas of Budget 2019; namely, Growing our Enterprises; Supporting our Workers; Healthcare Support; and Fiscal Management.'


ipdb>  exit()


BdbQuit: 

In [None]:
pq_df = pd.DataFrame(pqs, columns=['asker_name', 'asker_party', 'asker_parliaments', 'askees', 'question', 'parliament_no', 'session_no', 'volume_no', 'sitting_no', 'sitting_date', 'report_section', 'title', 'responder_name', 'responder_title', 'response', 'is_pq'])
pq_df.insert(4, 'askees_portfolios', pq_df.askees.apply(identify_portfolios))
pq_df.insert(13, 'responder_portfolio', pq_df.responder_title.apply(identify_portfolio))
pq_df

In [None]:
pq_df.to_csv('pqs.csv', index=False, sep='|') # sep=',' gives formatting issues 

In [None]:
assert all(pq_df.parliament_no < 15) and all(pq_df.parliament_no >= 12)
# assert all(map(lambda x: not x[0].isupper(), pq_df.question.values)) 
assert all(map(lambda x: not x[:3] == 'and', pq_df.question.values)) 
parties_set = set(pq_df.asker_party.values)
print(f'parties: {parties_set} (len: {len(parties_set)})')
print()
askee_set = set([askees for sublist in pq_df.askees for askees in sublist])
print(f'askees: {askee_set} (len: {len(askee_set)})')

In [None]:
len(set(pq_df.asker_name.values))

In [None]:
min_count = 999
min_mp = None
max_count = 0
max_mp = None
less_than_ten = 0
just_one = 0
for name in set(pq_df.asker_name.values):
    count_here = pq_df[pq_df.asker_name == name]['asker_name'].count()
    if count_here < min_count:
        min_count = count_here
        min_mp = name
    if count_here > max_count:
        max_count = count_here
        max_mp = name
    if count_here < 10:
        less_than_ten += 1
    if count_here == 1:
        just_one += 1
        
min_count, min_mp, max_count, max_mp, less_than_ten, just_one