run the mp scraper one and the actual hansard scraper one. then you'll have some html files of hansard and a csv file of mps. then run this nb.

if u wanna find code that deals w edge cases, find comments that start with "edge case". examples are given. i started writing those comments q l8 tho so i don't guarantee that i commented on all of them. 

**the csv file you get from this is delimited by '|'. if reading with code make sure u account for that. if opening in excel, follow these instrns (https://support.affinity.co/hc/en-us/articles/360044453711-How-to-open-CSV-files-with-the-correct-delimiter-separator).**

In [1]:
from bs4 import BeautifulSoup as bs
import ast
import os
import os.path
from datetime import datetime
from enum import Enum
import re
import pandas as pd
import numpy as np
from pdb import set_trace as st

**for merging with mps.csv**

In [2]:
mp_df = pd.read_csv('mps.csv')
mp_df.Party = mp_df.Party.apply(ast.literal_eval)
mp_df.Parliaments = mp_df.Parliaments.apply(ast.literal_eval)
mps = dict(
    zip(mp_df.Name.apply(lambda x: x.replace('.', '').replace(',', '').lower()), # keys
    zip(mp_df.Name, mp_df.Party, mp_df.Parliaments))) # values
mp_names = list(mps.keys())

In [3]:
alr_matched = set() # honorific+names that have alr been matched to names so we don't spam the print
ministers_found = set() # minister titles that have alr been found (to be used for future searches in case of typos)

In [4]:
honorific_regex = r'(mr|mrs|ms|miss|mdm|dr|er dr|prof|assoc prof|er|asst prof|assoc prof dr|inche|encik)'

# for matching honorific+name in report to actual mp data.
# cannot simply remove honorific as the programmer doesn't have an exhaustive list
# of honorifics, and some are quite rare in everyday use (e.g. Inche Rahamat Bin Kenap).
def honorific_name_to_mp_data(honorific_name):
    honorific_name = honorific_name.replace('.','').replace(',','').replace(':','').lower().strip()
    honorific_name = re.sub('\(.+\)', '', honorific_name)
    
    # try the easy way first (find and remove honorific)
    honorific_match = re.match(honorific_regex, honorific_name)
    if honorific_match:
        name = honorific_name[honorific_match.span()[1]+1:]
        if name in mps.keys():
            return mps[name]
        
        # seems quite common for them to write "asked" twice in the hansard proceedings
        last_asked = name.rfind(' asked')
        if last_asked and name[:last_asked] in mps.keys():
            return mps[name[:last_asked]]

        # slightly harder way (rearranging words)
        for mp_name in mp_names:
            mp_name_words = set(mp_name.split(' '))
            name_words = set(name.split(' '))
            if mp_name_words == name_words:
                if (honorific_name, mp_name) not in alr_matched:
                    #print(f'rearranging matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
            
            # for omission of chinese name
            if len(mp_name_words) - len(name_words) <= 2 and len(name_words) >= 2 and name_words.issubset(mp_name_words):
                if (honorific_name, mp_name) not in alr_matched:
                    print(f'allowing omitted words in name matched {honorific_name} to {mp_name}')
                    alr_matched.add((honorific_name, mp_name))
                return mps[mp_name]
                
    digit_match = re.search('\d+', honorific_name)
    if digit_match:
        # names shldn't have digits
        honorific_name = honorific_name[digit_match.span()[1]:]
        return honorific_name_to_mp_data(honorific_name)
        
    # the hard way (levenshtein)
    closest_name = levenshtein_best_match(honorific_name, mp_names)
    
    if (honorific_name, closest_name) not in alr_matched:
        print(f'levenshtein matched {honorific_name} to {closest_name}')
        alr_matched.add((honorific_name, closest_name))
    return mps[closest_name]

def levenshtein_best_match(value, options):
    min_levenshtein = 99999
    min_val = None
    for option in options:
        l_dist = levenshtein(option, value)
        if l_dist < min_levenshtein:
            min_levenshtein = l_dist
            min_val = option
    return min_val
            

# borrowed from: https://blog.paperspace.com/implementing-levenshtein-distance-word-autocomplete-autocorrect/
# we use levenshtein as it helps to protect against typos too, like the "asked asked" in:
# https://sprs.parl.gov.sg/search/sprs3topic?reportid=oral-answer-2822
def levenshtein(token1, token2):
    distances = np.zeros((len(token1) + 1, len(token2) + 1))
    for t1 in range(len(token1) + 1):
        distances[t1][0] = t1
    for t2 in range(len(token2) + 1):
        distances[0][t2] = t2
        
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(token1) + 1):
        for t2 in range(1, len(token2) + 1):
            if (token1[t1-1] == token2[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1

    return distances[len(token1)][len(token2)]

In [5]:
pqs = []

In [6]:
class ReportSection(Enum):
    WRITTEN = 'Written Answers to Questions'
    WRITTEN_NA = 'Written Answers to Questions for Oral Answer Not Answered by End of Question Time'
    ORAL = 'Oral Answers to Questions'
    BUDGET = 'Budget'

we assume that all pqs are prefaced with #. (non sprs) or # (sprs). ignore follow up qns since we are only interested in mapping mps to topics, and the follow up qns will always be from the same mp and on the same topic.

notes regarding minister titles:
* Minister for Culture, Community and Youth is the only minister title with a comma
* but there used to be Minister for Information, Communication and the Arts and Minister for Community Development, Youth and Sports
* no questions were ever directed to minister mentor

In [7]:
cap_word = r'[A-Z][a-z]+'
cap_words = f'({cap_word})( {cap_word})*'
mccy = '(Acting )?Minister for Culture, Community and Youth'
mica = '(Acting )?Minister for Information, Communications and the Arts'
mcdys = '(Acting )?Minister for Community Development, Youth( and|,) Sports'
micma = 'Minister-in-charge of Muslim Affairs'
minister_for_something = f'({cap_words} )?Minister( of State)? (for|of) (the )?{cap_words}( and (the )?{cap_words})?'
something_minister = f'{cap_words} Minister'
one_minister_regex = f'(({mccy})|({mica})|({mcdys})|({micma})|({minister_for_something})|({something_minister}))'
minister_regex = re.compile(f'{one_minister_regex}( and (the )?{one_minister_regex})?') # can have multiple targets

def first_two_capitalized(words):
    return words[0][0].isupper() and words[1][0].isupper() and words[0][1] and words[0][1].islower()

def trim_off_non_pq_content_at_start(para):
    para_words = para.split(' ')
    if first_two_capitalized(para_words):
        return para
    
    # i assume honorific+name has at least two words capitalized and non-numbers
    while not first_two_capitalized(para_words):
        para_words = para_words[1:]
    
    return ' '.join(para_words)

# extracts the first substring which is a substring of ministers
def extract_first_ministers(para):
    minister_match = re.search(minister_regex, para)
    if not minister_match:
        # report might've been in the wrong case; try to match to existing ministers
        minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace(' for ', '.{1,5}'), ministers_found))) + ')',
            para.lower()
        )
        if not minister_match:
            minister_match = re.search(
            '(' + '|'.join(list(map(lambda s: s.lower().replace(' for ', '(\s)*(for|of)(\s)*'), ministers_found))) + ')',
            para.lower()
        )
        if not minister_match:
            for existing_minister in ministers_found:
                if existing_minister.replace(' ', '') in para.replace(' ', ''):
                    minister_match = re.search('(\s)?'.join(c for c in existing_minister.replace(' ', '').lower()), para.lower())
                    break
        minister = levenshtein_best_match(minister_match.group(), ministers_found)
        print(f'found minister: {str(minister_match.group())}; matched to {minister}')
    else:
        minister = para[:minister_match.span()[1]].replace(' of ', ' for ')
        ministers_found.add(minister)

    para = para.replace(minister_match.group(), '').strip()

    if ' and Leader' in minister:
        minister = minister[:-11]

    return minister, para

def get_ministers_and_question(para):
    askee, question = extract_first_ministers(para)
    
    if not re.search('and (the )?Minister', askee):
        return (askee,), question   
    else:
        askee = askee.replace('and the Minister', 'and Minister')
        askees = askee.split(' and Minister')
        return (askees[0], 'Minister' + askees[1]), question

def soup_to_pqs(soup, file):
    # print(file)
    # seems to happen quite often sadly
    if soup.get_text() == '':
        print(f'empty text {file}')
        return
    
    stripped_strings = list(map(
        lambda text: re.sub(r'\s+', ' ', text),
        filter(
            lambda text: not re.match(r'Page:\s+\d+', text) and not re.match(r'Column:\s+\d+', text),
            [text for text in soup.stripped_strings])))
    if len(stripped_strings) < 20: # the table at the top of the page alr accounts for most of this.
        return
    parl_no = int(stripped_strings[4])
    sess_no = int(stripped_strings[6])
    vol_no = int(stripped_strings[8])
    sitting_no = int(stripped_strings[10])
    sitting_date = datetime.strptime(stripped_strings[12], '%d-%m-%Y')
    section_name_raw = stripped_strings[14].lower()
    
    if 'answered' in section_name_raw:
        section_name = ReportSection.WRITTEN_NA
    elif 'written' in section_name_raw:
        section_name = ReportSection.WRITTEN
    elif 'oral' in section_name_raw:
        section_name = ReportSection.ORAL
    elif 'budget' in section_name_raw:
        section_name = ReportSection.BUDGET
    else:
        raise f'no section name??? {section_name_raw}'
    
    title = stripped_strings[16]
    the_rest = stripped_strings[20:]
    
    if section_name == ReportSection.BUDGET:
        return # TODO TODO TODOTODOTODO TODO TODO ============================================================================
    
    while len(the_rest) > 0 and not re.match(r'\d\d?', the_rest[0]):
        the_rest = the_rest[1:]
        
    if len(the_rest) == 0:
        return
        
    indices_corresponding_to_pqs = []
    indices_corresponding_to_speakers = []
    maybe_more_pqs = True
    for i in range(len(the_rest)):
        if the_rest[i][0] == ':' or (i-1 >= 0 and the_rest[i-1][-1] == ':' and the_rest[i-1] in list(map(lambda s: s.get_text().strip(), soup.select('strong')))): # edge case: (sprs3topic_reportid=oral-answer-2239.html), Ong Ye Kung's first response has the colon bolded, whereas it's normally not bolded. this throws us off. extra check in the condition is to resolve this.
            actual_index_to_append = i-1
            # edge case: (sprs3topic_reportid=oral-answer-1632.html), "The Senior Minister of State for Home Affairs (Mr Desmond Lee) (for the Minister for Home Affairs)" is broken up into multiple entries for some reason. this loop is to ensure the full name and title gets saved.
            while the_rest[actual_index_to_append][0] == '(' and the_rest[actual_index_to_append][-1] == ')':
                actual_index_to_append -= 1
            # edge case: (sprs3topic_reportid=oral-answer-2760.html), "The Minister of State for Home Affairs (Mr Desmond Tan) (for the  Minister for Home Affairs)" is also cut in the middle for some reason zzz
            bracket_count = the_rest[actual_index_to_append].count('(') - the_rest[actual_index_to_append].count(')')
            while bracket_count != 0:
                actual_index_to_append -= 1
                bracket_count += the_rest[actual_index_to_append].count('(') - the_rest[actual_index_to_append].count(')')
            indices_corresponding_to_speakers.append(actual_index_to_append)
            maybe_more_pqs = False
        elif re.match(r'\d\d?', the_rest[i]) and maybe_more_pqs:
            indices_corresponding_to_pqs.append(i)
            
    if len(indices_corresponding_to_pqs) == 0:
        print(f'no pqs? {file}')
        return
    if len(indices_corresponding_to_speakers) == 0:
        print(f'no speakers? {file}')
        return
        
    pq_sublists = []
    pq_qn_indices = []
    while len(indices_corresponding_to_pqs) > 1:
        pq_qn_indices.append(the_rest[indices_corresponding_to_pqs[0]])
        pq_sublists.append(the_rest[indices_corresponding_to_pqs[0]+1:indices_corresponding_to_pqs[1]])
        indices_corresponding_to_pqs = indices_corresponding_to_pqs[1:]
        
    pq_qn_indices.append(the_rest[indices_corresponding_to_pqs[0]])
    pq_sublists.append(the_rest[indices_corresponding_to_pqs[0]+1:indices_corresponding_to_speakers[0]])
    
    speaking_sublists = []
    
    while len(indices_corresponding_to_speakers) > 1:
        speaking_sublists.append(the_rest[indices_corresponding_to_speakers[0]:indices_corresponding_to_speakers[1]])
        indices_corresponding_to_speakers = indices_corresponding_to_speakers[1:]
    
    speaking_sublists.append(the_rest[indices_corresponding_to_speakers[0]:])
    
    new_pqs = []
    new_pq_indices = []
    
    for pq_i, sl in zip(pq_qn_indices, pq_sublists):
        pq_para = ' '.join(sl)
        
        #pq_para = trim_off_non_pq_content_at_start(pq_para) # sometimes we end up mistaking other numbers in the text as being the pq numbers. so we deal w that here.
        if ' asked the ' not in pq_para:
            continue
        asker_honorific_name, pq_para = pq_para.split(' asked the ', 1)    
        ministers, question = get_ministers_and_question(pq_para)

        if question[0] == ',':
            question = question[1:].strip()
        if len(asker_honorific_name.strip()) == 0:
            return
        asker, asker_party, asker_parls = honorific_name_to_mp_data(asker_honorific_name.strip())
        new_pq_indices.append(int(pq_i))
        new_pqs.append([asker, asker_party[0], asker_parls, ministers, question, parl_no, sess_no, vol_no, sitting_no, sitting_date, section_name, title])
    
    speakers_and_spokens = []
    for sl in speaking_sublists:
        text = ' '.join(sl)
        speaker, spoken = text.split(':', 1)
        speaker = speaker.strip()
        spoken = spoken.strip()
        while len(spoken) > 0 and not spoken[0].isalpha():
            spoken = spoken[1:]
        if len(spoken) == 0: # edge case (sprs3topic_reportid=written-answer-4142.html). sometimes people are just lost for words i guess.
            return
        if spoken[:len('Question No')] == 'Question No':
            continue
        speaker = re.sub(f'\(for the .*\)', '', speaker)
        speaker = re.sub(f'\(on behalf of the .*\)', '', speaker)
        if len(speakers_and_spokens) == 0:
            speaker_title_match = re.search(minister_regex, speaker)
            if not speaker_title_match:
                responder_title = ''
                if section_name == ReportSection.ORAL: # only oral answers include speaker title. 
                    print(f'no title for this speaker {speaker} in this file {file}')
            else:
                responder_title = speaker_title_match.group()
        speaker = re.sub(minister_regex, '', speaker)
        in_bracket_honorific_match = re.search(f'\({honorific_regex}.+\)', speaker.lower())
        if in_bracket_honorific_match:
            honorific_match_group = in_bracket_honorific_match.group()[1:-1]
            if len(honorific_match_group) == 0:
                return
            speaker = honorific_name_to_mp_data(honorific_match_group)[0]
        else:
            speaker = re.sub(f'\(.+\)', '', speaker)
            if len(speaker) == 0:
                return
            speaker = honorific_name_to_mp_data(speaker)[0]
        speakers_and_spokens.append([speaker, spoken])
        
    assert len(new_pqs) == len(new_pq_indices)
        
    if len(new_pqs) == 0:
        return
        
    # only 1 pq was asked. so everything else in the file must be related to that pq.
    if len(new_pqs) == 1:
        responder, response = speakers_and_spokens[0]
        new_pqs[0].append(responder)
        new_pqs[0].append(responder_title)
        new_pqs[0].append(response)
        follow_ups = speakers_and_spokens[1:]
        new_pqs[0].append(follow_ups)
        pqs.append(new_pqs[0])
        return
    
    if len(speakers_and_spokens) == 0:
        print(f'found file with no responses {file}')
        for new_pq in new_pqs:
            new_pq.append('')
            new_pq.append('')
            new_pq.append('')
            new_pq.append([])
            pqs.append(new_pq)
        return
    
    # only 1 person spoke after all the pqs were asked. so this person must be responding to all the pqs.
    if len(speakers_and_spokens) == 1:
        responder, response = speakers_and_spokens[0]
        for new_pq in new_pqs:
            new_pq.append(responder)
            new_pq.append(responder_title)
            new_pq.append(response)
            new_pq.append([]) # no follow-ups after main response
            pqs.append(new_pq)
        return

    # past this point, the html file has more than one pq, and more than one response to those pqs. 
    assert len(new_pqs) > 1
    assert len(speakers_and_spokens) > 1
    
    relevant_followups = dict() # stores which spoken thing is relevant to which pq
    for new_pq_i in new_pq_indices:
        relevant_followups[new_pq_i] = []
    first_responder, first_response = speakers_and_spokens[0]
    
    # usually the responder responds to all the qns at once. we just wanna confirm that.
    qn_indices_covered_by_first_response = set()
    range_matches = re.findall(r'\d+ to \d+', first_response) # e.g. "i wanna cover qns x to y and z to w"
    if range_matches and len(range_matches) > 0:
        for range_match in range_matches:
            qn_indices_covered_by_first_response = qn_indices_covered_by_first_response.union(set(range(*list(map(int, range_match.split(' to '))))))
    range_matches = re.findall(r'\d+( )?-( )?\d+', first_response) # e.g. "i wanna cover qns x-y and z-w"
    if range_matches and len(range_matches) > 0:
        for range_match in range_matches:
            qn_indices_covered_by_first_response = qn_indices_covered_by_first_response.union(set(range(*list(map(int, range_match.split('-'))))))
    qn_indices_covered_by_first_response = qn_indices_covered_by_first_response.union(set(map(int, re.findall(r'\d+', first_response))))
    if set(new_pq_indices).issubset(qn_indices_covered_by_first_response) or 'all' in first_response or 'together' in first_response or 'every' in first_response:
        # the first responder is responding to all the questions at once. 
        for pq_i in new_pq_indices:
            # speakers_and_spokens[0] is asking for permission to answer all qns at once. speakers_and_spokens[1] is the actual response.
            relevant_followups[pq_i].append(speakers_and_spokens[1])
        speakers_and_spokens = speakers_and_spokens[2:]
    else:
        print("i genuinely don't think we'll reach this point. but if we do, find out which questions this current response is addressing")
        print('rmb to remove the consumed entries from speakers_and_spokens')
        print(f'file is {file}')
        import pdb
        pdb.set_trace()

    # no follow-ups after main response
    if len(speakers_and_spokens) == 0:
        for new_pq_i, new_pq in zip(new_pq_indices, new_pqs):
            new_pq.append(relevant_followups[new_pq_i][0][0]) # responder
            new_pq.append(responder_title)
            new_pq.append(relevant_followups[new_pq_i][0][1]) # response
            new_pq.append([])
            pqs.append(new_pq)
        return
    
    # gotta map everything that is said, to the relevant pqs. 
    asker_to_new_pqi = dict()
    for new_pq_i, new_pq in zip(new_pq_indices, new_pqs):
        asker_to_new_pqi[new_pq[0]] = new_pq_i

    pqs_with_new_responses_since_last_time_first_responder_spoke = set()
    pqs_that_first_responder_covered_with_last_reply = set(new_pq_indices)
    
    for i in range(len(speakers_and_spokens)):
        speaker, spoken = speakers_and_spokens[i]
        #print(f'pqs w new responses: {pqs_with_new_responses_since_last_time_first_responder_spoke}')
        #print(f'pqs last covered: {pqs_that_first_responder_covered_with_last_reply}')
        
        if speaker in asker_to_new_pqi.keys(): # something is said by someone who asked a pq. so it's relevant to that pq. 
            #print(f'said by someone who asked a pq. {asker_to_new_pqi[speaker]} {[speaker, spoken[:100]]}')
            relevant_followups[asker_to_new_pqi[speaker]].append([speaker, spoken])
            pqs_with_new_responses_since_last_time_first_responder_spoke.add(asker_to_new_pqi[speaker])
            continue
        
        # something is said by someone who did not ask any pqs. 
        
        if speaker == first_responder:
            # it's the original responder, ofc responding to some followup qns. 
            # find out which qns these are by seeing who said stuff since the last time this guy spoke.
            #print(f'said by responder. adding to pqs: {pqs_with_new_responses_since_last_time_first_responder_spoke}. {[speaker, spoken[:100]]}')
            for pq_with_new_response in pqs_with_new_responses_since_last_time_first_responder_spoke:
                relevant_followups[pq_with_new_response].append([speaker, spoken])
            pqs_that_first_responder_covered_with_last_reply = pqs_with_new_responses_since_last_time_first_responder_spoke
            pqs_with_new_responses_since_last_time_first_responder_spoke = set()
            continue
        else:
            # otherwise, it's a follow-up qn from someone originally unrelated. naturally the qn would be targeted at the first responder, and the
            # content has to be rel8ed to whatever the responder last said. so we assign it to the same set of pqs.
            #print(f'new followup qn. adding to pqs: {pqs_that_first_responder_covered_with_last_reply}. {[speaker, spoken[:100]]}')
            for pq_covered in pqs_that_first_responder_covered_with_last_reply:
                relevant_followups[pq_covered].append([speaker, spoken])
                pqs_with_new_responses_since_last_time_first_responder_spoke.add(pq_covered)
            continue
    
    for new_pq_i, new_pq in zip(new_pq_indices, new_pqs):
        new_pq.append(relevant_followups[new_pq_i][0][0]) # responder
        new_pq.append(responder_title)
        new_pq.append(relevant_followups[new_pq_i][0][1]) # response
        new_pq.append(relevant_followups[new_pq_i][1:])
        pqs.append(new_pq)
    return


In [8]:
pqs = []
files_and_exceptions = []
files_to_run_through = os.listdir('scraped_content')[500:550]

for i in range(len(files_to_run_through)):
    file = files_to_run_through[i]
    print(file)
    filepath = os.path.join('scraped_content', file)
    if os.stat(filepath).st_size < 200000: # the html elements alr take up more than 300kb, so if a file is this small then someth's wrong
        continue
    try:
        with open(filepath, 'r') as f:
            soup = bs(f, 'html.parser')
        soup_to_pqs(soup, file)
    except Exception as e:
        #import pdb
        #pdb.set_trace()
        #raise e
        print(f'exception: {str(e)} - {file}')
        files_and_exceptions.append([file, e])
        continue
        
    if i%25==0:
        print(f'{i}/{len(files_to_run_through)}')
        
print('=====DONE==================================================')
print(f'total pqs: {len(pqs)}')
print(f'total files: {len(files_to_run_through)}')
print(f'avg pqs per file: {len(pqs)/len(files_to_run_through)}')
print(f'files with exceptions: {files_and_exceptions}')

sprs3topic_reportid=oral-answer-2535.html
levenshtein matched mr deputy speaker to abdul samad
0/50
sprs3topic_reportid=oral-answer-2536.html
sprs3topic_reportid=oral-answer-2537.html
sprs3topic_reportid=oral-answer-2538.html
sprs3topic_reportid=oral-answer-2539.html
sprs3topic_reportid=oral-answer-2540.html
no title for this speaker Mr Deputy Speaker in this file sprs3topic_reportid=oral-answer-2540.html
sprs3topic_reportid=oral-answer-2541.html
sprs3topic_reportid=oral-answer-2542.html
sprs3topic_reportid=oral-answer-2543.html
levenshtein matched mr masagos zulkifli b m m to masagos zulkifli bin masagos mohamad
sprs3topic_reportid=oral-answer-2544.html
sprs3topic_reportid=oral-answer-2545.html
levenshtein matched mdm deputy speaker to lim wee kiak
sprs3topic_reportid=oral-answer-2546.html
sprs3topic_reportid=oral-answer-2547.html
sprs3topic_reportid=oral-answer-2548.html
sprs3topic_reportid=oral-answer-2549.html
sprs3topic_reportid=oral-answer-2550.html
sprs3topic_reportid=oral-answe

In [9]:
pq_df = pd.DataFrame(pqs, columns=['asker_name', 'asker_party', 'asker_parliaments', 'askees', 'question', 'parliament_no', 'session_no', 'volume_no', 'sitting_no', 'sitting_date', 'report_section', 'title', 'responder_name', 'responder_title', 'response', 'discussion'])
pq_df

Unnamed: 0,asker_name,asker_party,asker_parliaments,askees,question,parliament_no,session_no,volume_no,sitting_no,sitting_date,report_section,title,responder_name,responder_title,response,discussion
0,Tin Pei Ling,People's Action Party,"(12, 13, 14)","(Minister for National Development,)",whether the proportion of units under the Marr...,14,1,95,34,2021-07-27,ReportSection.ORAL,Increasing Proportion of BTO Flats within Matu...,Desmond Lee,The Minister for National Development,"Mr Deputy Speaker, the Married Child Priority ...","[[Abdul Samad, Ms Tin Pei Ling.], [Tin Pei Lin..."
1,Louis Ng Kok Kwang,People's Action Party,"(13, 14)","(Minister for Manpower,)",(a) what are the top three ways by which the M...,14,1,95,34,2021-07-27,ReportSection.ORAL,Number and Actions against Salary Kickback Off...,Koh Poh Koon,The Senior Minister of State for Manpower,"Sir, between 2016 and 2020, MOM looked into an...","[[Abdul Samad, Mr Louis Ng.], [Louis Ng Kok Kw..."
2,Tin Pei Ling,People's Action Party,"(12, 13, 14)","(Minister for Manpower,)",to what extent should an employer be made resp...,14,1,95,34,2021-07-27,ReportSection.ORAL,Extent of Employer's Responsibility for Migran...,Zaqy Mohamad,The Senior Minister of State for Manpower,"Mr Deputy Speaker, employers are responsible f...","[[Abdul Samad, Ms Tin.], [Tin Pei Ling, Thank ..."
3,Kwek Hian Chuan Henry,People's Action Party,"(13, 14)","(Minister for Manpower,)",whether the Government can strongly encourage ...,14,1,95,34,2021-07-27,ReportSection.ORAL,Raising of Re-employment Age at Organisations ...,Tan See Leng,The Minister for Manpower,"Mr Deputy Speaker, Sir, in 2019, the Tripartit...","[[Abdul Samad, Mr Henry Kwek.], [Kwek Hian Chu..."
4,Foo Mee Har,People's Action Party,"(12, 13, 14)","(Minister for Manpower,)",with an increasing number of Singaporeans in P...,14,1,95,34,2021-07-27,ReportSection.ORAL,Permitting Higher Voluntary Savings Amounts un...,Tan See Leng,The Minister for Manpower,"Mr Deputy Speaker, Sir, Singaporeans already h...","[[Abdul Samad, Ms Foo Mee Har.], [Foo Mee Har,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Liang Eng Hwa,People's Action Party,"(11, 12, 13, 14)","(Minister for Manpower,)",(a) what transitory support will the Governmen...,14,1,95,38,2021-09-14,ReportSection.ORAL,Impact of Local Qualifying Salary Requirement ...,Amrin Amin,The Senior Minister of State for Manpower,Please do.,[]
62,Edward Chia Bing Hui,People's Action Party,"(14,)","(Minister for Manpower,)",(a) how is the Local Qualifying Salary being d...,14,1,95,38,2021-09-14,ReportSection.ORAL,Impact of Local Qualifying Salary Requirement ...,Amrin Amin,The Senior Minister of State for Manpower,Please do.,[]
63,Edward Chia Bing Hui,People's Action Party,"(14,)","(Minister for Manpower,)",(a) how does the Ministry plan to inform and s...,14,1,95,38,2021-09-14,ReportSection.ORAL,Impact of Local Qualifying Salary Requirement ...,Amrin Amin,The Senior Minister of State for Manpower,Please do.,"[[Edward Chia Bing Hui, Mr Speaker, I would li..."
64,Desmond Choo,People's Action Party,"(13, 14)","(Minister for Manpower,)",in light of the tightened Local Qualifying Sal...,14,1,95,38,2021-09-14,ReportSection.ORAL,Impact of Local Qualifying Salary Requirement ...,Amrin Amin,The Senior Minister of State for Manpower,Please do.,[]


In [10]:
pq_df.to_csv('pqs.csv', index=False, sep='|') # sep=',' gives formatting issues 

In [11]:
assert all(pq_df.parliament_no < 15) and all(pq_df.parliament_no >= 12)
assert all(map(lambda x: not x[0].isupper(), pq_df.question.values)) 
assert all(map(lambda x: not x[:3] == 'and', pq_df.question.values)) 
parties_set = set(pq_df.asker_party.values)
print(f'parties: {parties_set} (len: {len(parties_set)})')
print()
askee_set = set([askees for sublist in pq_df.askees for askees in sublist])
print(f'askees: {askee_set} (len: {len(askee_set)})')

parties: {"People's Action Party", "Workers' Party", 'Nominated Member of Parliament', 'Progress Singapore Party'} (len: 4)

askees: {'Minister-in-charge for Muslim Affairs', 'Minister for Transport', 'Minister for Trade and Industry', 'Minister for Social and Family Development', 'Minister for Sustainability and the Environment', 'Minister for Foreign Affairs', 'Minister for Culture, Community and Youth', 'Minister for Home Affairs', 'Minister for Education', 'Minister for National Development', 'Minister for Communications and Information', 'Minister for Health', 'Minister for Manpower'} (len: 13)


In [12]:
len(set(pq_df.asker_name.values))

34

In [13]:
min_count = 999
min_mp = None
max_count = 0
max_mp = None
less_than_ten = 0
just_one = 0
for name in set(pq_df.asker_name.values):
    count_here = pq_df[pq_df.asker_name == name]['asker_name'].count()
    if count_here < min_count:
        min_count = count_here
        min_mp = name
    if count_here > max_count:
        max_count = count_here
        max_mp = name
    if count_here < 10:
        less_than_ten += 1
    if count_here == 1:
        just_one += 1
        
min_count, min_mp, max_count, max_mp, less_than_ten, just_one

(1, 'Sylvia Lim', 6, 'Louis Ng Kok Kwang', 34, 16)