In [1]:
from bs4 import BeautifulSoup as bs
import os
import os.path
from datetime import datetime
from enum import Enum
import re

RAW_HANSARD_DIR = 'scraped_content'

In [2]:
class ReportSection(Enum):
    WRITTEN = 'Written Answers to Questions'
    WRITTEN_NA = 'Written Answers to Questions for Oral Answer Not Answered by End of Question Time'
    ORAL = 'Oral Answers to Questions'

we assume that all pqs are prefaced with #. (non sprs) or # (sprs). ignore follow up qns since we are only interested in mapping mps to topics, and the follow up qns will always be from the same mp and on the same topic.

In [3]:
pq_paras = []
    
for file in os.listdir(RAW_HANSARD_DIR):
    filepath = os.path.join(RAW_HANSARD_DIR, file)
    if not os.path.isfile(filepath):
        continue
    
    print(f'\nProcessing: {file}')
    
    # the files don't have consistent html formatting, but they fall into two major groups.
    if file[0:4] == 'sprs':
        is_sprs = True
    else:
        is_sprs = False
    
    with open(filepath, 'r') as f:
        soup = bs(f, 'html.parser')
        
    if is_sprs:
        # extract metadata from the table at the top of the page
        topic_table = soup.find('table', {'class': 'topic'})
        topic_text = topic_table.get_text()
        
        parl_no_label_span = re.match(r'Parliament No:', topic_text).span()
        topic_text = topic_text[parl_no_label_span[1]:]
        parl_no = int(re.match(r'\d+', topic_text).group())

        sit_date_label_span = re.match(r'.*Sitting Date:', topic_text).span()
        topic_text = topic_text[sit_date_label_span[1]:]
        sit_date = datetime.strptime(re.match(r'\d+-\d+-\d+', topic_text).group(), '%d-%m-%Y')
        
        # extract report section via filename
        if 'oral-answer' in file:
            section = ReportSection.ORAL
        elif 'written-answer-na' in file:
            section = ReportSection.WRITTEN_NA
        elif 'written-answer' in file:
            section = ReportSection.WRITTEN
        else:
            raise 'unidentified section (sprs)'
        
        # extract paragraphs from report in the form of rows of table
        report_table = soup.find('div', {'class': 'reportTable'})
        report_table_rows = report_table.findChildren()

        # find out which rows correspond to pqs and which rows don't.
        for row in report_table_rows:
            paragraph = row.get_text()
            paragraph = paragraph.replace('\xa0', ' ')
            if not re.match(r'\d+.{8,75} asked the .+', paragraph):
                continue
            paragraph_no_num = paragraph[
                re.search('\d+', paragraph).span()[1]:]
            pq_paras.append((paragraph_no_num.strip(), sit_date, parl_no, section))
    else:
        # extract metadata from meta elements directly
        sit_date_str = soup.find('meta', {'name': 'Sit_Date'})['content']
        parl_no_str = soup.find('meta', {'name': 'Parl_No'})['content']
        section_str = soup.find('meta', {'name': 'Sect_Name'})['content']
        
        sit_date = datetime.strptime(sit_date_str, '%Y-%m-%d')
        parl_no = int(parl_no_str)
        if 'NOT REACHED' in section_str:
            section = ReportSection.WRITTEN_NA
        elif 'ORAL' in section_str:
            section = ReportSection.ORAL
        elif 'WRITTEN' in section_str:
            section = ReportSection.WRITTEN
        else:
            raise 'unidentified section (non sprs)'
        
        # extract paragraphs from table in the form of raw text (since each paragraph doesn't have its own element)
        report_table = soup.find('div', {'class': 'reportTable'})
        raw_text = report_table.get_text()
        
        # look for things like 1. 2. etc
        first_number_dot_occurrence = re.search(r'\d+\. .{8,75} asked the ', raw_text)
        
        if not first_number_dot_occurrence:
            continue

        # cut off everything before the actual body of the text
        raw_text = raw_text[first_number_dot_occurrence.span()[0]:]
        
        # formatting stuff
        raw_text = raw_text.replace('\n\n', ' ')
        raw_paras = raw_text.split('\xa0\xa0\xa0\xa0')
        
        new_pq_paras = list(
            map(lambda s: (s[re.search('\d+\.', s).span()[1]:].strip(), sit_date, parl_no, section), # remove index, add sitting date, parl no, section
                map(lambda s: re.sub('Column: \d+', '', s.replace('  ', ' ').strip()), # remove column indicators and extra spaces
                    filter(lambda para: re.match(r'\d+\. .{8,75} asked the ', para), raw_paras)))) # keep only paragraphs that are pqs.
        
        pq_paras += new_pq_paras



Processing: sprs3topic_reportid=oral-answer-2802.html

Processing: sprs3topic_reportid=oral-answer-2806.html

Processing: sprs3topic_reportid=oral-answer-2809.html

Processing: sprs3topic_reportid=oral-answer-2812.html

Processing: sprs3topic_reportid=oral-answer-2818.html

Processing: sprs3topic_reportid=oral-answer-2819.html

Processing: sprs3topic_reportid=oral-answer-2822.html

Processing: sprs3topic_reportid=oral-answer-2825.html

Processing: sprs3topic_reportid=oral-answer-2828.html

Processing: sprs3topic_reportid=oral-answer-2833.html

Processing: sprs3topic_reportid=written-answer-10486.html

Processing: sprs3topic_reportid=written-answer-na-10315.html

Processing: sprs3topic_reportid=written-answer-na-10460.html

Processing: sprs3topic_reportid=written-answer-na-10587.html

Processing: sprs3topic_reportid=written-answer-na-10628.html

Processing: topic_reportid=003_19641116_S0003_T0011.html

Processing: topic_reportid=003_19641118_S0004_T0013.html

Processing: topic_reportid