In [4]:
# ! pip install pdfplumber

In [31]:
import pdfplumber
import pandas as pd
import re

In [108]:
pdf_path = "data/speeches/pdf/"
txt_path = "data/speeches/txt/"
extracted_path = "data/speeches/extracted/"

speech_suffix = "speeches.txt"
extracted_speech_suffix = lambda x,numb=True: f"speech_{x}.txt" if numb else "speech.txt"

In [7]:
metadata = pd.read_csv('data/speech_records.csv')

In [12]:
metadata.head(1)

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,document_symbol_searchable,date,sg_number,year,month,day,body,sub_body,doc_number,doc_type,add_part,file_prefix,file_name_pdf
0,4090177,"Guterres, António, 1949-",UN. Secretary-General,S/PV.9988,S/PV.9988,2025-08-28,9,2025,8,28,S,,,PV.9988,,S_2025_PV.9988_,S_2025_PV.9988_speeches.pdf


In [11]:
# metadata[metadata.year >= 2025]

In [126]:
test_subset = metadata[metadata.year >= 2025].copy()
header, init_header, footer = 70, 0, 70

for row in test_subset.itertuples():

    with pdfplumber.open(f"{pdf_path+row.file_name_pdf}") as pdf, open(f"{txt_path+row.file_prefix}{speech_suffix}", "w", encoding="utf-8") as f:
        
        for page in pdf.pages:
            # get page size
            x0, y0, x1, y1 = page.bbox
            # print(page.page_number)
            # cut the header and footer off
            content_area = page.crop((x0, y0 + init_header, x1, y1 - footer))
            # update to header value after first page, so I get the first header
            init_header = header
            t = content_area.extract_text()
            if t:
                f.write(t + '\n')

In [94]:
start_sg = re.compile(
    r"(?m)" # whole doc
    r"^\s*" # whitespace
    r"(?:\d+\.\s*)?" # digit and string if existent e.g.(5. The Secretary...)
    # r"The\sSecretary-?General"
    r"The Secretary-General"
    r"(?:\s*\([^)]*\))?" # stuff in brackets if existent e.g.(spoke in spanish)
    r"\s*:" # :
    r"\s*", # whitespace if there
    re.IGNORECASE
)

next_speaker = re.compile(
    r"(?m)" # whole doc
    r"^\s*" # whitespace
    r"(?:\d+\.\s*)?" # digit and string if existent 
    r"(?:The|[A-Z]*[A-Za-z]*\.)" # "The" or "Mr."/"Mrs."/...
    r"(?:\s+[A-Z][A-Za-z\'\.\-]*){1,5}"  # 1 to 5 names
    r"(?:\s*\([^)]*\)){0,3}" # brackets, max 3 e.g.(Algeria) (spoke in spanish)
    r"\s*:" # :
    r"\s*", # whitespace if there)
)

In [130]:
for row in test_subset.itertuples():

    with open(f"{txt_path+row.file_prefix}{speech_suffix}", "r", encoding="utf-8") as f:
        text = f.read()

    sg_matches = list(start_sg.finditer(text))
    multiple_matches = len(sg_matches) > 1

    for idx, sg_match in enumerate(sg_matches):
        start_idx = sg_match.end()
        stop_match = next_speaker.search(text, start_idx)
        end_idx = stop_match.start() if stop_match else len(text)
        speech = text[start_idx:end_idx].strip()

        with open(f"{extracted_path+row.file_prefix}{extracted_speech_suffix(idx, multiple_matches)}", "w", encoding="utf-8") as file:
            file.write(sg_match.group())
            file.write("\n")
            file.write(speech)