In [72]:
# ! pip install pdfplumber
# ! pip install PyMuPDF

In [71]:
import pdfplumber
import pandas as pd
import re
import os

import fitz

In [169]:
data_dir = "data/"
dataframe_dir = data_dir + "dataframes/"
speeches_dir = data_dir + "speeches/"
speeches_pdf_dir = speeches_dir + "pdf/"
speeches_txt_dir = speeches_dir + "txt/"
speeches_extracted_dir = speeches_dir + "extracted/"

speech_suffix = "speeches.txt"
extracted_speech_suffix = lambda x,numb=True: f"speech_{x}.txt" if numb else "speech.txt"

In [31]:
metadata = pd.read_csv('data/speech_records.csv')

## Proprocessing
Check which pdfs are image and which are text based

adding rules for how to handle them

In [172]:
doc_info = pd.read_csv(f'{dataframe_dir}documents_processing_info.csv')
doc_info.head(1)

Unnamed: 0,body,year,month,day,doc_number,sub_body,two_column_layout,left_column_french,starting_page
0,S,1964,,,,,True,True,3.0


In [166]:
metadata[metadata.file_prefix == 'A_2024_79_PV.1_']

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,document_symbol_searchable,date,sg_number,year,month,day,body,sub_body,doc_number,doc_type,add_part,file_prefix,file_name_pdf,is_in_dir
23,4075008,"Guterres, António, 1949-",UN. Secretary-General,A/79/PV.1,A/79/PV.1,2024-09-10,9,2024,9,10,A,,79.0,PV.1,,A_2024_79_PV.1_,A_2024_79_PV.1_speeches.pdf,True


In [None]:
footer1 = "This record contains the original text of speeches delivered in English and interpretations of speeches delivered in the other languages."
footer2 = "This record contains the text of speeches delivered in English and of the translation of speeches delivered in other languages."
c_footer1 = "This record is subject to correction."
e_footer1 = "Corrections to this record should be submitted in one of the working languages. "
trailing = "HOW TO OBTAIN UNITED NATIONS PUBLICATIONS"

In [77]:
def classifier(filename):
    with open(f"{speeches_pdf_dir+filename}","rb") as f:
        pdf = fitz.open(f)
        res = []
        for page in pdf:
            image_area = 0.0
            text_area = 0.0
            for b in page.get_text("blocks"):
                if '<image:' in b[4]:
                    r = fitz.Rect(b[:4])
                    image_area = image_area + abs(r)
                else:
                    r = fitz.Rect(b[:4])
                    text_area = text_area + abs(r)
            if image_area == 0.0 and text_area != 0.0:
                res.append(1)
            if text_area == 0.0 and image_area != 0.0:
                res.append(0) 
        return res
    
# classifier('S_1981_PV.2321_speeches.pdf')
classifier('A_2011_C.5_66_SR.13_speeches.pdf')
# classifier('S_1964_PV.1097_speeches.pdf')

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [136]:
submeta = metadata.sample(40).copy()

In [161]:
def get_text_pdf(filename):
    is_text = False
    with pdfplumber.open(f"{speeches_pdf_dir+filename}") as pdf:
        # get the text from the first couple of pages
        pages = pdf.pages
        text0 = pages[0].extract_text() if len(pages) > 0 else None
        text1 = pages[1].extract_text() if len(pages) > 1 else None
        text2 = pages[2].extract_text() if len(pages) > 2 else None
        text = text0 + text1 + text2
    return text

# print(get_text_pdf('A_1957_PV.690_speeches.pdf'))
# print(get_text_pdf('E_1970_SR.1696_speeches.pdf'))
# print(get_text_pdf('E_1946_SR.17_18_speeches.pdf'))
print(get_text_pdf('A_2022_77_PV.4_speeches.pdf'))

A
United Nations
/77/PV.4*
General Assembly
Official Records
Seventy-seventh session
4
th plenary meeting
Tuesday, 20 September 2022, 9 a.m.
New York
President: Mr. Kőrösi ........................................... (Hungary)
The meeting was called to order at 9 a.m. It is loaded with Ukrainian grain destined for the
people of the Horn of Africa, millions of whom are
Agenda item 113 on the edge of famine. It navigated its way through a
war zone, guided by the very parties to the conflict,
Report of the Secretary-General on the work of the
as part of an unprecedented comprehensive initiative to
Organization (A/77/1)
get more food and fertilizer out of Ukraine and Russia,
The President: Before proceeding to the bring desperately needed relief to those in need, calm
general debate, as announced in the Journal of the commodity markets, secure future harvests and lower
United Nations, the General Assembly will hear a prices for consumers everywhere.
presentation by the Secretary-General of 

In [None]:
def is_text_based(filename):
    is_text = False
    with pdfplumber.open(f"{speeches_pdf_dir+filename}") as pdf:
        # get the text from the first couple of pages
        pages = pdf.pages
        text0 = pages[2].extract_text() if len(pages) > 2 else None
        text1 = pages[3].extract_text() if len(pages) > 3 else None
        # text2 = pages[2].extract_text() if len(pages) > 2 else None
        text = text0 + text1 # + text2
        # check for bad scanned words like 'y3s'
        num_tex_num = re.findall(r"[A-Za-z][0-9,.!?~][A-Za-z]", text)
        tex_len, ntn_len = len(text), len(num_tex_num)
        ntn_threshold = min(int(tex_len/1000), 10) # per 1000 char one error is allowed , but max 10 errors in total otherwise we get the old bad automatic scanned documents
        # print(text)
        # print(f"size {len(text)} , found {len(good)} spec {len(good)/len(text)}")
        if ( tex_len > 8000) and (ntn_len < ntn_threshold):
            is_text = True
    return (is_text, tex_len, ntn_len)

submeta[['is_text_based', 'text_size', 'text_typos']] = (submeta.file_name_pdf.apply(is_text_based).apply(pd.Series))

# is_text_based('S_1981_PV.2321_speeches.pdf')
# print('\n\n oooooooooooooooo \n\n')
# is_text_based('A_2011_C.5_66_SR.13_speeches.pdf')
# is_text_based('S_1964_PV.1097_speeches.pdf')

In [143]:
submeta.head(len(submeta)).sort_values('date')

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,document_symbol_searchable,date,sg_number,year,month,day,...,sub_body,doc_number,doc_type,add_part,file_prefix,file_name_pdf,is_in_dir,is_text_based,text_size,text_typos
1201,3983784,"Lie, Trygve, 1896-1968",UN. Secretary-General,E/SR.14,E/SR.14,1946-09-30,1,1946,9,30,...,,,SR.14,,E_1946_SR.14_,E_1946_SR.14_speeches.pdf,True,True,10192,8
1187,3928178,"Hammarskjöld, Dag, 1905-1961",UN. Secretary-General,E/SR.791,E/SR.791,1954-04-30,2,1954,4,30,...,,,SR.791,,E_1954_SR.791_,E_1954_SR.791_speeches.pdf,True,True,13638,8
1183,3926574,"Hammarskjöld, Dag, 1905-1961",UN. Secretary-General,E/SR.806,E/SR.806,1954-07-14,2,1954,7,14,...,,,SR.806,,E_1954_SR.806_,E_1954_SR.806_speeches.pdf,True,False,14447,54
1152,3874304,"Thant, U, 1909-1974",UN. Secretary-General,E/SR.1274,ESR1274,1963-07-09,3,1963,7,9,...,,,SR.1274,,E_1963_SR.1274_,E_1963_SR.1274_speeches.pdf,True,True,13523,2
1143,1289692,"Thant, U, 1909-1974",UN. Secretary-General,S/PV.1144,SPV1144,1964-09-09,3,1964,9,9,...,,,PV.1144,,S_1964_PV.1144_,S_1964_PV.1144_speeches.pdf,True,False,10636,11
1130,1288226,"Thant, U, 1909-1974",UN. Secretary-General,S/PV.1223,SPV1223,1965-06-11,3,1965,6,11,...,,,PV.1223,,S_1965_PV.1223_,S_1965_PV.1223_speeches.pdf,True,False,8673,18
1120,852532,"Thant, U, 1909-1974",UN. Secretary-General,S/PV.1275,SPV1275,1966-03-16,3,1966,3,16,...,,,PV.1275,,S_1966_PV.1275_,S_1966_PV.1275_speeches.pdf,True,True,10680,7
1114,853903,"Thant, U, 1909-1974",UN. Secretary-General,S/PV.1320,S/PV.1320,1966-11-16,3,1966,11,16,...,,,PV.1320,,S_1966_PV.1320_,S_1966_PV.1320_speeches.pdf,True,True,10215,6
1073,832101,"Waldheim, Kurt, 1918-2007",UN. Secretary-General,S/PV.1678,SPV1678,1972-11-28,4,1972,11,28,...,,,PV.1678,,S_1972_PV.1678_,S_1972_PV.1678_speeches.pdf,True,False,6543,0
1022,1469125,"Waldheim, Kurt, 1918-2007",UN. Secretary-General,E/SR.2006,ESR2006,1976-06-30,4,1976,6,30,...,,,SR.2006,,E_1976_SR.2006_,E_1976_SR.2006_speeches.pdf,True,True,12982,0


## PDF to txt

iterate over pdf documents and extract text to txt-file

In [None]:
test_subset = metadata[metadata.year >= 2025].copy()
header, init_header, footer = 70, 0, 70

for row in test_subset.itertuples():

    with pdfplumber.open(f"{speeches_pdf_dir+row.file_name_pdf}") as pdf, open(f"{speeches_txt_dir+row.file_prefix}{speech_suffix}", "w", encoding="utf-8") as f:
        
        for page in pdf.pages:
            # get page size
            x0, y0, x1, y1 = page.bbox
            # print(page.page_number)
            # cut the header and footer off
            content_area = page.crop((x0, y0 + init_header, x1, y1 - footer))
            # update to header value after first page, so I get the first header
            init_header = header
            t = content_area.extract_text()
            if t:
                f.write(t + '\n')

## Extract speech form txt

the regex:

In [94]:
start_sg = re.compile(
    r"(?m)" # whole doc
    r"^\s*" # whitespace
    r"(?:\d+\.\s*)?" # digit and string if existent e.g.(5. The Secretary...)
    # r"The\sSecretary-?General"
    r"The Secretary-General"
    r"(?:\s*\([^)]*\))?" # stuff in brackets if existent e.g.(spoke in spanish)
    r"\s*:" # :
    r"\s*", # whitespace if there
    re.IGNORECASE
)

next_speaker = re.compile(
    r"(?m)" # whole doc
    r"^\s*" # whitespace
    r"(?:\d+\.\s*)?" # digit and string if existent 
    r"(?:The|[A-Z]*[A-Za-z]*\.)" # "The" or "Mr."/"Mrs."/...
    r"(?:\s+[A-Z][A-Za-z\'\.\-]*){1,5}"  # 1 to 5 names
    r"(?:\s*\([^)]*\)){0,3}" # brackets, max 3 e.g.(Algeria) (spoke in spanish)
    r"\s*:" # :
    r"\s*", # whitespace if there)
)

iterate each file and extract the regex:

In [None]:
for row in test_subset.itertuples():

    with open(f"{speeches_txt_dir+row.file_prefix}{speech_suffix}", "r", encoding="utf-8") as f:
        text = f.read()

    sg_matches = list(start_sg.finditer(text))
    multiple_matches = len(sg_matches) > 1

    for idx, sg_match in enumerate(sg_matches):
        start_idx = sg_match.end()
        stop_match = next_speaker.search(text, start_idx)
        end_idx = stop_match.start() if stop_match else len(text)
        speech = text[start_idx:end_idx].strip()

        with open(f"{speeches_extracted_dir+row.file_prefix}{extracted_speech_suffix(idx, multiple_matches)}", "w", encoding="utf-8") as file:
            file.write(sg_match.group())
            file.write("\n")
            file.write(speech)

## Testing corner

In [32]:
actual_filenames = []
for filename in os.listdir(speeches_pdf_dir):
    if filename.endswith(".pdf"):
        actual_filenames.append(filename)
        filepath = os.path.join(speeches_pdf_dir, filename)
        # if is_text_based(filepath):
        #     print(f"{filename}: TEXT-BASED")
        # else:
        #     print(f"{filename}: IMAGE-BASED")

In [33]:
print(actual_filenames[0])
print(len(actual_filenames))
print(len(metadata))

A_1950_PV.289_speeches.pdf
1205
1205


In [34]:
def is_in_actual_filenames(filename):
    return str(filename) in (actual_filenames)

metadata['is_in_dir'] = metadata.file_name_pdf.apply(is_in_actual_filenames)

In [24]:
# metadata[metadata.add_part.notna()]

In [36]:
# metadata[metadata.is_in_dir == False]

In [162]:
metadata[metadata.file_name_pdf == 'E_2011_2011_SR.20_A_speeches.pdf']

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,document_symbol_searchable,date,sg_number,year,month,day,body,sub_body,doc_number,doc_type,add_part,file_prefix,file_name_pdf,is_in_dir
351,729398,"Ban, Ki-moon, 1944-",UN. Secretary-General,E/2011/SR.20(A),E2011SR20A,2011-07-07,8,2011,7,7,E,,2011.0,SR.20,A,E_2011_2011_SR.20_A_,E_2011_2011_SR.20_A_speeches.pdf,True


In [163]:
metadata[metadata.document_symbol ==  'E/1984/SR.23']

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,document_symbol_searchable,date,sg_number,year,month,day,body,sub_body,doc_number,doc_type,add_part,file_prefix,file_name_pdf,is_in_dir
938,302213,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,E/1984/SR.23,E1984SR23,1984-07-04,5,1984,7,4,E,,1984.0,SR.23,,E_1984_1984_SR.23_,E_1984_1984_SR.23_speeches.pdf,True


In [164]:
metadata[metadata.add_part.notna()]

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,document_symbol_searchable,date,sg_number,year,month,day,body,sub_body,doc_number,doc_type,add_part,file_prefix,file_name_pdf,is_in_dir
42,4060295,"Guterres, António, 1949-",UN. Secretary-General,A/78/PV.50 (Resumption 1),A/78/PV.50 (Resumption 1),2023-12-20,9,2023,12,20,A,,78.0,PV.50,RESUMPTION_1,A_2023_78_PV.50_RESUMPTION_1_,A_2023_78_PV.50_RESUMPTION_1_speeches.pdf,True
351,729398,"Ban, Ki-moon, 1944-",UN. Secretary-General,E/2011/SR.20(A),E2011SR20A,2011-07-07,8,2011,7,7,E,,2011.0,SR.20,A,E_2011_2011_SR.20_A_,E_2011_2011_SR.20_A_speeches.pdf,True
689,461573,"Annan, Kofi, 1938-2018",UN. Secretary-General,S/PV.4506(Resumption2),SPV4506RESUMPTION2,2002-04-04,7,2002,4,4,S,,,PV.4506,RESUMPTION2,S_2002_PV.4506_RESUMPTION2_,S_2002_PV.4506_RESUMPTION2_speeches.pdf,True
741,409004,"Annan, Kofi, 1938-2018",UN. Secretary-General,S/PV.4105(Resumption1),SPV4105RESUMPTION1,2000-02-28,7,2000,2,28,S,,,PV.4105,RESUMPTION1,S_2000_PV.4105_RESUMPTION1_,S_2000_PV.4105_RESUMPTION1_speeches.pdf,True
843,344523,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,S/PV.2977(PartII)(closed-resumption3),SPV2977PARTIICLOSEDRESUMPTION3,1991-02-23,5,1991,2,23,S,,,PV.2977,PARTIICLOSED-RESUMPTION3,S_1991_PV.2977_PARTIICLOSED-RESUMPTION3_,S_1991_PV.2977_PARTIICLOSED-RESUMPTION3_speech...,True
949,801242,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,S/PV.2399andCorr.1,SPV2399ANDCORR1,1982-10-04,5,1982,10,4,S,,,PV.2399,ANDCORR.1,S_1982_PV.2399_ANDCORR.1_,S_1982_PV.2399_ANDCORR.1_speeches.pdf,True
982,807487,"Waldheim, Kurt, 1918-2007",UN. Secretary-General,S/PV.2191andAdd.1,SPV2191ANDADD1,1980-01-11,4,1980,1,11,S,,,PV.2191,ANDADD.1,S_1980_PV.2191_ANDADD.1_,S_1980_PV.2191_ANDADD.1_speeches.pdf,True
1198,3984117,"Lie, Trygve, 1896-1968",UN. Secretary-General,E/SR.19-20,E/SR.19-20,1946-10-03,1,1946,10,3,E,,,SR.19,20,E_1946_SR.19_20_,E_1946_SR.19_20_speeches.pdf,True
1199,3984033,"Lie, Trygve, 1896-1968",UN. Secretary-General,E/SR.17-18,E/SR.17-18,1946-10-02,1,1946,10,2,E,,,SR.17,18,E_1946_SR.17_18_,E_1946_SR.17_18_speeches.pdf,True
1200,3983935,"Lie, Trygve, 1896-1968",UN. Secretary-General,E/SR.15-16,E/SR.15-16,1946-10-01,1,1946,10,1,E,,,SR.15,16,E_1946_SR.15_16_,E_1946_SR.15_16_speeches.pdf,True
