In [1]:
# ! pip install pdfplumber
# ! pip install PyMuPDF

In [2]:
import pdfplumber
import pandas as pd
import logging

In [3]:
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [4]:
data_dir = "data/"
dataframe_dir = data_dir + "dataframes/"
speeches_dir = data_dir + "speeches/"
speeches_pdf_dir = speeches_dir + "pdf/"
speeches_txt_dir = speeches_dir + "txt/"
last_metadata_step = 'metadata_s03'
current_metadata_step = 'metadata_s04'

speech_suffix = "speeches.txt"

In [5]:
metadata = pd.read_csv(f'{dataframe_dir}/{last_metadata_step}.csv')

# Text based extraction - PDF to txt

In [6]:
tb_metadata = metadata[metadata.text_based==True]
len(tb_metadata)

725

In [7]:
footer1 = "This record contains the original text of speeches delivered in English and interpretations of speeches delivered in the other languages."
footer2 = "This record contains the text of speeches delivered in English and of the translation of speeches delivered in other languages."
c_footer1 = "This record is subject to correction."
e_footer1 = "Corrections to this record should be submitted in one of the working languages."
trailing = "HOW TO OBTAIN UNITED NATIONS PUBLICATIONS"
footer_list = ["This record contains the original text",
               "This record contains the text of", 
               "This record is subject to correction",
               "Corrections to this record should be",
               "HOW TO OBTAIN UNITED NATIONS PUBLICATIONS"]

iterate over pdf documents and extract text to txt-file

In [8]:
def extract_text_based_pdf(df):
    header, footer = 70, 70
    y_special_E_header = 150
    e_header = 'Economic and Social Council'

    for row in df.itertuples(): 
        try:
            two_column_layout = row.two_column_layout
            if not isinstance(two_column_layout, bool): # handle the unknown cases but that was only SR so now obsolete kind of
                two_column_layout = two_column_layout=='True' if two_column_layout!='unknown' else two_column_layout 

            with pdfplumber.open(f"{speeches_pdf_dir+row.file_name_pdf}") as pdf, open(f"{speeches_txt_dir+row.file_prefix}{speech_suffix}", "w", encoding="utf-8") as f:
                
                for page in pdf.pages:
                    x0, y0, x1, y1 = page.bbox

                    if two_column_layout=='unknown': 
                        if (row.text_based and row.body=='E'):
                            special_E_header = page.crop((x0, y0, x1, y_special_E_header))
                            contains_special_E_header = e_header in special_E_header.extract_text()
                            two_column_layout = contains_special_E_header
                            log.info(f"document {row.document_symbol} was decided as two columnt layout {two_column_layout}")
                        else:
                            log.error(f"encountered unknown column layout for {row.document_symbol}")
                            break

                    y0 = (y0 + header) if (page.page_number > 1) else y0
                    y1 = y1 - footer
                    if two_column_layout and (page.page_number >= row.starting_page):
                        content_area_l = page.crop((x0, y0, int(x1/2), y1))
                        content_area_r = page.crop((int(x1/2), y0, x1, y1))
                        tex = content_area_l.extract_text(x_tolerance=1, y_tolerance=2) + ' ' + content_area_r.extract_text(x_tolerance=1, y_tolerance=2) 
                    else:
                        content_area = page.crop((x0, y0, x1, y1))
                        tex = content_area.extract_text(x_tolerance=1, y_tolerance=2)

                    if page.page_number==1:
                        for sentence_foot in footer_list:
                            if sentence_foot in tex:
                                tex = tex.split(sentence_foot)[0].strip()
                                log.debug(f'stripped with {sentence_foot}')
                                continue

                    if tex:
                        f.write(tex + '\n')
                log.info(f"successfully saved {row.document_symbol} from file {row.file_name_pdf}")
        except Exception as e:
            log.error(f"when trying to read pdf {row.file_name_pdf}")
            log.info(f"{e}")

In [9]:
# extract_text_based_pdf(tb_metadata.sample(20))
extract_text_based_pdf(tb_metadata)

INFO:__main__:successfully saved A/49/PV.28 from file A_1994_49_PV.28_speeches.pdf
INFO:__main__:successfully saved A/C.1/49/PV.3 from file A_1994_C.1_49_PV.3_speeches.pdf
INFO:__main__:successfully saved A/49/PV.35 from file A_1994_49_PV.35_speeches.pdf
INFO:__main__:successfully saved A/49/PV.39 from file A_1994_49_PV.39_speeches.pdf
INFO:__main__:successfully saved A/C.1/49/PV.19 from file A_1994_C.1_49_PV.19_speeches.pdf
INFO:__main__:successfully saved A/50/PV.1 from file A_1995_50_PV.1_speeches.pdf
INFO:__main__:successfully saved A/50/PV.20 from file A_1995_50_PV.20_speeches.pdf
INFO:__main__:successfully saved A/50/PV.35 from file A_1995_50_PV.35_speeches.pdf
INFO:__main__:successfully saved A/50/PV.40 from file A_1995_50_PV.40_speeches.pdf
INFO:__main__:successfully saved A/50/PV.105 from file A_1996_50_PV.105_speeches.pdf
INFO:__main__:successfully saved A/C.1/51/PV.3 from file A_1996_C.1_51_PV.3_speeches.pdf
INFO:__main__:successfully saved A/51/PV.33 from file A_1996_51_PV.