In [1]:
import pandas as pd
import re
import logging

In [2]:
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
data_dir = "data/"
dataframe_dir = data_dir + "dataframes/"
speeches_dir = data_dir + "speeches/"
speeches_pdf_dir = speeches_dir + "pdf/"
speeches_txt_dir = speeches_dir + "txt/"
speeches_extracted_dir = speeches_dir + "extracted/"
last_metadata_step = 'metadata_s03'
current_metadata_step = 'metadata_s04'

speech_suffix = "speeches.txt"
extracted_speech_suffix = lambda x,numb=True: f"speech_{x}.txt" if numb else "speech.txt"

In [4]:
metadata = pd.read_csv(f'{dataframe_dir}/{last_metadata_step}.csv')
metadata = metadata[metadata.text_based==True] # TODO change after image based pipeline is done

# Extract speech form txt
the regex:

In [43]:
start_sg = re.compile(
    r"(?m)" # whole doc
    r"^\s*" # whitespace
    r"(?:\d+\.?\s*)?" # digit and string if existent e.g.(5. The Secretary...)
    # r"The\sSecretary-?General"
    r"(The\s+)?Secretary-General"
    r"(?:\s*\([^)]*\))?" # stuff in brackets if existent e.g.(spoke in spanish)
    r"[\s\n]*:" # :
    r"\s*", # whitespace if there
    re.IGNORECASE
)

next_speaker = re.compile(
    r"(?m)" # whole doc
    r"^\s*" # whitespace
    r"(?:\d+\.\s*)?" # digit and string if existent 
    r"(?:The|[A-Z]*[A-Za-z]*\.)" # "The" or "Mr."/"Mrs."/...
    r"(?:\s+[A-Z][A-Za-z\'\.\-]*){1,5}"  # 1 to 5 names
    r"(?:\s*\([^)]*\)){0,3}" # brackets, max 3 e.g.(Algeria) (spoke in spanish)
    r"\s*:" # :
    r"\s*", # whitespace if there)
)

iterate each file and extract the regex:

In [46]:
def extract_speeches(df):
    for row in df.itertuples():
        try:
            with open(f"{speeches_txt_dir+row.file_prefix}{speech_suffix}", "r", encoding="utf-8") as f:
                text = f.read()

            sg_matches = list(start_sg.finditer(text))
            multiple_matches = len(sg_matches) > 1

            for idx, sg_match in enumerate(sg_matches):
                start_idx = sg_match.end()
                stop_match = next_speaker.search(text, start_idx)
                end_idx = stop_match.start() if stop_match else len(text)
                speech = text[start_idx:end_idx].strip()

                with open(f"{speeches_extracted_dir+row.file_prefix}{extracted_speech_suffix(idx, multiple_matches)}", "w", encoding="utf-8") as file:
                    file.write(sg_match.group())
                    file.write("\n")
                    file.write(speech)
            log.info(f"saved {len(sg_matches)} speech(es) for {row.document_symbol} where file prefix {row.file_prefix}")
        except Exception as e:
            log.error(f"while saving {row.document_symbol} where file prefix {row.file_prefix}")
            log.info(e)


In [48]:
extract_speeches(metadata.head(20))

INFO:__main__:saved 1 speech(es) for A/49/PV.28 where file prefix A_1994_49_PV.28_
INFO:__main__:saved 1 speech(es) for A/C.1/49/PV.3 where file prefix A_1994_C.1_49_PV.3_
INFO:__main__:saved 1 speech(es) for A/49/PV.35 where file prefix A_1994_49_PV.35_
INFO:__main__:saved 1 speech(es) for A/49/PV.39 where file prefix A_1994_49_PV.39_
INFO:__main__:saved 1 speech(es) for A/C.1/49/PV.19 where file prefix A_1994_C.1_49_PV.19_
INFO:__main__:saved 1 speech(es) for A/50/PV.1 where file prefix A_1995_50_PV.1_
INFO:__main__:saved 1 speech(es) for A/50/PV.20 where file prefix A_1995_50_PV.20_
INFO:__main__:saved 1 speech(es) for A/50/PV.35 where file prefix A_1995_50_PV.35_
INFO:__main__:saved 1 speech(es) for A/50/PV.40 where file prefix A_1995_50_PV.40_
INFO:__main__:saved 1 speech(es) for A/50/PV.105 where file prefix A_1996_50_PV.105_
INFO:__main__:saved 1 speech(es) for A/C.1/51/PV.3 where file prefix A_1996_C.1_51_PV.3_
INFO:__main__:saved 1 speech(es) for A/51/PV.33 where file prefix A

## add speech metadata

add info like tokens. sentences. ...