code by Jakob

In [8]:
# ! pip install spacy

In [9]:
import pandas as pd
import re
import logging
import spacy

In [10]:
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [11]:
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load(
    "en_core_web_sm",
    disable=["ner", "lemmatizer", "attribute_ruler"]
)

In [13]:
data_dir = "data/"
dataframe_dir = data_dir + "dataframes/"
speeches_dir = data_dir + "speeches/"
speeches_pdf_dir = speeches_dir + "pdf/"
speeches_txt_dir = speeches_dir + "txt/"
speeches_extracted_dir = speeches_dir + "extracted/"
last_metadata_step = 'metadata_s03'
current_metadata_step = 'metadata_s04'

speech_suffix = "speeches.txt"
extracted_speech_suffix = lambda x,numb=True: f"speech_{x+1}.txt" if numb else "speech.txt"

In [14]:
metadata = pd.read_csv(f'{dataframe_dir}/{last_metadata_step}.csv')
metadata = metadata[metadata.text_based==True] # TODO change after image based pipeline is done

# Extract speech form txt
the regex to extract the speeches:

In [15]:
start_sg = re.compile(
    r"(?m)" # whole doc
    r"^\s*" # whitespace
    r"(?:\d+\.?\s*)?" # digit and string if existent e.g.(5. The Secretary...)
    # r"The\sSecretary-?General"
    r"(The\s+)?Secretary-General"
    r"(?:\s*\([^)]*\))?" # stuff in brackets if existent e.g.(spoke in spanish)
    r"[\s\n]*:" # :
    r"\s*", # whitespace if there
    re.IGNORECASE
)

next_speaker = re.compile(
    r"(?m)" # whole doc
    r"^\s*" # whitespace
    r"(?:\d+\.\s*)?" # digit and string if existent 
    r"(?:The|[A-Z]*[A-Za-z]*\.)" # "The" or "Mr."/"Mrs."/...
    r"(?:\s+[A-Z][A-Za-z\'\.\-]*){1,5}"  # 1 to 5 names
    r"(?:\s*\([^)]*\)){0,3}" # brackets, max 3 e.g.(Algeria) (spoke in spanish)
    r"\s*:" # :
    r"\s*", # whitespace if there)
)

### prepare speech metadata
The regex to count tokens, types and sentences

In [77]:
def nlp_counting(text, nlp):
    doc = nlp(text)

    tokens = sum(1 for t in doc if not t.is_space)
    types_ = len(set(t.text.lower() for t in doc if not t.is_space))
    sentences = sum(1 for _ in doc.sents)

    return tokens, types_, sentences

iterate each file and extract the regex:

In [110]:
speech_metadata = []

def extract_speeches(df):
    for row in df.itertuples():
        try:
            with open(f"{speeches_txt_dir+row.file_prefix}{speech_suffix}", "r", encoding="utf-8") as f:
                text = f.read()

            sg_matches = list(start_sg.finditer(text))
            multiple_matches = len(sg_matches) > 1

            for idx, sg_match in enumerate(sg_matches):
                start_idx = sg_match.end()
                stop_match = next_speaker.search(text, start_idx)
                end_idx = stop_match.start() if stop_match else len(text)
                speech = text[start_idx:end_idx].strip()

                file_name = f"{row.file_prefix}{extracted_speech_suffix(idx, multiple_matches)}"
                with open(f"{speeches_extracted_dir+file_name}", "w", encoding="utf-8") as file:
                    file.write(sg_match.group())
                    file.write("\n")
                    file.write(speech)

                # check for wrongly cropped headers - has proven easy and efficient but should be optimized by now
                if ('. . . . . .') in speech:
                    log.warning(f"{row.document_symbol} may contain a wrongly cropped header")

                # count tokens, types and sentences
                tokens, types, sentences = nlp_counting(speech, nlp)
                # append to new metadata for each speach individually
                speech_metadata.append({
                    "record_id": row.record_id,
                    "speaker": row.speaker,
                    "speaker_organization": row.speaker_organization,
                    "document_symbol": row.document_symbol,
                    "date": row.date,
                    "sg_number": row.sg_number,
                    "year": row.year,
                    "month": row.month,
                    "day": row.day,
                    "body": row.body,
                    "sub_body": row.sub_body,
                    "doc_number": row.doc_number,
                    "doc_type": row.doc_type,
                    "add_part": row.add_part,
                    "lang_field": row.lang_field,
                    "doc_url": row.doc_url,
                    "date_document": row.date_document,
                    "text_based": row.text_based,
                    "speech_index": idx,
                    "multiple_matches": multiple_matches,
                    "tokens": tokens,
                    "types": types,
                    "sentences": sentences,
                    "file_name": file_name
                })
            log.info(f"saved {len(sg_matches)} speech(es) for {row.document_symbol} where file prefix {row.file_prefix}")
        except Exception as e:
            log.error(f"while saving {row.document_symbol} where file prefix {row.file_prefix}")
            log.info(e)

    # speech_metadata = pd.DataFrame(speech_metadata)


In [111]:
extract_speeches(metadata)

INFO:__main__:saved 1 speech(es) for A/49/PV.28 where file prefix A_1994_49_PV.28_
INFO:__main__:saved 1 speech(es) for A/C.1/49/PV.3 where file prefix A_1994_C.1_49_PV.3_
INFO:__main__:saved 1 speech(es) for A/49/PV.35 where file prefix A_1994_49_PV.35_
INFO:__main__:saved 1 speech(es) for A/49/PV.39 where file prefix A_1994_49_PV.39_
INFO:__main__:saved 1 speech(es) for A/C.1/49/PV.19 where file prefix A_1994_C.1_49_PV.19_
INFO:__main__:saved 1 speech(es) for A/50/PV.1 where file prefix A_1995_50_PV.1_
INFO:__main__:saved 1 speech(es) for A/50/PV.20 where file prefix A_1995_50_PV.20_
INFO:__main__:saved 1 speech(es) for A/50/PV.35 where file prefix A_1995_50_PV.35_
INFO:__main__:saved 1 speech(es) for A/50/PV.40 where file prefix A_1995_50_PV.40_
INFO:__main__:saved 1 speech(es) for A/50/PV.105 where file prefix A_1996_50_PV.105_
INFO:__main__:saved 1 speech(es) for A/C.1/51/PV.3 where file prefix A_1996_C.1_51_PV.3_
INFO:__main__:saved 1 speech(es) for A/51/PV.33 where file prefix A

6317 is indeed empty

## speech metadata

In [112]:
speech_metadata = pd.DataFrame(speech_metadata)

In [113]:
speech_metadata.head(1)

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,date,sg_number,year,month,day,body,...,lang_field,doc_url,date_document,text_based,speech_index,multiple_matches,tokens,types,sentences,file_name
0,363235,"Boutros-Ghali, Boutros, 1922-2016",UN. Secretary-General,A/49/PV.28,1994-10-12,6,1994,10,12,A,...,English,https://digitallibrary.un.org/record/169004/fi...,1994-10-12,True,0,False,1356,434,71,A_1994_49_PV.28_speech.txt


In [115]:
speech_metadata.reset_index(drop=True, inplace=True)
speech_metadata.to_csv(f'{dataframe_dir}{current_metadata_step}.csv', index=False)