code by Jakob

In [1]:
import pandas as pd
import logging

In [2]:
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
data_dir = "data/"
dataframe_dir = data_dir + "dataframes/"
speeches_dir = data_dir + "speeches/"
speeches_pdf_dir = speeches_dir + "pdf/"
speeches_txt_dir = speeches_dir + "txt/"
speeches_extracted_dir = speeches_dir + "extracted/"
last_metadata_step = 'metadata_s02'
current_metadata_step = 'metadata_s03'

speech_suffix = "speeches.txt"
extracted_speech_suffix = lambda x,numb=True: f"speech_{x}.txt" if numb else "speech.txt"

## load data

In [4]:
# sr = pd.read_csv(f'{dataframe_dir}/speech_records.csv')
metadata = pd.read_csv(f'{dataframe_dir}/{last_metadata_step}.csv')
dpi = pd.read_csv(f'{dataframe_dir}/documents_processing_info.csv')

fill up missing values and convert dates

In [5]:
metadata.date = pd.to_datetime(metadata.date)
dpi.day = dpi.day.fillna(1)
dpi.month = dpi.month.fillna(1)
dpi['date'] = pd.to_datetime(dpi[['year','month','day']])
dpi.drop(columns=['year','month','day'], inplace=True)

## Proprocessing
Check which pdfs are image and which are text based - we did that by introducing documents_processing_info where we store the classficiation of documents, new we map this info back into the metadata. 
Therefore we create sub dfs for mapping once with and without sub_body to my documents_processing_info to obtain the text_based column

In [6]:
dpi_sb = dpi[dpi["sub_body"].notna()].copy()
dpi_nsb = dpi[dpi["sub_body"].isna()].copy()
dpi_sb = dpi_sb.sort_values(["date", "body", "sub_body"])
dpi_nsb = dpi_nsb.sort_values(["date", "body"])

dpi_listed_subbodies = dpi[dpi.sub_body.notna()].sub_body.unique()
md_sb = metadata[(metadata.sub_body.isin(dpi_listed_subbodies))].copy()
md_nsb = metadata[~(metadata.sub_body.isin(dpi_listed_subbodies))].copy()
md_sb = md_sb.sort_values(["date", "body", "sub_body"])
md_nsb = md_nsb.sort_values(["date", "body"])

In [7]:
md_merg_sb = pd.merge_asof(
    md_sb,
    dpi_sb,
    by=["body", "sub_body"],
    left_on="date",
    right_on="date",
    direction="backward",
    suffixes=("", "_todrop")
)
md_merg_nsb = pd.merge_asof(
    md_nsb,
    dpi_nsb,
    by=["body"],
    left_on="date",
    right_on="date",
    direction="backward",
    suffixes=("", "_todrop")
)

In [8]:
metadata = pd.concat([md_merg_sb, md_merg_nsb]).sort_values(["body", "date"])
metadata.drop(columns=['sub_body_todrop'], inplace=True)

notable exceptions

In [9]:
metadata.loc[metadata.document_symbol=='E/2000/SR.44', 'text_based'] = False

In [None]:
metadata.groupby(['body', 'sub_body', 'text_based'], dropna=False).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,record_id,speaker,speaker_organization,document_symbol,date,sg_number,year,month,day,doc_number,...,file_prefix,file_name_pdf,record_id_document,document_symbol_found,lang_field,doc_url,date_document,two_column_layout,left_column_french,starting_page
body,sub_body,text_based,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
A,C.1,False,288722,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/C.1/38/PV.11,1983-10-24,5,1983,10,24,38.0,...,A_1983_C.1_38_PV.11_,A_1983_C.1_38_PV.11_speeches.pdf,57551.0,A/C.1/38/PV.11,English,http://digitallibrary.un.org/record/57551/file...,1983-10-24,False,,2
A,C.1,True,365914,"Boutros-Ghali, Boutros, 1922-2016",UN. Secretary-General,A/C.1/49/PV.3,1994-10-17,6,1994,10,17,49.0,...,A_1994_C.1_49_PV.3_,A_1994_C.1_49_PV.3_speeches.pdf,170075.0,A/C.1/49/PV.3,English,https://digitallibrary.un.org/record/170075/fi...,1994-10-17,True,,1
A,C.2,False,328834,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/C.2/43/SR.16,1988-10-19,5,1988,10,19,43.0,...,A_1988_C.2_43_SR.16_,A_1988_C.2_43_SR.16_speeches.pdf,59199.0,A/C.2/43/SR.16,English,https://digitallibrary.un.org/record/59199/fil...,1988-10-19,False,,2
A,C.2,True,393116,"Annan, Kofi, 1938-2018",UN. Secretary-General,A/C.2/52/SR.45,1997-12-01,7,1997,12,1,52.0,...,A_1997_C.2_52_SR.45_,A_1997_C.2_52_SR.45_speeches.pdf,252978.0,A/C.2/52/SR.45,English,https://digitallibrary.un.org/record/252978/fi...,1997-12-01,True,,2
A,C.5,False,292642,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/C.5/38/SR.7,1983-10-07,5,1983,10,7,38.0,...,A_1983_C.5_38_SR.7_,A_1983_C.5_38_SR.7_speeches.pdf,57598.0,A/C.5/38/SR.7,English,https://digitallibrary.un.org/record/57598/fil...,1983-12-07,False,,2
A,C.5,True,356413,"Boutros-Ghali, Boutros, 1922-2016",UN. Secretary-General,A/C.5/47/SR.71,1993-08-26,6,1993,8,26,47.0,...,A_1993_C.5_47_SR.71_,A_1993_C.5_47_SR.71_speeches.pdf,172932.0,A/C.5/47/SR.71,English,https://digitallibrary.un.org/record/172932/fi...,1993-08-26,False,,2
A,ES-10,True,428414,"Annan, Kofi, 1938-2018",UN. Secretary-General,A/ES-10/PV.14,2000-10-20,7,2000,10,20,,...,A_2000_ES-10_PV.14_,A_2000_ES-10_PV.14_speeches.pdf,425381.0,A/ES-10/PV.14,English,https://digitallibrary.un.org/record/425381/fi...,2000-10-20,True,,1
A,ES-11,True,3967632,"Guterres, António, 1949-",UN. Secretary-General,A/ES-11/PV.1,2022-02-28,9,2022,2,28,,...,A_2022_ES-11_PV.1_,A_2022_ES-11_PV.1_speeches.pdf,3967063.0,A/ES-11/PV.1,English,https://digitallibrary.un.org/record/3967063/f...,2022-02-28,True,,1
A,S-13,False,311611,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/S-13/PV.1,1986-05-27,5,1986,5,27,,...,A_1986_S-13_PV.1_,A_1986_S-13_PV.1_speeches.pdf,858165.0,A/S-13/PV.1-8 and Annexes,English,https://digitallibrary.un.org/record/858165/fi...,1988,False,,2
A,S-14,False,312127,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/S-14/PV.1,1986-09-17,5,1986,9,17,,...,A_1986_S-14_PV.1_,A_1986_S-14_PV.1_speeches.pdf,120798.0,A/S-14/PV.1,English,https://digitallibrary.un.org/record/120798/fi...,1986-09-17,False,,2


# Filter the Summary Records
The dataset contains a lot of summary records, we decided to filter them out at this point since the speeches do not resemble the actual words used and we want to extract from here on only the speeches and not the general agenda of things talked about. But we kept them in till here so we already have the pdfs if the research question gets broader in the future

In [None]:
sr_speeches = metadata[metadata.document_symbol_found.str.contains('/SR.')]
print(f'there are {len(sr_speeches)} speeches that are a summary record')

there are 182 speeches that are a summary record


In [None]:
metadata = metadata[~metadata.document_symbol_found.str.contains('SR')]

In [None]:
metadata.reset_index(drop=True, inplace=True)
metadata.to_csv(f'{dataframe_dir}{current_metadata_step}.csv', index=False)

### quick info

In [None]:
metadata.text_based.value_counts()

text_based
True     725
False    285
Name: count, dtype: int64