In [None]:
import pandas as pd
import numpy as np  
import requests #! pip install requests
import xml.etree.ElementTree as ET
import time
import logging

In [69]:
base_url = "https://digitallibrary.un.org/search"
namespace = {'nmsp': 'http://www.loc.gov/MARC21/slim'}
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
# logging.basicConfig(filename='logs/example.log', encoding='utf-8', level=logging.INFO)
crawl_delay = 5 # from robots.txt
log.info(f"log with crawl delay {crawl_delay}s")

INFO:__main__:log with crawl delay 5s


In [85]:
filter_only_doc = "fct__1=Documents+and+Publications"
filter_marcxml = "of=xm"
result_max = "rg=100"
search_url = lambda param: f"{base_url}?ln=en&p={param}&f=&rm=&sf=&so=d&rg=50&c=United+Nations+Digital+Library+System&{filter_marcxml}&fti=0&{filter_only_doc}&fti=0"

# minimal search would be: (I currently don't know what the other filters are for)
search_url = lambda param: f"{base_url}?p={param}&ln=en&{filter_marcxml}&{filter_only_doc}&{result_max}"

In [156]:
def search_doc_for_speech_code(session, data, failed_requests, speech_code, speech_code_searchable, crawl_delay=crawl_delay, just_match_first=False): 
    try:
        # pause between requests
        time.sleep(crawl_delay)
        search_res = session.get(search_url(speech_code_searchable))
        if search_res.status_code != 200:
            log.error(f"Web page returned status code: {search_res.status_code} for search of {speech_code}")
            failed_requests.append(speech_code_searchable)

        root = ET.fromstring(search_res.text)
        log.info(f"Searching for: {speech_code}")
        current_results = len(data)

        for record in root.findall('nmsp:record', namespace):
            record_id = record.find("nmsp:controlfield[@tag='001']", namespace)
            field = record.find("nmsp:datafield[@tag='191']", namespace)
            speech_code_found = field.find("nmsp:subfield[@code='a']", namespace)

            exact_match = speech_code_found.text == speech_code
            starts_same = (speech_code_found.text.startswith(str(speech_code + ' '))
                           or 
                           speech_code_found.text.startswith(str(speech_code + '(')))
            is_resumption = "resumption" in speech_code_found.text.lower()
            if exact_match or (starts_same and is_resumption) or just_match_first:
                just_match_first=False
                field = record.find("nmsp:datafield[@tag='992']", namespace)
                if field is None:
                    field = record.find("nmsp:datafield[@tag='269']", namespace) # only year
                    logging.warning(f"The record {speech_code_found.text} will only saved with date as year")
                date = field.find("nmsp:subfield[@code='a']", namespace)
            
                for datafield in record.findall("nmsp:datafield[@tag='856']", namespace):
                    lang_field = datafield.find("nmsp:subfield[@code='y']", namespace)
                    if lang_field.text.lower() == 'english':
                        document_url = datafield.find("nmsp:subfield[@code='u']", namespace)
                        break
                
                data.append([
                    record_id.text, 
                    speech_code_found.text, 
                    lang_field.text,
                    document_url.text,
                    date.text])
                log.info(f"\tMatched {speech_code} to the found: {speech_code_found.text}")
            else:
                log.debug(f"\tcould not match {speech_code} to the found: {speech_code_found.text}")
                continue
    except Exception as e:
        log.error(f"While searching {speech_code_searchable} when last id: {record_id.text} last speech code: {speech_code_found.text}")
        log.exception(e)
        failed_requests.append(speech_code_searchable)
    
    if (len(data)-current_results) == 0:
        log.warning(f"No results found for {speech_code} when searching {speech_code_searchable}")


In [157]:
def create_session():
    session = requests.Session()
    session.headers.update({
        "User-Agent": "UNSG_speech_corpus/1.0",
    })
    log.info('Session created')
    return session

In [None]:
data = []
failed_requests = []
speech_records = pd.read_csv('data/speech_records.csv')
session = create_session()
log.info(f'Has {len(speech_records)} records to search the url of')

for speech_record in speech_records.itertuples():
    search_doc_for_speech_code(session, data, failed_requests, speech_record.speech_code, speech_record.speech_code_searchable)

if len(failed_requests)>0:
    log.warning(f"The following searches failed: {failed_requests}")

INFO:__main__:Session created


INFO:__main__:Searching for: A/S-15/PV.1
INFO:__main__:	Matched A/S-15/PV.1 to the found: A/S-15/PV.1
INFO:__main__:	Matched A/S-15/PV.1 to the found: A/S-15/PV.1
INFO:__main__:Searching for: A/42/PV.110
INFO:__main__:	Matched A/42/PV.110 to the found: A/42/PV.110
INFO:__main__:Searching for: A/42/PV.105
INFO:__main__:	Matched A/42/PV.105 to the found: A/42/PV.105
INFO:__main__:Searching for: A/42/PV.100
INFO:__main__:	Matched A/42/PV.100 to the found: A/42/PV.100
INFO:__main__:Searching for: A/C.5/42/SR.56
INFO:__main__:	Matched A/C.5/42/SR.56 to the found: A/C.5/42/SR.56
INFO:__main__:Searching for: A/C.5/42/SR.55
INFO:__main__:	Matched A/C.5/42/SR.55 to the found: A/C.5/42/SR.55
INFO:__main__:Searching for: A/42/PV.64
INFO:__main__:	Matched A/42/PV.64 to the found: A/42/PV.64
INFO:__main__:Searching for: A/42/PV.48
INFO:__main__:	Matched A/42/PV.48 to the found: A/42/PV.48
INFO:__main__:Searching for: A/C.1/42/PV.20
INFO:__main__:	Matched A/C.1/42/PV.20 to the found: A/C.1/42/PV.20


In [129]:
len(data)

1062

first round of trying to download again

In [158]:
no_results = [ # add code and new searchable
    # list of searches we got no result
    ['A/78/PV.50(Resumption1)', 'A\\/78\\/PV.50 (Resumption1)'],
    ['A/77/PV.74', 'A\\/77\\/PV.74'],
    ['S/PV.2399andCorr.1', 'S\\/PV.2399andCorr.1'],
    ['S/PV.2191andAdd.1', 'S\\/PV.2191andAdd.1'],
    ['S/PV.2179andCorr.1', 'S\\/PV.2179andCorr.1'],
    ['E/1978/SR.17', 'E\\/1978\\/SR.17'],
    ['S/PV.2051andCorr.1', 'S\\/PV.2051andCorr.1'],
    ['S/PV.1782andCorr.1', 'S\\/PV.1782andCorr.1'],
    ['E/SR.21[1946, 3rd sess.]', 'E\\/SR.21 \\[1946, 3rd sess.\\]'],
    ['E/SR.20[1946, 3rd sess.]', 'E\\/SR.20 \\[1946, 3rd sess.\\]'],
    ['E/SR.19[1946, 3rd sess.]', 'E\\/SR.19 \\[1946, 3rd sess.\\]'],
    ['E/SR.17[1946, 3rd sess.]', 'E\\/SR.17 \\[1946, 3rd sess.\\]'],
    ['E/SR.15[1946, 3rd sess.]', 'E\\/SR.15 \\[1946, 3rd sess.\\]'],
    ['E/SR.14[1946, 3rd sess.]', 'E\\/SR.14 \\[1946, 3rd sess.\\]'],
    ['E/SR.3[1946, 3rd sess.]', 'E\\/SR.3 \\[1946, 3rd sess.\\]'],
    ['E/SR.5[1946, 3rd sess.]', 'E\\/SR.5 \\[1946, 3rd sess.\\]'],
    ['E/SR.1[1946, 3rd sess.]', 'E\\/SR.1 \\[1946, 3rd sess.\\]'],
    # failed requests
    ['A/77/PV.74', 'A\\/77\\/PV.74']
]

session = create_session()
failed_requests = []
for no_res in no_results:
    # pause between requests
    search_doc_for_speech_code(session, data, failed_requests, no_res[0], no_res[1], just_match_first=True)


INFO:__main__:Session created
INFO:__main__:Searching for: A/78/PV.50(Resumption1)
INFO:__main__:	Matched A/78/PV.50(Resumption1) to the found: A/78/PV.50 (Resumption 1)
INFO:__main__:Searching for: A/77/PV.74
INFO:__main__:	Matched A/77/PV.74 to the found: A/77/PV.74
INFO:__main__:Searching for: S/PV.2399andCorr.1
INFO:__main__:	Matched S/PV.2399andCorr.1 to the found: S/PV.2399
INFO:__main__:Searching for: S/PV.2191andAdd.1
INFO:__main__:	Matched S/PV.2191andAdd.1 to the found: S/PV.2191ANDAdd.1
INFO:__main__:Searching for: S/PV.2179andCorr.1
INFO:__main__:Searching for: E/1978/SR.17
ERROR:__main__:While searching E\/1978\/SR.17 when last id: 3908382 last speech code: E/1978/SR.17(OR)
ERROR:__main__:cannot access local variable 'lang_field' where it is not associated with a value
Traceback (most recent call last):
  File "C:\Users\schmi\AppData\Local\Temp\ipykernel_7676\1085290260.py", line 41, in search_doc_for_speech_code
    lang_field.text,
    ^^^^^^^^^^
UnboundLocalError: canno

second round of downloading things again

In [160]:
no_results = [ # add code and new searchable
    # list of searches we got no result
    ['S/PV.2179andCorr.1', 'S\\/PV.2179'],
    # ['E/1978/SR.17', 'E\\/1978\\/SR.17'], # lang field error # is empty on web page
    ['S/PV.2051andCorr.1', 'S\\/PV.2051'],
    ['S/PV.1782andCorr.1', 'S\\/PV.1782'],
    # ['E/SR.20[1946, 3rd sess.]', 'E\\/SR.20 \\[1946, 3rd sess.\\]'], # in 19
    # ['E/SR.14[1946, 3rd sess.]', 'E\\/SR.14 \\[1946, 3rd sess.\\]'], # is hidden in result nr 114
    ['E/SR.1[1946, 3rd sess.]', 'E\\/SR.1-2'],
]

session = create_session()
failed_requests = []
for no_res in no_results:
    # pause between requests
    search_doc_for_speech_code(session, data, failed_requests, no_res[0], no_res[1], just_match_first=True)


  ['E/SR.1[1946, 3rd sess.]', 'E\/SR.1-2'],
INFO:__main__:Session created
INFO:__main__:Searching for: S/PV.2179andCorr.1
INFO:__main__:	Matched S/PV.2179andCorr.1 to the found: S/PV.2179
INFO:__main__:Searching for: E/1978/SR.17
ERROR:__main__:While searching E\/1978\/SR.17 when last id: 3908382 last speech code: E/1978/SR.17(OR)
ERROR:__main__:cannot access local variable 'lang_field' where it is not associated with a value
Traceback (most recent call last):
  File "C:\Users\schmi\AppData\Local\Temp\ipykernel_7676\1085290260.py", line 41, in search_doc_for_speech_code
    lang_field.text,
    ^^^^^^^^^^
UnboundLocalError: cannot access local variable 'lang_field' where it is not associated with a value
INFO:__main__:Searching for: S/PV.2051andCorr.1
INFO:__main__:	Matched S/PV.2051andCorr.1 to the found: S/PV.2051
INFO:__main__:Searching for: S/PV.1782andCorr.1
INFO:__main__:	Matched S/PV.1782andCorr.1 to the found: S/PV.1782
INFO:__main__:Searching for: E/SR.1[1946, 3rd sess.]
INFO:

In [198]:
doc_records = pd.DataFrame(data, columns=['record_id', 'speech_code', 'lang_field', 'doc_url', 'date'])
records_len = len(doc_records)
doc_records

Unnamed: 0,record_id,speech_code,lang_field,doc_url,date
0,4089910,S/PV.9988,English,https://digitallibrary.un.org/record/4089910/f...,2025-08-28
1,4086749,S/PV.9962 (Resumption 2),English,https://digitallibrary.un.org/record/4086749/f...,2025-07-24
2,4086712,S/PV.9962 (Resumption 1),English,https://digitallibrary.un.org/record/4086712/f...,2025-07-22
3,4086244,S/PV.9962,English,https://digitallibrary.un.org/record/4086244/f...,2025-07-22
4,4085074,S/PV.9941,English,https://digitallibrary.un.org/record/4085074/f...,2025-06-22
...,...,...,...,...,...
1379,4022905,A/77/PV.74,English,https://digitallibrary.un.org/record/4022905/f...,2023-06-01
1380,10486,S/PV.2179,English,https://digitallibrary.un.org/record/10486/fil...,1979-12-14
1381,224283,S/PV.2051,English,https://digitallibrary.un.org/record/224283/fi...,1977-11-30
1382,47134,S/PV.1782,English,https://digitallibrary.un.org/record/47134/fil...,1974-07-22


In [199]:
wrong_matches = [ 
    'A/77/PV.49 (Resumption 1)',
    'A/76/PV.56 (Resumption 1)', # twice
    'A/76/PV.54 (Resumption 1)',
    'A/75/PV.48 (Resumption 1)',
    'A/74/PV.32 (Resumption 2)',
    'A/74/PV.32 (Resumption 1)',
    'A/S-24/PV.10(Resumption1)/Corr.1',
    'A/S-24/PV.10(Resumption1)',
    'A/S-23/PV.10(Resumption1)',
]
clean_doc_records = doc_records[~doc_records.speech_code.isin(wrong_matches)]
len(clean_doc_records)

1374

In [200]:
display(clean_doc_records[clean_doc_records.duplicated()])
clean_doc_records = clean_doc_records.drop_duplicates()
len(clean_doc_records)

Unnamed: 0,record_id,speech_code,lang_field,doc_url,date
1005,159828,A/47/PV.33,English,https://digitallibrary.un.org/record/159828/fi...,1992-10-12
1378,817254,E/SR.19-20,English,https://digitallibrary.un.org/record/817254/fi...,1946-10-03
1379,4022905,A/77/PV.74,English,https://digitallibrary.un.org/record/4022905/f...,2023-06-01


1371

There is a document duplication for A/S-15/PV.1 on UN site

In [None]:
# keep only one A/S-15/PV.1 -  there is a document duplication on UN site
display(clean_doc_records[clean_doc_records.speech_code.isin(['A/S-15/PV.1'])])
clean_doc_records = clean_doc_records.drop(index=1063)
len(clean_doc_records)

Unnamed: 0,record_id,speech_code,lang_field,doc_url,date
1062,761784,A/S-15/PV.1,English,https://digitallibrary.un.org/record/761784/fi...,1989
1063,39664,A/S-15/PV.1,English,https://digitallibrary.un.org/record/39664/fil...,1988-05-31


1370

I wrongfully downloaded all the resumptions, but the UNSG doesn't speak there so I'll remove them again

In [None]:
keep_values = ['A/78/PV.50 (Resumption 1)', 'S/PV.4506(Resumption2)', 'S/PV.4105(Resumption1)']
contains_resumption = clean_doc_records.speech_code.str.contains("resumption", case=False, na=False)
keep_row = clean_doc_records.speech_code.isin(keep_values)
print(len(clean_doc_records[(contains_resumption & ~keep_row)]))

clean_doc_records = clean_doc_records[~(contains_resumption & ~keep_row)]
len(clean_doc_records)

167


1203

In [None]:
clean_doc_records.reset_index(drop=True, inplace=True)
# clean_doc_records.to_csv('cleaned_doc_records.csv', index=False)

In [207]:
def make_saveable(code):
    code = code.replace('/', '')
    code = code.replace('.', '')
    code = code.replace('[', '')
    code = code.replace(']', '')
    return code

In [223]:
def download_pdf(session, failed_doc_requests, doc_url, speech_code, crawl_delay=crawl_delay):
    try:
        # pause between requests
        time.sleep(crawl_delay)

        response = session.get(doc_url)

        if response.status_code != 200:
            log.error(f"Web page returned status code: {response.status_code} for search of {speech_code}")
            failed_doc_requests.append(doc_url)
            return

        # Write content in pdf file
        pdf = open("data/speeches/pdf/"+make_saveable(speech_code)+".pdf", 'wb')
        pdf.write(response.content)
        pdf.close()
        log.info(f"Downloaded file: {speech_code} to {make_saveable(speech_code)}")
    except Exception as e:
        log.error(f"When processing: {speech_code} with url {doc_url} last status code: {response.status_code} ")
        log.exception(e)
        failed_doc_requests.append(doc_url)
    # finally:
    #     pdf.close()

In [226]:
failed_doc_requests = []
session = create_session()

for doc_record in clean_doc_records.itertuples():
    download_pdf(session, 
                 failed_doc_requests, 
                 doc_record.doc_url, 
                 doc_record.speech_code,
                 crawl_delay=crawl_delay*3)
    if len(failed_doc_requests) > 60:
        break

log.info("All PDF files downloaded")

INFO:__main__:Session created
INFO:__main__:Downloaded file: S/PV.9988 to SPV9988
INFO:__main__:Downloaded file: S/PV.9962 to SPV9962
INFO:__main__:Downloaded file: S/PV.9941 to SPV9941
INFO:__main__:Downloaded file: S/PV.9939 to SPV9939
INFO:__main__:Downloaded file: S/PV.9938 to SPV9938
INFO:__main__:Downloaded file: A/79/PV.72 to A79PV72
INFO:__main__:Downloaded file: S/PV.9919 to SPV9919
INFO:__main__:Downloaded file: A/79/PV.67 to A79PV67
INFO:__main__:Downloaded file: S/PV.9907 to SPV9907
INFO:__main__:Downloaded file: S/PV.9884 to SPV9884
INFO:__main__:Downloaded file: S/PV.9861 to SPV9861
INFO:__main__:Downloaded file: S/PV.9841 to SPV9841
INFO:__main__:Downloaded file: S/PV.9821 to SPV9821
INFO:__main__:Downloaded file: S/PV.9761 to SPV9761
INFO:__main__:Downloaded file: A/C.5/79/SR.5 to AC579SR5
INFO:__main__:Downloaded file: S/PV.9738 to SPV9738
INFO:__main__:Downloaded file: S/PV.9734 to SPV9734
INFO:__main__:Downloaded file: S/PV.9733 to SPV9733
INFO:__main__:Downloaded fi