search data obtained form 
https://digitallibrary.un.org/search?ln=en&as=1&rm=&sf=year&so=d&rg=100&c=United+Nations+Digital+Library+System&of=hb&fti=0&fct__1=Speeches&fti=0&as_query=JTdCJTIyZGF0ZV9zZWxlY3RvciUyMiUzQSU3QiUyMmRhdGVUeXBlJTIyJTNBJTIyY3JlYXRpb25fZGF0ZSUyMiUyQyUyMmRhdGVQZXJpb2QlMjIlM0ElMjJhbGx5ZWFycyUyMiUyQyUyMmRhdGVGcm9tJTIyJTNBJTIyJTIyJTJDJTIyZGF0ZVRvJTIyJTNBJTIyJTIyJTdEJTJDJTIyY2xhdXNlcyUyMiUzQSU1QiU3QiUyMnNlYXJjaEluJTIyJTNBJTIyYXV0aG9yJTIyJTJDJTIyY29udGFpbiUyMiUzQSUyMmV4YWN0LW1hdGNoJTIyJTJDJTIydGVybSUyMiUzQSUyMlVOLiUyMFNlY3JldGFyeS1HZW5lcmFsJTIyJTJDJTIyb3BlcmF0b3IlMjIlM0ElMjJBTkQlMjIlN0QlNUQlN0Q%3D&action_search=placeholder#searchresultsbox

In [370]:
import json
import xmltodict
from bs4 import BeautifulSoup  
import pandas as pd
import xml.etree.ElementTree as ET
import unicodedata
from datetime import date
import numpy as np  
import re


### load data

In [371]:
tree = ET.parse('data/all_SG_speeches_search.xml')
root = tree.getroot()

In [372]:
root.tag
namespace = {'nmsp': 'http://www.loc.gov/MARC21/slim'}

In [373]:
len(list(root.findall("nmsp:record", namespaces=namespace)))

1212

### transfrom into dataframe

In [374]:
data = []
for record in root.findall('nmsp:record', namespace):
    record_id = record.find("nmsp:controlfield[@tag='001']", namespace)
    field = record.find("nmsp:datafield[@tag='700']", namespace)
    speaker = field.find("nmsp:subfield[@code='a']", namespace)
    field = record.find("nmsp:datafield[@tag='710']", namespace)
    speaker_organization = field.find("nmsp:subfield[@code='a']", namespace)
    field = record.find("nmsp:datafield[@tag='089']", namespace)
    doc_type = field.find("nmsp:subfield[@code='a']", namespace)
    field = record.find("nmsp:datafield[@tag='791']", namespace)
    speech_code = field.find("nmsp:subfield[@code='a']", namespace)
    speech_code_searchable = field.find("nmsp:subfield[@code='q']", namespace)
    field = record.find("nmsp:datafield[@tag='992']", namespace)
    date = field.find("nmsp:subfield[@code='a']", namespace)
    data.append([record_id.text, 
                 speaker.text, 
                 speaker_organization.text, 
                 doc_type.text, 
                 speech_code.text, 
                 speech_code_searchable.text if speech_code_searchable is not None else None, 
                 date.text])

In [375]:
records = pd.DataFrame(data, columns=['record_id', 'speaker', 'speaker_organization', 'doc_type', 'speech_code', 'speech_code_searchable', 'date'])
records_len = len(records)
records

Unnamed: 0,record_id,speaker,speaker_organization,doc_type,speech_code,speech_code_searchable,date
0,4090177,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9988,,2025-08-28
1,4087505,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9962,,2025-07-22
2,4085871,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9941,SPV9941,2025-06-22
3,4084965,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9939,SPV9939,2025-06-20
4,4084667,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9938,SPV9938,2025-06-19
...,...,...,...,...,...,...,...
1207,3983935,"Lie, Trygve, 1896-1968",UN. Secretary-General,Speech index record,"E/SR.15[1946, 3rd sess.]",,1946-10-01
1208,3983784,"Lie, Trygve, 1896-1968",UN. Secretary-General,Speech index record,"E/SR.14[1946, 3rd sess.]",,1946-09-30
1209,3983385,"Lie, Trygve, 1896-1968",UN. Secretary-General,Speech index record,"E/SR.3[1946, 3rd sess.]",,1946-09-12
1210,3985925,"Lie, Trygve, 1896-1968",UN. Secretary-General,Speech index record,"E/SR.5[1946, 2nd sess.]",,1946-05-31


In [376]:
for row in records.itertuples():
    if '[' in row.speech_code:
        print(row.speech_code)

E/SR.21[1946, 3rd sess.]
E/SR.20[1946, 3rd sess.]
E/SR.19[1946, 3rd sess.]
E/SR.17[1946, 3rd sess.]
E/SR.15[1946, 3rd sess.]
E/SR.14[1946, 3rd sess.]
E/SR.3[1946, 3rd sess.]
E/SR.5[1946, 2nd sess.]
E/SR.1[1946, 2nd sess.]


### transfrom specific columns

In [377]:
# unicode normalize because they use differnt o accents for Guterres António
records.speaker = [unicodedata.normalize('NFKD',s) for s in records.speaker]
# date to date object
records.date = pd.to_datetime(records.date)

In [378]:
records.dtypes

record_id                         object
speaker                           object
speaker_organization              object
doc_type                          object
speech_code                       object
speech_code_searchable            object
date                      datetime64[ns]
dtype: object

In [379]:
records.speaker.unique()

array(['Guterres, António, 1949-', 'Ban, Ki-moon, 1944-',
       'Annan, Kofi, 1938-2018', 'Boutros-Ghali, Boutros, 1922-2016',
       'Pérez de Cuéllar, Javier, 1920-2020',
       'Waldheim, Kurt, 1918-2007', 'Thant, U, 1909-1974',
       'Hammarskjöld, Dag, 1905-1961', 'Lie, Trygve, 1896-1968'],
      dtype=object)

### remove speeches that are held out of term time

In [380]:
# all secretary general and their term dates
still_active_placeholder = '9999-12-31'
all_sgs = {
    'Guterres, António' : {
        'start': '2017-01-01',
        'end': still_active_placeholder
    },
    'Ban, Ki-moon' : {
        'start': '2007-01-01',
        'end': '2016-12-31',
    },
    'Annan, Kofi' : {
        'start': '1997-01-01',
        'end': '2006-12-31',
    },
    'Boutros-Ghali, Boutros' : {
        'start': '1992-01-01',
        'end': '1996-12-31',
    },
    'Pérez de Cuéllar, Javier' : {
        'start': '1982-01-01',
        'end': '1991-12-31',
    },
    'Waldheim, Kurt' : {
        'start': '1972-01-01',
        'end': '1981-12-31',
    },
    'Thant, U' : {
        'start': '1961-11-03',
        'end': '1971-12-31',
    },
    'Hammarskjöld, Dag' : {
        'start': '1953-04-10',
        'end': '1961-09-18',
    },
    'Lie, Trygve' : {
        'start': '1946-02-02',
        'end': '1952-11-10',
    }    
}
# all_sgs = json.load(json.dumps(all_sgs)

In [381]:
indices_to_drop = np.array([], dtype=int)
for sg in all_sgs:
    start_date = np.datetime64(all_sgs[sg]['start'])
    end_date = np.datetime64(all_sgs[sg]['end'])
    sg_records = records[records.speaker.str.contains(sg)] # get all rows for the SG
    new_indices = sg_records[sg_records.date < start_date].index #  get all rows with speeches not during his term
    if any(new_indices):
        indices_to_drop = np.concatenate((indices_to_drop, np.array(new_indices)))
print('will drop: ')
display(records.iloc[indices_to_drop])
records.drop(indices_to_drop, inplace=True)

print(f'{records_len - len(records)} indices were dropped because they were speeches out of term times')
records_len = len(records)

will drop: 


Unnamed: 0,record_id,speaker,speaker_organization,doc_type,speech_code,speech_code_searchable,date
537,592461,"Ban, Ki-moon, 1944-",UN. Secretary-General,Speech index record,A/61/PV.78,A61PV78,2006-12-14
538,588393,"Ban, Ki-moon, 1944-",UN. Secretary-General,Speech index record,A/61/PV.31,A61PV31,2006-10-13
798,388390,"Annan, Kofi, 1938-2018",UN. Secretary-General,Speech index record,A/C.3/51/SR.30,AC351SR30,1996-11-08
841,353838,"Boutros-Ghali, Boutros, 1922-2016",UN. Secretary-General,Speech index record,A/47/PV.83,A47PV83,1991-12-10
843,346704,"Boutros-Ghali, Boutros, 1922-2016",UN. Secretary-General,Speech index record,A/46/PV.59,A46PV59,1991-12-03


5 indices were dropped because they were speeches out of term times


### fill up search data

In [384]:
def make_searchable(code):
    code = code.replace('/', '')
    code = code.replace('.', '')
    code = code.replace('[', ' ')
    code = code.replace(']', ' ')
    return code

new_searchable = []
for index, row in records.iterrows():
    new_sbl = row.speech_code_searchable if pd.notna(row.speech_code_searchable) else make_searchable(row.speech_code)
    new_searchable.append(new_sbl)

records.speech_code_searchable = new_searchable

In [383]:
records

Unnamed: 0,record_id,speaker,speaker_organization,doc_type,speech_code,speech_code_searchable,date
0,4090177,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9988,SPV 9988,2025-08-28
1,4087505,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9962,SPV 9962,2025-07-22
2,4085871,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9941,SPV9941,2025-06-22
3,4084965,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9939,SPV9939,2025-06-20
4,4084667,"Guterres, António, 1949-",UN. Secretary-General,Speech index record,S/PV.9938,SPV9938,2025-06-19
...,...,...,...,...,...,...,...
1207,3983935,"Lie, Trygve, 1896-1968",UN. Secretary-General,Speech index record,"E/SR.15[1946, 3rd sess.]","ESR 15 1946, 3rd sess",1946-10-01
1208,3983784,"Lie, Trygve, 1896-1968",UN. Secretary-General,Speech index record,"E/SR.14[1946, 3rd sess.]","ESR 14 1946, 3rd sess",1946-09-30
1209,3983385,"Lie, Trygve, 1896-1968",UN. Secretary-General,Speech index record,"E/SR.3[1946, 3rd sess.]","ESR 3 1946, 3rd sess",1946-09-12
1210,3985925,"Lie, Trygve, 1896-1968",UN. Secretary-General,Speech index record,"E/SR.5[1946, 2nd sess.]","ESR 5 1946, 2nd sess",1946-05-31


In [None]:
records.to_csv('data/speech_records.csv', index=False)