code by Huan Li

In [23]:
from pathlib import Path
PROJECT_ROOT = Path.cwd()
input_dir = PROJECT_ROOT / "data" / "speeches" / "extracted_from_image"
out_dir = PROJECT_ROOT / "data" / "speeches" / "text_cleaned_from_image"

 ## cleaned  txt 

In [24]:
import re

# 1) filter out punctuation 
punct_re = re.compile(r"[.,:;!?()\-'\"]")

# 2) drop patterns
DROP_PATTERNS = [
    r"^Digitized by",
    r"^This record contains",
    r"^Corrections should",
    r"^The meeting (was|rose)",
    r"^Held at Headquarters",
    r"^PROVISIONAL",
    r"^VERBATIM RECORD",
    r"^Security Council",
    r"^UNITED NATIONS",
    r"^\(?Union of Soviet Socialist Republics\)?",
]

drop_re = re.compile("|".join(DROP_PATTERNS), re.IGNORECASE)

In [25]:
# (A) filter editor signatures,like ：JP/PLJ, AE/gt, RM/5
editor_sig_re = re.compile(r"^\s*[A-Za-z]{1,4}\s*/\s*[A-Za-z0-9]{1,6}\s*$")

# (B) filter UN document number lines, like S/PV. 2918, A/S-17/PV.1, S/1986/2705
#     S/PV. 2918, A/S-17/PV.1, S/1986/2705
slash_number_end_re = re.compile(r"/.*\d\s*$")

# (C) filter ALL-CAPS titles/headers, which are common in UN docs but not body text; 
upper_title_re = re.compile(
    r"^(?=.{6,}$)(?=.*[A-Z].*[A-Z].*[A-Z].*[A-Z].*[A-Z].*[A-Z])[A-Z0-9\s\(\)\-,'\.\/:;]+$"
)

# (D) filter common boilerplate lines 
non_punct_noise_kw_re = re.compile(
    r"\b(UNITED|NATIONS|Security Council|AGENDA|ITEMS?|Members?|President|PROVISIONAL|VERBATIM|RECORD)\b",
    re.IGNORECASE
)

def clean_txt_lines(text: str, debug_n: int = 50):
    """
    Clean OCR text lines conservatively:
      - Always drop: editor signatures, UN doc-number-only lines, ALL-CAPS titles, fixed boilerplate.
      - For lines with NO sentence-level punctuation: drop only if clearly non-body (short, numeric, ALL-CAPS, keyworded).
      - Debug: collect first `debug_n` examples dropped due to the "no punctuation" rule.
    Returns:
      cleaned_text (str), debug_samples_no_punct (list[str])
    """
    cleaned = []
    debug_samples_no_punct = []

    for raw_line in text.splitlines():
        line = raw_line.strip()
        if not line:
            continue

        # A：delete editor signatures (e.g., JP/PLJ, AE/gt, RM/5) 
        if editor_sig_re.match(line):
            continue

        # B：delete UN document number lines (e.g., S/PV. 2918, A/S-17/PV.1, S/1986/2705) 
        if slash_number_end_re.search(line):
            continue

        # C：delete ALL-CAPS titles/headers (common in UN docs but not body text)
        if upper_title_re.match(line):
            continue

        # D：delete lines matching fixed boilerplate patterns
        if drop_re.search(line):
            continue

        if not punct_re.search(line):
            drop_no_punct = False

            if line.isdigit() or len(line) <= 4:
                drop_no_punct = True

            elif line.isupper():
                drop_no_punct = True

            elif non_punct_noise_kw_re.search(line):
                drop_no_punct = True

            if drop_no_punct:
                if len(debug_samples_no_punct) < debug_n:
                    debug_samples_no_punct.append(line)
                continue

       
        cleaned.append(line)

    return "\n".join(cleaned)


## OCR post-processing

In [26]:
from wordfreq import zipf_frequency, top_n_list
from symspellpy import SymSpell, Verbosity

# -------- Lexicon check --------
def in_general_lexicon(word: str) -> bool:
    z = zipf_frequency(word, "en")
    return z >= 2.0

# -------- Build SymSpell dictionary --------
symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
for w in top_n_list("en", 200000):
    freq = int(zipf_frequency(w, "en") * 1000)
    symspell.create_dictionary_entry(w, max(freq, 1))

# -------- Protection rules --------
DOCNO_RE = re.compile(r"(A/\d+|S/\d+|S/PV\.|A/\d+/PV\.|\d{4})")

def is_protected(token: str) -> bool:
    # 1) protect Title Case （Contingents, Redeployment ）
    if token.istitle():
        return True

    # 2) protect possessive （Hugo's / Hugo’s）
    if ("'s" in token) or ("’s" in token): 
        return True

    # 3) protect ALL-CAPS 
    if token.isupper() and len(token) <= 6:
        return True

    # 4) protect tokens with digits or docno-like patterns (S/PV.2918, A/S-17/PV.1, S/1986/2705)
    if any(ch.isdigit() for ch in token):
        return True
    if "/" in token or DOCNO_RE.search(token):
        return True

    return False

WORD_RE = re.compile(r"[A-Za-z][A-Za-z\-']{2,}")

# -------- Risk filters to avoid bad "prefix stripping" --------
COMMON_PREFIXES = {
    "re","de","un","im","in","en","sub","pre","pro","con","dis","mis","non",
    "over","under","inter","trans","anti","auto","post","fore","counter"
}

def is_plural_flip(orig: str, cand: str) -> bool:
    o, c = orig.lower(), cand.lower()
    return (o == c + "s") or (o == c + "es") or (c == o + "s") or (c == o + "es")

def is_prefix_stripping_risk(orig: str, cand: str) -> bool:
    o, c = orig.lower(), cand.lower()

    if c == o:
        return False

    # protect against "prefix stripping" errors, where the candidate is a real word but is just the original with a common prefix removed (e.g., entrusting -> trusting, redeployment -> deployment)
    if (o.endswith(c) or (c in o)) and (len(o) - len(c) in (1,2,3)):
        removed = o.replace(c, "", 1)  
        # for example, if orig=entrusting and cand=trusting, then removed=entrus, which is not a real prefix; but if orig=redeployment and cand=deployment, then removed=re, which is a common prefix.
        if o.startswith(removed) is False and o.startswith(c) is False:
            pass
        if o.endswith(c):
            prefix = o[:-len(c)]
            if prefix in COMMON_PREFIXES:
                return True

    return False

# -------- Correct one token --------
def correct_token(token: str):
    if is_protected(token):
        return token, False

    lower = token.lower()

    if in_general_lexicon(lower):
        return token, False

    suggestions = symspell.lookup(
        lower,
        Verbosity.CLOSEST,
        max_edit_distance=2
    )
    if not suggestions:
        return token, False

    best = suggestions[0].term

    if not in_general_lexicon(best):
        return token, False

    if is_plural_flip(lower, best):
        return token, False

    if is_prefix_stripping_risk(lower, best):
        return token, False

    if token.isupper():
        return best.upper(), True
    return best, True

def correct_text_fast(text: str):
    changes = []

    def repl(m):
        tok = m.group(0)
        new_tok, changed = correct_token(tok)
        if changed:
            changes.append((tok, new_tok))
        return new_tok

    corrected = WORD_RE.sub(repl, text)
    return corrected, changes

In [27]:

files = input_dir.glob("*.txt")
summary_stats = []

# iterate over all txt files in input_dir, clean and correct them, save to out_dir, and collect summary stats
for in_path in files:
    
    fname = in_path.name
    out_path = out_dir / fname

    print(f"\n=== Processing: {fname} ===")

    raw_text = in_path.read_text(encoding="utf-8", errors="replace")

    # clean text lines conservatively
    cleaned_text = clean_txt_lines(raw_text)

    # spelling correction
    corrected_text, changes = correct_text_fast(cleaned_text)

    # summary stats
    num_changes = len(changes)
    summary_stats.append((fname, num_changes))

    print(f"Corrected tokens: {num_changes}")

    if num_changes > 0:
        print("First 10 corrections:")
        for a, b in changes[:10]:
            print(f"  {a} → {b}")
    else:
        print("No corrections made.")

    # save cleaned and corrected text to out_dir
    out_path.write_text(corrected_text, encoding="utf-8")

print("\n====== SUMMARY ======")

any_changes = False

for fname, count in summary_stats:
    if count > 0:
        print(f"{fname}: {count}")
        any_changes = True

if any_changes:
    print("\nAll done.")



=== Processing: A_1983_C.1_38_PV.11_speeches.txt ===
Corrected tokens: 0
No corrections made.

=== Processing: A_1984_C.1_39_PV.12_speeches.txt ===
Corrected tokens: 0
No corrections made.

=== Processing: A_1985_40_PV.117_speeches.txt ===
Corrected tokens: 0
No corrections made.

=== Processing: A_1985_40_PV.122_speeches.txt ===
Corrected tokens: 2
First 10 corrections:
  biennia → biennial
  triplications → replications

=== Processing: A_1985_40_PV.122_speeches__02.txt ===
Corrected tokens: 0
No corrections made.

=== Processing: A_1985_40_PV.36_speeches.txt ===
Corrected tokens: 0
No corrections made.

=== Processing: A_1985_40_PV.49_speeches.txt ===
Corrected tokens: 1
First 10 corrections:
  rampancy → rampant

=== Processing: A_1985_40_PV.66_speeches.txt ===
Corrected tokens: 0
No corrections made.

=== Processing: A_1985_40_PV.75_speeches.txt ===
Corrected tokens: 0
No corrections made.

=== Processing: A_1985_C.1_40_PV.21_speeches.txt ===
Corrected tokens: 0
No corrections ma

## Extract Speech metadata

In [28]:
import pandas as pd

meta_dir = PROJECT_ROOT / "data" / "dataframes" / "metadata_s03.csv"
meta = pd.read_csv(meta_dir,encoding="utf-8-sig")

In [29]:
meta.head()

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,date,sg_number,year,month,day,body,...,file_name_pdf,record_id_document,document_symbol_found,lang_field,doc_url,date_document,text_based,two_column_layout,left_column_french,starting_page
0,380162,"Lie, Trygve, 1896-1968",UN. Secretary-General,A/PV.289,1950-09-28,1,1950,9,28,A,...,A_1950_PV.289_speeches.pdf,646684.0,A/PV.289,English,https://digitallibrary.un.org/record/646684/fi...,1950-09-28,False,True,,1
1,380215,"Lie, Trygve, 1896-1968",UN. Secretary-General,A/PV.348,1951-11-16,1,1951,11,16,A,...,A_1951_PV.348_speeches.pdf,735242.0,A/PV.348,English,http://digitallibrary.un.org/record/735242/fil...,1951-11-16,False,True,,1
2,4089214,"Hammarskjöld, Dag, 1905-1961",UN. Secretary-General,A/PV.690,1957-09-26,2,1957,9,26,A,...,A_1957_PV.690_speeches.pdf,698477.0,A/PV.690,English,http://digitallibrary.un.org/record/698477/fil...,1957-09-26,False,True,,1
3,4086528,"Hammarskjöld, Dag, 1905-1961",UN. Secretary-General,A/PV.871,1960-09-26,2,1960,9,26,A,...,A_1960_PV.871_speeches.pdf,709690.0,A/PV.871,English,https://digitallibrary.un.org/record/709690/fi...,1960-09-26,False,True,,1
4,381101,"Hammarskjöld, Dag, 1905-1961",UN. Secretary-General,A/PV.883,1960-10-03,2,1960,10,3,A,...,A_1960_PV.883_speeches.pdf,698629.0,A/PV.883,English,https://digitallibrary.un.org/record/698629/fi...,1960-10-03,False,True,,1


In [36]:
import re
files = out_dir.glob("*.txt")

created_names = []

for in_path in files:
    name = in_path.name
    
    # Keep everything up to "_speeches"
    match = re.match(r"(.+?_speeches)", name)
    if not match:
        continue
    
    base_name = match.group(1)
    new_name = base_name + ".pdf"
    new_path = out_dir / new_name
    print(f"Processing: {name} → {new_name}")
    
    # Skip if already created in this loop
    if new_name in created_names:
        continue
    
    # Skip if file already exists in folder
    if new_path.exists():
        print(f"Skip (already exists): {new_name}")
        continue
    
    created_names.append(new_name)
    
    print(f"Renamed: {in_path.name} → {new_name}")

print("Done.")

Processing: A_1983_C.1_38_PV.11_speeches.txt → A_1983_C.1_38_PV.11_speeches.pdf
Renamed: A_1983_C.1_38_PV.11_speeches.txt → A_1983_C.1_38_PV.11_speeches.pdf
Processing: A_1984_C.1_39_PV.12_speeches.txt → A_1984_C.1_39_PV.12_speeches.pdf
Renamed: A_1984_C.1_39_PV.12_speeches.txt → A_1984_C.1_39_PV.12_speeches.pdf
Processing: A_1985_40_PV.117_speeches.txt → A_1985_40_PV.117_speeches.pdf
Renamed: A_1985_40_PV.117_speeches.txt → A_1985_40_PV.117_speeches.pdf
Processing: A_1985_40_PV.122_speeches.txt → A_1985_40_PV.122_speeches.pdf
Renamed: A_1985_40_PV.122_speeches.txt → A_1985_40_PV.122_speeches.pdf
Processing: A_1985_40_PV.122_speeches__02.txt → A_1985_40_PV.122_speeches.pdf
Processing: A_1985_40_PV.36_speeches.txt → A_1985_40_PV.36_speeches.pdf
Renamed: A_1985_40_PV.36_speeches.txt → A_1985_40_PV.36_speeches.pdf
Processing: A_1985_40_PV.49_speeches.txt → A_1985_40_PV.49_speeches.pdf
Renamed: A_1985_40_PV.49_speeches.txt → A_1985_40_PV.49_speeches.pdf
Processing: A_1985_40_PV.66_speeches

In [37]:
created_names

['A_1983_C.1_38_PV.11_speeches.pdf',
 'A_1984_C.1_39_PV.12_speeches.pdf',
 'A_1985_40_PV.117_speeches.pdf',
 'A_1985_40_PV.122_speeches.pdf',
 'A_1985_40_PV.36_speeches.pdf',
 'A_1985_40_PV.49_speeches.pdf',
 'A_1985_40_PV.66_speeches.pdf',
 'A_1985_40_PV.75_speeches.pdf',
 'A_1985_C.1_40_PV.21_speeches.pdf',
 'A_1986_40_PV.124_speeches.pdf',
 'A_1986_40_PV.127_speeches.pdf',
 'A_1986_41_PV.102_speeches.pdf',
 'A_1986_41_PV.33_speeches.pdf',
 'A_1986_41_PV.42_speeches.pdf',
 'A_1986_41_PV.54_speeches.pdf',
 'A_1986_C.1_41_PV.23_speeches.pdf',
 'A_1986_S-13_PV.1_speeches.pdf',
 'A_1986_S-13_PV.5_speeches.pdf',
 'A_1986_S-14_PV.1_speeches.pdf',
 'A_1987_42_PV.34_speeches.pdf',
 'A_1987_42_PV.41_speeches.pdf',
 'A_1987_42_PV.44_speeches.pdf',
 'A_1987_42_PV.48_speeches.pdf',
 'A_1987_42_PV.64_speeches.pdf',
 'A_1987_C.1_42_PV.20_speeches.pdf',
 'A_1988_42_PV.100_speeches.pdf',
 'A_1988_42_PV.105_speeches.pdf',
 'A_1988_42_PV.110_speeches.pdf',
 'A_1988_42_PV.114_speeches.pdf',
 'A_1988_42

In [38]:
len(created_names)

80

In [39]:
# extract metadata from pdf files
meta_sub = meta[meta['file_name_pdf'].isin(created_names)]

In [40]:
meta_sub

Unnamed: 0,record_id,speaker,speaker_organization,document_symbol,date,sg_number,year,month,day,body,...,file_name_pdf,record_id_document,document_symbol_found,lang_field,doc_url,date_document,text_based,two_column_layout,left_column_french,starting_page
9,288722,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/C.1/38/PV.11,1983-10-24,5,1983,10,24,A,...,A_1983_C.1_38_PV.11_speeches.pdf,57551.0,A/C.1/38/PV.11,English,http://digitallibrary.un.org/record/57551/file...,1983-10-24,False,False,,2
11,295415,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/C.1/39/PV.12,1984-10-24,5,1984,10,24,A,...,A_1984_C.1_39_PV.12_speeches.pdf,69799.0,A/C.1/39/PV.12,English,http://digitallibrary.un.org/record/69799/file...,1984-10-24,False,False,,2
15,304596,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/40/PV.36,1985-10-16,5,1985,10,16,A,...,A_1985_40_PV.36_speeches.pdf,100056.0,A/40/PV.36,English,https://digitallibrary.un.org/record/100056/fi...,1985-10-16,False,False,,2
16,304835,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/40/PV.49,1985-10-24,5,1985,10,24,A,...,A_1985_40_PV.49_speeches.pdf,101376.0,A/40/PV.49,English,https://digitallibrary.un.org/record/101376/fi...,1985-10-24,False,False,,2
17,305254,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,A/C.1/40/PV.21,1985-10-31,5,1985,10,31,A,...,A_1985_C.1_40_PV.21_speeches.pdf,103770.0,A/C.1/40/PV.21,English,http://digitallibrary.un.org/record/103770/fil...,1985-10-31,False,False,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582,341121,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,S/PV.2963,1990-11-29,5,1990,11,29,S,...,S_1990_PV.2963_speeches.pdf,105331.0,S/PV.2963,English,https://digitallibrary.un.org/record/105331/fi...,1990-11-29,False,False,,2
583,344523,"Pérez de Cuéllar, Javier, 1920-2020",UN. Secretary-General,S/PV.2977(PartII)(closed-resumption3),1991-02-23,5,1991,2,23,S,...,S_1991_PV.2977_PARTIICLOSED-RESUMPTION3_speech...,110472.0,S/PV.2977(PartII)(closed-resumption3),English,https://digitallibrary.un.org/record/110472/fi...,1991-02-23,False,False,,2
584,348619,"Boutros-Ghali, Boutros, 1922-2016",UN. Secretary-General,S/PV.3046,1992-01-31,6,1992,1,31,S,...,S_1992_PV.3046_speeches.pdf,196999.0,S/PV.3046,English,https://digitallibrary.un.org/record/196999/fi...,1992-01-31,False,False,,2
585,349813,"Boutros-Ghali, Boutros, 1922-2016",UN. Secretary-General,S/PV.3057,1992-02-28,6,1992,2,28,S,...,S_1992_PV.3057_speeches.pdf,139573.0,S/PV.3057,English,https://digitallibrary.un.org/record/139573/fi...,1992-02-28,False,False,,2


In [41]:
meta_sub.to_csv(PROJECT_ROOT / "data" / "dataframes" / "metadata_s06.csv", index=False, encoding="utf-8-sig")

In [44]:
set(created_names) - set(meta_sub['file_name_pdf'].unique())

{'A_1986_S-13_PV.5_speeches.pdf'}