code by Huan Li

In [None]:
from pathlib import Path

In [27]:
BASE_DIR = Path.cwd()

SPEECHES_DIR = BASE_DIR / "data" / "speeches"
EXTRACTED_OCR_DIR = SPEECHES_DIR / "extracted_ocr"
CLEANED_OCR_DIR = SPEECHES_DIR / "cleaned_ocr"
CLEANED_OCR_DIR.mkdir(exist_ok=True)

txt_path = EXTRACTED_OCR_DIR / "A_1950_PV.289_speeches.txt"


In [28]:
# Load the txt file
text = txt_path.read_text(encoding="utf-8")


In [29]:
text

"34. The SECRETARY-GENERAL:\n\n34. The SECRETARY-GENERAL: We are all, I\nam sure, ready and eager, after seven days of general\ndebate, to begin work here‘and in the committees upon\nthe seventy items of our agenda. What I have to say\ntoday will therefore be very, very brief. _\n35. First, allow me to express my gratitude for the\nreferences made during the debate to the Secretariat.\n8 See Oficial Records of the General Assembly, Fourth Ses-\nsion, Plenary Meetings, 227th meeting.\n\nWe shall continue to do all we can to help make this\nsession of the General Assembly a fruitful one.\n36. Ata moment when the world is more sharply and\ndangerously divided than at any time since the United\nNations was founded, I have been greatly impressed by\nthe strong support for the principles of the Charter and\nthe faith in the United Nations approach to the problem\nof peace that have been expressed by so many speakers\nin the general debate.\n37. It must be admitted, of course, that the basic\

In [30]:
import re

def normalize_paragraph_numbers(text: str) -> str:
    """
    Normalize paragraph numbering at paragraph starts ONLY.

    Rules:
    - Only affect numbers at line start or after newline
    - Remove stray symbols before the number
    - Ensure paragraph number ends with '.'
    """

    # remove quotes before number： "' 46." → "46."
    text = re.sub(
        r"(^|\n)\s*[\'‘’]+\s*(\d+)\.", 
        r"\1\2.",
        text
    )

    #  remove quotes after number → period： "47," → "47."
    text = re.sub(
        r"(^|\n)\s*(\d+)\s*,",
        r"\1\2.",
        text
    )

    return text


In [31]:
text_normalize = normalize_paragraph_numbers(text)

In [32]:
def clean_ocr_symbols(text: str) -> str:
    """
    Clean common OCR artifacts:
    - ._  -> .
    - remove stray quotes (` ‘ ’ ´)
    - ..  -> .
    """

    # ---------- 1. ._ -> .
    text = re.sub(r"\.\s*_", ".", text)

    # ---------- 2. remove stray quote-like characters
    text = re.sub(r"[‘’`´]", " ", text)

    # ---------- 3. collapse multiple dots
    text = re.sub(r"\.{2,}", ".", text)


    return text


In [33]:
txt_cleaned = clean_ocr_symbols(text_normalize)

In [34]:
lines = txt_cleaned.splitlines()

for i, line in enumerate(lines[:3]):
    print(f"LINE {i}: {repr(line)}")


LINE 0: '34. The SECRETARY-GENERAL:'
LINE 1: ''
LINE 2: '34. The SECRETARY-GENERAL: We are all, I'


In [None]:
# Remove redundant "The SECRETARY-GENERAL:" headers 
PURE_SG = re.compile(r"^\s*\d+\.\s*The\s+SECRETARY-?GENERAL\s*:\s*$", re.I)
SG_WITH_TEXT = re.compile(r"^\s*\d+\.\s*The\s+SECRETARY-?GENERAL\s*:\s*\S+", re.I)

def drop_redundant_sg_header(lines, lookahead_nonempty=2):
    out = []
    i = 0
    while i < len(lines):
        cur = lines[i].strip()

        if PURE_SG.match(cur):
            # look ahead next non-empty lines
            seen = 0
            j = i + 1
            found_repeat = False
            while j < len(lines) and seen < lookahead_nonempty:
                nxt = lines[j].strip()
                if nxt:
                    seen += 1
                    if SG_WITH_TEXT.match(nxt):
                        found_repeat = True
                        break
                j += 1

            if found_repeat:
                # drop this pure header line
                i += 1
                continue

        out.append(lines[i])
        i += 1
    return out


In [36]:
lines = txt_cleaned.splitlines()
lines = drop_redundant_sg_header(lines)
text_drop_dup = "\n".join(lines)


In [37]:
text_drop_dup

'\n34. The SECRETARY-GENERAL: We are all, I\nam sure, ready and eager, after seven days of general\ndebate, to begin work here and in the committees upon\nthe seventy items of our agenda. What I have to say\ntoday will therefore be very, very brief.\n35. First, allow me to express my gratitude for the\nreferences made during the debate to the Secretariat.\n8 See Oficial Records of the General Assembly, Fourth Ses-\nsion, Plenary Meetings, 227th meeting.\n\nWe shall continue to do all we can to help make this\nsession of the General Assembly a fruitful one.\n36. Ata moment when the world is more sharply and\ndangerously divided than at any time since the United\nNations was founded, I have been greatly impressed by\nthe strong support for the principles of the Charter and\nthe faith in the United Nations approach to the problem\nof peace that have been expressed by so many speakers\nin the general debate.\n37. It must be admitted, of course, that the basic\ndifferences between the two s

In [None]:
# Remove "See Official Records" reference blocks
REF_START = re.compile(
    r"(?:^|\b)(?:\d+|[A-Z])?\s*See\s+Official\s+Records\b",
    re.I
)

# Not a sentence-ending dot: No. / o.
NON_TERMINAL_DOT = re.compile(
    r"\b(?:No|no|o)\.\s*\d+\.\s*$"
)

# True sentence-ending dot (end of line)
SENTENCE_END = re.compile(r"\.\s*(?:$|[\"”'])")

def remove_reference_blocks(text: str) -> str:
    lines = text.splitlines()
    out = []

    skipping = False
    saw_terminal_dot = False  

    for line in lines:
        l = line.strip()

        if not skipping and REF_START.search(l):
            skipping = True
            saw_terminal_dot = False
            continue

        if skipping:
            # Check if a sentence-ending dot appears
            if SENTENCE_END.search(l) and not NON_TERMINAL_DOT.search(l):
                saw_terminal_dot = True
                continue

            # Already saw the ending dot, next line → exit reference block
            if saw_terminal_dot:
                skipping = False
                saw_terminal_dot = False
                # This line is the main text, keep it
                out.append(line)
            # Otherwise continue skipping
            continue

        # ---------- Main text ----------
        out.append(line)

    return "\n".join(out)


In [39]:
txt_drop_reference = remove_reference_blocks(text_drop_dup)
txt_drop_reference

'\n34. The SECRETARY-GENERAL: We are all, I\nam sure, ready and eager, after seven days of general\ndebate, to begin work here and in the committees upon\nthe seventy items of our agenda. What I have to say\ntoday will therefore be very, very brief.\n35. First, allow me to express my gratitude for the\nreferences made during the debate to the Secretariat.\n8 See Oficial Records of the General Assembly, Fourth Ses-\nsion, Plenary Meetings, 227th meeting.\n\nWe shall continue to do all we can to help make this\nsession of the General Assembly a fruitful one.\n36. Ata moment when the world is more sharply and\ndangerously divided than at any time since the United\nNations was founded, I have been greatly impressed by\nthe strong support for the principles of the Charter and\nthe faith in the United Nations approach to the problem\nof peace that have been expressed by so many speakers\nin the general debate.\n37. It must be admitted, of course, that the basic\ndifferences between the two s

In [None]:
def fix_hyphenation(text):
    # match: part of a word + - + newline + another part of the word
    # e.g., mem-\nbership -> membership
    return re.sub(r"([a-z])-\n\s*([a-z])", r"\1\2", text)

# Call immediately before or after merging full_text

In [41]:
txt_drop_reference = fix_hyphenation(txt_drop_reference)

In [42]:
txt_drop_reference

'\n34. The SECRETARY-GENERAL: We are all, I\nam sure, ready and eager, after seven days of general\ndebate, to begin work here and in the committees upon\nthe seventy items of our agenda. What I have to say\ntoday will therefore be very, very brief.\n35. First, allow me to express my gratitude for the\nreferences made during the debate to the Secretariat.\n8 See Oficial Records of the General Assembly, Fourth Session, Plenary Meetings, 227th meeting.\n\nWe shall continue to do all we can to help make this\nsession of the General Assembly a fruitful one.\n36. Ata moment when the world is more sharply and\ndangerously divided than at any time since the United\nNations was founded, I have been greatly impressed by\nthe strong support for the principles of the Charter and\nthe faith in the United Nations approach to the problem\nof peace that have been expressed by so many speakers\nin the general debate.\n37. It must be admitted, of course, that the basic\ndifferences between the two side

In [None]:
def aggressive_un_cleaner(text):
    # 1. handle typical UN meeting reference patterns (cross-line matching)
    # match blocks like "See Official Records... meeting."
    un_ref_pattern = re.compile(
        r"\d*\s*See\s+Official\s+Records.*?meeting\.?", 
        re.IGNORECASE | re.DOTALL
    )
    text = un_ref_pattern.sub("", text)

    # 2. Remove common header/footer keyword lines
    lines = text.splitlines()
    cleaned_lines = []
    
    # Common UN noise patterns
    noise_patterns = [
        r"General\s+Assembly", 
        r"Plenary\s+Meetings", 
        r"Supplement\s+No\.",
        r"Security\s+Council"
    ]
    
    for line in lines:
        # skip lines matching noise patterns or page numbers
        is_noise = any(re.search(p, line, re.I) for p in noise_patterns) and len(line) < 100
        is_page_num = re.match(r"^\s*\d+\s*$", line)
        
        if not (is_noise or is_page_num):
            cleaned_lines.append(line)
            
    return "\n".join(cleaned_lines)

In [44]:
txt_drop_reference = aggressive_un_cleaner(txt_drop_reference)

In [45]:
txt_drop_reference

'\n34. The SECRETARY-GENERAL: We are all, I\nam sure, ready and eager, after seven days of general\ndebate, to begin work here and in the committees upon\nthe seventy items of our agenda. What I have to say\ntoday will therefore be very, very brief.\n35. First, allow me to express my gratitude for the\nreferences made during the debate to the Secretariat.\n\nWe shall continue to do all we can to help make this\n36. Ata moment when the world is more sharply and\ndangerously divided than at any time since the United\nNations was founded, I have been greatly impressed by\nthe strong support for the principles of the Charter and\nthe faith in the United Nations approach to the problem\nof peace that have been expressed by so many speakers\nin the general debate.\n37. It must be admitted, of course, that the basic\ndifferences between the two sides in the world conflict\nhave not been diminished by a few days of general\nebate,\n38. At the same time I do not recall at any previous\nsession 

In [None]:
import re
from functools import lru_cache

from wordfreq import zipf_frequency
from rapidfuzz.distance import Levenshtein
from symspellpy import SymSpell, Verbosity

# --- 0) merge ：Ses-\nsion -> Session, mem.\nbership -> membership ---
# def join_hyphen_linebreak(text: str) -> str:
#     text = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text)      # hyphen linebreak
#     text = re.sub(r"(\w)\.\s*\n\s*(\w)", r"\1\2", text)     # dot linebreak (memo.\nrandum)
#     return text

# --- 1) token extraction: match only English words (including don't)---
WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")

# --- 2) SymSpell initialization (ensure dictionary path is correct)---
sym = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym.load_dictionary("data/frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)

# --- 3) zipf frequency (general English corpus) + cache acceleration ---
@lru_cache(maxsize=200000)
def z(word: str) -> float:
    return zipf_frequency(word, "en")

def looks_like_proper_noun(word: str) -> bool:
    return word[0].isupper()

def is_all_caps(word: str) -> bool:
    return word.isupper() and len(word) > 1

# --- 4) judge: is this word "like an OCR error" (non-word/very rare word)---
def suspicious_nonword(word: str, low=2.0) -> bool:
    if len(word) <= 2:
        return False
    if is_all_caps(word) or looks_like_proper_noun(word):
        return False
    return z(word.lower()) < low

# --- 5) candidate generation: candidates with edit distance ≤ 2 (top k)---
def gen_candidates(word: str, max_edit=2, topk=5):
    sug = sym.lookup(word.lower(), Verbosity.TOP, max_edit_distance=max_edit)
    return [s.term for s in sug][:topk]

# --- 6) candidate selection: replace only if "significantly more common" (frequency gain >= gain) to avoid false positives ---
def pick_by_frequency(original: str, cands, gain=1.5):
    o = original.lower()
    oz = z(o)
    best = None
    bestz = oz

    for c in cands:
        if Levenshtein.distance(o, c.lower()) > 2:
            continue
        cz = z(c.lower())
        if cz > bestz:
            best, bestz = c, cz

    if best and (bestz - oz) >= gain:
        return best
    return original

# --- 7) word-level correction (Type 1)---
def correct_nonword_token(tok: str) -> str:
    cap = tok[0].isupper()

    if not suspicious_nonword(tok):
        return tok

    cands = gen_candidates(tok)
    new = pick_by_frequency(tok, cands)

    if cap:
        new = new[:1].upper() + new[1:]
    return new

# --- 8) apply to entire text ---
def correct_text_nonword(text: str) -> str:
    # text = join_hyphen_linebreak(text)

    def repl(m):
        return correct_nonword_token(m.group(0))

    return WORD_RE.sub(repl, text)


In [63]:
print("DICT LOADED =", sym.load_dictionary("data/frequency_dictionary_en_82_765.txt", term_index=0, count_index=1))
print("DICT SIZE   =", sym.word_count)


DICT LOADED = True
DICT SIZE   = 82834


In [64]:
txt_fixed = correct_text_nonword(txt_drop_reference)
print(txt_fixed)


34. The SECRETARY-GENERAL: We are all, I
am sure, ready and eager, after seven days of general
debate, to begin work here and in the committees upon
the seventy items of our agenda. What I have to say
today will therefore be very, very brief.
35. First, allow me to express my gratitude for the
references made during the debate to the Secretariat.

We shall continue to do all we can to help make this
36. Ata moment when the world is more sharply and
dangerously divided than at any time since the United
Nations was founded, I have been greatly impressed by
the strong support for the principles of the Charter and
the faith in the United Nations approach to the problem
of peace that have been expressed by so many speakers
in the general debate.
37. It must be admitted, of course, that the basic
differences between the two sides in the world conflict
have not been diminished by a few days of general
debate,
38. At the same time I do not recall at any previous
session such a demonstration a

In [61]:
PDF_SINGLE_COLUMN_DIR = SPEECHES_DIR / "pdf_single_column"

pdf_path = PDF_SINGLE_COLUMN_DIR / "A_1950_PV.289_speeches.pdf"

out_txt= CLEANED_OCR_DIR/ f"{pdf_path.stem}.txt"
txt_fixed = correct_text_nonword(txt_drop_reference)
out_txt.write_text(txt_fixed, encoding="utf-8")

3400