In [3]:
import os
import shutil
import subprocess
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import re
import re
from collections import Counter
import nltk
import pandas as pd
import spacy

from functools import lru_cache
from pathlib import Path

#  Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


# 1. Read single image - based pdf

In [4]:

BASE_DIR = Path("D:/uppsala/15. ds projects")
RAW_PDF_DIR = BASE_DIR / "PDS_UNSG_TEXT_CORPUS" / "data" / "speeches" / "pdf"
TMP_DIR = BASE_DIR / "tmp_data"
TMP_DIR.mkdir(exist_ok=True)  # auto create if not exists


# 2. Convert two columns to one column

In [None]:


def convert_to_single_column(pdf_name, input_dir=RAW_PDF_DIR, output_dir=TMP_DIR, k2_path=None, timeout=600):
    """Convert a two-column PDF to single-column and save it in the tmp_data folder"""

    # prepare input and output paths
    input_pdf = input_dir / pdf_name
    output_pdf = output_dir / pdf_name

    # find k2pdfopt executable
    exe = k2_path or shutil.which("k2pdfopt") or shutil.which("k2pdfopt.exe")
    if not exe:
        raise FileNotFoundError("k2pdfopt not found, please ensure it is installed.")

    # build command
    cmd = [
        exe, str(input_pdf),
        "-o", str(output_pdf),
        "-mode", "copy",# keep original quality
        "-col", "2", #  input = two columns
        "-dev", "pdf",#  output format =  PDF
        "-ui-", # close interactive UI mode
        "-p", "1-9999", #  deal with pages 1 to 9999
        "-x", #  removing white margins
        "-wrap-", #  no  wrapping
        "-autorotate-", #  no auto-rotation
        "-rt", "0" #  no auto-rotation
        "-verbose"


    ]

    # execute
    res = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
    if res.returncode != 0:
        raise RuntimeError(f"k2pdfopt error: {res.stderr or res.stdout}")

    if not output_pdf.exists() or output_pdf.stat().st_size < 5000:
        raise RuntimeError(f"Output file error, please check: {output_pdf}")

    print(f"✅ Conversion successful: {output_pdf.name} → Size {output_pdf.stat().st_size//1024} KB")
    return output_pdf


In [None]:
convert_to_single_column("A_1983_38_PV.81_speeches.pdf",k2_path=r"D:\LeStoreDownload\k2pdfopt.exe")

# 3. Extract text using different techniques and calculate accuracy of each model

## Define the function of how to calculate text recognition accuracy
* Using spaCy’s vocab to identify whether a word is valid (including verb tenses, plural forms, and common proper nouns)
* If a word is capitalized and not at the beginning of a sentence, it is treated as a proper noun — no lemmatization and no validity checking

In [None]:
# load spacy model for text evaluation
nlp = spacy.load("en_core_web_lg", disable=["parser", "tagger"])  # only used for lemmatization and vocab, no need for parser and tagger


def evaluate_fast(text):
    """
    Efficient text quality evaluation:
    - complexity: O(n)
    - leverage caching for validity checks,avoid lookups for every word
    - output: accuracy, invalid-word dataframe
    """

    doc = nlp(text)

    words = []
    is_proper_flags = []
    append_word = words.append
    append_flag = is_proper_flags.append

    # -----------❗ Query cache (greatly speeds up) -----------
    valid_cache = {}  # word -> True/False

    for token in doc:
        if not token.is_alpha:
            continue

        txt = token.text
        lemma = token.lemma_

        # Define "proper noun": capitalized and not sentence start
        is_proper = txt[0].isupper() and not token.is_sent_start

        # Normalize word form
        if is_proper:
            final = txt.lower()
        else:
            final = lemma.lower()

        append_word(final)
        append_flag(is_proper)

    total = len(words)
    if total == 0:
        return 0.0, pd.DataFrame(columns=["invalid_word", "frequency"])

    invalid_words = []

    # -----------❗ Core optimization: O(n), dictionary lookup with caching -----------
    for w, is_proper in zip(words, is_proper_flags):

        if is_proper:
            # Proper nouns are automatically valid
            continue

        # Use cache to improve efficiency
        if w in valid_cache:
            is_valid = valid_cache[w]
        else:
            is_valid = (w in nlp.vocab)
            valid_cache[w] = is_valid

        if not is_valid:
            invalid_words.append(w)

    # ----------- Output results -----------
    invalid_freq = Counter(invalid_words)
    df_invalid = pd.DataFrame(
        invalid_freq.items(), columns=["invalid_word", "frequency"]
    ).sort_values("frequency", ascending=False)

    accuracy = (total - len(invalid_words)) / total

    return accuracy, df_invalid


## 3.1 Baseline: Extract text directly by PyMuPDF

In [10]:
def extract_text_direct_original(pdf_name,
                                 pdf_dir=Path("data/speeches/pdf")):
    """
    for text-based PDFs, directly extract using PyMuPDF without going through k2pdfopt or OCR.
    """
    pdf_path = pdf_dir / pdf_name
    doc = fitz.open(pdf_path)
    all_text = []

    for i, page in enumerate(doc, start=1):
        text = page.get_text("text")   # already machine-readable text
        all_text.append(text)
    doc.close()

    full_text = "\n".join(all_text)
    return full_text

In [22]:
direct_txt = extract_text_direct_original("A_1983_38_PV.81_speeches.pdf")
save_path = TMP_DIR / "A_1983_38_PV.81_speeches_direct.txt"

# with open(save_path, "w", encoding="utf-8") as f:
#     f.write(direct_txt)

In [23]:
accuracy, df_invalid = evaluate_fast(direct_txt)

print("Accuracy:", accuracy)
print("Invalid words frequency:")
print(df_invalid)



Accuracy: 0.9983934477870531
Invalid words frequency:
                    invalid_word  frequency
6                        thearab          2
19                     theunited          2
0                             li          1
37                      dyseized          1
28                         ajust          1
29                      thechair          1
30                 aggressionsor          1
31                        ofarab          1
32                        peapie          1
33                   whichisrael          1
34                itsimpotencein          1
35                      thegolan          1
36                tochallengeits          1
38                          wiii          1
26                     thosearab          1
39                      ofturkey          1
40                        theplo          1
41     attheconferenceemphasized          1
42                 ofpalestinian          1
43                chosepalestine          1
44                thes

# 