 code by Huan Li

In [1]:
import shutil
import subprocess
import pytesseract
from pathlib import Path
import pandas as pd
#  Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

#  Read single image - based pdf

In [None]:

# project root
BASE_DIR = Path.cwd()
SPEECHES_DIR = BASE_DIR / "data" / "speeches"
# input
PDF_DIR = SPEECHES_DIR / "pdf"
# output
PDF_SINGLE_COLUMN_DIR = SPEECHES_DIR / "pdf_single_column"

In [None]:
DATAFRAMES_DIR = BASE_DIR / "data" / "dataframes"
META_PATH = DATAFRAMES_DIR / "metadata_s03.csv"

df = pd.read_csv(META_PATH)

In [None]:
# filter text_based == False and two_column_layout == True
trial_df = df[(df["text_based"] == False) & (df["two_column_layout"] == True)]

Convert two columns into one column

In [None]:
def convert_to_single_column(
    pdf_name: str,
    input_dir=PDF_DIR,
    output_dir=PDF_SINGLE_COLUMN_DIR,
    k2_path=None,
    timeout=600,
    debug=False
):
    """
    Convert a two-column PDF to single-column using k2pdfopt.
    If the output file already exists, skip conversion.
    """

    input_pdf = input_dir / pdf_name
    if not input_pdf.exists():
        raise FileNotFoundError(f"Input PDF not found: {input_pdf}")

    output_pdf = output_dir / pdf_name

    if output_pdf.exists():
        print(f"⏭️  Skipped: {pdf_name} already exists.")
        return output_pdf

    exe = k2_path or shutil.which("k2pdfopt") or shutil.which("k2pdfopt.exe")
    if not exe:
        raise FileNotFoundError("k2pdfopt not found, please ensure it is installed.")

    cmd = [
        exe, str(input_pdf),
        "-o", str(output_pdf),
        "-mode",  "copy",
        "-col", "2",
        "-ui-",
        "-p", "3-9999",
        "-x",
        "-wrap-",
        "-autorotate-",
        "-rt", "0",
        "-verbose"
    ]

    res = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)

    if debug:
        print("\n========== k2pdfopt DEBUG ==========")
        print(f"PDF: {pdf_name}")
        print("CMD:")
        print(" ".join(cmd))
        print("\n----- STDOUT -----")
        print(res.stdout)
        print("\n----- STDERR -----")
        print(res.stderr)
        print("===================================\n")

    if res.returncode != 0:
        raise RuntimeError(f"k2pdfopt error: {res.stderr or res.stdout}")

    if not output_pdf.exists() or output_pdf.stat().st_size < 5000:
        raise RuntimeError(f"Output file error, please check: {output_pdf}")

    print(f"✅ Conversion successful: {output_pdf.name} → {output_pdf.stat().st_size // 1024} KB")
    return output_pdf


Run the code on the whole corpus

In [15]:
K2_PATH=r"D:\LeStoreDownload\k2pdfopt.exe"
ok, fail = [], []
import time

t0 = time.time()
for _, row in trial_df.iterrows():
    pdf_name = row["file_name_pdf"]
    try:
        out_pdf = convert_to_single_column(
            pdf_name,
            input_dir=PDF_DIR,
            output_dir=PDF_SINGLE_COLUMN_DIR,
            k2_path=K2_PATH,
            debug=False
        )
        ok.append(pdf_name)
    except Exception as e:
        fail.append((pdf_name, str(e)))
        print(f"❌ FAIL: {pdf_name}\n   {e}\n")

print("====== TRIAL SUMMARY ======")
print("OK:", len(ok), ok)
print("FAIL:", len(fail))
for x in fail:
    print(" -", x[0], "=>", x[1][:160])

total_sec = time.time() - t0
print(f"\n⏱️ Total runtime: {total_sec/60:.1f} minutes")

⏭️  Skipped: A_1950_PV.289_speeches.pdf already exists.
⏭️  Skipped: A_1951_PV.348_speeches.pdf already exists.
⏭️  Skipped: A_1957_PV.690_speeches.pdf already exists.
⏭️  Skipped: A_1960_PV.871_speeches.pdf already exists.
⏭️  Skipped: A_1960_PV.883_speeches.pdf already exists.
⏭️  Skipped: A_1960_PV.904_speeches.pdf already exists.
⏭️  Skipped: A_1960_PV.906_speeches.pdf already exists.
⏭️  Skipped: A_1979_34_PV.17_speeches.pdf already exists.
⏭️  Skipped: A_1983_38_PV.81_speeches.pdf already exists.
⏭️  Skipped: A_1983_38_PV.90_speeches.pdf already exists.
⏭️  Skipped: A_1984_39_PV.44_speeches.pdf already exists.
⏭️  Skipped: A_1984_39_PV.83_speeches.pdf already exists.
⏭️  Skipped: A_1984_39_PV.96_speeches.pdf already exists.
⏭️  Skipped: A_1993_48_PV.71_speeches.pdf already exists.
⏭️  Skipped: A_1993_48_PV.74_speeches.pdf already exists.
⏭️  Skipped: A_1994_48_PV.95_speeches.pdf already exists.
⏭️  Skipped: S_1964_PV.1097_speeches.pdf already exists.
⏭️  Skipped: S_1964_PV.1102_s

In [16]:
fail

[]

# evaluate the output via accuracy

In [None]:
# from collections import Counter
# import pandas as pd
# import spacy
# from wordfreq import zipf_frequency


# nlp = spacy.load("en_core_web_lg", disable=["parser", "tagger"])  # only used for lemmatization and vocab, no need for parser and tagger

# def evaluate_precise(text, nlp):
#     """
#     Lightweight OCR lexical quality evaluation.

#     Logic:
#     - Tokenize text using spaCy
#     - Ignore non-alphabetic tokens
#     - Treat proper nouns as valid
#     - For other words, check lemma frequency using wordfreq (Zipf scale)
#     - Words with extremely low frequency are considered likely OCR errors

#     Parameters
#     ----------
#     text : str
#         OCR-extracted text
#     nlp : spacy.Language
#         Loaded spaCy English model

#     Returns
#     -------
#     accuracy : float
#         Ratio of lexically valid words
#     df_invalid : pd.DataFrame
#         Frequency table of likely incorrect words
#     """

#     doc = nlp(text)

#     total = 0
#     invalid_words = []

#     # Cache to avoid repeated frequency lookup
#     valid_cache = {}  # word -> True / False

#     for token in doc:

#         # Skip punctuation, numbers, symbols
#         if not token.is_alpha:
#             continue

#         total += 1

#         txt = token.text
#         lemma = token.lemma_

#         # ---------- Proper noun rule ----------
#         # Capitalized words inside sentences are assumed to be valid
#         is_proper = txt[0].isupper() and not token.is_sent_start
#         if is_proper:
#             continue

#         # ---------- Frequency check ----------
#         w = lemma.lower()

#         if w in valid_cache:
#             ok = valid_cache[w]
#         else:
#             ok = zipf_frequency(w, "en") > 1.5  # empirical threshold
#             valid_cache[w] = ok

#         if not ok:
#             invalid_words.append(w)

#     # ---------- Aggregate invalid words ----------
#     invalid_freq = Counter(invalid_words)

#     df_invalid = (
#         pd.DataFrame(invalid_freq.items(),
#                      columns=["invalid_word", "frequency"])
#         .sort_values("frequency", ascending=False)
#     )

#     accuracy = (total - len(invalid_words)) / total if total > 0 else 0.0

#     return accuracy, df_invalid


In [None]:
# accuracy, df_invalid = evaluate_precise(text, nlp)
# print(f"OCR Lexical Accuracy: {accuracy*100:.2f}%")
# df_invalid



OCR Lexical Accuracy: 99.04%


Unnamed: 0,invalid_word,frequency
29,nesia,14
30,ations,4
71,pendence,3
70,donesia,3
24,ebate,2
...,...,...
32,wislies,1
31,tative,1
28,bership,1
27,consicer,1
