# Language Model Integration for OCR

This notebook explores the impact of statistical language models on handwritten text recognition accuracy using CTC decoding.


## Motivation
CTC-based OCR models often produce character-level predictions without linguistic context. Language models help correct improbable character sequences by incorporating word-level probabilities.


## Dataset Preparation for Language Modeling


In [None]:
from datasets import load_dataset
import re

print("üì• Loading Teklia IAM-line dataset...")
dataset = load_dataset("teklia/IAM-line")

texts = []
for split in dataset.keys():
    for sample in dataset[split]:
        text = sample["text"].strip()
        if len(text) > 3:
            # ‚úÖ Fixed regex: move '-' to the end of the character set
            text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\- ]+", " ", text)
            text = re.sub(r"\s+", " ", text).lower()
            texts.append(text)

print(f"‚úÖ Extracted {len(texts)} lines from IAM-line dataset.")

# Save to corpus file
with open(corpus_path, "w", encoding="utf-8") as f:
    f.write("\n".join(texts))

print(f"‚úÖ Corpus saved to: {corpus_path}")
!head -n 10 {corpus_path}

## 3-Gram Language Model


In [None]:
!kenlm/build/bin/lmplz --discount_fallback -o 5 < {corpus_path} > {lm_arpa_path}
!kenlm/build/bin/build_binary {lm_arpa_path} {lm_binary_path}

print(f"\n‚úÖ KenLM models created successfully:")
!ls -lh {lm_arpa_path} {lm_binary_path}


In [None]:
from datasets import load_dataset
import re

print("üì• Loading Teklia IAM-line dataset...")
dataset = load_dataset("teklia/IAM-line")

texts = []
for split in dataset.keys():
    for sample in dataset[split]:
        text = sample["text"].strip()
        if len(text) > 3:
            # ‚úÖ Fixed regex: move '-' to the end of the character set
            text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\- ]+", " ", text)
            text = re.sub(r"\s+", " ", text).lower()
            texts.append(text)

print(f"‚úÖ Extracted {len(texts)} lines from IAM-line dataset.")

# Save to corpus file
with open(corpus_path, "w", encoding="utf-8") as f:
    f.write("\n".join(texts))

print(f"‚úÖ Corpus saved to: {corpus_path}")
!head -n 10 {corpus_path}

In [None]:
# ===========================================
# üß† 1Ô∏è‚É£  Install dependencies
# ===========================================
!apt-get -qq install build-essential cmake
!pip install datasets joblib

# ===========================================
# üß† 2Ô∏è‚É£  Clone & build KenLM
# ===========================================
!rm -rf /content/kenlm
!git clone https://github.com/kpu/kenlm.git /content/kenlm
!mkdir -p /content/kenlm/build
%cd /content/kenlm/build
!cmake /content/kenlm
!make -j4
!ls -l /content/kenlm/build/bin/

# ===========================================
# üß† 3Ô∏è‚É£  Create training corpus from Teklia IAM dataset
# ===========================================
%cd /content
from datasets import load_dataset
import re

print("üì• Loading Teklia IAM dataset ...")
ds = load_dataset("teklia/iam-line", split="train")

texts = []
for ex in ds:
    t = ex["text"]
    if len(t) > 3:
        t = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\- ]+", " ", t)
        t = re.sub(r"\s+", " ", t).strip().lower()
        texts.append(t)

corpus_path = "/content/iam_corpus.txt"
with open(corpus_path, "w") as f:
    f.write("\n".join(texts))
print(f"‚úÖ Corpus saved: {corpus_path} ({len(texts)} lines)")

# ===========================================
# üß† 4Ô∏è‚É£  Train a 3-gram KenLM language model
# ===========================================
!cd /content && /content/kenlm/build/bin/lmplz -o 3 < /content/iam_corpus.txt > /content/iam_lm.arpa
!cd /content && /content/kenlm/build/bin/build_binary /content/iam_lm.arpa /content/iam_lm.binary
print("‚úÖ KenLM binary built: /content/iam_lm.binary")

# ===========================================
# üß† 5Ô∏è‚É£  Mount Drive & save LM model there
# ===========================================
from google.colab import drive
drive.mount('/content/drive')

!mkdir -p /content/drive/MyDrive/kenlm_iam_lm
!cp /content/iam_lm.arpa /content/iam_lm.binary /content/drive/MyDrive/kenlm_iam_lm/
print("‚úÖ Saved KenLM model to: /content/drive/MyDrive/kenlm_iam_lm")


In [None]:
decoder = build_ctcdecoder(
    labels=vocab_list,
    kenlm_model_path=KENLM_BINARY,
)
print("‚úÖ KenLM decoder built successfully!")


In [None]:
# ============================================
# üß† 1Ô∏è‚É£ Import dependencies
# ============================================
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import joblib
import cv2
from PIL import Image
import gradio as gr
import tempfile
from jiwer import wer, cer
from pyctcdecode import build_ctcdecoder
import pyctcdecode.decoder as decoder_module
import pyctcdecode.language_model as language_model_module
import kenlm

# ‚úÖ Fix: Inject kenlm inside pyctcdecode (needed for Colab)
decoder_module.kenlm = kenlm
language_model_module.kenlm = kenlm
print("‚úÖ KenLM successfully linked to pyctcdecode")

# ============================================
# üß† 2Ô∏è‚É£ Paths ‚Äì update if needed
# ============================================
MODEL_DIR = "/content/drive/MyDrive/htr_final_model_20251110_110833"
KENLM_BINARY = "/content/drive/MyDrive/kenlm_iam_lm/iam_lm.binary"

# ============================================
# üß† 3Ô∏è‚É£ Load HTR model + vocabulary
# ============================================
print("üîÑ Loading HTR model and vocab...")
model = keras.models.load_model(os.path.join(MODEL_DIR, "htr_model.keras"), compile=False)
vocab_list = joblib.load(os.path.join(MODEL_DIR, "vocab_list.pkl"))
print(f"‚úÖ Model and vocabulary loaded successfully! (Vocab size: {len(vocab_list)})")

# ============================================
# üß† 4Ô∏è‚É£ Build tuned KenLM decoder
# ============================================
print("üîÑ Building KenLM decoder (Œ±=0.5, Œ≤=1.5, beam_width=100)...")
decoder = build_ctcdecoder(
    labels=vocab_list,
    kenlm_model_path=KENLM_BINARY,
    alpha=0.5,   # Tuned parameter
    beta=1.5     # Tuned parameter
)
print("‚úÖ KenLM decoder initialized with tuned parameters!")

# ============================================
# üß† 5Ô∏è‚É£ Line segmentation (Projection method)
# ============================================
def segment_lines_projection(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (3, 3), 0)
    _, binary = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernel = np.ones((2, 50), np.uint8)
    closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    hist = np.sum(closed, axis=1)
    threshold = np.max(hist) * 0.1

    lines, in_line, start = [], False, 0
    for y, val in enumerate(hist):
        if val > threshold and not in_line:
            in_line, start = True, y
        elif val <= threshold and in_line:
            in_line = False
            end = y
            if end - start >= 10:
                lines.append((start, end))

    line_imgs = []
    for y1, y2 in lines:
        y1 = max(y1 - 10, 0)
        y2 = min(y2 + 10, img.shape[0])
        cropped = img[y1:y2, :]
        pil_img = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
        line_imgs.append(pil_img)
    return line_imgs

# ============================================
# üß† 6Ô∏è‚É£ Preprocess for HTR model
# ============================================
IMG_HEIGHT, IMG_WIDTH = 64, 800

def preprocess_image(img):
    img = img.convert("L")
    img = img.resize((IMG_WIDTH, IMG_HEIGHT))
    img = np.array(img, dtype=np.float32) / 255.0
    img = np.expand_dims(img, axis=-1)
    return np.expand_dims(img, axis=0)

# ============================================
# üß† 7Ô∏è‚É£ Decoding functions
# ============================================
def decode_greedy(pred):
    pred = np.squeeze(pred)
    best_path = np.argmax(pred, axis=-1)
    text = ''.join([vocab_list[i] for i in best_path if i < len(vocab_list)])
    return text

def decode_with_kenlm(pred):
    pred = np.squeeze(pred)
    return decoder.decode(pred, beam_width=100)  # Tuned beam width

# ============================================
# üß† 8Ô∏è‚É£ Gradio Prediction Function
# ============================================
def recognize_text(uploaded_image, ground_truth):
    if uploaded_image is None:
        return [], "", "", "‚ö†Ô∏è Please upload a handwriting image."

    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
        uploaded_image.save(tmp.name)
        image_path = tmp.name

    line_images = segment_lines_projection(image_path)
    if not line_images:
        return [], "", "", "‚ùå No text lines detected."

    results = []
    greedy_lines, lm_lines = [], []

    for line_img in line_images:
        img_input = preprocess_image(line_img)
        preds = model.predict(img_input, verbose=0)

        greedy_text = decode_greedy(preds)
        lm_text = decode_with_kenlm(preds)

        results.append((line_img, f"üßæ Greedy: {greedy_text}\nüìñ LM: {lm_text}"))
        greedy_lines.append(greedy_text)
        lm_lines.append(lm_text)

    greedy_final = "\n".join(greedy_lines)
    lm_final = "\n".join(lm_lines)

    if ground_truth.strip():
        return results, greedy_final, lm_final, (
            f"WER (Greedy): {wer(ground_truth, greedy_final):.2%} | CER: {cer(ground_truth, greedy_final):.2%}\n"
            f"WER (KenLM): {wer(ground_truth, lm_final):.2%} | CER: {cer(ground_truth, lm_final):.2%}"
        )
    else:
        return results, greedy_final, lm_final, "‚ÑπÔ∏è No ground truth provided."

# ============================================
# üß† 9Ô∏è‚É£ Gradio Interface
# ============================================
with gr.Blocks(title="HTR + Tuned KenLM Handwriting Recognition") as interface:
    gr.Markdown("## üìù Handwritten Text Recognition (Greedy vs Tuned KenLM)")
    gr.Markdown("Upload a handwritten image ‚Äî the app will segment text lines and show both raw CTC and tuned LM predictions.")

    upload = gr.Image(type="pil", label="üì§ Upload Handwritten Image")
    ground_truth_input = gr.Textbox(label="‚úÖ Ground Truth (optional for accuracy)", lines=4)
    recognize_btn = gr.Button("üß† Recognize Text")

    gallery = gr.Gallery(label="üì∏ Line Predictions", columns=1, preview=True)
    greedy_output = gr.Textbox(label="üîπ Greedy Decode (No LM)")
    lm_output = gr.Textbox(label="üîπ Tuned KenLM Decode (Œ±=0.5, Œ≤=1.5, beam=100)")
    accuracy_output = gr.Textbox(label="üìä Accuracy Comparison (WER / CER)")

    recognize_btn.click(
        fn=recognize_text,
        inputs=[upload, ground_truth_input],
        outputs=[gallery, greedy_output, lm_output, accuracy_output]
    )

interface.launch(share=True)


## 3-Gram Evaluation


In [None]:
# ============================================
# üì¶ 1Ô∏è‚É£ Install dependencies
# ============================================
!apt-get install -y cmake build-essential libboost-all-dev
!pip install pyctcdecode https://github.com/kpu/kenlm/archive/master.zip jiwer datasets joblib matplotlib tqdm

# ============================================
# üß† 2Ô∏è‚É£ Import libraries
# ============================================
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import joblib
from pyctcdecode import build_ctcdecoder
import pyctcdecode.decoder as decoder_module
import pyctcdecode.language_model as language_model_module
import kenlm
from jiwer import wer, cer
import matplotlib.pyplot as plt
from tqdm import tqdm
from datasets import load_dataset
from PIL import Image

# Inject kenlm bindings to pyctcdecode
decoder_module.kenlm = kenlm
language_model_module.kenlm = kenlm
print("‚úÖ KenLM successfully linked to pyctcdecode")

# ============================================
# üìÅ 3Ô∏è‚É£ Paths (same as Gradio setup)
# ============================================
MODEL_DIR = "/content/drive/MyDrive/htr_final_model_20251110_110833"
KENLM_BINARY = "/content/drive/MyDrive/kenlm_iam_lm/iam_lm.binary"

# ============================================
# üß† 4Ô∏è‚É£ Load model + vocab
# ============================================
print("üîÑ Loading model and vocab...")
model = keras.models.load_model(os.path.join(MODEL_DIR, "htr_model.keras"), compile=False)
vocab_list = joblib.load(os.path.join(MODEL_DIR, "vocab_list.pkl"))
print(f"‚úÖ Model loaded successfully! Vocab size: {len(vocab_list)}")

# ============================================
# üß† 5Ô∏è‚É£ Build KenLM decoder
# ============================================
print("üîÑ Building KenLM decoder...")
decoder = build_ctcdecoder(labels=vocab_list, kenlm_model_path=KENLM_BINARY)
print("‚úÖ KenLM decoder ready!")

# ============================================
# üìö 6Ô∏è‚É£ Load IAM Line dataset
# ============================================
print("üìÇ Loading IAM Line dataset (test split)...")
dataset = load_dataset("Teklia/IAM-line", split="test")
print(f"‚úÖ Loaded {len(dataset)} samples.")

# ============================================
# üß© 7Ô∏è‚É£ Preprocessing
# ============================================
IMG_HEIGHT, IMG_WIDTH = 64, 800

def preprocess_image(example):
    """Handle the fact that 'image' is already a PIL object in IAM Line dataset."""
    img = example["image"].convert("L")
    img = img.resize((IMG_WIDTH, IMG_HEIGHT))
    img = np.expand_dims(np.array(img, dtype=np.float32) / 255.0, axis=-1)
    return np.expand_dims(img, axis=0), example["text"]

# ============================================
# üßÆ 8Ô∏è‚É£ Evaluate both Greedy + KenLM decoding
# ============================================
def decode_greedy(pred):
    pred = np.squeeze(pred)
    best_path = np.argmax(pred, axis=-1)
    return ''.join([vocab_list[i] for i in best_path if i < len(vocab_list)])

def decode_with_kenlm(pred):
    pred = np.squeeze(pred)
    return decoder.decode(pred)

greedy_preds, lm_preds, gts = [], [], []

for ex in tqdm(dataset, desc="Evaluating"):
    img_input, gt = preprocess_image(ex)
    preds = model.predict(img_input, verbose=0)
    greedy_preds.append(decode_greedy(preds))
    lm_preds.append(decode_with_kenlm(preds))
    gts.append(gt)

# ============================================
# üìä 9Ô∏è‚É£ Compute metrics
# ============================================
greedy_wer = np.mean([wer(gt, p) for gt, p in zip(gts, greedy_preds)])
greedy_cer = np.mean([cer(gt, p) for gt, p in zip(gts, greedy_preds)])
lm_wer = np.mean([wer(gt, p) for gt, p in zip(gts, lm_preds)])
lm_cer = np.mean([cer(gt, p) for gt, p in zip(gts, lm_preds)])

print("\nüìä Evaluation Summary:")
print(f"üßÆ Greedy Decode ‚Üí WER: {greedy_wer:.4f}, CER: {greedy_cer:.4f}")
print(f"üìñ KenLM Decode  ‚Üí WER: {lm_wer:.4f}, CER: {lm_cer:.4f}")

# ============================================
# üñº üîü Show sample predictions
# ============================================
for i in range(5):
    plt.imshow(dataset[i]["image"], cmap="gray")
    plt.title(f"GT: {gts[i]}\nGreedy: {greedy_preds[i]}\nLM: {lm_preds[i]}")
    plt.axis("off")
    plt.show()


In [None]:
import pandas as pd
import os
from jiwer import wer, cer

# ‚úÖ Make sure these lists exist in your notebook:
# gts, greedy_preds, lm_preds

# Build detailed DataFrame
results_df = pd.DataFrame({
    "GroundTruth": gts,
    "GreedyPrediction": greedy_preds,
    "KenLMPrediction": lm_preds,
    "Greedy_WER": [wer(gt, p) for gt, p in zip(gts, greedy_preds)],
    "Greedy_CER": [cer(gt, p) for gt, p in zip(gts, greedy_preds)],
    "LM_WER": [wer(gt, p) for gt, p in zip(gts, lm_preds)],
    "LM_CER": [cer(gt, p) for gt, p in zip(gts, lm_preds)]
})

# Save to your Google Drive model directory
save_path = "/content/drive/MyDrive/htr_final_model_20251110_110833/evaluation_results.csv"
results_df.to_csv(save_path, index=False)

print(f"‚úÖ Evaluation results saved successfully to:\n{save_path}")
print(f"üìÑ Total lines saved: {len(results_df)}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Load your saved CSV
df = pd.read_csv("/content/drive/MyDrive/htr_final_model_20251110_110833/evaluation_results.csv")

# Histogram of WERs
plt.figure(figsize=(8,4))
sns.histplot(df["Greedy_WER"], color="red", label="Greedy", kde=True)
sns.histplot(df["LM_WER"], color="blue", label="KenLM", kde=True)
plt.title("WER Distribution: Greedy vs KenLM")
plt.xlabel("WER per line")
plt.ylabel("Count")
plt.legend()
plt.show()

# Bar comparison of average performance
plt.figure(figsize=(6,4))
bars = plt.bar(["Greedy WER", "KenLM WER", "Greedy CER", "KenLM CER"],
               [df["Greedy_WER"].mean(), df["LM_WER"].mean(),
                df["Greedy_CER"].mean(), df["LM_CER"].mean()],
               color=["salmon", "skyblue", "salmon", "skyblue"])
plt.title("Average WER & CER Comparison")
plt.show()


## 5-Gram Language Model


In [None]:
# ============================================
# üß© STEP 0: Install dependencies
# ============================================
!apt install -y build-essential libboost-all-dev cmake
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install pyctcdecode jiwer datasets joblib matplotlib tqdm

# ============================================
# üìö STEP 1: Load IAM Line training data
# ============================================
from datasets import load_dataset
import os

print("üîÑ Loading IAM Line dataset...")
dataset = load_dataset("teklia/iam-line")  # ‚úÖ Correct name

train_texts = [ex["text"].strip() for ex in dataset["train"] if ex["text"].strip()]
print(f"‚úÖ Loaded {len(train_texts)} training lines.")

# Save corpus to Drive
os.makedirs("/content/drive/MyDrive/", exist_ok=True)
corpus_path = "/content/drive/MyDrive/iam_corpus.txt"

with open(corpus_path, "w") as f:
    for line in train_texts:
        f.write(line + "\n")

print(f"‚úÖ Saved corpus to: {corpus_path}")

# ============================================
# üß† STEP 2: Build 5-gram KenLM
# ============================================
!git clone https://github.com/kpu/kenlm.git
!mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4

lm_path = "/content/drive/MyDrive/iam_lm_5gram.arpa"
!kenlm/build/bin/lmplz -o 5 < /content/drive/MyDrive/iam_corpus.txt > {lm_path}

print(f"‚úÖ 5-gram KenLM saved to: {lm_path}")

# ============================================
# üß± STEP 3: Load HTR model + vocab
# ============================================
import tensorflow as tf
from tensorflow import keras
import joblib
from pyctcdecode import build_ctcdecoder
import pyctcdecode.decoder as decoder_module
import pyctcdecode.language_model as language_model_module
import kenlm

# Inject kenlm bindings for pyctcdecode
decoder_module.kenlm = kenlm
language_model_module.kenlm = kenlm
print("‚úÖ KenLM successfully linked to pyctcdecode")

MODEL_DIR = "/content/drive/MyDrive/htr_final_model_20251110_110833"

print("üîÑ Loading model and vocab...")
model = keras.models.load_model(os.path.join(MODEL_DIR, "htr_model.keras"), compile=False)
vocab_list = joblib.load(os.path.join(MODEL_DIR, "vocab_list.pkl"))
print(f"‚úÖ Model loaded successfully! Vocab size: {len(vocab_list)}")

# ============================================
# üî§ STEP 4: Build 5-gram LM decoder
# ============================================
decoder = build_ctcdecoder(labels=vocab_list, kenlm_model_path=lm_path)
print("‚úÖ 5-gram KenLM decoder ready!")

## WER Comparison and Analysis


In [None]:
# ============================================
# üßÆ STEP 5: Evaluate model on IAM Line test set
# ============================================
from jiwer import wer, cer
from tqdm import tqdm
import numpy as np
from PIL import Image
from PIL import Image
import numpy as np

def preprocess_image(example):
    # ‚úÖ The dataset image is already a PIL.Image object
    img = example["image"].convert("L")  # grayscale
    img = img.resize((800, 64))
    img = np.array(img, dtype=np.float32) / 255.0
    img = np.expand_dims(img, axis=(0, -1))  # add batch & channel dims
    return img, example["text"]

def decode_greedy(pred):
    pred = np.squeeze(pred)
    best_path = np.argmax(pred, axis=-1)
    return "".join([vocab_list[i] for i in best_path if i < len(vocab_list)])

print("üìÇ Loading IAM test split...")
test_data = dataset["test"]
print(f"‚úÖ Loaded {len(test_data)} samples.")

greedy_preds, lm_preds, gts = [], [], []

for ex in tqdm(test_data, desc="Evaluating"):
    img_input, gt = preprocess_image(ex)
    preds = model.predict(img_input, verbose=0)
    greedy_preds.append(decode_greedy(preds))
    lm_preds.append(decoder.decode(np.squeeze(preds)))
    gts.append(gt)

# ============================================
# üìä STEP 6: Compute WER/CER
# ============================================
greedy_wer = wer(gts, greedy_preds)
greedy_cer = cer(gts, greedy_preds)
lm_wer = wer(gts, lm_preds)
lm_cer = cer(gts, lm_preds)

print("\nüìä Evaluation Summary:")
print(f"üßÆ Greedy Decode ‚Üí WER: {greedy_wer:.4f}, CER: {greedy_cer:.4f}")
print(f"üìñ 5-gram LM Decode ‚Üí WER: {lm_wer:.4f}, CER: {lm_cer:.4f}")

# ============================================
# üíæ STEP 7: Save all artifacts
# ============================================
print(f"‚úÖ All artifacts saved:")
print(f"  üìú Corpus: {corpus_path}")
print(f"  üß† 5-gram LM: {lm_path}")
