In [1]:
import os, csv, re
import datetime
from pdf2image import convert_from_path
from PIL import Image
from kraken import binarization, blla, rpred
from kraken.lib import models
from dateutil import parser
import pandas as pd

In [2]:
# Create base dirs if they don't exist
os.makedirs("images", exist_ok=True)
os.makedirs("ocr", exist_ok=True)

# Create timestamped run dirs
timestamp = datetime.datetime.now().strftime("run_%Y%m%d_%H%M%S")
img_run_dir = os.path.join("images", timestamp)
ocr_run_dir = os.path.join("ocr", timestamp)
os.makedirs(img_run_dir, exist_ok=True)
os.makedirs(ocr_run_dir, exist_ok=True)

print(f"[INFO] Saving images to {img_run_dir}")
print(f"[INFO] Saving OCR text to {ocr_run_dir}")

# CSV output file
csv_path = os.path.join(ocr_run_dir, "ocr_output.csv")

# Load OCR model
model = models.load_any("models/arabic_best.mlmodel")

# Convert page 11 from PDF to images
pages = convert_from_path("books/attacks.pdf", dpi=300, first_page=11, last_page=11)

[INFO] Saving images to images/run_20250828_162751
[INFO] Saving OCR text to ocr/run_20250828_162751


In [3]:
# Convert Arabic-Indic digits → ASCII digits
def normalize_digits(s: str) -> str:
    trans = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")
    return s.translate(trans)

# Extract dates from text
def extract_dates(text: str):
    text_norm = normalize_digits(text)
    date_pattern = re.compile(r'(\d{2,4})[/-](\d{1,3})[/-](\d{1,4})')  # allow OCR glitches
    matches = date_pattern.findall(text_norm)

    raw_dates = []
    clean_dates = []

    for y, m, d in matches:
        raw_dates.append(f"{y}/{m}/{d}")  # save raw fragment

        try:
            y = int(y)
            m = int(m)
            d = int(d)

            # Fix 2-digit years (assume 1900s for this corpus)
            if y < 100:
                y = 1900 + y

            # Validate ranges
            if not (1 <= m <= 12):
                continue
            if not (1 <= d <= 31):
                continue

            clean_dates.append(f"{y:04d}-{m:02d}-{d:02d}")
        except Exception:
            continue

    # Return semicolon-separated
    return ";".join(raw_dates), ";".join(sorted(set(clean_dates)))

with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["page", "side", "text", "dates_raw", "dates_normalized"])  # header

    for i, page in enumerate(pages, start=11):
        w, h = page.size
        halves = {
            "right": page.crop((w // 2, 0, w, h)),  # RTL order: right first
            "left": page.crop((0, 0, w // 2, h)),
        }

        for side, img in halves.items():
            img_path = os.path.join(img_run_dir, f"page_{i}_{side}.png")
            img.save(img_path)

            # OCR
            bin_img = binarization.nlbin(img)
            seg = blla.segment(bin_img)
            pred = rpred.rpred(model, bin_img, seg)
            text = "\n".join([line.prediction for line in pred])

            # Dates
            dates_raw, dates_norm = extract_dates(text)

            # Write to CSV
            writer.writerow([i, side, text, dates_raw, dates_norm])



In [4]:
df = pd.read_csv(csv_path)

In [14]:
# explode normalized_date into rows
df = df.assign(normalized_date=df['dates_normalized'].str.split(';')).explode('normalized_date')
df = df[df['normalized_date'].notna() & (df['normalized_date'] != "")]

In [19]:
def validate_or_nat(date_str):
    try:
        y, m, d = map(int, date_str.split("-"))

        # year must be 1900–1960 (adjust window for your data)
        if not (1900 <= y <= 1960):
            return pd.NaT

        # month/day sanity
        if not (1 <= m <= 12):
            return pd.NaT
        if not (1 <= d <= 31):
            return pd.NaT

        return pd.to_datetime(f"{y:04d}-{m:02d}-{d:02d}", errors="coerce")
    except:
        return pd.NaT


df['dt'] = df['normalized_date'].apply(validate_or_nat)

# anchors
df['prev_valid'] = df['dt'].ffill()
df['next_valid'] = df['dt'].bfill()

def contextual_fix(row):
    if pd.notna(row['dt']):
        return row['dt']  # already valid
    # fallback to prev/next
    if pd.notna(row['prev_valid']) and pd.notna(row['next_valid']):
        # pick whichever is closer in days
        prev_gap = abs((row['prev_valid'] - row['next_valid']).days)
        return row['prev_valid'] if prev_gap <= 15 else row['next_valid']
    if pd.notna(row['prev_valid']):
        return row['prev_valid']
    if pd.notna(row['next_valid']):
        return row['next_valid']
    return pd.NaT

df['fixed_date'] = df.apply(contextual_fix, axis=1)
df['fixed_date'] = df['fixed_date'].dt.strftime("%Y-%m-%d")

In [20]:
print(df[['page','side','raw_norm','normalized_date','fixed_date']].head(10))

   page   side    raw_norm normalized_date  fixed_date
0    11  right  1949-08-01      1949-08-01  1949-08-01
0    11  right  1949-10-01      1949-10-01  1949-10-01
0    11  right  1999-10-01      1999-10-01  1949-08-01
0    11  right  1949-08-01      1949-08-01  1949-08-01
0    11  right  1949-10-01      1949-10-01  1949-10-01
0    11  right  1999-10-01      1999-10-01  1949-08-01
0    11  right  1949-08-01      1949-08-01  1949-08-01
0    11  right  1949-10-01      1949-10-01  1949-10-01
0    11  right  1999-10-01      1999-10-01  1949-11-20
1    11   left  1949-11-20      1949-11-20  1949-11-20


In [21]:
audit = df[df['raw_norm'] != df['fixed_date']]
print(audit[['page','side','raw_norm','fixed_date']])

   page   side    raw_norm  fixed_date
0    11  right  1999-10-01  1949-08-01
0    11  right  1999-10-01  1949-08-01
0    11  right  1999-10-01  1949-11-20


In [22]:
def make_reading_order(df):
    # Ensure sort: page asc, then side (right before left)
    df['side_order'] = df['side'].map({'right': 0, 'left': 1})
    return df.sort_values(['page', 'side_order']).reset_index(drop=True)

def split_across(df):
    results = []
    full_text = ""
    meta = []

    # Build continuous reading stream
    for _, row in df.iterrows():
        full_text += f"\n{row['text']}"
        meta.append((row['page'], row['side']))

    # Regex split into blocks
    matches = list(re.finditer(r'(\d{2,4}[/-]\d{1,2}[/-]\d{1,4})', normalize_digits(full_text)))

    for i, m in enumerate(matches):
        raw_date = m.group(1)
        start = m.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(full_text)
        block_text = full_text[start:end].strip()

        results.append({
            "raw_date": raw_date,
            "text_block": block_text
            # optionally: earliest (page, side) from meta in this span
        })

    return pd.DataFrame(results)

In [None]:
# CSV output file
clean_csv_path = os.path.join(ocr_run_dir, "ocr_output_clean.csv")

# Save cleaned dataset
df.to_csv(clean_csv_path, index=False)