<a href="https://colab.research.google.com/github/JustinStec/research-library-colab/blob/main/reocr_marker_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Re-OCR Library with Marker + Citation Metadata Enrichment

**What this does:**
1. Downloads PDFs from Supabase Storage
2. Re-OCRs all PDFs with marker (GPU-accelerated)
3. Cleans extracted text (ligatures, smart quotes, OCR artifacts)
4. Enriches bibliographic metadata via CrossRef API
5. Uploads everything back to Supabase (sets `embedding = NULL` to trigger re-embedding)

**Before running:**
1. **Runtime → Change runtime type → A100 GPU + High RAM**
2. Run `add_citation_metadata.sql` in **Supabase SQL Editor** to add citation columns (already in clipboard)
3. Run cells in order

**After running:**
- Run `embed_library_colab.ipynb` to re-embed all texts with GTE-Qwen2-1.5B

In [1]:
# Cell 1: Install dependencies
!pip install -q marker-pdf requests tqdm
!pip install -q supabase==2.11.0 postgrest==0.19.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.7/195.7 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.2/223.2 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m948.6/948.6 kB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Cell 2: Configuration
import os

SUPABASE_URL = "https://zknmvifnbrycjwckkggy.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Inprbm12aWZuYnJ5Y2p3Y2trZ2d5Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTc2OTcwMjM3OSwiZXhwIjoyMDg1Mjc4Mzc5fQ.NbKHG7-VKYSdCbvhSW9a1v-5OoLSVevyEKin_RI4pvQ"

BUCKET_NAME = "pdf-library"
PDF_DIR = "/content/pdfs"
OUTPUT_DIR = "/content/output"
CHECKPOINT_PATH = "/content/reocr_checkpoint.json"
CROSSREF_CHECKPOINT = "/content/crossref_checkpoint.json"

# Polite CrossRef usage: include your email
CROSSREF_MAILTO = "jts3et@virginia.edu"

# Set CUDA for marker
os.environ["TORCH_DEVICE"] = "cuda"

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Configuration set.")

Configuration set.


In [3]:
# Cell 3: Connect to Supabase + add citation metadata columns
from supabase import create_client
import requests

supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
print("Connected to Supabase")

# --- Add citation metadata columns if they don't exist ---
# We use the Supabase REST endpoint to execute SQL via PostgREST's RPC.
# First, create a helper function, then call it, then drop it.

MIGRATION_SQL = """
DO $$
BEGIN
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS author_full TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS title_full TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS source_type TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS publisher TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS place TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS journal TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS volume TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS issue TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS pages TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS editors TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS book_title TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS edition TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS doi TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS url TEXT;
  ALTER TABLE library_texts ADD COLUMN IF NOT EXISTS isbn TEXT;
END
$$;
"""

# Try to select one of the new columns to check if migration already ran
try:
    test = supabase.table("library_texts").select("doi").limit(1).execute()
    print("Citation metadata columns already exist.")
except Exception:
    print("Citation columns not found. Running migration...")
    # Execute via Supabase SQL endpoint (requires service key)
    resp = requests.post(
        f"{SUPABASE_URL}/rest/v1/rpc/",
        headers={
            "apikey": SUPABASE_KEY,
            "Authorization": f"Bearer {SUPABASE_KEY}",
            "Content-Type": "application/json"
        },
        json={}
    )
    # If the RPC approach doesn't work, print SQL for manual execution
    print("\n" + "="*60)
    print("MANUAL STEP NEEDED: Paste this SQL into your Supabase SQL Editor")
    print("Dashboard > SQL Editor > New Query > Paste > Run")
    print("="*60)
    print(MIGRATION_SQL)
    print("="*60)
    print("\nAfter running the SQL, re-run this cell.")
    raise SystemExit("Run the SQL above in Supabase SQL Editor, then re-run this cell.")

Connected to Supabase
Citation metadata columns already exist.


In [4]:
# Cell 4: Fetch PDF inventory (only texts not yet OCR'd with marker)
from collections import Counter

all_pdf_rows = []
batch_size = 100
offset = 0

while True:
    response = supabase.table("library_texts") \
        .select("id, file_name, storage_path, category, has_pdf, ocr_status") \
        .eq("has_pdf", True) \
        .neq("ocr_status", "complete") \
        .range(offset, offset + batch_size - 1) \
        .execute()

    if not response.data:
        break

    all_pdf_rows.extend(response.data)
    offset += batch_size

print(f"Texts needing OCR: {len(all_pdf_rows)}")

if all_pdf_rows:
    cat_counts = Counter(r["category"] for r in all_pdf_rows)
    print("\nBy category:")
    for cat, count in sorted(cat_counts.items()):
        print(f"  {cat}: {count}")
else:
    print("All PDFs already processed! Nothing to do.")

# Optional: filter to a specific category (set to None to process all)
FILTER_CATEGORY = None  # e.g. "Eliot" or None for all

if FILTER_CATEGORY and all_pdf_rows:
    all_pdf_rows = [r for r in all_pdf_rows if r["category"] == FILTER_CATEGORY]
    print(f"\nFiltered to {FILTER_CATEGORY}: {len(all_pdf_rows)} texts")

Texts needing OCR: 79

By category:
  Cognitive: 2
  Early_Modern: 3
  Eliot: 10
  Medieval: 8
  Modernism: 4
  Other: 1
  Philosophy: 12
  Poetics: 9
  Semiotics: 5
  Theory_Method: 25


In [5]:
# Cell 5: Download PDFs from Supabase Storage
from tqdm import tqdm
import json

download_errors = []
downloaded = []

print(f"Downloading {len(all_pdf_rows)} PDFs from Supabase Storage...\n")

for row in tqdm(all_pdf_rows):
    storage_path = row.get("storage_path")
    if not storage_path:
        download_errors.append({"file_name": row["file_name"], "error": "no storage_path"})
        continue

    local_path = os.path.join(PDF_DIR, os.path.basename(storage_path))

    # Skip if already downloaded
    if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
        downloaded.append({"row": row, "local_path": local_path})
        continue

    try:
        data = supabase.storage.from_(BUCKET_NAME).download(storage_path)
        with open(local_path, "wb") as f:
            f.write(data)
        downloaded.append({"row": row, "local_path": local_path})
    except Exception as e:
        download_errors.append({"file_name": row["file_name"], "error": str(e)})

print(f"\nDownloaded: {len(downloaded)}")
print(f"Errors: {len(download_errors)}")
if download_errors:
    for err in download_errors[:10]:
        print(f"  - {err['file_name']}: {err['error']}")

Downloading 79 PDFs from Supabase Storage...



100%|██████████| 79/79 [02:47<00:00,  2.12s/it]


Downloaded: 64
Errors: 15
  - Bell_Interactional_Metalepsis_and_Unnatural_Narratology: no storage_path
  - Bell_and_Alber_Ontological_Metalepsis: no storage_path
  - Bianchi_The_Feminine_Symptom: no storage_path
  - Cushman_The_Fallacy_of_Imitative_Form_Revisited: no storage_path
  - Eliot-Phillip_Massinger: no storage_path
  - Eliot_Clark_Lectures: no storage_path
  - Fludernik_Scene_Shift,_Metalepsis,_and_the_Metaleptic_Mode: no storage_path
  - Grierson_Introduction: no storage_path
  - Kuhn-Treichel_The_Metalepsis_of_Killing_a_Character_in_Diachronic_Perspective: no storage_path
  - Malabou_Plasticity_at_the_Dusk_of_Writing: no storage_path





In [6]:
# Cell 6: Initialize marker
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected! Go to Runtime > Change runtime type > GPU")

print("\nLoading marker models (this takes 1-2 minutes)...")

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict

model_dict = create_model_dict()
converter = PdfConverter(artifact_dict=model_dict)

print("Marker models loaded and ready.")

PyTorch version: 2.10.0+cu128
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB


AttributeError: 'torch._C._CudaDeviceProperties' object has no attribute 'total_mem'

In [None]:
# Cell 7: Run marker on all PDFs
import re
import unicodedata
import json
import gc

# --- Text cleaning functions (from reocr_pipeline.py) ---

def clean_text(text):
    """Thorough UTF-8 cleaning for OCR output."""
    if not text:
        return ""

    text = unicodedata.normalize("NFC", text)

    ligature_map = {
        "\ufb00": "ff", "\ufb01": "fi", "\ufb02": "fl",
        "\ufb03": "ffi", "\ufb04": "ffl", "\ufb05": "st", "\ufb06": "st",
    }
    for lig, replacement in ligature_map.items():
        text = text.replace(lig, replacement)

    quote_map = {
        "\u2018": "'", "\u2019": "'", "\u201a": "'", "\u201b": "'",
        "\u201c": '"', "\u201d": '"', "\u201e": '"', "\u201f": '"',
        "\u2032": "'", "\u2033": '"',
        "\u2013": "-", "\u2014": "--", "\u2015": "--",
        "\u2026": "...", "\u00a0": " ",
        "\u200b": "", "\u200c": "", "\u200d": "", "\ufeff": "", "\u00ad": "",
    }
    for char, replacement in quote_map.items():
        text = text.replace(char, replacement)

    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    text = re.sub(r'[\ud800-\udfff]', '', text)
    text = re.sub(r'\\u[0-9a-fA-F]{0,3}[^0-9a-fA-F]', '', text)
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
    text = re.sub(r'\n\s*\d{1,4}\s*\n', '\n', text)
    text = re.sub(r'\n[A-Z][A-Z\s&]+(\(\d{4}\))\s+\d+[:\.]?\d*\s*\n', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+\n', '\n', text)
    text = text.strip()

    return text


def strip_markdown_for_txt(md_text):
    """Convert markdown to clean plain text."""
    text = md_text
    text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', text)
    text = re.sub(r'_{1,3}([^_]+)_{1,3}', r'\1', text)
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', text)
    text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    return text.strip()


# --- Subject extraction ---

SUBJECT_KEYWORDS = {
    "phenomenology": ["phenomenology", "phenomenological", "husserl", "merleau-ponty", "heidegger"],
    "embodiment": ["embodied", "embodiment", "corporeal", "somatic", "body"],
    "enactivism": ["enactive", "enactivism", "autopoiesis"],
    "cognition": ["cognitive", "cognition", "mental representation", "working memory"],
    "emotion": ["emotion", "affect", "feeling", "mood", "sentiment"],
    "attention": ["attention", "attentional", "awareness", "salience"],
    "perception": ["perception", "perceptual", "sensory", "multisensory"],
    "metaphysics": ["metaphysics", "metaphysical", "ontology", "ontological"],
    "aesthetics": ["aesthetic", "beauty", "sublime", "taste"],
    "poetics": ["poetic", "poetry", "verse", "prosody", "meter", "rhyme"],
    "rhetoric": ["rhetoric", "rhetorical", "trope", "metaphor"],
    "modernism": ["modernism", "modernist", "eliot", "pound", "woolf"],
    "philosophy": ["philosophy", "philosophical", "epistemology", "ethics"],
    "linguistics": ["linguistic", "syntax", "grammar", "morphology", "phonology"],
    "semiotics": ["semiotic", "sign", "signification", "peirce", "saussure"],
    "dissociation": ["dissociation", "dissociated", "unified sensibility"],
    "imagination": ["imagination", "imaginative", "imaginal", "fancy"],
    "narrative": ["narrative", "narration", "narrator", "diegesis", "story"],
    "medieval": ["medieval", "middle ages", "chaucer", "dante"],
    "early_modern": ["early modern", "renaissance", "milton", "shakespeare", "donne"],
    "neuroscience": ["neuroscience", "neural", "brain", "fmri", "cortex", "hippocampus"],
    "social_cognition": ["social cognition", "theory of mind", "intersubjectivity", "joint attention"],
    "kant": ["kant", "kantian", "critique of", "transcendental"],
    "disability": ["disability", "disabled", "crip", "ableism", "accommodation"],
}

def extract_subjects(text):
    lower = text.lower()
    subjects = []
    for subject, keywords in SUBJECT_KEYWORDS.items():
        for kw in keywords:
            if kw in lower:
                subjects.append(subject)
                break
    return list(set(subjects))


# --- Process all PDFs ---

# Load checkpoint if exists (resume after crash)
completed_files = set()
results = {}
if os.path.exists(CHECKPOINT_PATH):
    with open(CHECKPOINT_PATH) as f:
        checkpoint = json.load(f)
    results = checkpoint.get("results", {})
    completed_files = set(results.keys())
    print(f"Resuming from checkpoint: {len(completed_files)} already processed")

processing_errors = []
total = len(downloaded)

print(f"\nProcessing {total} PDFs with marker...\n")

for i, item in enumerate(downloaded):
    row = item["row"]
    local_path = item["local_path"]
    file_name = row["file_name"]

    if file_name in completed_files:
        continue

    print(f"[{i+1}/{total}] {file_name}", end=" ... ")

    try:
        result = converter(local_path)
        md_text = result.markdown

        if not md_text or len(md_text.strip()) < 50:
            processing_errors.append({"file_name": file_name, "error": "empty or near-empty output"})
            print("EMPTY")
            continue

        plain_text = strip_markdown_for_txt(md_text)
        cleaned = clean_text(plain_text)
        word_count = len(cleaned.split())
        subjects = extract_subjects(cleaned)

        output_path = os.path.join(OUTPUT_DIR, file_name + ".txt")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned)

        results[file_name] = {
            "id": row["id"],
            "file_name": file_name,
            "word_count": word_count,
            "subjects": subjects,
            "output_path": output_path,
        }
        completed_files.add(file_name)

        print(f"{word_count} words")

    except Exception as e:
        processing_errors.append({"file_name": file_name, "error": str(e)})
        print(f"ERROR: {e}")

    # Checkpoint every 25 files
    if (i + 1) % 25 == 0:
        with open(CHECKPOINT_PATH, "w") as f:
            json.dump({"results": results, "errors": processing_errors}, f)
        torch.cuda.empty_cache()
        gc.collect()
        print(f"  [checkpoint saved: {len(results)} done]")

# Final checkpoint
with open(CHECKPOINT_PATH, "w") as f:
    json.dump({"results": results, "errors": processing_errors}, f)

print(f"\n{'='*60}")
print(f"Done. {len(results)} processed, {len(processing_errors)} errors.")
if processing_errors:
    print("\nErrors:")
    for err in processing_errors:
        print(f"  - {err['file_name']}: {err['error']}")

In [None]:
# Cell 8: Upload cleaned text to Supabase
from tqdm import tqdm

upload_errors = []
uploaded_count = 0

print(f"Uploading {len(results)} processed texts to Supabase...\n")

for file_name, info in tqdm(results.items()):
    try:
        with open(info["output_path"], "r", encoding="utf-8") as f:
            content = f.read()

        update_data = {
            "content": content,
            "ocr_status": "complete",
            "word_count": info["word_count"],
            "subjects": info["subjects"],
            "embedding": None,  # Force re-embedding
        }

        supabase.table("library_texts") \
            .update(update_data) \
            .eq("file_name", file_name) \
            .execute()

        uploaded_count += 1

    except Exception as e:
        upload_errors.append({"file_name": file_name, "error": str(e)})

print(f"\nUploaded: {uploaded_count}")
print(f"Errors: {len(upload_errors)}")
if upload_errors:
    for err in upload_errors[:10]:
        print(f"  - {err['file_name']}: {err['error']}")

---
## Citation Metadata Enrichment via CrossRef

The cells below query the CrossRef API using each text's author surname + title keywords to retrieve full bibliographic metadata: full author names, full title, journal/publisher, volume/issue/pages, DOI, etc.

This covers journal articles and most academic books reliably. Primary texts (poems, literary works) and some older monographs may not resolve and will need manual entry later.

Rate: ~1 request/second (polite pool). ~1,060 texts takes ~18 minutes.

In [None]:
# Cell 9: CrossRef metadata enrichment
import requests
import time
import json
from tqdm import tqdm

def query_crossref(author, title, year=None):
    """
    Query CrossRef for bibliographic metadata.
    Returns the best-matching work or None.
    """
    params = {"rows": 3, "mailto": CROSSREF_MAILTO}

    # Build query from available fields
    if author:
        params["query.author"] = author
    if title:
        params["query.bibliographic"] = title

    if not params.get("query.author") and not params.get("query.bibliographic"):
        return None

    try:
        resp = requests.get(
            "https://api.crossref.org/works",
            params=params,
            timeout=15
        )
        if resp.status_code != 200:
            return None

        data = resp.json()
        items = data.get("message", {}).get("items", [])
        if not items:
            return None

        # Score candidates: prefer year match and author match
        best = None
        best_score = -1

        for item in items:
            score = item.get("score", 0)

            # Bonus for year match
            if year:
                pub_year = None
                for date_field in ["published-print", "published-online", "created"]:
                    if date_field in item:
                        parts = item[date_field].get("date-parts", [[]])[0]
                        if parts:
                            pub_year = parts[0]
                            break
                if pub_year and abs(pub_year - year) <= 1:
                    score += 50

            # Bonus for author surname match
            if author:
                cr_authors = item.get("author", [])
                for a in cr_authors:
                    if a.get("family", "").lower() == author.lower():
                        score += 30
                        break

            if score > best_score:
                best_score = score
                best = item

        # Require minimum confidence: author surname must appear in results
        if best and author:
            cr_authors = best.get("author", [])
            author_found = any(
                a.get("family", "").lower() == author.lower()
                for a in cr_authors
            )
            if not author_found:
                return None

        return best

    except Exception:
        return None


def extract_metadata_from_crossref(item):
    """
    Extract citation metadata from a CrossRef work item.
    Returns a dict of fields ready for Supabase update.
    """
    meta = {}

    # Full author name(s)
    authors = item.get("author", [])
    if authors:
        author_names = []
        for a in authors:
            given = a.get("given", "")
            family = a.get("family", "")
            if given and family:
                author_names.append(f"{given} {family}")
            elif family:
                author_names.append(family)
        if author_names:
            meta["author_full"] = "; ".join(author_names)

    # Full title
    titles = item.get("title", [])
    subtitles = item.get("subtitle", [])
    if titles:
        full_title = titles[0]
        if subtitles:
            full_title += ": " + subtitles[0]
        meta["title_full"] = full_title

    # Source type
    cr_type = item.get("type", "")
    type_map = {
        "journal-article": "journal_article",
        "book": "book",
        "book-chapter": "book_chapter",
        "edited-book": "edited_volume",
        "monograph": "book",
        "proceedings-article": "journal_article",
        "dissertation": "dissertation",
        "reference-entry": "book_chapter",
    }
    meta["source_type"] = type_map.get(cr_type, cr_type)

    # Publisher
    if item.get("publisher"):
        meta["publisher"] = item["publisher"]

    # Journal / container title
    container = item.get("container-title", [])
    if container:
        if meta.get("source_type") == "journal_article":
            meta["journal"] = container[0]
        elif meta.get("source_type") in ("book_chapter", "edited_volume"):
            meta["book_title"] = container[0]

    # Volume / Issue
    if item.get("volume"):
        meta["volume"] = item["volume"]
    if item.get("issue"):
        meta["issue"] = item["issue"]

    # Pages
    if item.get("page"):
        meta["pages"] = item["page"]

    # Editors
    editors = item.get("editor", [])
    if editors:
        editor_names = []
        for e in editors:
            given = e.get("given", "")
            family = e.get("family", "")
            if given and family:
                editor_names.append(f"{given} {family}")
            elif family:
                editor_names.append(family)
        if editor_names:
            meta["editors"] = "; ".join(editor_names)

    # DOI
    if item.get("DOI"):
        meta["doi"] = item["DOI"]

    # ISBN
    isbns = item.get("ISBN", [])
    if isbns:
        meta["isbn"] = isbns[0]

    # URL (DOI link)
    if item.get("DOI"):
        meta["url"] = f"https://doi.org/{item['DOI']}"

    # Year (update if we got a more precise one)
    for date_field in ["published-print", "published-online", "created"]:
        if date_field in item:
            parts = item[date_field].get("date-parts", [[]])[0]
            if parts:
                meta["year"] = parts[0]
                break

    # Publication place (CrossRef rarely has this, but check)
    if item.get("publisher-location"):
        meta["place"] = item["publisher-location"]

    return meta


# --- Fetch all library texts that need enrichment ---
# Uses direct REST API to avoid postgrest version issues

print("Fetching library texts for metadata enrichment...")
all_texts = []
offset = 0

headers = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
}

while True:
    resp = requests.get(
        f"{SUPABASE_URL}/rest/v1/library_texts",
        headers=headers,
        params={
            "select": "id,file_name,author,title,year,author_full,doi",
            "author_full": "is.null",
            "offset": offset,
            "limit": 100,
        },
        timeout=30,
    )
    if resp.status_code != 200:
        print(f"Error fetching texts: {resp.status_code} {resp.text}")
        break
    batch = resp.json()
    if not batch:
        break
    all_texts.extend(batch)
    offset += 100

print(f"Texts needing metadata enrichment: {len(all_texts)}")

# Load CrossRef checkpoint
cr_completed = set()
if os.path.exists(CROSSREF_CHECKPOINT):
    with open(CROSSREF_CHECKPOINT) as f:
        cr_data = json.load(f)
    cr_completed = set(cr_data.get("completed", []))
    print(f"Resuming: {len(cr_completed)} already looked up")

# --- Query CrossRef for each text ---

enriched = 0
not_found = 0
cr_errors = []

for i, text in enumerate(tqdm(all_texts)):
    file_name = text["file_name"]

    if file_name in cr_completed:
        continue

    author = text.get("author")
    title = text.get("title")
    year = text.get("year")

    if not author and not title:
        cr_completed.add(file_name)
        not_found += 1
        continue

    try:
        cr_item = query_crossref(author, title, year)

        if cr_item:
            meta = extract_metadata_from_crossref(cr_item)
            if meta:
                # Update via REST API directly
                patch_resp = requests.patch(
                    f"{SUPABASE_URL}/rest/v1/library_texts",
                    headers={
                        **headers,
                        "Content-Type": "application/json",
                        "Prefer": "return=minimal",
                    },
                    params={"id": f"eq.{text['id']}"},
                    json=meta,
                    timeout=15,
                )
                if patch_resp.status_code in (200, 204):
                    enriched += 1
                else:
                    cr_errors.append({"file_name": file_name, "error": f"Update failed: {patch_resp.status_code}"})
            else:
                not_found += 1
        else:
            not_found += 1

        cr_completed.add(file_name)

    except Exception as e:
        cr_errors.append({"file_name": file_name, "error": str(e)})

    # Rate limit: ~1 request/sec (CrossRef polite pool)
    time.sleep(1.0)

    # Checkpoint every 50
    if (i + 1) % 50 == 0:
        with open(CROSSREF_CHECKPOINT, "w") as f:
            json.dump({"completed": list(cr_completed)}, f)
        print(f"  [checkpoint: {enriched} enriched, {not_found} not found]")

# Final checkpoint
with open(CROSSREF_CHECKPOINT, "w") as f:
    json.dump({"completed": list(cr_completed)}, f)

print(f"\n{'='*60}")
print(f"CrossRef enrichment complete.")
print(f"  Enriched: {enriched}")
print(f"  Not found: {not_found}")
print(f"  Errors: {len(cr_errors)}")
if cr_errors:
    for err in cr_errors[:10]:
        print(f"  - {err['file_name']}: {err['error']}")

In [None]:
# Cell 10: Verification + metadata coverage report
import requests

headers = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
    "Prefer": "count=exact",
}

def count_query(params=None):
    """Query library_texts with exact count via REST."""
    resp = requests.get(
        f"{SUPABASE_URL}/rest/v1/library_texts",
        headers=headers,
        params={"select": "id", **(params or {})},
        timeout=15,
    )
    # Count is in the content-range header
    cr = resp.headers.get("content-range", "")
    # Format: "0-99/1234" or "*/1234"
    if "/" in cr:
        return int(cr.split("/")[1])
    return len(resp.json())

total = count_query()
ocr_complete = count_query({"ocr_status": "eq.complete"})
null_emb = count_query({"embedding": "is.null"})
has_author_full = count_query({"author_full": "not.is.null"})
has_doi = count_query({"doi": "not.is.null"})
has_journal = count_query({"journal": "not.is.null"})
has_publisher = count_query({"publisher": "not.is.null"})
has_source_type = count_query({"source_type": "not.is.null"})

print(f"Total library texts: {total}")
print(f"Texts with ocr_status='complete': {ocr_complete}")
print(f"Texts needing re-embedding: {null_emb}")

print(f"\n--- Citation Metadata Coverage ---")
print(f"  author_full:  {has_author_full}/{total} ({100*has_author_full//total}%)")
print(f"  doi:          {has_doi}/{total} ({100*has_doi//total}%)")
print(f"  source_type:  {has_source_type}/{total} ({100*has_source_type//total}%)")
print(f"  journal:      {has_journal}/{total} ({100*has_journal//total}%)")
print(f"  publisher:    {has_publisher}/{total} ({100*has_publisher//total}%)")

# Show texts still missing author_full
resp = requests.get(
    f"{SUPABASE_URL}/rest/v1/library_texts",
    headers={
        "apikey": SUPABASE_KEY,
        "Authorization": f"Bearer {SUPABASE_KEY}",
    },
    params={
        "select": "file_name,author,title,year,category",
        "author_full": "is.null",
        "order": "category",
        "limit": 30,
    },
    timeout=15,
)
missing = resp.json() if resp.status_code == 200 else []

if missing:
    print(f"\n--- Sample texts still needing manual metadata ({len(missing)} shown) ---")
    for row in missing:
        print(f"  [{row.get('category','?')}] {row.get('author','?')} - {row.get('title','?')} ({row.get('year','?')})")

print(f"\n--- Next Steps ---")
print(f"1. Run embed_library_colab.ipynb to re-embed all {null_emb} texts")
print(f"2. Manually fill metadata for texts CrossRef couldn't resolve (primary texts, older monographs)")