# DocuMind: End-to-End Document Understanding Pipeline

This notebook runs the **full DocuMind pipeline** on Google Colab:

1. **Setup** — Clone repo, install dependencies, check GPU
2. **Data** — Download CORD v2, prepare splits, visualize samples
3. **Prompt Engineering** — Run 7 strategies on the base model
4. **Fine-Tuning** — QLoRA fine-tune Qwen2-VL-2B on receipts
5. **Evaluation** — Compare base vs prompted vs fine-tuned

**Requirements:** Google Colab with a T4 GPU (free tier) or better.

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 1: Clone repo and install dependencies
# ════════════════════════════════════════════════════════════════════
import os

# Reset to a known-good directory (handles re-runs after rm -rf)
try:
    os.getcwd()
except OSError:
    os.chdir("/content")

# ── Clone the repo ─────────────────────────────────────────────────
REPO_URL = "https://github.com/NaveenPrasanth/DocuLLM-Finetune.git"
REPO_DIR = "/content/DocuLLM-Finetune"

if not os.path.exists(REPO_DIR):
    os.chdir("/content")
    !git clone {REPO_URL} {REPO_DIR}
    print(f"Cloned repo to {REPO_DIR}")
else:
    !cd {REPO_DIR} && git pull
    print(f"Repo already exists, pulled latest")

os.chdir(REPO_DIR)
print(f"Working directory: {os.getcwd()}")

# ── Install dependencies ───────────────────────────────────────────
!pip install -q \
    torch \
    "transformers>=4.45.0" \
    "accelerate>=0.34.0" \
    "peft>=0.13.0" \
    "bitsandbytes>=0.43.0" \
    "qwen-vl-utils>=0.0.2" \
    "datasets>=2.20.0" \
    "omegaconf>=2.3" \
    "pydantic>=2.5" \
    "rapidfuzz>=3.5" \
    "python-dotenv>=1.0" \
    wandb rich tqdm matplotlib seaborn Pillow

# Install the project itself
!pip install -q -e .

print("\nAll dependencies installed!")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 2: Imports and GPU check
# ════════════════════════════════════════════════════════════════════
import json
import random
import numpy as np
import torch
import matplotlib.pyplot as plt

# Project imports — this is the whole point of having a codebase!
from src.config import load_base_config, load_training_config, load_data_config
from src.data.cord_loader import load_cord_dataset, parse_cord_ground_truth, get_cord_schema
from src.data.format_converter import sample_to_chatml, sample_to_inference_chatml
from src.data.dataset_builder import build_cord_splits, get_dataset_stats
from src.training.model_loader import load_quantized_model, load_processor, print_model_info
from src.training.lora_config import build_lora_config, apply_lora

# Seed everything
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# GPU check
if not torch.cuda.is_available():
    raise RuntimeError(
        "No GPU detected! Go to Runtime > Change runtime type > GPU.\n"
        "A T4 (free tier) with 15GB VRAM is sufficient."
    )

gpu_name = torch.cuda.get_device_name(0)
props = torch.cuda.get_device_properties(0)
gpu_mem = getattr(props, 'total_memory', getattr(props, 'total_mem', 0)) / (1024**3)
print(f"GPU: {gpu_name} ({gpu_mem:.1f} GB)")
print(f"CUDA: {torch.version.cuda}")
print(f"PyTorch: {torch.__version__}")

if gpu_mem < 14:
    print("WARNING: <14GB VRAM — training may fail.")
else:
    print("VRAM sufficient for QLoRA.")

# Load configs from YAML
base_config = load_base_config()
training_config = load_training_config()
data_config = load_data_config("cord")
print(f"\nProject: {base_config.project.name}")
print(f"Model: {base_config.model.name}")
print(f"Dataset: {data_config.dataset.name} ({data_config.dataset.hf_path})")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 3: Load and explore the CORD dataset
# ════════════════════════════════════════════════════════════════════
import gc

# ── Colab free-tier RAM budget ─────────────────────────────────────
# Colab free tier has ~12.7GB system RAM. Loading 720 PIL images +
# model weights + tokenizer easily exceeds this. We cap training
# samples to keep RAM under control while still getting meaningful
# fine-tuning results.
MAX_TRAIN = 200   # 200 samples is plenty for QLoRA on a small domain
MAX_VAL = 30
MAX_TEST = 20

# Use our data pipeline
print("Loading CORD v2 dataset...")
splits = build_cord_splits(seed=SEED)

for split_name, samples in splits.items():
    stats = get_dataset_stats(samples)
    print(f"  {split_name}: {stats['num_samples']} samples, "
          f"avg {stats['avg_fields']:.1f} fields/sample "
          f"(range: {stats['min_fields']}-{stats['max_fields']})")

# Cap splits to fit in Colab RAM
train_samples = splits["train"][:MAX_TRAIN]
val_samples = splits["val"][:MAX_VAL]
test_samples = splits["test"][:MAX_TEST]

# Free the full splits we no longer need
del splits
gc.collect()

print(f"\nUsing: {len(train_samples)} train | {len(val_samples)} val | {len(test_samples)} test")
print("(Capped for Colab free-tier 12.7GB RAM budget)")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 4: Visualize a training data point (input → output)
# ════════════════════════════════════════════════════════════════════
from src.data.format_converter import DEFAULT_INSTRUCTION

sample_idx = 0
sample = train_samples[sample_idx]

fig, (ax_img, ax_txt) = plt.subplots(1, 2, figsize=(16, 8),
                                      gridspec_kw={"width_ratios": [1, 1.2]})

# Left: receipt image
ax_img.imshow(sample["image"])
ax_img.set_title("INPUT: Receipt Image", fontsize=14, fontweight="bold")
ax_img.axis("off")

# Right: expected JSON output
gt_json = json.dumps(sample["ground_truth"], indent=2, ensure_ascii=False)
lines = gt_json.split("\n")
display_text = "\n".join(lines[:60])
if len(lines) > 60:
    display_text += "\n... (truncated)"

ax_txt.axis("off")
ax_txt.set_title("OUTPUT: Expected JSON", fontsize=14, fontweight="bold")
ax_txt.text(0.02, 0.98, display_text, transform=ax_txt.transAxes,
            fontsize=8, fontfamily="monospace", verticalalignment="top",
            bbox=dict(boxstyle="round,pad=0.5", facecolor="#f0f0f0", alpha=0.9))

plt.suptitle(f"Training Sample #{sample_idx}", fontsize=12, y=1.02)
plt.tight_layout()
plt.show()

# Print what the model actually sees
print(f"INSTRUCTION: {DEFAULT_INSTRUCTION}")
print(f"\nImage size: {sample['image'].size}")
print(f"Top-level keys: {list(sample['ground_truth'].keys())}")
print(f"Flat fields: {sample['metadata']['num_fields']}")
print(f"JSON length: {len(gt_json)} chars")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 5: Load Qwen2-VL with 4-bit quantization
# ════════════════════════════════════════════════════════════════════

# Use our config-driven model loader
tc = training_config.training

print("Loading model with 4-bit quantization...")
model = load_quantized_model(
    model_name=base_config.model.name,
    quantization_config=tc.quantization,
)

processor = load_processor(
    processor_name=base_config.model.processor_name,
)

print_model_info(model)

allocated = torch.cuda.memory_allocated(0) / (1024**3)
print(f"\nGPU memory after model load: {allocated:.2f} GB")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 6: Quick base model test (before any fine-tuning)
# ════════════════════════════════════════════════════════════════════
from qwen_vl_utils import process_vision_info
from src.inference.postprocessor import postprocess_prediction

def run_inference(model, processor, image, instruction=None):
    """Run inference on a single image."""
    if instruction is None:
        instruction = DEFAULT_INSTRUCTION

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": instruction},
        ],
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(text=[text], images=image_inputs, videos=video_inputs,
                       padding=True, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)

    generated_ids = output_ids[:, inputs["input_ids"].shape[1]:]
    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]


# Test base model on 1 sample
test_item = test_samples[0]
print("Base model prediction (before fine-tuning):")
print("=" * 60)
base_prediction = run_inference(model, processor, test_item["image"])
print(base_prediction[:500])
print("=" * 60)

# Use our postprocessor (handles markdown fences, fixes common JSON issues)
base_processed = postprocess_prediction(base_prediction)
print(f"Valid JSON: {'Yes' if base_processed['valid'] else 'No'}")
if base_processed['errors']:
    print(f"Notes: {base_processed['errors']}")
if base_processed['valid']:
    print(f"Keys: {list(base_processed['parsed'].keys())}")
    print("\nNote: Base model outputs free-form JSON, not the CORD schema.")
    print("Fine-tuning teaches the model the exact schema structure.")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 7: Apply LoRA adapters
# ════════════════════════════════════════════════════════════════════
from peft import LoraConfig as PeftLoraConfig

# Use our config-driven LoRA setup
lora_config = build_lora_config(tc.lora)
model = apply_lora(model, lora_config)

allocated = torch.cuda.memory_allocated(0) / (1024**3)
print(f"\nGPU memory after LoRA: {allocated:.2f} GB")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 8: Prepare data and train
# ════════════════════════════════════════════════════════════════════
import gc
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from src.data.format_converter import DEFAULT_INSTRUCTION

# ── Optional W&B ──────────────────────────────────────────────────
USE_WANDB = False  # Set True and login first
if USE_WANDB:
    import wandb
    wandb.init(project="documind", tags=["qwen2-vl", "qlora", "cord"])

# ── Convert samples to ChatML training format ─────────────────────
INSTRUCTION = DEFAULT_INSTRUCTION

def sample_to_training_dict(sample):
    gt_json = json.dumps(sample["ground_truth"], indent=2, ensure_ascii=False)
    return {
        "messages": sample_to_chatml(
            image=sample["image"],
            ground_truth_json=gt_json,
            instruction=INSTRUCTION,
        )
    }

print("Converting to ChatML format...")
train_data = [sample_to_training_dict(s) for s in train_samples]
val_data = [sample_to_training_dict(s) for s in val_samples]
print(f"Train: {len(train_data)} | Val: {len(val_data)}")

# Free raw samples — the PIL images are now inside train_data messages
del train_samples, val_samples
gc.collect()

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(val_data)

# ── Custom collator for Qwen2-VL multimodal training ──────────────
def find_assistant_tokens(input_ids, tokenizer):
    """Find assistant response regions for label masking."""
    assistant_start = tokenizer.encode("assistant\n", add_special_tokens=False)
    end_marker = tokenizer.encode("<|im_end|>", add_special_tokens=False)
    ids = input_ids.tolist()
    regions = []
    i = 0
    while i < len(ids) - len(assistant_start):
        if ids[i:i+len(assistant_start)] == assistant_start:
            start = i + len(assistant_start)
            j = start
            while j < len(ids) - len(end_marker) + 1:
                if ids[j:j+len(end_marker)] == end_marker:
                    regions.append((start, j + len(end_marker)))
                    i = j + len(end_marker)
                    break
                j += 1
            else:
                regions.append((start, len(ids)))
                break
        i += 1
    return regions


def collate_fn(examples):
    """Collator: chat template + vision processing + label masking."""
    texts, all_images = [], []
    for ex in examples:
        text = processor.apply_chat_template(
            ex["messages"], tokenize=False, add_generation_prompt=False
        )
        texts.append(text)
        images, _ = process_vision_info(ex["messages"])
        if images:
            all_images.extend(images)

    batch = processor(
        text=texts,
        images=all_images if all_images else None,
        padding=True, truncation=True, max_length=2048,
        return_tensors="pt",
    )

    # Label masking: only compute loss on assistant responses
    labels = batch["input_ids"].clone()
    for i in range(len(examples)):
        regions = find_assistant_tokens(batch["input_ids"][i], processor.tokenizer)
        labels[i, :] = -100
        for start, end in regions:
            labels[i, start:end] = batch["input_ids"][i, start:end]
    if processor.tokenizer.pad_token_id is not None:
        labels[labels == processor.tokenizer.pad_token_id] = -100
    batch["labels"] = labels
    return batch

# ── Sanity check the collator ─────────────────────────────────────
print("\nTesting collator...")
test_batch = collate_fn([train_data[0]])
n_labeled = (test_batch['labels'][0] != -100).sum().item()
n_total = test_batch['input_ids'].shape[1]
print(f"  input_ids: {test_batch['input_ids'].shape}")
print(f"  Labeled tokens: {n_labeled}/{n_total} ({100*n_labeled/n_total:.1f}%)")
del test_batch
gc.collect()
torch.cuda.empty_cache()

# ── Training args ─────────────────────────────────────────────────
# With 200 train samples and batch_size=1, grad_accum=4:
#   steps_per_epoch = 200 / (1 * 4) = 50
#   total_steps = 50 * 3 = 150
#   warmup_steps = 15 (10% of 150)
OUTPUT_DIR = "./outputs/qlora_qwen2vl_cord"
GRAD_ACCUM = 4  # reduced from 8 since we have fewer samples
NUM_EPOCHS = 3
STEPS_PER_EPOCH = len(train_dataset) // GRAD_ACCUM
TOTAL_STEPS = STEPS_PER_EPOCH * NUM_EPOCHS
WARMUP_STEPS = max(1, int(TOTAL_STEPS * 0.1))

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_steps=WARMUP_STEPS,
    weight_decay=0.01,
    bf16=True,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    logging_steps=5,
    eval_strategy="steps",
    eval_steps=STEPS_PER_EPOCH,       # eval once per epoch
    save_steps=STEPS_PER_EPOCH,       # save once per epoch
    save_total_limit=2,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to="wandb" if USE_WANDB else "none",
    logging_first_step=True,
    optim="paged_adamw_8bit",
    dataloader_pin_memory=False,
)

# ── Train ─────────────────────────────────────────────────────────
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
)

print(f"\nStarting training...")
print(f"  {len(train_dataset)} samples, batch=1, grad_accum={GRAD_ACCUM}")
print(f"  {STEPS_PER_EPOCH} steps/epoch x {NUM_EPOCHS} epochs = {TOTAL_STEPS} total steps")
print(f"  Warmup: {WARMUP_STEPS} steps | Eval every {STEPS_PER_EPOCH} steps")
print(f"  Logging every 5 steps — first output in ~1 min\n")

trainer.train()
print("\nTraining complete!")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 9: Plot training curves
# ════════════════════════════════════════════════════════════════════
log_history = trainer.state.log_history

train_steps = [e["step"] for e in log_history if "loss" in e and "eval_loss" not in e]
train_losses = [e["loss"] for e in log_history if "loss" in e and "eval_loss" not in e]
eval_steps = [e["step"] for e in log_history if "eval_loss" in e]
eval_losses = [e["eval_loss"] for e in log_history if "eval_loss" in e]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(train_steps, train_losses, "b-", lw=1.5, label="Train")
axes[0].set_xlabel("Step"); axes[0].set_ylabel("Loss")
axes[0].set_title("Training Loss"); axes[0].legend(); axes[0].grid(alpha=0.3)

if eval_losses:
    axes[1].plot(eval_steps, eval_losses, "r-o", lw=1.5, label="Eval")
    axes[1].set_xlabel("Step"); axes[1].set_ylabel("Loss")
    axes[1].set_title("Eval Loss"); axes[1].legend(); axes[1].grid(alpha=0.3)
else:
    axes[1].text(0.5, 0.5, "No eval data yet", ha="center", va="center")

plt.tight_layout()
plt.show()

if train_losses:
    print(f"Final train loss: {train_losses[-1]:.4f}")
if eval_losses:
    print(f"Final eval loss:  {eval_losses[-1]:.4f}")
    print(f"Best eval loss:   {min(eval_losses):.4f}")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 10: Evaluate — Base vs Fine-Tuned side by side
# ════════════════════════════════════════════════════════════════════
from src.data.cord_loader import flatten_cord_fields
from src.inference.postprocessor import postprocess_prediction
from src.evaluation.metrics import compute_field_f1, compute_json_validity

NUM_EVAL = 10
eval_samples = test_samples[:NUM_EVAL]

ft_results = {"f1": [], "json_valid": []}

print(f"Evaluating fine-tuned model on {NUM_EVAL} test samples...\n")

for i, sample in enumerate(eval_samples):
    prediction = run_inference(model, processor, sample["image"])
    processed = postprocess_prediction(prediction)

    # JSON validity
    ft_results["json_valid"].append(1.0 if processed["valid"] else 0.0)

    # Field F1
    if processed["valid"]:
        pred_flat = flatten_cord_fields(processed["parsed"])
        gt_flat = sample["ground_truth_flat"]
        f1_result = compute_field_f1(pred_flat, gt_flat)
        ft_results["f1"].append(f1_result["micro"]["f1"])
    else:
        ft_results["f1"].append(0.0)

    if i < 3:  # Show first 3
        print(f"── Sample {i} ──")
        print(f"  Valid JSON: {processed['valid']}")
        print(f"  Field F1:   {ft_results['f1'][-1]:.3f}")
        print(f"  Prediction: {prediction[:200]}...\n")

avg_f1 = sum(ft_results['f1']) / len(ft_results['f1'])
avg_valid = sum(ft_results['json_valid']) / len(ft_results['json_valid'])

print("\n" + "=" * 50)
print(f"Fine-Tuned Model Results ({NUM_EVAL} samples)")
print("=" * 50)
print(f"  Avg Field F1:     {avg_f1:.4f}")
print(f"  JSON Valid Rate:  {avg_valid:.1%}")

In [None]:
# ════════════════════════════════════════════════════════════════════
# Cell 11: Save adapter to Google Drive
# ════════════════════════════════════════════════════════════════════
from google.colab import drive
drive.mount("/content/drive")

DRIVE_SAVE_DIR = "/content/drive/MyDrive/documind/adapters/qlora_qwen2vl_cord"
os.makedirs(DRIVE_SAVE_DIR, exist_ok=True)

print(f"Saving adapter to {DRIVE_SAVE_DIR}...")
model.save_pretrained(DRIVE_SAVE_DIR)
processor.save_pretrained(DRIVE_SAVE_DIR)

# Save training config + results
config_to_save = {
    "model_name": base_config.model.name,
    "lora": tc.lora.model_dump(),
    "training_args": {
        "epochs": tc.args.num_train_epochs,
        "lr": tc.args.learning_rate,
        "batch_size": tc.args.per_device_train_batch_size,
        "grad_accum": tc.args.gradient_accumulation_steps,
    },
    "results": {
        "avg_field_f1": avg_f1,
        "json_valid_rate": avg_valid,
        "final_train_loss": train_losses[-1] if train_losses else None,
        "final_eval_loss": eval_losses[-1] if eval_losses else None,
    },
    "dataset": {"train": len(train_data), "val": len(val_data), "test_eval": NUM_EVAL},
}
with open(f"{DRIVE_SAVE_DIR}/training_config.json", "w") as f:
    json.dump(config_to_save, f, indent=2)

saved_files = os.listdir(DRIVE_SAVE_DIR)
print(f"\nSaved {len(saved_files)} files:")
for fn in sorted(saved_files):
    size = os.path.getsize(os.path.join(DRIVE_SAVE_DIR, fn))
    print(f"  {fn:40s} {size / 1024:.1f} KB")

if USE_WANDB:
    import wandb
    if wandb.run:
        wandb.finish()

print("\nDone! Adapter saved to Google Drive.")