SETUP

In [21]:
# =============================================================================
# IMPORTS AND DEPENDENCIES
# =============================================================================

import json
import os
import csv
import numpy as np
from pathlib import Path
from tqdm import tqdm
import string
import pandas as pd

import matplotlib.pyplot as plt
from PIL import Image
from dotenv import load_dotenv

# LLM imports
from langchain_groq import ChatGroq

# Docling imports for VRDU OCR
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.datamodel import vlm_model_specs
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
from langchain_docling.loader import DoclingLoader

DOC VQA

In [22]:
# Dataset loading
data_dir = "docvqa_samples_300"
image_dir = os.path.join(data_dir, "images")
metadata_file = os.path.join(data_dir, "metadata.json")
output_csv = "results\OCR_VRDU_results.csv"

  output_csv = "results\OCR_VRDU_results.csv"


NEW DATASET

In [None]:
# Configuration
data_dir = "NewDataset"
image_dir = os.path.join(data_dir, "images")
metadata_file = os.path.join(data_dir, "metadata.json")
output_csv = "results_NEWDATA\OCR_VRDU_RESULTS_NEWDATASET.csv"

In [16]:
# Configuration and Dataset Setup
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Model configuration
GEN_MODEL_ID = "llama-3.1-8b-instant"

# Evaluation metrics functions
def normalize(text):
    """Normalize text for comparison by removing punctuation and converting to lowercase."""
    return text.lower().translate(str.maketrans('', '', string.punctuation)).strip()

def exact_match(pred, ground_truths):
    """Calculate exact match score between prediction and ground truths."""
    pred_norm = normalize(pred)
    return any(pred_norm == normalize(gt) for gt in ground_truths)

def f1_score(pred, ground_truths):
    """Calculate F1 score between prediction and ground truths."""
    def score(pred, gt):
        pred_tokens = normalize(pred).split()
        gt_tokens = normalize(gt).split()
        common = set(pred_tokens) & set(gt_tokens)
        if not common:
            return 0.0
        precision = len(common) / len(pred_tokens) if pred_tokens else 0.0
        recall = len(common) / len(gt_tokens) if gt_tokens else 0.0
        return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return max(score(pred, gt) for gt in ground_truths)



# Load metadata
with open(metadata_file, "r", encoding="utf-8") as f:
    docvqa_metadata = json.load(f)

print(f"Loaded {len(docvqa_metadata)} samples from DocVQA dataset")

Loaded 10 samples from DocVQA dataset


In [17]:
# VRDU OCR Setup
VLM_MODEL = vlm_model_specs.SMOLDOCLING_TRANSFORMERS

pipeline_options = VlmPipelineOptions(vlm_options=VLM_MODEL)

doc_converter = DocumentConverter(
    format_options={
        InputFormat.IMAGE: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
    }
)

print("VRDU Document converter initialized")

VRDU Document converter initialized


Document loading

In [None]:
# LLM Initialization and Main Processing Pipeline
llm = ChatGroq(
    groq_api_key=os.getenv('GROQ_API_KEY'),  # Use environment variable
    model_name=GEN_MODEL_ID,
    temperature=0,
    max_tokens=1024,
    timeout=60
)

print("Language Model initialized for QA")

# Main evaluation pipeline
processed_count = 0
failed_count = 0

with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["id", "image_filename", "question", "ground_truth", "ocr_content", "predicted_answer", "exact_match", "f1_score"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    em_scores = []
    f1_scores = []

    for i, sample in enumerate(tqdm(docvqa_metadata, desc="Processing documents")):
        try:
            doc_id = sample['id']
            image_filename = sample['image_filename']
            question = sample['question']
            ground_truth = sample['answers']

            image_path = os.path.join(image_dir, image_filename)
            if not os.path.exists(image_path):
                failed_count += 1
                continue

            # Extract OCR content using VRDU
            loader = DoclingLoader(
                file_path=[str(image_path)],
                converter=doc_converter,
                export_type="markdown"
            )

            documents = loader.load()
            if not documents or not documents[0].page_content.strip():
                failed_count += 1
                continue

            ocr_content = documents[0].page_content.strip()

            # Improved LLM QA with better VRDU utilization and one-shot example
            prompt = f"""You are analyzing structured document content that preserves layout, tables, headers, and formatting. Use the document structure to find the precise answer.

EXAMPLE:
Document: "## Invoice Details\n| Item | Quantity | Price |\n|------|----------|-------|\n| Laptop | 2 | $1,200 |\n| Mouse | 5 | $25 |\n\n**Total: $2,525**"
Question: What is the total amount?
Answer: $2,525

Now answer this question:

DOCUMENT:
{ocr_content}

QUESTION: {question}

INSTRUCTIONS: Look at the document structure including headers, tables, lists, and formatting. Find the relevant section and extract the precise answer. Return ONLY the answer - no explanations or extra text.

ANSWER:"""

            try:
                response = llm.invoke(prompt)
                predicted_answer = str(response.content).strip()
            except Exception:
                predicted_answer = ""

            # Evaluation
            em = exact_match(predicted_answer, ground_truth)
            f1_val = f1_score(predicted_answer, ground_truth)

            em_scores.append(int(em))
            f1_scores.append(f1_val)

            writer.writerow({
                "id": doc_id,
                "image_filename": image_filename,
                "question": question,
                "ground_truth": " | ".join(ground_truth),
                "ocr_content": ocr_content,
                "predicted_answer": predicted_answer,
                "exact_match": em,
                "f1_score": round(f1_val, 2)
            })

            processed_count += 1

        except Exception:
            failed_count += 1


Language Model initialized for QA


Processing documents:   0%|          | 0/10 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Processing documents:  10%|█         | 1/10 [03:32<31:53, 212.58s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Processing documents:  20%|██        | 2/10 [03:38<12:05, 90.73s/it] The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Processing documents:  30%|

Successfully processed: 7/10
Average Exact Match: 28.57%
Average F1 Score: 45.24%
Results saved to: OCR_VRDU_RESULTS_NEWDATASET.csv





## Results Analysis

In [24]:
# Load results from OUTPUT_CSV and compute average F1 and EM scores
results_df = pd.read_csv(output_csv)
avg_f1 = results_df['f1_score'].mean()
avg_em = results_df['exact_match'].mean()

print(f"Average F1 Score: {avg_f1:.3f}")
print(f"Average Exact Match (EM) Score: {avg_em:.3f}")

Average F1 Score: 0.554
Average Exact Match (EM) Score: 0.494
