# Load Requrements

In [9]:
import json
from IPython.display import Markdown, display

In [10]:
# read json file
with open('json_res/benchmark_data.jsonl') as f:
    benchmark_data = json.load(f)

with open('json_res/llamaparse_res.jsonl') as f:
    llamaparse_res = json.load(f)

with open('json_res/mistralocr_res.jsonl') as f:
    mistralocr_res = json.load(f)

In [11]:
!pip install torchmetrics jiwer



In [12]:
# Character Error rate (CER)
from torchmetrics.text import CharErrorRate
import jiwer


cer = CharErrorRate()

transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# benchmarking function
def benchmark(benchmark_data, parser_res):
    cer_scores = []
    wer_scores = []
    for i, page in enumerate(benchmark_data['pages']):
        print(f"Page {i+1}")
        target = page['markdown']
        preds = parser_res['pages'][i]['markdown']

        # cer score
        cer_score = cer(preds=preds, target=target).item()

        # wer score
        wer_score = jiwer.wer(reference=target, hypothesis=preds, truth_transform=transforms, hypothesis_transform=transforms)

        cer_scores.append(cer_score)
        wer_scores.append(wer_score)

        print(f"CER: {cer_score:.4f}")
        print(f"WER: {wer_score:.4f}")
        print()
    return cer_scores, wer_scores

In [14]:
# benchmarking function
def benchmark_text(benchmark_data, parser_res):
    cer_scores = []
    wer_scores = []
    for i, page in enumerate(benchmark_data['pages']):
        print(f"Page {i+1}")
        target = page['markdown']
        preds = parser_res['pages'][i]['text']

        # cer score
        cer_score = cer(preds=preds, target=target).item()

        # wer score
        wer_score = jiwer.wer(reference=target, hypothesis=preds, truth_transform=transforms, hypothesis_transform=transforms)

        cer_scores.append(cer_score)
        wer_scores.append(wer_score)

        print(f"CER: {cer_score:.4f}")
        print(f"WER: {wer_score:.4f}")
        print()
    return cer_scores, wer_scores

# LlamaParse

In [25]:
# benchmarking llamaparse
llamaparse_cer, llamaparse_wer = benchmark(benchmark_data, llamaparse_res)

print(f"CER List: {llamaparse_cer}")
print(f"WER List: {llamaparse_wer}")
print()
print(f"Average CER: {sum(llamaparse_cer)/len(llamaparse_cer):.4f}")
print(f"Average WER: {sum(llamaparse_wer)/len(llamaparse_wer):.4f}")

Page 1
CER: 0.0111
WER: 0.0074

Page 2
CER: 0.0015
WER: 0.0000

Page 3
CER: 0.6966
WER: 0.5180

CER List: [0.011148648336529732, 0.0015455950051546097, 0.6966218948364258]
WER List: [0.007444168734491315, 0.0, 0.5180327868852459]

Average CER: 0.2364
Average WER: 0.1752


# Mistral OCR

In [26]:
# benchmarking mistralocr
mistralocr_cer, mistralocr_wer = benchmark(benchmark_data, mistralocr_res)

print(f"CER List: {mistralocr_cer}")
print(f"WER List: {mistralocr_wer}")
print()
print(f"Average CER: {sum(mistralocr_cer)/len(mistralocr_cer):.4f}")
print(f"Average WER: {sum(mistralocr_wer)/len(mistralocr_wer):.4f}")

Page 1
CER: 0.2666
WER: 0.2184

Page 2
CER: 0.1339
WER: 0.1566

Page 3
CER: 0.2679
WER: 0.2328

CER List: [0.266554057598114, 0.1338871717453003, 0.2678648829460144]
WER List: [0.21836228287841192, 0.15664335664335666, 0.23278688524590163]

Average CER: 0.2228
Average WER: 0.2026


# Pipeline 1 (Yolo + GOT-OCR2 + Gemma 3)

In [28]:
with open('json_res/pipeline_1_res.jsonl') as f:
    pipeline_1_res = json.load(f)

pipeline_1_cer, pipeline_1_wer = benchmark(benchmark_data, pipeline_1_res)

print(f"CER List: {pipeline_1_cer}")
print(f"WER List: {pipeline_1_wer}")
print()
print(f"Average CER: {sum(pipeline_1_cer)/len(pipeline_1_cer):.4f}")
print(f"Average WER: {sum(pipeline_1_wer)/len(pipeline_1_wer):.4f}")

Page 1
CER: 0.1932
WER: 0.2035

Page 2
CER: 0.3752
WER: 0.5091

Page 3
CER: 0.2150
WER: 0.2902

CER List: [0.19324325025081635, 0.37519320845603943, 0.21502815186977386]
WER List: [0.20347394540942929, 0.509090909090909, 0.2901639344262295]

Average CER: 0.2612
Average WER: 0.3342


# Pipeline 2 (Yolo + Universal.io + Gemma 3) - With Formating VLM

In [15]:
with open('json_res/pipeline_2_res.jsonl') as f:
    pipeline_2_res = json.load(f)

pipeline_2_cer, pipeline_2_wer = benchmark(benchmark_data, pipeline_2_res)

print(f"CER List: {pipeline_2_cer}")
print(f"WER List: {pipeline_2_wer}")
print()
print(f"Average CER: {sum(pipeline_2_cer)/len(pipeline_2_cer):.4f}")
print(f"Average WER: {sum(pipeline_2_wer)/len(pipeline_2_wer):.4f}")

Page 1
CER: 0.3922
WER: 0.3970

Page 2
CER: 0.0330
WER: 0.0392

Page 3
CER: 0.8010
WER: 0.9000

CER List: [0.39222973585128784, 0.03303709253668785, 0.8009961247444153]
WER List: [0.3970223325062035, 0.039160839160839164, 0.9]

Average CER: 0.4088
Average WER: 0.4454


# Pipeline 2 (Yolo + Universal.io + Gemma 3) - Only Text

In [16]:
with open('json_res/pipeline_2_res.jsonl') as f:
    pipeline_2_res = json.load(f)

pipeline_2_cer, pipeline_2_wer = benchmark_text(benchmark_data, pipeline_2_res)

print(f"CER List: {pipeline_2_cer}")
print(f"WER List: {pipeline_2_wer}")
print()
print(f"Average CER: {sum(pipeline_2_cer)/len(pipeline_2_cer):.4f}")
print(f"Average WER: {sum(pipeline_2_wer)/len(pipeline_2_wer):.4f}")

Page 1
CER: 0.2608
WER: 0.2779

Page 2
CER: 0.0195
WER: 0.0587

Page 3
CER: 0.1849
WER: 0.1459

CER List: [0.26081082224845886, 0.019513137638568878, 0.1849285364151001]
WER List: [0.27791563275434245, 0.05874125874125874, 0.14590163934426228]

Average CER: 0.1551
Average WER: 0.1609
