# Load Requirements

In [None]:
# !pip install torchmetrics jiwer

In [1]:
import json

In [12]:
# read json
with open('res/benchmark_data.json') as f:
    benchmark_data = json.load(f)
 
# benchmark_data

In [13]:
# Character Error rate (CER)
from torchmetrics.text import CharErrorRate
import jiwer


cer = CharErrorRate()

transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# benchmarking function
def benchmark(benchmark_data, parser_res):
    cer_scores = []
    wer_scores = []
    for i, page in enumerate(benchmark_data['pages']):
        print(f"Page {i+1}")
        target = page['markdown']
        preds = parser_res['pages'][i]['markdown']

        # cer score
        cer_score = cer(preds=preds, target=target).item()

        # wer score
        wer_score = jiwer.wer(reference=target, hypothesis=preds, truth_transform=transforms, hypothesis_transform=transforms)

        cer_scores.append(round(cer_score, 4))
        wer_scores.append(round(wer_score, 4))

        print(f"CER: {cer_score:.4f}")
        print(f"WER: {wer_score:.4f}")
        print()
    return cer_scores, wer_scores

# LlamaParse

In [18]:
# read json
with open('res/llamaparse_res.json') as f:
    llamaparse_res = json.load(f)
 
# llamaparse_res

In [19]:
# benchmarking llamaparse
llamaparse_cer, llamaparse_wer = benchmark(benchmark_data, llamaparse_res)

print(f"CER List: {llamaparse_cer}")
print(f"WER List: {llamaparse_wer}")
print(f"Average CER: {sum(llamaparse_cer)/len(llamaparse_cer):.4f}")
print(f"Average WER: {sum(llamaparse_wer)/len(llamaparse_wer):.4f}")

Page 1
CER: 0.0122
WER: 0.0099

Page 2
CER: 0.0006
WER: 0.0014

Page 3
CER: 0.6014
WER: 0.5201

Page 4
CER: 0.0163
WER: 0.0030

Page 5
CER: 0.6048
WER: 0.6636

Page 6
CER: 0.0059
WER: 0.0167

CER List: [0.0122, 0.0006, 0.6014, 0.0163, 0.6048, 0.0059]
WER List: [0.0099, 0.0014, 0.5201, 0.003, 0.6636, 0.0167]
Average CER: 0.2069
Average WER: 0.2024


# Mistral OCR

In [20]:
# read json
with open('res/mistralocr_res.json') as f:
    mistralocr_res = json.load(f)
 
# mistralocr_res

In [21]:
# benchmarking mistralocr
mistralocr_cer, mistralocr_wer = benchmark(benchmark_data, mistralocr_res)

print(f"CER List: {mistralocr_cer}")
print(f"WER List: {mistralocr_wer}")
print(f"Average CER: {sum(mistralocr_cer)/len(mistralocr_cer):.4f}")
print(f"Average WER: {sum(mistralocr_wer)/len(mistralocr_wer):.4f}")

Page 1
CER: 0.2643
WER: 0.2184

Page 2
CER: 0.1322
WER: 0.1580

Page 3
CER: 0.2798
WER: 0.2488

Page 4
CER: 0.0442
WER: 0.0838

Page 5
CER: 0.1493
WER: 0.3822

Page 6
CER: 0.6077
WER: 0.6517

CER List: [0.2643, 0.1322, 0.2798, 0.0442, 0.1493, 0.6077]
WER List: [0.2184, 0.158, 0.2488, 0.0838, 0.3822, 0.6517]
Average CER: 0.2462
Average WER: 0.2905


# Pipeline 1 (Yolo + GOT-OCR2 + Gemma 3)

In [22]:
# read json
with open('res/pipeline_1_res.json') as f:
    pipeline_1_res = json.load(f)
 
# pipeline_1_res

In [23]:
pipeline_1_cer, pipeline_1_wer = benchmark(benchmark_data, pipeline_1_res)

print(f"CER List: {pipeline_1_cer}")
print(f"WER List: {pipeline_1_wer}")
print(f"Average CER: {sum(pipeline_1_cer)/len(pipeline_1_cer):.4f}")
print(f"Average WER: {sum(pipeline_1_wer)/len(pipeline_1_wer):.4f}")

Page 1
CER: 0.1884
WER: 0.2159

Page 2
CER: 0.3745
WER: 0.5091

Page 3
CER: 0.2224
WER: 0.2809

Page 4
CER: 0.4662
WER: 0.4656

Page 5
CER: 0.6646
WER: 0.7483

Page 6
CER: 0.9830
WER: 0.9950

CER List: [0.1884, 0.3745, 0.2224, 0.4662, 0.6646, 0.983]
WER List: [0.2159, 0.5091, 0.2809, 0.4656, 0.7483, 0.995]
Average CER: 0.4832
Average WER: 0.5358


# Pipeline 2 (Yolo + Universal.io + Gemma 3 + Gemma 3)

In [30]:
# read json
with open('res/pipeline_2_res.json') as f:
    pipeline_2_res = json.load(f)
 
# pipeline_2_res

In [31]:
pipeline_2_cer, pipeline_2_wer = benchmark(benchmark_data, pipeline_2_res)

print(f"CER List: {pipeline_2_cer}")
print(f"WER List: {pipeline_2_wer}")
print(f"Average CER: {sum(pipeline_2_cer)/len(pipeline_2_cer):.4f}")
print(f"Average WER: {sum(pipeline_2_wer)/len(pipeline_2_wer):.4f}")

Page 1
CER: 0.2914
WER: 0.2978

Page 2
CER: 0.1032
WER: 0.0657

Page 3
CER: 0.2228
WER: 0.1862

Page 4
CER: 0.3530
WER: 0.3039

Page 5
CER: 0.6337
WER: 0.7918

Page 6
CER: 0.2132
WER: 0.1900

CER List: [0.2914, 0.1032, 0.2228, 0.353, 0.6337, 0.2132]
WER List: [0.2978, 0.0657, 0.1862, 0.3039, 0.7918, 0.19]
Average CER: 0.3029
Average WER: 0.3059


# Pipeline 3 (Yolo + Universal.io + Gemma 3 + phi-4)

In [26]:
# with open('json_res/pipeline_2_res.jsonl') as f:
#     pipeline_2_res = json.load(f)

# pipeline_2_cer, pipeline_2_wer = benchmark_text(benchmark_data, pipeline_2_res)

# print(f"CER List: {pipeline_2_cer}")
# print(f"WER List: {pipeline_2_wer}")
# print()
# print(f"Average CER: {sum(pipeline_2_cer)/len(pipeline_2_cer):.4f}")
# print(f"Average WER: {sum(pipeline_2_wer)/len(pipeline_2_wer):.4f}")