# Load Requirements

In [1]:
# !pip install torchmetrics

In [35]:
import json

# read json
with open('res/benchmark_data.json') as f:
    benchmark_data = json.load(f)
 
# benchmark_data

In [36]:
# Character Error rate (CER)
from torchmetrics.text import CharErrorRate
from torchmetrics.text import WordErrorRate
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.text import BLEUScore

cer = CharErrorRate()
wer = WordErrorRate()
rouge = ROUGEScore()
bleu = BLEUScore()


In [4]:
from torchmetrics.text import BLEUScore
preds = ['CIRP Journal of Manufacturing Science and Technology 48 (2024) 19-27\n\nContents lists available at ScienceDirect\n\nCIRP Journal of Manufacturing Science and Technology']
target = [['CIRP Journal of Manufacturing Science and Technology 48 (2024) 19-27\n\nContents lists available at ScienceDirect\n\n# CIRP Journal of Manufacturing Science and Technology']]
bleu = BLEUScore()
bleu(preds, target)

tensor(0.8808)

In [37]:
# Order Score
from difflib import SequenceMatcher
from scipy.stats import kendalltau, spearmanr
import re

def split_into_blocks(text):
    # Basic split: by double newlines (paragraph/section boundaries)
    blocks = [block.strip() for block in text.strip().split('\n\n') if block.strip()]
    # remove empty blocks and blocks with only whitespace
    # blocks = [block for block in blocks if block and not re.match(r'^\s*$', block)]
    return blocks

def get_block_order_indices(pred_blocks, target_blocks):
    indices = []
    indices_score = []
    for j, t_block in enumerate(target_blocks):
        sequence_score = []
        for i, p_block in enumerate(pred_blocks):
            score = SequenceMatcher(None, t_block, p_block).ratio()
            sequence_score.append(score)
        if sequence_score:
            # if max(sequence_score) > 0.5:  # 50% similar
                max_index = sequence_score.index(max(sequence_score))
                indices.append(max_index)
                indices_score.append(max(sequence_score))
    return indices, indices_score

def order_score_kendall_tau(pred_blocks, target_blocks):
    target_indices = list(range(len(target_blocks)))
    pred_indices, _ = get_block_order_indices(pred_blocks, target_blocks)
    print("Target indices:", target_indices)
    print("Pred indices:", pred_indices)
    print("Pred indices score:", _)
    print()
    
    if len(pred_indices) < 2:
        return 0.0, 0.0  # not enough data to compare order
    
    tau, _ = kendalltau(target_indices, pred_indices)
    spearman = spearmanr(target_indices, pred_indices)[0]
    return max(tau, 0), max(spearman, 0)  # ensure non-negative

def order_score_predict(pred_text, target_text):
    pred_blocks = split_into_blocks(pred_text)
    target_blocks = split_into_blocks(target_text)

    tau_score, spearman_score = order_score_kendall_tau(pred_blocks, target_blocks)

    return tau_score, spearman_score

In [38]:
# benchmarking function
def benchmark(benchmark_data, parser_res):
    cer_scores = []
    wer_scores = []
    rouge_scores = []
    bleu_scores = []
    tau_scores = []
    spearman_scores = []
    for i, page in enumerate(benchmark_data['pages']):
        print(f"Page {i+1}")
        target = page['markdown']
        preds = parser_res['pages'][i]['markdown']

        # cer score
        cer_score = cer(preds=preds, target=target).item()

        # wer score
        wer_score = wer(preds=preds, target=target).item()

        # rouge score
        rouge_score = rouge(preds=preds, target=target)
        rougeL_fmeasure = rouge_score["rougeL_fmeasure"].item()
        
        # bleu score
        bleu_score = bleu(preds=[preds], target=[[target]]).item()

        # order score
        tau_score, spearman_score = order_score_predict(preds, target)
        tau_score = float(tau_score)
        spearman_score = float(spearman_score)

        cer_scores.append(round(cer_score, 4))
        wer_scores.append(round(wer_score, 4))
        rouge_scores.append(round(rougeL_fmeasure, 4))
        bleu_scores.append(round(bleu_score, 4))
        tau_scores.append(round(tau_score, 4))
        spearman_scores.append(round(spearman_score, 4))

        print(f"CER: {cer_score:.4f}")
        print(f"WER: {wer_score:.4f}")
        print(f"ROUGE: {rougeL_fmeasure:.4f}")
        print(f"BLEU: {bleu_score:.4f}")
        print(f"Order Score (Kendall Tau): {tau_score:.4f}")
        print(f"Order Score (Spearman): {spearman_score:.4f}")
        print()

    # calculate average scores
    avg_cer = sum(cer_scores) / len(cer_scores)
    avg_wer = sum(wer_scores) / len(wer_scores)
    avg_rouge = sum(rouge_scores) / len(rouge_scores)
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_tau = sum(tau_scores) / len(tau_scores)
    avg_spearman = sum(spearman_scores) / len(spearman_scores)
    
    print(f"CER List: {cer_scores}")
    print(f"Average CER: {avg_cer:.4f}")
    print(f"WER List: {wer_scores}")
    print(f"Average WER: {avg_wer:.4f}")
    print(f"ROUGE List: {rouge_scores}")
    print(f"Average ROUGE: {avg_rouge:.4f}")
    print(f"BLEU List: {bleu_scores}")
    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Order Score (Kendall Tau) List: {tau_scores}")
    print(f"Average Order Score (Kendall Tau): {avg_tau:.4f}")
    print(f"Order Score (Spearman) List: {spearman_scores}")
    print(f"Average Order Score (Spearman): {avg_spearman:.4f}")

# LlamaParse

In [7]:
# read json
with open('res/llamaparse_res.json') as f:
    llamaparse_res = json.load(f)

In [8]:
# benchmarking llamaparse
benchmark(benchmark_data, llamaparse_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Pred indices: [7, 0, 1, 2, 3, 7, 4, 7, 5, 6, 0, 7, 8, 9, 10, 0]
Pred indices score: [0.37037037037037035, 1.0, 1.0, 0.9787234042553191, 1.0, 0.5079365079365079, 0.9162790697674419, 0.35294117647058826, 0.958904109589041, 1.0, 0.35051546391752575, 0.9310344827586207, 1.0, 1.0, 0.9046454767726161, 0.35051546391752575]

CER: 0.0389
WER: 0.0457
ROUGE: 0.9785
BLEU: 0.9336
Order Score (Kendall Tau): 0.3726
Order Score (Spearman): 0.3492

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Pred indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Pred indices score: [1.0, 1.0, 0.9811320754716981, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9966666666666667]

CER: 0.0006
WER: 0.0027
ROUGE: 1.0000
BLEU: 0.9932
Order Score (Kendall Tau): 1.0000
Order Score (Spearman): 1.0000

Page 3
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Pred indices: [5,

# Mistral OCR

In [9]:
# read json
with open('res/mistralocr_res.json') as f:
    mistralocr_res = json.load(f)
 
benchmark(benchmark_data, mistralocr_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Pred indices: [10, 0, 1, 2, 4, 6, 7, 6, 9, 14, 14, 10, 12, 13, 14, 14]
Pred indices score: [0.35714285714285715, 1.0, 0.9965277777777778, 1.0, 0.9206349206349206, 0.6274509803921569, 0.9330143540669856, 0.41025641025641024, 0.8383233532934131, 0.22442244224422442, 1.0, 1.0, 0.969626168224299, 0.9649289099526066, 0.15212527964205816, 1.0]

CER: 0.2753
WER: 0.2329
ROUGE: 0.9112
BLEU: 0.7354
Order Score (Kendall Tau): 0.7073
Order Score (Spearman): 0.8027

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Pred indices: [0, 8, 0, 8, 0, 8, 1, 2, 3, 4, 5, 6, 7, 8, 8]
Pred indices score: [0.32786885245901637, 0.2328767123287671, 0.3888888888888889, 0.19736842105263158, 1.0, 0.2810810810810811, 0.7380675203725262, 0.7692307692307693, 1.0, 0.8148148148148148, 0.9984726161902684, 1.0, 0.9979135618479881, 0.8854166666666666, 0.16790123456790124]

CER: 0.1322
WER: 0.1562
ROUGE: 0.9147
BLEU: 0.8409

# Pipeline 1 (Yolo + GOT-OCR2 + Gemma 3)

In [10]:
# read json
with open('res/pipeline_1_res.json') as f:
    pipeline_1_res = json.load(f)

# Benchmarking pipeline_1
benchmark(benchmark_data, pipeline_1_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Pred indices: [0, 0, 1, 2, 2, 2, 4, 7, 4, 10, 7, 7, 5, 6, 8, 7]
Pred indices score: [0.24444444444444444, 0.9921259842519685, 0.9169329073482428, 0.4691358024691358, 0.24444444444444444, 0.38235294117647056, 0.47990255785627284, 0.375, 0.7164790174002047, 0.6274509803921569, 0.5866666666666667, 0.3380281690140845, 0.9551912568306011, 0.9506545820745217, 0.37774524158125916, 0.5866666666666667]

CER: 0.2011
WER: 0.2192
ROUGE: 0.9167
BLEU: 0.8486
Order Score (Kendall Tau): 0.7082
Order Score (Spearman): 0.8393

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Pred indices: [0, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 10]
Pred indices score: [0.3252032520325203, 0.25, 0.3853211009174312, 0.19047619047619047, 0.9908256880733946, 0.9182389937106918, 0.9240121580547113, 0.6, 0.7, 0.2, 0.3512513601741023, 0.2857142857142857, 0.2594142259414226, 0.3453815261044177, 0.7144866385372715]

CER: 0.

# Pipeline 2 (Yolo + Universal.io + Gemma 3 + Gemma 3)

In [11]:
# read json
with open('res/pipeline_2_res.json') as f:
    pipeline_2_res = json.load(f)

# Benchmarking pipeline_2
benchmark(benchmark_data, pipeline_2_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Pred indices: [0, 1, 2, 3, 5, 7, 8, 7, 13, 19, 11, 14, 16, 18, 9, 11]
Pred indices score: [0.7659574468085106, 0.9133858267716536, 0.9965397923875432, 0.9896907216494846, 0.9160997732426304, 0.6274509803921569, 0.937799043062201, 0.41025641025641024, 0.450354609929078, 0.47593582887700536, 1.0, 0.8852459016393442, 0.9672131147540983, 0.338368580060423, 0.463768115942029, 1.0]

CER: 0.2987
WER: 0.3470
ROUGE: 0.8886
BLEU: 0.7561
Order Score (Kendall Tau): 0.7059
Order Score (Spearman): 0.8262

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Pred indices: [0, 2, 0, 4, 6, 7, 8, 10, 12, 11, 13, 14, 15, 17, 20]
Pred indices score: [0.8524590163934426, 0.9647058823529412, 1.0, 0.6615384615384615, 0.9454545454545454, 0.9, 0.9907692307692307, 0.7741935483870968, 0.7777777777777778, 0.6956521739130435, 0.9984726161902684, 1.0, 0.9994029850746269, 0.84, 0.336734693877551]

CER: 0.1032
WER: 0.07

# Pipeline 3 (Yolo + Universal.io + Gemma 3 + phi-4)

In [12]:
# with open('res/pipeline_4_res.json') as f:
#     pipeline_4_res = json.load(f)

# pipeline_4_cer, pipeline_4_wer = benchmark(benchmark_data, pipeline_4_res)

# print(f"CER List: {pipeline_4_cer}")
# print(f"WER List: {pipeline_4_wer}")
# print()
# print(f"Average CER: {sum(pipeline_4_cer)/len(pipeline_4_cer):.4f}")
# print(f"Average WER: {sum(pipeline_4_wer)/len(pipeline_4_wer):.4f}")

# Pipeline 4 (Yolo + Universal.io + Qwen 2.5 VL + Qwen 2.5)

In [13]:
with open('res/pipeline_4_res.json') as f:
    pipeline_4_res = json.load(f)

benchmark(benchmark_data, pipeline_4_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Pred indices: [0, 1, 2, 3, 5, 6, 8, 9, 11, 11, 9, 13, 15, 17, 10, 9]
Pred indices score: [0.8, 0.967741935483871, 0.8102564102564103, 0.967741935483871, 0.9029345372460497, 0.5882352941176471, 0.7728459530026109, 0.9565217391304348, 0.717948717948718, 0.1156773211567732, 0.38596491228070173, 0.9473684210526315, 0.9694835680751174, 0.3620689655172414, 0.16044260027662519, 0.38596491228070173]

CER: 0.4095
WER: 0.4475
ROUGE: 0.8435
BLEU: 0.6339
Order Score (Kendall Tau): 0.7628
Order Score (Spearman): 0.8517

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Pred indices: [0, 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 11, 12, 1, 5]
Pred indices score: [1.0, 1.0, 0.9818181818181818, 0.9038461538461539, 0.9818181818181818, 0.6938775510204082, 0.9506726457399103, 0.967741935483871, 0.7611940298507462, 0.9166666666666666, 0.9943231441048035, 0.972972972972973, 0.8648170011806375, 0.234375, 0.125]

CER: 

# Pipeline 5 (Yolo + Universal.io + Qwen 2.5 VL + Gemma 3)

In [40]:
with open('res/pipeline_5_res.json') as f:
    pipeline_5_res = json.load(f)

benchmark(benchmark_data, pipeline_5_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Pred indices: [0, 15, 1, 2, 3, 4, 5, 6, 7, 8, 7, 13, 19, 11, 14, 15, 16, 17, 18, 19, 11]
Pred indices score: [0.926829268292683, 0.08695652173913043, 0.9838709677419355, 1.0, 0.9896907216494846, 0.9206349206349206, 0.9975429975429976, 0.8484848484848485, 1.0, 0.9425837320574163, 0.41025641025641024, 0.450354609929078, 0.5411764705882353, 1.0, 0.9836065573770492, 0.8947368421052632, 0.9975961538461539, 0.9459459459459459, 0.7953830010493179, 0.3585237258347979, 1.0]

CER: 0.2942
WER: 0.3059
ROUGE: 0.8883
BLEU: 0.8160
Order Score (Kendall Tau): 0.7116
Order Score (Spearman): 0.7724

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Pred indices: [0, 1, 0, 3, 4, 5, 7, 9, 11, 10, 12, 13, 14, 15, 16, 17, 18, 19]
Pred indices score: [0.8524590163934426, 0.7671232876712328, 1.0, 0.5, 0.9818181818181818, 0.8888888888888888, 0.5826086956521739, 0.9375, 0.77777777

In [56]:
from difflib import SequenceMatcher
from scipy.stats import kendalltau, spearmanr
from sklearn.metrics.cluster import adjusted_rand_score
import re

def split_into_blocks(text):
    # Basic split: by double newlines (paragraph/section boundaries)
    blocks = [block.strip() for block in text.strip().split('\n\n') if block.strip()]
    return blocks

def block_content_recall(pred_blocks, target_blocks):
    matched = 0
    for t_block in target_blocks:
        for p_block in pred_blocks:
            if SequenceMatcher(None, t_block, p_block).ratio() > 0.75:  # 85% similar
                matched += 1
                break
    return matched / len(target_blocks) if target_blocks else 1.0

def get_block_order_indices(pred_blocks, target_blocks):
    indices = []
    indices_score = []
    for i, p_block in enumerate(pred_blocks):
        sequence_score = []
        for j, t_block in enumerate(target_blocks):
            score = SequenceMatcher(None, t_block, p_block).ratio()
            sequence_score.append(score)
            # print()
            # print("Target block:", t_block)
            # print("Pred blocks:", p_block)
            # print("Score:", score)
            # print("Pred block index:", j)
            # print()
        print("Sequence score:", sequence_score)
        print("Max sequence score:", max(sequence_score))
        print("Max sequence score index:", sequence_score.index(max(sequence_score)))
        print()
        if sequence_score:
            # if max(sequence_score) > 0.5:  # 50% similar
                max_index = sequence_score.index(max(sequence_score))
                indices.append(max_index)
                indices_score.append(max(sequence_score))
    return indices, indices_score

def order_score_kendall_tau(pred_blocks, target_blocks):
    target_indices = list(range(len(target_blocks)))
    len_indices = min(len(pred_blocks), len(target_blocks))
    pred_indices, _ = get_block_order_indices(pred_blocks[:len_indices], target_blocks[:len_indices])
    # pred_indices, _ = get_block_order_indices(pred_blocks, target_blocks)
    print("Target indices:", target_indices)
    print("Pred indices:", pred_indices)
    print("Pred indices score:", _)
    print()
    
    if len(pred_indices) < 2:
        return 0.0  # not enough data to compare order
    tau, _ = kendalltau(target_indices[:len(pred_indices)], pred_indices)
    return max(tau, 0)  # tau can be negative

    # stat = spearmanr(target_indices[:len(pred_indices)], pred_indices)
    # return max(stat.statistic, 0)  # stat.correlation can be negative
    # print("Spearman statistic:", stat.statistic)

    # score = adjusted_rand_score(target_indices[:len(pred_indices)], pred_indices)
    # return max(score, 0)  # score can be negative

def evaluate_prediction(pred_text, target_text):
    pred_blocks = split_into_blocks(pred_text)
    target_blocks = split_into_blocks(target_text)
    # print("Length of target blocks:", len(target_blocks))
    # print("Length of pred blocks:", len(pred_blocks))
    print("Target blocks:", target_blocks[11])
    print("Pred blocks:", pred_blocks[17])
    print(SequenceMatcher(None, target_blocks[1], pred_blocks[0]).ratio())

    recall = block_content_recall(pred_blocks, target_blocks)
    order = order_score_kendall_tau(pred_blocks, target_blocks)

    final_score = 0.5 * (recall +  order)
    return {
        # "content_recall": round(recall, 3),
        "order_score": round(order, 3),
        # "final_score": round(final_score, 3)
    }

# Example usage:
idx = 2
prediction = pipeline_5_res['pages'][idx]['markdown']
target = benchmark_data['pages'][idx]['markdown']

results = evaluate_prediction(prediction, target)
print("Evaluation Results:")
for k, v in results.items():
    print(f"{k}: {v}")

Target blocks: *p < 0.05
**p < 0.01
Pred blocks: knowledge reusability had a statistically significant positive correlation with short-term retention score (rpb = 0.672, n = 13, p = 0.012), but not for long-term retention scores (rpb = 0.466, n = 13, p = 0.108). However, when knowledge retention between short- and long-term was analyzed, Pearsonâ€™s correlation showed that there was a positive cor- relation between both, which was statistically significant (rp = 0.717, n= 13, p = 0.006). Fig. 6 illustrates the comparison for the success rate of wiring a second sensor by comparing the traditional AR, Enhanced AR and Paper based approaches, achieving 50%, 80% and 25% respectively.
0.11494252873563218
Sequence score: [1.0, 0.11494252873563218, 0.07194244604316546, 0.33962264150943394, 0.07563025210084033, 0.35294117647058826, 0.16129032258064516, 0.24, 0.11162790697674418, 0.17721518987341772, 0.01639344262295082, 0.15384615384615385, 0.021406727828746176, 0.22857142857142856, 0.014825796

In [39]:
with open('res/pipeline_5_2_res.json') as f:
    pipeline_5_2_res = json.load(f)

benchmark(benchmark_data, pipeline_5_2_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Pred indices: [0, 15, 1, 2, 3, 4, 5, 6, 7, 8, 7, 13, 9, 11, 14, 15, 16, 17, 18, 9, 11]
Pred indices score: [0.95, 0.08695652173913043, 0.9838709677419355, 1.0, 1.0, 0.9206349206349206, 0.9975429975429976, 0.8484848484848485, 1.0, 0.9425837320574163, 0.41025641025641024, 0.450354609929078, 0.5411764705882353, 1.0, 1.0, 0.8947368421052632, 0.9975961538461539, 0.9459459459459459, 0.7953830010493179, 0.3585237258347979, 1.0]

CER: 0.2297
WER: 0.2260
ROUGE: 0.9134
BLEU: 0.8233
Order Score (Kendall Tau): 0.6923
Order Score (Spearman): 0.7347

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Pred indices: [0, 1, 0, 3, 4, 5, 7, 9, 11, 10, 12, 13, 14, 15, 16, 17, 18, 19]
Pred indices score: [0.8524590163934426, 0.7671232876712328, 1.0, 0.5, 0.9818181818181818, 0.8888888888888888, 0.5826086956521739, 0.9375, 0.7777777777777778, 0.6956521739130435, 0.9984732824427