# Load Requirements

In [2]:
# !pip install torchmetrics python-Levenshtein

In [3]:
import json

# read json
with open('res/benchmark_data.json') as f:
    benchmark_data = json.load(f)
 
# benchmark_data

In [4]:
# Character Error rate (CER)
from torchmetrics.text import CharErrorRate
from torchmetrics.text import WordErrorRate
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.text import BLEUScore
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import single_meteor_score
import Levenshtein

cer = CharErrorRate()
wer = WordErrorRate()
rouge = ROUGEScore()
bleu = BLEUScore()


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
hypothesis = 'CIRP Journal of Manufacturing Science and Technology 48 (2024) 19-27\n\nContents lists available at ScienceDirect\n\nCIRP Journal of Manufacturing Science and Technology'
reference = 'CIRP Journal of Manufacturing Science and Technology 48 (2024) 19-27\n\nContents lists available at ScienceDirect\n\n# CIRP Journal of Manufacturing Science and Technology'

# rouge score
rouge_score= rouge(preds=hypothesis, target=reference)

# bleu score
bleu_score = bleu(preds=[hypothesis], target=[[reference]]).item()

# meteor score
meteor_score = single_meteor_score(word_tokenize(reference), word_tokenize(hypothesis))

# print scores
print(f"Rouge Score: ")
for key, value in rouge_score.items():
    print(f"{key}: {value}")

print(f"\nBLEU Score: {bleu_score}")
print(f"\nMETEOR Score: {meteor_score}")

Rouge Score: 
rouge1_fmeasure: 1.0
rouge1_precision: 1.0
rouge1_recall: 1.0
rouge2_fmeasure: 1.0
rouge2_precision: 1.0
rouge2_recall: 1.0
rougeL_fmeasure: 1.0
rougeL_precision: 1.0
rougeL_recall: 1.0
rougeLsum_fmeasure: 1.0
rougeLsum_precision: 1.0
rougeLsum_recall: 1.0

BLEU Score: 0.8807735443115234

METEOR Score: 0.9635765283355645


In [6]:
hypothesis = "My country economy at this season keeps escaping from the shadow of business though holds a crude oil high, so on an unstable element that continues still, and recovering gradually and well."
reference = "My country economy at this season keeps escaping from Odoba of business though holds a crude oil high so on unstable element that continues still, and recovering gradually and well."

# rouge score
rouge_score= rouge(preds=hypothesis, target=reference)

# bleu score
bleu_score = bleu(preds=[hypothesis], target=[[reference]]).item()

# meteor score
meteor_score = single_meteor_score(word_tokenize(reference), word_tokenize(hypothesis))

# print scores
print(f"Rouge Score: ")
for key, value in rouge_score.items():
    print(f"{key}: {value}")

print(f"\nBLEU Score: {bleu_score}")
print(f"\nMETEOR Score: {meteor_score}")

Rouge Score: 
rouge1_fmeasure: 0.9354838728904724
rouge1_precision: 0.90625
rouge1_recall: 0.9666666388511658
rouge2_fmeasure: 0.8666666746139526
rouge2_precision: 0.8387096524238586
rouge2_recall: 0.8965517282485962
rougeL_fmeasure: 0.9354838728904724
rougeL_precision: 0.90625
rougeL_recall: 0.9666666388511658
rougeLsum_fmeasure: 0.9354838728904724
rougeLsum_precision: 0.90625
rougeLsum_recall: 0.9666666388511658

BLEU Score: 0.7173057794570923

METEOR Score: 0.9587214041101407


In [7]:
hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party"
reference = "It is a guide to action that ensures that the military will forever heed Party commands"

# rouge score
rouge_score= rouge(preds=hypothesis, target=reference)

# bleu score
bleu_score = bleu(preds=[hypothesis], target=[[reference]]).item()

# meteor score
meteor_score = single_meteor_score(word_tokenize(reference), word_tokenize(hypothesis))

# print scores
print(f"Rouge Score: ")
for key, value in rouge_score.items():
    print(f"{key}: {value}")

print(f"\nBLEU Score: {bleu_score}")
print(f"\nMETEOR Score: {meteor_score}")

Rouge Score: 
rouge1_fmeasure: 0.7058823704719543
rouge1_precision: 0.6666666865348816
rouge1_recall: 0.75
rouge2_fmeasure: 0.5
rouge2_precision: 0.47058823704719543
rouge2_recall: 0.5333333611488342
rougeL_fmeasure: 0.6470588445663452
rougeL_precision: 0.6111111044883728
rougeL_recall: 0.6875
rougeLsum_fmeasure: 0.6470588445663452
rougeLsum_precision: 0.6111111044883728
rougeLsum_recall: 0.6875

BLEU Score: 0.4118037521839142

METEOR Score: 0.6944444444444445


In [8]:
preds = ['CIRP Journal of Manufacturing Science and Technology 48 (2024) 19-27\n\nContents lists available at ScienceDirect\n\nCIRP Journal of Manufacturing Science and Technology']
target = ['CIRP Journal of Manufacturing Science and Technology 48 (2024) 19-27\n\nContents lists available at ScienceDirect\n\n# CIRP Journal of Manufacturing Science and Technology']

rouge(preds=preds, target=[target])

{'rouge1_fmeasure': tensor(1.),
 'rouge1_precision': tensor(1.),
 'rouge1_recall': tensor(1.),
 'rouge2_fmeasure': tensor(1.),
 'rouge2_precision': tensor(1.),
 'rouge2_recall': tensor(1.),
 'rougeL_fmeasure': tensor(1.),
 'rougeL_precision': tensor(1.),
 'rougeL_recall': tensor(1.),
 'rougeLsum_fmeasure': tensor(1.),
 'rougeLsum_precision': tensor(1.),
 'rougeLsum_recall': tensor(1.)}

In [9]:
def get_normalized_edit_distance(s1, s2):
    edit_distance = Levenshtein.distance(s1, s2)
    return edit_distance / max(len(s1), len(s2))

normalized_edit_distance = Levenshtein.distance(target, preds)
print(f"Normalized Edit Distance: {normalized_edit_distance:.4f}")

Normalized Edit Distance: 1.0000


In [10]:
# Order Score
from difflib import SequenceMatcher
from scipy.stats import kendalltau, spearmanr
import re

def split_into_blocks(text):
    # Basic split: by double newlines (paragraph/section boundaries)
    blocks = [block.strip() for block in text.strip().split('\n') if block.strip()]
    # remove empty blocks and blocks with only whitespace
    blocks = [block for block in blocks if block and not re.match(r'^\s*$', block)]
    return blocks

def get_block_order_indices(pred_blocks, target_blocks):
    indices = []
    indices_score = []
    for j, t_block in enumerate(target_blocks):
        sequence_score = []
        for i, p_block in enumerate(pred_blocks):
            score = SequenceMatcher(None, t_block, p_block).ratio()
            sequence_score.append(score)
        if sequence_score:
            max_index = sequence_score.index(max(sequence_score))
            indices.append(max_index)
            indices_score.append(max(sequence_score))
    return indices, indices_score

def order_score_predict(pred_text, target_text):
    pred_blocks = split_into_blocks(pred_text)
    target_blocks = split_into_blocks(target_text)

    target_indices = list(range(len(target_blocks)))
    pred_indices, _ = get_block_order_indices(pred_blocks, target_blocks)
    # print("Target indices:", target_indices)
    # print("Pred indices:", pred_indices)
    # print("Pred indices score:", _)
    # print()
    
    if len(pred_indices) < 2:
        return 0.0, 0.0  # not enough data to compare order
    
    tau, _ = kendalltau(target_indices, pred_indices)
    spearman = spearmanr(target_indices, pred_indices)[0]

    return max(tau, 0), max(spearman, 0)  # ensure non-negative

In [11]:
# benchmarking function
def benchmark(benchmark_data, parser_res):
    cer_scores = []
    wer_scores = []
    rouge_scores = []
    bleu_scores = []
    tau_scores = []
    spearman_scores = []
    meteor_scores = []
    ned_scores = []
    for i, page in enumerate(benchmark_data['pages']):
        print(f"Page {i+1}")
        target = page['markdown']
        preds = parser_res['pages'][i]['markdown']

        # cer score
        cer_score = cer(preds=preds, target=target).item()
        cer_scores.append(round(cer_score, 4))

        # wer score
        wer_score = wer(preds=preds, target=target).item()
        wer_scores.append(round(wer_score, 4))

        # rouge score
        rouge_score = rouge(preds=preds, target=target)
        rougeL_recall = rouge_score["rougeL_recall"].item()
        rouge_scores.append(round(rougeL_recall, 4))

        # bleu score
        bleu_score = bleu(preds=[preds], target=[[target]]).item()
        bleu_scores.append(round(bleu_score, 4))

        # order score
        tau_score, spearman_score = order_score_predict(preds, target)
        tau_score = float(tau_score)
        spearman_score = float(spearman_score)
        tau_scores.append(round(tau_score, 4))
        spearman_scores.append(round(spearman_score, 4))

        # meteor score
        # tokenize the text into words
        target_words = word_tokenize(target)
        preds_words = word_tokenize(preds)
        meteor_score = single_meteor_score(target_words, preds_words)
        meteor_scores.append(round(meteor_score, 4))
    
        # normalized edit distance score
        ned_score = Levenshtein.distance(target, preds) / max(len(target), len(preds))
        ned_score = float(ned_score)
        ned_scores.append(round(ned_score, 4))

        # Print scores
        print(f"CER: {cer_score:.4f}")
        print(f"WER: {wer_score:.4f}")
        print(f"METEOR: {meteor_score:.4f}")
        # print(f"ROUGE: {rougeL_recall:.4f}")
        print(f"BLEU: {bleu_score:.4f}")
        print(f"Order Score (Kendall Tau): {tau_score:.4f}")
        print(f"Order Score (Spearman): {spearman_score:.4f}")
        print(f"Normalized Edit Distance: {ned_score:.4f}")
        print()

    # calculate average scores
    avg_cer = sum(cer_scores) / len(cer_scores)
    avg_wer = sum(wer_scores) / len(wer_scores)
    # avg_rouge = sum(rouge_scores) / len(rouge_scores)
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_tau = sum(tau_scores) / len(tau_scores)
    avg_spearman = sum(spearman_scores) / len(spearman_scores)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_ned = sum(ned_scores) / len(ned_scores)
    
    print(f"CER List: {cer_scores}")
    print(f"Average CER: {avg_cer:.4f}")
    print(f"WER List: {wer_scores}")
    print(f"Average WER: {avg_wer:.4f}")
    print(f"Meteor List: {meteor_scores}")
    print(f"Average Meteor: {avg_meteor:.4f}")
    # print(f"ROUGE List: {rouge_scores}")
    # print(f"Average ROUGE: {avg_rouge:.4f}")
    print(f"BLEU List: {bleu_scores}")
    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Order Score (Kendall Tau) List: {tau_scores}")
    print(f"Average Order Score (Kendall Tau): {avg_tau:.4f}")
    print(f"Order Score (Spearman) List: {spearman_scores}")
    print(f"Average Order Score (Spearman): {avg_spearman:.4f}")
    print(f"Normalized Edit Distance List: {ned_scores}")
    print(f"Average Normalized Edit Distance: {avg_ned:.4f}")

# LlamaParse

In [45]:
# read json
with open('res/llamaparse_res.json') as f:
    llamaparse_res = json.load(f)

In [46]:
# benchmarking llamaparse
benchmark(benchmark_data, llamaparse_res)

Page 1
CER: 0.0408
WER: 0.0457
METEOR: 0.9635
BLEU: 0.9336
Order Score (Kendall Tau): 0.8333
Order Score (Spearman): 0.8772
Normalized Edit Distance: 0.0408

Page 2
CER: 0.0012
WER: 0.0027
METEOR: 0.9977
BLEU: 0.9932
Order Score (Kendall Tau): 1.0000
Order Score (Spearman): 1.0000
Normalized Edit Distance: 0.0012

Page 3
CER: 0.6015
WER: 0.5682
METEOR: 0.7486
BLEU: 0.8433
Order Score (Kendall Tau): 0.0366
Order Score (Spearman): 0.0000
Normalized Edit Distance: 0.6015

Page 4
CER: 0.0907
WER: 0.0903
METEOR: 0.8609
BLEU: 0.9028
Order Score (Kendall Tau): 0.3503
Order Score (Spearman): 0.3581
Normalized Edit Distance: 0.0907

Page 5
CER: 0.6032
WER: 0.6334
METEOR: 0.9292
BLEU: 0.7368
Order Score (Kendall Tau): 0.5829
Order Score (Spearman): 0.6594
Normalized Edit Distance: 0.5591

Page 6
CER: 0.0448
WER: 0.0439
METEOR: 0.9742
BLEU: 0.9634
Order Score (Kendall Tau): 0.9305
Order Score (Spearman): 0.9666
Normalized Edit Distance: 0.0448

CER List: [0.0408, 0.0012, 0.6015, 0.0907, 0.6032, 0

# Mistral OCR

In [47]:
# read json
with open('res/mistralocr_res.json') as f:
    mistralocr_res = json.load(f)
 
benchmark(benchmark_data, mistralocr_res)

Page 1
CER: 0.2748
WER: 0.2329
METEOR: 0.7566
BLEU: 0.7354
Order Score (Kendall Tau): 0.1259
Order Score (Spearman): 0.0416
Normalized Edit Distance: 0.2748

Page 2
CER: 0.1325
WER: 0.1562
METEOR: 0.8749
BLEU: 0.8409
Order Score (Kendall Tau): 0.6448
Order Score (Spearman): 0.6799
Normalized Edit Distance: 0.1325

Page 3
CER: 0.2800
WER: 0.2712
METEOR: 0.7383
BLEU: 0.7017
Order Score (Kendall Tau): 0.5257
Order Score (Spearman): 0.6164
Normalized Edit Distance: 0.2800

Page 4
CER: 0.1239
WER: 0.1429
METEOR: 0.7957
BLEU: 0.8285
Order Score (Kendall Tau): 0.1863
Order Score (Spearman): 0.2015
Normalized Edit Distance: 0.1239

Page 5
CER: 0.1461
WER: 0.3238
METEOR: 0.8331
BLEU: 0.6098
Order Score (Kendall Tau): 0.7385
Order Score (Spearman): 0.8032
Normalized Edit Distance: 0.1461

Page 6
CER: 0.6114
WER: 0.6384
METEOR: 0.5565
BLEU: 0.3958
Order Score (Kendall Tau): 0.3298
Order Score (Spearman): 0.3601
Normalized Edit Distance: 0.6114

CER List: [0.2748, 0.1325, 0.28, 0.1239, 0.1461, 0.6

# Pipeline 1 (Yolo + GOT-OCR2 + Gemma 3)

In [None]:
# read json
with open('res/pipeline_1_res.json') as f:
    pipeline_1_res = json.load(f)

# Benchmarking pipeline_1
benchmark(benchmark_data, pipeline_1_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Pred indices: [0, 0, 1, 2, 2, 2, 4, 7, 4, 10, 7, 7, 5, 6, 8, 7]
Pred indices score: [0.24444444444444444, 0.9921259842519685, 0.9169329073482428, 0.4691358024691358, 0.24444444444444444, 0.38235294117647056, 0.47990255785627284, 0.375, 0.7164790174002047, 0.6274509803921569, 0.5866666666666667, 0.3380281690140845, 0.9551912568306011, 0.9506545820745217, 0.37774524158125916, 0.5866666666666667]

CER: 0.2011
WER: 0.2192
ROUGE: 0.9167
BLEU: 0.8486
Order Score (Kendall Tau): 0.7082
Order Score (Spearman): 0.8393

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Pred indices: [0, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 10]
Pred indices score: [0.3252032520325203, 0.25, 0.3853211009174312, 0.19047619047619047, 0.9908256880733946, 0.9182389937106918, 0.9240121580547113, 0.6, 0.7, 0.2, 0.3512513601741023, 0.2857142857142857, 0.2594142259414226, 0.3453815261044177, 0.7144866385372715]

CER: 0.

# Pipeline 2 (Yolo + Universal.io + Gemma 3 + Gemma 3)

In [None]:
# read json
with open('res/pipeline_2_res.json') as f:
    pipeline_2_res = json.load(f)

# Benchmarking pipeline_2
benchmark(benchmark_data, pipeline_2_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Pred indices: [0, 1, 2, 3, 5, 7, 8, 7, 13, 19, 11, 14, 16, 18, 9, 11]
Pred indices score: [0.7659574468085106, 0.9133858267716536, 0.9965397923875432, 0.9896907216494846, 0.9160997732426304, 0.6274509803921569, 0.937799043062201, 0.41025641025641024, 0.450354609929078, 0.47593582887700536, 1.0, 0.8852459016393442, 0.9672131147540983, 0.338368580060423, 0.463768115942029, 1.0]

CER: 0.2987
WER: 0.3470
ROUGE: 0.8886
BLEU: 0.7561
Order Score (Kendall Tau): 0.7059
Order Score (Spearman): 0.8262

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Pred indices: [0, 2, 0, 4, 6, 7, 8, 10, 12, 11, 13, 14, 15, 17, 20]
Pred indices score: [0.8524590163934426, 0.9647058823529412, 1.0, 0.6615384615384615, 0.9454545454545454, 0.9, 0.9907692307692307, 0.7741935483870968, 0.7777777777777778, 0.6956521739130435, 0.9984726161902684, 1.0, 0.9994029850746269, 0.84, 0.336734693877551]

CER: 0.1032
WER: 0.07

# Pipeline 3 (Yolo + Universal.io + Gemma 3 + phi-4)

In [None]:
# with open('res/pipeline_4_res.json') as f:
#     pipeline_4_res = json.load(f)

# pipeline_4_cer, pipeline_4_wer = benchmark(benchmark_data, pipeline_4_res)

# print(f"CER List: {pipeline_4_cer}")
# print(f"WER List: {pipeline_4_wer}")
# print()
# print(f"Average CER: {sum(pipeline_4_cer)/len(pipeline_4_cer):.4f}")
# print(f"Average WER: {sum(pipeline_4_wer)/len(pipeline_4_wer):.4f}")

# Pipeline 4 (Yolo + Universal.io + Qwen 2.5 VL + Qwen 2.5)

In [None]:
with open('res/pipeline_4_2_res.json') as f:
    pipeline_4_res = json.load(f)

benchmark(benchmark_data, pipeline_4_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
Pred indices: [0, 28, 1, 2, 3, 4, 5, 6, 7, 5, 9, 16, 23, 11, 13, 13, 14, 15, 23, 17, 18, 19, 21, 16, 24, 25, 26, 27, 28, 28, 29, 29, 32, 23]
Pred indices score: [0.9473684210526315, 0.1111111111111111, 0.8888888888888888, 0.2811881188118812, 0.7608695652173914, 0.8235294117647058, 0.4816753926701571, 0.7368421052631579, 0.7777777777777778, 0.6329787234042553, 0.8, 0.3950617283950617, 0.7380952380952381, 0.64, 0.6933333333333334, 0.72, 0.72, 0.64, 0.8378378378378378, 0.7457627118644068, 0.7272727272727273, 0.3824959481361426, 0.6493506493506493, 0.19704433497536947, 0.7733333333333333, 0.8, 0.7466666666666667, 0.7733333333333333, 0.7733333333333333, 0.7466666666666667, 0.7466666666666667, 0.7466666666666667, 0.7466666666666667, 0.8378378378378378]

CER: 0.5755
WER: 0.7648
ROUGE: 0.3607
BLEU: 0.1377
Order Score (Kendall Tau): 0.7680
Order S

# Pipeline 5 (Yolo + Universal.io + Qwen 2.5 VL + Gemma 3)

In [49]:
with open('res/pipeline_5_2_res.json') as f:
    pipeline_5_res = json.load(f)

benchmark(benchmark_data, pipeline_5_res)

Page 1
CER: 0.2297
WER: 0.2260
METEOR: 0.8994
BLEU: 0.8233
Order Score (Kendall Tau): 0.7061
Order Score (Spearman): 0.8168
Normalized Edit Distance: 0.2297

Page 2
CER: 0.0391
WER: 0.0611
METEOR: 0.9648
BLEU: 0.9051
Order Score (Kendall Tau): 0.8202
Order Score (Spearman): 0.9222
Normalized Edit Distance: 0.0386

Page 3
CER: 0.1823
WER: 0.1652
METEOR: 0.8784
BLEU: 0.8043
Order Score (Kendall Tau): 0.6384
Order Score (Spearman): 0.6557
Normalized Edit Distance: 0.1823

Page 4
CER: 0.0396
WER: 0.0728
METEOR: 0.9587
BLEU: 0.8937
Order Score (Kendall Tau): 0.8587
Order Score (Spearman): 0.9425
Normalized Edit Distance: 0.0394

Page 5
CER: 0.6308
WER: 0.7393
METEOR: 0.9344
BLEU: 0.7767
Order Score (Kendall Tau): 0.5057
Order Score (Spearman): 0.5700
Normalized Edit Distance: 0.6308

Page 6
CER: 0.1890
WER: 0.1889
METEOR: 0.7903
BLEU: 0.8284
Order Score (Kendall Tau): 0.8002
Order Score (Spearman): 0.8291
Normalized Edit Distance: 0.1890

CER List: [0.2297, 0.0391, 0.1823, 0.0396, 0.6308, 0

In [12]:
with open('res/pipeline_5_3_res.json') as f:
    pipeline_5_res = json.load(f)

benchmark(benchmark_data, pipeline_5_res)

Page 1
CER: 0.2297
WER: 0.2260
METEOR: 0.8994
BLEU: 0.8233
Order Score (Kendall Tau): 0.7061
Order Score (Spearman): 0.8168
Normalized Edit Distance: 0.2297

Page 2
CER: 0.0178
WER: 0.0462
METEOR: 0.9657
BLEU: 0.9212
Order Score (Kendall Tau): 0.8443
Order Score (Spearman): 0.9339
Normalized Edit Distance: 0.0178

Page 3
CER: 0.1557
WER: 0.1303
METEOR: 0.8750
BLEU: 0.8365
Order Score (Kendall Tau): 0.8464
Order Score (Spearman): 0.8863
Normalized Edit Distance: 0.1557

Page 4
CER: 0.0392
WER: 0.0728
METEOR: 0.9587
BLEU: 0.8937
Order Score (Kendall Tau): 0.8587
Order Score (Spearman): 0.9425
Normalized Edit Distance: 0.0390

Page 5
CER: 0.6271
WER: 0.7393
METEOR: 0.9242
BLEU: 0.7712
Order Score (Kendall Tau): 0.5034
Order Score (Spearman): 0.5665
Normalized Edit Distance: 0.6271

Page 6
CER: 0.1890
WER: 0.1889
METEOR: 0.7903
BLEU: 0.8284
Order Score (Kendall Tau): 0.8002
Order Score (Spearman): 0.8291
Normalized Edit Distance: 0.1890

CER List: [0.2297, 0.0178, 0.1557, 0.0392, 0.6271, 0

# Test Code

In [None]:
from difflib import SequenceMatcher
from scipy.stats import kendalltau, spearmanr
from sklearn.metrics.cluster import adjusted_rand_score
import re

def split_into_blocks(text):
    # Basic split: by double newlines (paragraph/section boundaries)
    blocks = [block.strip() for block in text.strip().split('\n\n') if block.strip()]
    return blocks

def block_content_recall(pred_blocks, target_blocks):
    matched = 0
    for t_block in target_blocks:
        for p_block in pred_blocks:
            if SequenceMatcher(None, t_block, p_block).ratio() > 0.75:  # 85% similar
                matched += 1
                break
    return matched / len(target_blocks) if target_blocks else 1.0

def get_block_order_indices(pred_blocks, target_blocks):
    indices = []
    indices_score = []
    for j, t_block in enumerate(target_blocks):
        sequence_score = []
        for i, p_block in enumerate(pred_blocks):
            score = SequenceMatcher(None, t_block, p_block).ratio()
            sequence_score.append(score)
            # print()
            # print("Target block:", t_block)
            # print("Pred blocks:", p_block)
            # print("Score:", score)
            # print("Pred block index:", j)
            # print()
        print("Sequence score:", sequence_score)
        print("Max sequence score:", max(sequence_score))
        print("Max sequence score index:", sequence_score.index(max(sequence_score)))
        print()
        if sequence_score:
            # if max(sequence_score) > 0.5:  # 50% similar
                max_index = sequence_score.index(max(sequence_score))
                indices.append(max_index)
                indices_score.append(max(sequence_score))
    return indices, indices_score

def order_score_kendall_tau(pred_blocks, target_blocks):
    target_indices = list(range(len(target_blocks)))
    len_indices = min(len(pred_blocks), len(target_blocks))
    pred_indices, _ = get_block_order_indices(pred_blocks, target_blocks)
    # pred_indices, _ = get_block_order_indices(pred_blocks, target_blocks)
    print("Target indices:", target_indices)
    print("Pred indices:", pred_indices)
    print("Pred indices score:", _)
    print()
    
    if len(pred_indices) < 2:
        return 0.0  # not enough data to compare order
    tau, _ = kendalltau(target_indices, pred_indices)
    return max(tau, 0)  # tau can be negative

    # stat = spearmanr(target_indices[:len(pred_indices)], pred_indices)
    # return max(stat.statistic, 0)  # stat.correlation can be negative
    # print("Spearman statistic:", stat.statistic)

    # score = adjusted_rand_score(target_indices[:len(pred_indices)], pred_indices)
    # return max(score, 0)  # score can be negative

def evaluate_prediction(pred_text, target_text):
    pred_blocks = split_into_blocks(pred_text)
    target_blocks = split_into_blocks(target_text)
    # print("Length of target blocks:", len(target_blocks))
    # print("Length of pred blocks:", len(pred_blocks))
    print("Target blocks:", target_blocks[17])
    print("Pred blocks:", pred_blocks[1])
    print(SequenceMatcher(None, target_blocks[17], pred_blocks[12]).ratio())

    recall = block_content_recall(pred_blocks, target_blocks)
    order = order_score_kendall_tau(pred_blocks, target_blocks)

    final_score = 0.5 * (recall +  order)
    return {
        # "content_recall": round(recall, 3),
        "order_score": round(order, 3),
        # "final_score": round(final_score, 3)
    }

# Example usage:
idx = 0
prediction = pipeline_5_res['pages'][idx]['markdown']
target = benchmark_data['pages'][idx]['markdown']

results = evaluate_prediction(prediction, target)
print("Evaluation Results:")
for k, v in results.items():
    print(f"{k}: {v}")

Target blocks: ### Enhancement of system of management
Pred blocks: New customer's development and increasing the sale of product
0.41509433962264153
Sequence score: [0.926829268292683, 0.275, 0.013029315960912053, 0.23529411764705882, 0.24489795918367346, 0.026905829596412557, 0.47058823529411764, 0.2857142857142857, 0.0, 0.020202020202020204, 0.21978021978021978, 0.3018867924528302, 0.23255813953488372, 0.008695652173913044, 0.4, 0.2222222222222222, 0.009174311926605505, 0.2222222222222222, 0.004032258064516129, 0.05714285714285714, 0.3018867924528302]
Max sequence score: 0.926829268292683
Max sequence score index: 0

Sequence score: [0.07142857142857142, 0.05970149253731343, 0.006802721088435374, 0.03636363636363636, 0.05555555555555555, 0.009523809523809525, 0.0, 0.0, 0.00881057268722467, 0.014084507042253521, 0.02564102564102564, 0.05, 0.0273972602739726, 0.009216589861751152, 0.05405405405405406, 0.08695652173913043, 0.009456264775413711, 0.04878048780487805, 0.008281573498964804

In [None]:
from difflib import SequenceMatcher
from scipy.stats import kendalltau, spearmanr
from sklearn.metrics.cluster import adjusted_rand_score
import re

def split_into_blocks(text):
    # Basic split: by double newlines (paragraph/section boundaries)
    blocks = [block.strip() for block in text.strip().split('\n\n') if block.strip()]
    return blocks

def block_content_recall(pred_blocks, target_blocks):
    matched = 0
    for t_block in target_blocks:
        for p_block in pred_blocks:
            if SequenceMatcher(None, t_block, p_block).ratio() > 0.75:  # 85% similar
                matched += 1
                break
    return matched / len(target_blocks) if target_blocks else 1.0

def get_block_order_indices(pred_blocks, target_blocks):
    indices = []
    indices_score = []
    for i, p_block in enumerate(pred_blocks):
        sequence_score = []
        for j, t_block in enumerate(target_blocks):
            score = SequenceMatcher(None, t_block, p_block).ratio()
            sequence_score.append(score)
            # print()
            # print("Target block:", t_block)
            # print("Pred blocks:", p_block)
            # print("Score:", score)
            # print("Pred block index:", j)
            # print()
        print("Sequence score:", sequence_score)
        print("Max sequence score:", max(sequence_score))
        print("Max sequence score index:", sequence_score.index(max(sequence_score)))
        print()
        if sequence_score:
            # if max(sequence_score) > 0.5:  # 50% similar
                max_index = sequence_score.index(max(sequence_score))
                indices.append(max_index)
                indices_score.append(max(sequence_score))
    return indices, indices_score

def order_score_kendall_tau(pred_blocks, target_blocks):
    target_indices = list(range(len(target_blocks)))
    len_indices = min(len(pred_blocks), len(target_blocks))
    pred_indices = list(range(len(pred_blocks)))
    pred_indices_2, _ = get_block_order_indices(pred_blocks, target_blocks)
    # pred_indices, _ = get_block_order_indices(pred_blocks, target_blocks)
    print("Target indices:", pred_indices)
    print("Pred indices:", pred_indices_2)
    print("Pred indices score:", _)
    print()
    
    if len(pred_indices) < 2:
        return 0.0  # not enough data to compare order
    tau, _ = kendalltau(pred_indices, pred_indices_2)
    return max(tau, 0)  # tau can be negative

    # stat = spearmanr(target_indices[:len(pred_indices)], pred_indices)
    # return max(stat.statistic, 0)  # stat.correlation can be negative
    # print("Spearman statistic:", stat.statistic)

    # score = adjusted_rand_score(target_indices[:len(pred_indices)], pred_indices)
    # return max(score, 0)  # score can be negative

def evaluate_prediction(pred_text, target_text):
    pred_blocks = split_into_blocks(pred_text)
    target_blocks = split_into_blocks(target_text)
    # print("Length of target blocks:", len(target_blocks))
    # print("Length of pred blocks:", len(pred_blocks))
    print("Target blocks:", target_blocks[0])
    print("Pred blocks:", pred_blocks[0])
    print(SequenceMatcher(None, target_blocks[0], pred_blocks[0]).ratio())

    recall = block_content_recall(pred_blocks, target_blocks)
    order = order_score_kendall_tau(pred_blocks, target_blocks)

    final_score = 0.5 * (recall +  order)
    return {
        # "content_recall": round(recall, 3),
        "order_score": round(order, 3),
        # "final_score": round(final_score, 3)
    }

# Example usage:
idx = 0
prediction = llamaparse_res['pages'][idx]['markdown']
target = benchmark_data['pages'][idx]['markdown']

results = evaluate_prediction(prediction, target)
print("Evaluation Results:")
for k, v in results.items():
    print(f"{k}: {v}")

Target blocks: PFU Business report
Pred blocks: # New customer's development and increasing the sale of product
0.2682926829268293
Sequence score: [0.2682926829268293, 0.057971014492753624, 1.0, 0.19943019943019943, 0.25225225225225223, 0.3333333333333333, 0.12030075187969924, 0.24691358024691357, 0.22784810126582278, 0.2, 0.13953488372093023, 0.1971153846153846, 0.10843373493975904, 0.35051546391752575, 0.15053763440860216, 0.21428571428571427, 0.13389121338912133, 0.3333333333333333, 0.137291280148423, 0.07563025210084033, 0.35051546391752575]
Max sequence score: 1.0
Max sequence score index: 2

Sequence score: [0.013029315960912053, 0.006802721088435374, 0.017094017094017096, 1.0, 0.017857142857142856, 0.006230529595015576, 0.06517311608961303, 0.013071895424836602, 0.0, 0.0288659793814433, 0.006430868167202572, 0.0124804992199688, 0.00718132854578097, 0.012422360248447204, 0.018867924528301886, 0.025889967637540454, 0.01991465149359886, 0.0, 0.010471204188481676, 0.0057061340941512

In [None]:
with open('res/pipeline_5_2_res.json') as f:
    pipeline_5_2_res = json.load(f)

benchmark(benchmark_data, pipeline_5_2_res)

Page 1
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Pred indices: [0, 15, 1, 2, 3, 4, 5, 6, 7, 8, 7, 13, 9, 11, 14, 15, 16, 17, 18, 9, 11]
Pred indices score: [0.95, 0.08695652173913043, 0.9838709677419355, 1.0, 1.0, 0.9206349206349206, 0.9975429975429976, 0.8484848484848485, 1.0, 0.9425837320574163, 0.41025641025641024, 0.450354609929078, 0.5411764705882353, 1.0, 1.0, 0.8947368421052632, 0.9975961538461539, 0.9459459459459459, 0.7953830010493179, 0.3585237258347979, 1.0]

CER: 0.2297
WER: 0.2260
ROUGE: 0.9134
BLEU: 0.8233
Order Score (Kendall Tau): 0.6923
Order Score (Spearman): 0.7347

Page 2
Target indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Pred indices: [0, 1, 0, 3, 4, 5, 7, 9, 11, 10, 12, 13, 14, 15, 16, 17, 18, 19]
Pred indices score: [0.8524590163934426, 0.7671232876712328, 1.0, 0.5, 0.9818181818181818, 0.8888888888888888, 0.5826086956521739, 0.9375, 0.7777777777777778, 0.6956521739130435, 0.9984732824427