# Language Model-based Quality Classifier

## Requirements

In [1]:
%pip install pagexml-tools py7zr nltk pandas tqdm textdistance
#%pip install pip install https://github.com/kpu/kenlm/archive/master.zip   # kenlm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk

nltk.download("punkt")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carstenschnober/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Download and Extract Ground Truth Data

In [3]:
GROUND_TRUTH_URL="https://zenodo.org/record/6414086/files/VOC%20Ground%20truths%20of%20the%20trainingset%20in%20PAGE%20xml.7z"

!/opt/homebrew/bin/wget -c $GROUND_TRUTH_URL

--2023-07-07 15:20:20--  https://zenodo.org/record/6414086/files/VOC%20Ground%20truths%20of%20the%20trainingset%20in%20PAGE%20xml.7z
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [4]:
import logging
import os
from pathlib import Path
import py7zr

GROUND_TRUTH_DIR: Path = Path("VOC Ground truths of the trainingset in PAGE xml")

try:
    GROUND_TRUTH_DIR.mkdir()

    with py7zr.SevenZipFile(
        "VOC Ground truths of the trainingset in PAGE xml.7z", mode="r"
    ) as archive:
        archive.extractall(GROUND_TRUTH_DIR)
except OSError:
    logging.info("Directory '%s' already exists", GROUND_TRUTH_DIR)


## Read Text from Ground Truth

In [5]:
import re
from nltk.tokenize import word_tokenize

LANGUAGE: str = "dutch"


def _preprocess(line: str) -> str:
    return re.sub(r"\d", "0", line).strip().lower()


def tokenize(line: str) -> list[str]:
    return word_tokenize(_preprocess(line), language=LANGUAGE)


In [6]:
from random import random
from sklearn.model_selection import train_test_split
from pagexml.parser import parse_pagexml_file

from tqdm import tqdm


page_xml_files = list(GROUND_TRUTH_DIR.glob("*.xml"))

# train_pages, test_pages = train_test_split(
#     page_xml_files, test_size=0.2, random_state=42
# )
train_pages = page_xml_files

training_lines: list[list[str]] = []
"""Tokenized lines of the ground truth."""

for file in tqdm(train_pages, desc="Reading", unit="file"):
    for line1 in parse_pagexml_file(file).get_lines():
        if text := line1.text:
            training_lines.append(tokenize(text))

training_lines


Reading:   0%|          | 0/4735 [00:00<?, ?file/s]

Reading: 100%|██████████| 4735/4735 [02:14<00:00, 35.08file/s]


[['00', '.'],
 ['van',
  'sumat',
  '„',
  's',
  'w',
  '„',
  '.t',
  'cust',
  ',',
  'den',
  '00',
  '„',
  '.',
  'jann',
  ':',
  'a',
  '„',
  'o',
  '0000',
  '.'],
 ['slaan', ',', 'dese', 'schult', 'ligtelijk', 'te', 'sullen', 'konnen'],
 ['afgekort', 'werden', ',', 'van', 'sijne', 'aan', 'te', 'groeijene'],
 ['jnkomsten', '.'],
 ['uyt',
  'de',
  'sapoulonboabandhaars',
  'sijn',
  'van',
  'wegens',
  'de',
  'volmagten',
  'uyt'],
 ['de', 'thienhandelsteden'],
 ['de', 'hooft', 'regenten', 'sedert', 'hier', 'ook', 'aangekomen'],
 ['komen', 'wegens', 'hare'],
 ['hare',
  'volmagten',
  ',',
  'namentlijk',
  'radja',
  'antosso',
  ',',
  'principalen',
  'den'],
 ['pligt', 'van', 'onderdanig'],
 ['wegens',
  'den',
  'radja',
  'van',
  'serantij',
  ',',
  'radja',
  'nang',
  '„',
  'heijt',
  'afleggen',
  'en'],
 ['moeda', ',', 'wegens', 'den', 'radja', 'van', 'cambang', 'en'],
 ['radja', 'seleman', ',', 'wegens', 'den', 'radja', 'van', 'pe', '„'],
 ['„', 'langit', ',',

## Train Language Model

In [7]:
from nltk.lm.preprocessing import padded_everygram_pipeline

N = 3

train, vocab = padded_everygram_pipeline(N, training_lines)
train, vocab


(<generator object padded_everygram_pipeline.<locals>.<genexpr> at 0x7fad19296d50>,
 <itertools.chain at 0x7fad23f6cc10>)

In [8]:
from nltk.lm import MLE, KneserNeyInterpolated

# model = MLE(N)

model = KneserNeyInterpolated(N)


In [9]:
model.fit(train, vocab)


## Score Pages

### Define Functions

In [10]:
from functools import lru_cache


def lru_cache_list_to_tuple(func):
    @lru_cache(maxsize=None)
    def wrapper(*args):
        args = tuple(arg if not isinstance(arg, Hashable) else tuple(arg) for arg in args)
        return func(*args)
    return wrapper

In [19]:
from functools import lru_cache
from statistics import harmonic_mean, mean, median
from typing import Optional


def aggregate_scores(scores: list[float]) -> float:
    return median([score for score in scores if score is not None])


@lru_cache(maxsize=1024)
def score_lines(lines: tuple["PageXMLTextLine"]) -> list[Optional[float]]:
    if len(lines) < 2:
        logging.warning("%d line(s), skipping.", len(lines))
        return None

    line_scores = [None]
    """Score for first line is always None."""

    # print(region_lines[0].text)
    for previous_line, line in zip(lines, lines[1:]):
        line_score = None
        if previous_line.text and line.text:
            previous_line_tokens = tokenize(previous_line.text)
            line_tokens = tokenize(line.text)

            token_scores = []
            for i, token in enumerate(line_tokens[: N - 1]):
                context = previous_line_tokens + line_tokens[:i]
                token_score = model.logscore(token, context)
                # print(f"Context: {_context}\tToken: {token}\tScore: {_score}")
                token_scores.append(token_score)

            if token_scores:
                line_score = aggregate_scores(token_scores)

            # print(f"{line.text}\t{line_score}\t{token_scores}")
        line_scores.append(line_score)

    assert len(line_scores) == len(lines)
    return aggregate_scores(line_scores)


def score_page(page_xml_file: Path) -> Optional[float]:
    """Score a PAGE XML file by scoring each line and aggregating the scores."""
    if lines := parse_pagexml_file(page_xml_file).get_lines():
        return score_lines(tuple(lines))
    else:
        return None


In [20]:
def score_regions(region1, region2) -> float:
    if region1.get_lines() and region2.get_lines():
        score = score_lines((region1.get_lines()[-1], region2.get_lines()[0]))
        # TODO: include more lines if less than N tokens?
    else:
        score = 0.0
    return score


def score_reading_order(page_xml_file: Path) -> float:
    regions = parse_pagexml_file(page_xml_file).text_regions

    if len(regions) <= 1:
        return 1.0
    else:
        scores = []
        for previous_region, region in zip(regions, regions[1:]):
            scores.append(score_regions(previous_region, region))
        return aggregate_scores(scores)


In [21]:
from functools import cmp_to_key


def compare_regions(region1, region2) -> float:
    return score_regions(region1, region2) - score_regions(region2, region1)


def best_reading_order(page_xml_file: Path):
    regions = parse_pagexml_file(page_xml_file).text_regions

    regions = sorted(regions, key=cmp_to_key(compare_regions))
    return regions


In [65]:
from typing import Hashable


def ordering_similarity(seq1: list[Hashable], seq2: list[Hashable]):
    """Compare two sequences for ordering similarity.

    All elements must be unique and hashable. Both sequences have to contain the same elements.

    Args:
        seq1 (list[Hashable]): First sequence.
        seq2 (list[Hashable]): Second sequence.
    Returns:
        float: Similarity score between 0.0 and 1.0.
    Raises:
        ValueError: If elements are not unique or if both sequences do not contain the same elements.
    """
    if set(seq1) != set(seq2):
        raise ValueError("Elements are not the same in both sequences.")
    if len(set(seq1)) != len(seq1):
        raise ValueError("Elements are not unique.")

    if len(seq1) < 2:
        return 1.0

    max_i = len(seq1) - 1

    similarity = 0
    for i1, region1 in enumerate(seq1):
        i2 = seq2.index(region1)

        # Do both sequences start with the same element?
        similarity += i1 == i2 == 0

        # Does the element have the same successor in both sequences?
        try:
            similarity += seq1[i1 + 1] == seq2[i2 + 1]
        except IndexError:
            # Last element in sequence(s)
            similarity += i1 == i2 == max_i

    score = similarity / (len(seq1) + 1)
    assert 0.0 <= score <= 1.0, f"{score} not in [0.0, 1.0]"
    return score


def ordering_similarity_regions(
    regions1: list["PageXMLTextRegion"], regions2: list["PageXMLTextRegion"]
) -> float:
    """Compare reading orders of two text regions.

    Args:
        regions1 (list[PageXMLTextRegion]): First reading order.
        regions2 (list[PageXMLTextRegion]): Second reading order, with identical elements as regions1.
    Returns:
        float: 1.0 if the reading orders are the same.
    """
    return ordering_similarity(
        [region.id for region in regions1], [region.id for region in regions2]
    )


def score_reading_order_similarity(page_xml_file: Path) -> float:
    try:
        regions = parse_pagexml_file(page_xml_file).text_regions
    except TypeError:
        logging.error("Could not parse PAGE XML file '%s'.", page_xml_file)
        return 1.0

    if len(regions) <= 1:
        return 1.0
    else:
        return ordering_similarity_regions(regions, best_reading_order(page_xml_file))


In [23]:
ordering_similarity([1,2,3,4], [4, 1, 2, 3])

0.4

In [24]:
import random

test_page = random.choice(train_pages)
print(test_page)


VOC Ground truths of the trainingset in PAGE xml/NL-HaNA_1.04.02_8769_0023.xml


In [25]:
regions = parse_pagexml_file(test_page).text_regions
print(regions)


[PageXMLTextRegion(
	id=r1, 
	type=['pagexml_doc', 'text_region'], 
	stats={"lines": 37, "words": 233, "text_regions": 0}
), PageXMLTextRegion(
	id=r2, 
	type=['pagexml_doc', 'text_region'], 
	stats={"lines": 1, "words": 1, "text_regions": 0}
)]


In [26]:
score_reading_order(test_page)


-7.636313556324984

In [27]:
ordered_regions = best_reading_order(test_page)
print(ordered_regions)

[PageXMLTextRegion(
	id=r2, 
	type=['pagexml_doc', 'text_region'], 
	stats={"lines": 1, "words": 1, "text_regions": 0}
), PageXMLTextRegion(
	id=r1, 
	type=['pagexml_doc', 'text_region'], 
	stats={"lines": 37, "words": 233, "text_regions": 0}
)]


In [28]:
score_page(test_page)


-12.528897009142241

In [29]:
ordering_similarity_regions(regions, ordered_regions)

0.0

### Score Test Set

In [30]:
TEST_SET_DIR = Path("limited2backup/949632/limited2_duplicated/page")
assert TEST_SET_DIR.is_dir()

TEST_DATA = Path("text_quality/data/limited2 review overview 20220713.csv")
assert TEST_DATA.is_file()


In [31]:
import random

#test_page = random.choice(list(TEST_SET_DIR.glob("*.xml")))
test_page = TEST_SET_DIR / "NL-HaNA_1.04.02_1110_0777.xml"
print(test_page)


limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_1110_0777.xml


In [32]:
regions = parse_pagexml_file(test_page).text_regions
print(regions)

[PageXMLTextRegion(
	id=492f662d-de6d-45b0-a697-648983c79e3c, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 2, "words": 8, "text_regions": 0}
), PageXMLTextRegion(
	id=fca39e24-2203-4154-956b-85b09038b663, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 3, "text_regions": 0}
), PageXMLTextRegion(
	id=8272ec42-2daf-4a2c-839d-71bbda6016bb, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 6, "text_regions": 0}
), PageXMLTextRegion(
	id=3a05fd89-63b1-4b82-932a-1a0d4c4179aa, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 6, "text_regions": 0}
), PageXMLTextRegion(
	id=2b8639e4-3670-4899-a616-ac467730ac99, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 2, "words": 7, "text_regions": 0}
), PageXMLTextRegion(
	id=4aa25208-1e14-42c2-b6bd-a9e7c5b64d8b, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 14, "words": 74, "text_regions": 0}
), PageXMLTextRegion(


In [33]:
for region in regions:
    print(f"Region:\t'{region.id}', '{region}'")
    print(region.coords)
    for line in region.get_lines():
        print(line.text)
    print("------------------------")


Region:	'492f662d-de6d-45b0-a697-648983c79e3c', 'PageXMLTextRegion(
	id=492f662d-de6d-45b0-a697-648983c79e3c, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 2, "words": 8, "text_regions": 0}
)'
Coords(points="539,3049 878,3049 878,3203 539,3203")
ƒ 3 67. 7: 8.
15. . Cittk:s.
------------------------
Region:	'fca39e24-2203-4154-956b-85b09038b663', 'PageXMLTextRegion(
	id=fca39e24-2203-4154-956b-85b09038b663, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 3, "text_regions": 0}
)'
Coords(points="763,3358 2616,3358 2616,3525 763,3525")
hoorn Raetvouchtn Heercommenis
------------------------
Region:	'8272ec42-2daf-4a2c-839d-71bbda6016bb', 'PageXMLTextRegion(
	id=8272ec42-2daf-4a2c-839d-71bbda6016bb, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 6, "text_regions": 0}
)'
Coords(points="792,2369 2441,2369 2441,2484 792,2484")
Ende menderlijck passie te lzetten, ende
------------------------
Region:	'3a05fd89-63b1-4b

In [34]:
ordered_regions = best_reading_order(test_page)
print(ordered_regions)

[PageXMLTextRegion(
	id=03d62e14-7119-4eb5-9b87-16601d859a72, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 5, "text_regions": 0}
), PageXMLTextRegion(
	id=492f662d-de6d-45b0-a697-648983c79e3c, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 2, "words": 8, "text_regions": 0}
), PageXMLTextRegion(
	id=d79a4f6e-49ef-4097-9db7-5b8f435d0d6f, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 4, "text_regions": 0}
), PageXMLTextRegion(
	id=cfcf33c8-1c3d-4f28-ac10-8471fed792f2, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 1, "text_regions": 0}
), PageXMLTextRegion(
	id=fca39e24-2203-4154-956b-85b09038b663, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 3, "text_regions": 0}
), PageXMLTextRegion(
	id=8272ec42-2daf-4a2c-839d-71bbda6016bb, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 6, "text_regions": 0}
), PageXMLTextRegion(
	i

In [35]:
ordering_similarity_regions(regions, ordered_regions)


0.5333333333333333

In [36]:
score_reading_order_similarity(test_page)

0.5333333333333333

In [37]:
for region in ordered_regions:
    print(f"Region:\t'{region.id}', '{region}'")
    print(region.coords)
    for line in region.get_lines():
        print(line.text)
    print("------------------------")


Region:	'03d62e14-7119-4eb5-9b87-16601d859a72', 'PageXMLTextRegion(
	id=03d62e14-7119-4eb5-9b87-16601d859a72, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 5, "text_regions": 0}
)'
Coords(points="2476,761 2653,761 2653,826 2476,826")
6 4/0  „ .
------------------------
Region:	'492f662d-de6d-45b0-a697-648983c79e3c', 'PageXMLTextRegion(
	id=492f662d-de6d-45b0-a697-648983c79e3c, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 2, "words": 8, "text_regions": 0}
)'
Coords(points="539,3049 878,3049 878,3203 539,3203")
ƒ 3 67. 7: 8.
15. . Cittk:s.
------------------------
Region:	'd79a4f6e-49ef-4097-9db7-5b8f435d0d6f', 'PageXMLTextRegion(
	id=d79a4f6e-49ef-4097-9db7-5b8f435d0d6f, 
	type=['pagexml_doc', 'text_region', 'Text'], 
	stats={"lines": 1, "words": 4, "text_regions": 0}
)'
Coords(points="1050,1199 2522,1199 2522,1396 1050,1396")
Mer Commissaris Mr. Anthonij
------------------------
Region:	'cfcf33c8-1c3d-4f28-ac10-8471fed792f2', 'PageXMLT

In [38]:
score_reading_order(test_page)


-inf

### Read Annotated Data

In [39]:
import pandas as pd

test_data = pd.read_csv(TEST_DATA, index_col=0).drop("Unnamed: 11", axis="columns")
test_data


Unnamed: 0,Pagina,Beoordeling,Opmerkingen Kay,Ranking Kay,Oordeel Maartje,Opmerkingen Maartje,Ranking Maartje,Som,Filename,Invno
0,1,Medium,Tekst,2,medium,,2,4,NL-HaNA_1.04.02_1069_0506,1069
1,2,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1108_1273,1108
2,3,Slecht,Leesvolgorde en tekst,3,slecht,,3,6,NL-HaNA_1.04.02_1110_0301,1110
3,4,Medium,Leesvolgorde,2,medium,,2,4,NL-HaNA_1.04.02_1110_0777,1110
4,5,Leeg,,0,goed,leeg,1,1,NL-HaNA_1.04.02_1110_0782,1110
...,...,...,...,...,...,...,...,...,...,...
495,496,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_10899_0130,10899
496,497,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_10953_0290,10953
497,498,Medium,Regelherkenning,2,slecht,leesvolgorde,3,5,NL-HaNA_1.04.02_10969_0121,10969
498,499,Goed,,1,goed,moeite met getallen,1,2,NL-HaNA_1.04.02_10975_0260,10975


In [40]:
test_data["pagexml_path"] = test_data["Filename"].apply(
    lambda f: TEST_SET_DIR / (f + ".xml")
)

assert (
    test_data["pagexml_path"].apply(lambda f: f.is_file()).all()
), "Could not find files in test set."

test_data


Unnamed: 0,Pagina,Beoordeling,Opmerkingen Kay,Ranking Kay,Oordeel Maartje,Opmerkingen Maartje,Ranking Maartje,Som,Filename,Invno,pagexml_path
0,1,Medium,Tekst,2,medium,,2,4,NL-HaNA_1.04.02_1069_0506,1069,limited2backup/949632/limited2_duplicated/page...
1,2,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1108_1273,1108,limited2backup/949632/limited2_duplicated/page...
2,3,Slecht,Leesvolgorde en tekst,3,slecht,,3,6,NL-HaNA_1.04.02_1110_0301,1110,limited2backup/949632/limited2_duplicated/page...
3,4,Medium,Leesvolgorde,2,medium,,2,4,NL-HaNA_1.04.02_1110_0777,1110,limited2backup/949632/limited2_duplicated/page...
4,5,Leeg,,0,goed,leeg,1,1,NL-HaNA_1.04.02_1110_0782,1110,limited2backup/949632/limited2_duplicated/page...
...,...,...,...,...,...,...,...,...,...,...,...
495,496,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_10899_0130,10899,limited2backup/949632/limited2_duplicated/page...
496,497,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_10953_0290,10953,limited2backup/949632/limited2_duplicated/page...
497,498,Medium,Regelherkenning,2,slecht,leesvolgorde,3,5,NL-HaNA_1.04.02_10969_0121,10969,limited2backup/949632/limited2_duplicated/page...
498,499,Goed,,1,goed,moeite met getallen,1,2,NL-HaNA_1.04.02_10975_0260,10975,limited2backup/949632/limited2_duplicated/page...


In [49]:
from tqdm.auto import tqdm

tqdm.pandas(unit="file", desc="Counting Regions")

def n_regions(page_xml_file: Path) -> int:
    try:
        pagexml = parse_pagexml_file(page_xml_file)
    except Exception as e:
        logging.warning(f"Could not read '{page_xml_file}': {e}")
        return 0
    return len(pagexml.text_regions)


test_data["regions"] = test_data["pagexml_path"].progress_apply(n_regions)
test_data


Scoring:   0%|          | 0/500 [00:00<?, ?page/s]



Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_1269_1040.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_1443_0212.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_1548_1453.xml




Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_1578_0938.xml




Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_2275_0078.xml




Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_2646_0018.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_2768_0375.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_2908_0211.xml




Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_3111_0999.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_3389_0782.xml




Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_4405_0237.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_4418_0340.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_4426_0769.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_7725_0189.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_8517_0004.xml




Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_9342_0377.xml




Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_9467_0563.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_9534_0129.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_9594_0147.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_9940_0348.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_10124_0230.xml




Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_10451_0626.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_10567_0010.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_10633_0002.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_10675_0578.xml
Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_10890_0901.xml


Unnamed: 0,Pagina,Beoordeling,Opmerkingen Kay,Ranking Kay,Oordeel Maartje,Opmerkingen Maartje,Ranking Maartje,Som,Filename,Invno,pagexml_path,score,regions
0,1,Medium,Tekst,2,medium,,2,4,NL-HaNA_1.04.02_1069_0506,1069,limited2backup/949632/limited2_duplicated/page...,0.400000,4
1,2,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1108_1273,1108,limited2backup/949632/limited2_duplicated/page...,0.285714,6
2,3,Slecht,Leesvolgorde en tekst,3,slecht,,3,6,NL-HaNA_1.04.02_1110_0301,1110,limited2backup/949632/limited2_duplicated/page...,0.400000,4
3,4,Medium,Leesvolgorde,2,medium,,2,4,NL-HaNA_1.04.02_1110_0777,1110,limited2backup/949632/limited2_duplicated/page...,0.533333,14
4,5,Leeg,,0,goed,leeg,1,1,NL-HaNA_1.04.02_1110_0782,1110,limited2backup/949632/limited2_duplicated/page...,0.500000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_10899_0130,10899,limited2backup/949632/limited2_duplicated/page...,,4
496,497,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_10953_0290,10953,limited2backup/949632/limited2_duplicated/page...,,2
497,498,Medium,Regelherkenning,2,slecht,leesvolgorde,3,5,NL-HaNA_1.04.02_10969_0121,10969,limited2backup/949632/limited2_duplicated/page...,,6
498,499,Goed,,1,goed,moeite met getallen,1,2,NL-HaNA_1.04.02_10975_0260,10975,limited2backup/949632/limited2_duplicated/page...,,15


In [66]:
from tqdm.auto import tqdm

n = 50

tqdm.pandas(desc="Scoring", unit="page")

test_data["score"] = test_data.iloc[:n]["pagexml_path"].progress_apply(
    score_reading_order_similarity
)
test_data.head(n)


Scoring:   0%|          | 0/50 [00:00<?, ?page/s]

ERROR:root:Could not parse PAGE XML file 'limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_1269_1040.xml'.


Error parsing file limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_1269_1040.xml


Unnamed: 0,Pagina,Beoordeling,Opmerkingen Kay,Ranking Kay,Oordeel Maartje,Opmerkingen Maartje,Ranking Maartje,Som,Filename,Invno,pagexml_path,score,regions
0,1,Medium,Tekst,2,medium,,2,4,NL-HaNA_1.04.02_1069_0506,1069,limited2backup/949632/limited2_duplicated/page...,0.4,4
1,2,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1108_1273,1108,limited2backup/949632/limited2_duplicated/page...,0.285714,6
2,3,Slecht,Leesvolgorde en tekst,3,slecht,,3,6,NL-HaNA_1.04.02_1110_0301,1110,limited2backup/949632/limited2_duplicated/page...,0.4,4
3,4,Medium,Leesvolgorde,2,medium,,2,4,NL-HaNA_1.04.02_1110_0777,1110,limited2backup/949632/limited2_duplicated/page...,0.533333,14
4,5,Leeg,,0,goed,leeg,1,1,NL-HaNA_1.04.02_1110_0782,1110,limited2backup/949632/limited2_duplicated/page...,0.5,5
5,6,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1111_0725,1111,limited2backup/949632/limited2_duplicated/page...,0.3,9
6,7,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1113_0025,1113,limited2backup/949632/limited2_duplicated/page...,0.181818,10
7,8,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1119_1476,1119,limited2backup/949632/limited2_duplicated/page...,0.105263,18
8,9,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1123_0200,1123,limited2backup/949632/limited2_duplicated/page...,0.5,5
9,10,Goed,,1,goed,,1,2,NL-HaNA_1.04.02_1131_0054,1131,limited2backup/949632/limited2_duplicated/page...,0.285714,6


In [68]:
test_data.head(n).loc[
    (
        (
            test_data["Opmerkingen Kay"].notna()
            & test_data["Opmerkingen Kay"].str.lower().str.contains("volgorde")
        )
        | (
            test_data["Opmerkingen Maartje"].notna()
            & test_data["Opmerkingen Kay"].str.lower().str.contains("volgorde")
        )
    )
]


Unnamed: 0,Pagina,Beoordeling,Opmerkingen Kay,Ranking Kay,Oordeel Maartje,Opmerkingen Maartje,Ranking Maartje,Som,Filename,Invno,pagexml_path,score,regions
2,3,Slecht,Leesvolgorde en tekst,3,slecht,,3,6,NL-HaNA_1.04.02_1110_0301,1110,limited2backup/949632/limited2_duplicated/page...,0.4,4
3,4,Medium,Leesvolgorde,2,medium,,2,4,NL-HaNA_1.04.02_1110_0777,1110,limited2backup/949632/limited2_duplicated/page...,0.533333,14
19,20,Medium,Leesvolgorde,2,goed,,1,3,NL-HaNA_1.04.02_1234_0623,1234,limited2backup/949632/limited2_duplicated/page...,0.4375,15
21,22,Medium,leesvolgorde,2,slecht,layout,3,5,NL-HaNA_1.04.02_1246_0629,1246,limited2backup/949632/limited2_duplicated/page...,0.25,27
25,26,Medium,Leesvolgorde,2,slecht,leesvolgorde,3,5,NL-HaNA_1.04.02_1274_0688,1274,limited2backup/949632/limited2_duplicated/page...,0.107143,27
38,39,Medium,Regiovolgorde,2,medium,leesvolgorde (marginalia),2,4,NL-HaNA_1.04.02_1351_1506,1351,limited2backup/949632/limited2_duplicated/page...,0.5,11


In [42]:
for i1, row in test_data.head(n).iterrows():
    print(row["pagexml_path"])
    for line1 in parse_pagexml_file(row["pagexml_path"]).get_lines():
        print(line1.text)
    print("-----------------------------------")


limited2backup/949632/limited2_duplicated/page/NL-HaNA_1.04.02_1069_0506.xml
5 % o R2 16. -
Sodat bijden extroordonarijebeeden ret
Gen
hierop gelet zijnde wat onder dese andelinghe
mochte schuijten en oock sootmogelijck was m
de fuwele meninghe van engelsche eenste
becomen, Is Gereplueert aen haer teantwoorde
en schriren als volcht
Alsoo den coninck van faccatra legen het accortlussen
hem en dengel n ontgemaeckt den eerste stantij
nt alleen ongevangene vinden pden eijschanden
pangoram van Banthan derwaers vsonden, maer
oock dengelsche haer bootpeftende tide omme ons
gelijck gecontrateert was  eijland tebringhen
tesende hndert heeften daer vyjt byj on
waerschynelijcke presumtie genomen is, dat de selve
Coningen eenen Lijn srecken de vernoemde engelsche
soo veel hen doenelijcken waer, oock sorden soecken te
hnderen In alle andere belooffde en geaccordeerde
poincten te volbrengen, enwy dienvolgenslichelijck
ondegeraeken van en elew andere, en ook
wijders opgisteren aonst tot en binnen ons