In [1]:
%load_ext autoreload
%autoreload now

# Download and convert data

In [2]:
from tqdm import tqdm
from document_segmentation.pagexml.generale_missiven import GeneraleMissiven
from document_segmentation.settings import (
    GENERALE_MISSIVEN_DOCUMENT_DIR,
    GENERALE_MISSIVEN_SHEET,
)

N = None

GENERALE_MISSIVEN_DOCUMENT_DIR.mkdir(parents=True, exist_ok=True)

sheet = GeneraleMissiven(GENERALE_MISSIVEN_SHEET)

existing_docs = {
    path.stem
    for path in GENERALE_MISSIVEN_DOCUMENT_DIR.glob("*.json")
    if path.is_file()
}

for document in tqdm(
    sheet.to_documents(n=N, skip_ids=existing_docs),
    total=(N or len(sheet)) - len(existing_docs),
    desc="Writing documents",
    unit="doc",
):
    document_file = GENERALE_MISSIVEN_DOCUMENT_DIR / f"{document.id}.json"

    with document_file.open("xt") as f:
        f.write(document.model_dump_json())
        f.write("\n")

Writing documents:   0%|          | 0/5 [00:00<?, ?doc/s]

Skipping row with inventory number 1171 due to status message: Niet gedigitaliseerd.
Skipping row with inventory number 2770 due to status message: Niet gedigitaliseerd.
Skipping row with inventory number 2770 due to status message: Niet gedigitaliseerd.
Skipping row with inventory number 2770 due to status message: Niet gedigitaliseerd.
Skipping row with inventory number 2911 due to status message: Niet gedigitaliseerd.





In [3]:
from document_segmentation.model.dataset import PageDataset

dataset = PageDataset.from_dir(GENERALE_MISSIVEN_DOCUMENT_DIR)
len(dataset)

Reading JSON files: 100%|██████████| 909/909 [01:10<00:00, 12.95file/s]


191146

In [4]:
dataset[5000]

Page(label=<Label.IN: 2>, regions=[Region(id='region_c62b09b5-3b73-455f-bb44-2c07ece8fe82_3', types=(<RegionType.PHYSICAL_STRUCTURE_DOC: 'physical_structure_doc'>, <RegionType.TEXT_REGION: 'text_region'>, <RegionType.PAGE_NUMBER: 'page-number'>, <RegionType.PAGEXML_DOC: 'pagexml_doc'>), coordinates=((66, 671), (63, 674), (66, 677), (70, 677), (73, 674), (70, 671)), lines=()), Region(id='region_72e9d1bd-256c-4b08-a65a-bafa26c4d572_4', types=(<RegionType.PHYSICAL_STRUCTURE_DOC: 'physical_structure_doc'>, <RegionType.TEXT_REGION: 'text_region'>, <RegionType.PAGE_NUMBER: 'page-number'>, <RegionType.PAGEXML_DOC: 'pagexml_doc'>), coordinates=((2550, 244), (2544, 237), (2544, 234), (2541, 234), (2534, 228), (2531, 228), (2528, 225), (2493, 225), (2490, 228), (2477, 228), (2474, 231), (2462, 231), (2458, 234), (2455, 231), (2389, 231), (2386, 234), (2364, 234), (2360, 237), (2357, 237), (2338, 256), (2338, 259), (2335, 263), (2335, 266), (2332, 269), (2332, 272), (2329, 275), (2329, 285), (232

In [5]:
from document_segmentation.model.page_sequence_tagger import PageSequenceTagger

tagger = PageSequenceTagger()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
training_dataset = dataset[:1000]

In [7]:
test_dataset = dataset[1000:1200]

In [8]:
tagger.train_(training_dataset, epochs=3, weights=dataset.class_weights())

  0%|          | 0/3 [00:00<?, ?epoch/s]

In [None]:
tagger.precision(test_dataset)

MulticlassPrecision:  16%|█▌        | 1/6.25 [00:31<02:44, 31.31s/batch]

[MulticlassPrecision: {'BEGIN': 0.0, 'IN': 1.0, 'END': 0.0}]


MulticlassPrecision:  32%|███▏      | 2/6.25 [00:59<02:06, 29.67s/batch]

[MulticlassPrecision: {'BEGIN': 0.0, 'IN': 1.0, 'END': 0.0}]


MulticlassPrecision:  48%|████▊     | 3/6.25 [01:37<01:47, 33.10s/batch]

[MulticlassPrecision: {'BEGIN': 0.0, 'IN': 1.0, 'END': 0.0}]


MulticlassPrecision:  64%|██████▍   | 4/6.25 [02:20<01:24, 37.35s/batch]

[MulticlassPrecision: {'BEGIN': 0.0, 'IN': 1.0, 'END': 0.0}]


MulticlassPrecision:  80%|████████  | 5/6.25 [02:57<00:46, 37.07s/batch]

[MulticlassPrecision: {'BEGIN': 0.0181818176060915, 'IN': 0.9809523820877075, 'END': 0.0}]


MulticlassPrecision:  96%|█████████▌| 6/6.25 [03:21<00:08, 32.52s/batch]

[MulticlassPrecision: {'BEGIN': 0.014925372786819935, 'IN': 0.984000027179718, 'END': 0.0}]


MulticlassPrecision: 7batch [03:26, 29.56s/batch]                       


[MulticlassPrecision: {'BEGIN': 0.014285714365541935, 'IN': 0.9846153855323792, 'END': 0.0}]


{'BEGIN': 0.014285714365541935, 'IN': 0.9846153855323792, 'END': 0.0}

In [None]:
tagger.recall(test_dataset)

MulticlassRecall: 7batch [00:00, 55.70batch/s]                       

[MulticlassRecall: {'BEGIN': 0.0, 'IN': 0.90625, 'END': 0.0}]
[MulticlassRecall: {'BEGIN': 0.0, 'IN': 0.625, 'END': 0.0}]
[MulticlassRecall: {'BEGIN': 0.0, 'IN': 0.6770833134651184, 'END': 0.0}]
[MulticlassRecall: {'BEGIN': 0.0, 'IN': 0.6875, 'END': 0.0}]
[MulticlassRecall: {'BEGIN': 0.5, 'IN': 0.6602563858032227, 'END': 0.0}]
[MulticlassRecall: {'BEGIN': 0.5, 'IN': 0.6542553305625916, 'END': 0.0}]
[MulticlassRecall: {'BEGIN': 0.5, 'IN': 0.6530612111091614, 'END': 0.0}]





{'BEGIN': 0.5, 'IN': 0.6530612111091614, 'END': 0.0}

In [None]:
tagger.f1_score(test_dataset)

MulticlassF1Score:   0%|          | 0/6.25 [00:00<?, ?batch/s]

[MulticlassF1Score: {'BEGIN': 0.0, 'IN': 0.9508196711540222, 'END': 0.0}]


MulticlassF1Score:   0%|          | 0/6.25 [00:00<?, ?batch/s]

[MulticlassF1Score: {'BEGIN': 0.0, 'IN': 0.7692307829856873, 'END': 0.0}]


MulticlassF1Score: 7batch [00:00, 49.06batch/s]                       


[MulticlassF1Score: {'BEGIN': 0.0, 'IN': 0.807453453540802, 'END': 0.0}]
[MulticlassF1Score: {'BEGIN': 0.0, 'IN': 0.8148148059844971, 'END': 0.0}]
[MulticlassF1Score: {'BEGIN': 0.035087719559669495, 'IN': 0.7892720699310303, 'END': 0.0}]
[MulticlassF1Score: {'BEGIN': 0.028985507786273956, 'IN': 0.7859424948692322, 'END': 0.0}]
[MulticlassF1Score: {'BEGIN': 0.0277777761220932, 'IN': 0.7852760553359985, 'END': 0.0}]


{'BEGIN': 0.0277777761220932, 'IN': 0.7852760553359985, 'END': 0.0}

In [None]:
tagger.accuracy(test_dataset)

MulticlassAccuracy:   0%|          | 0/6.25 [00:00<?, ?batch/s]

[MulticlassAccuracy: {'BEGIN': nan, 'IN': 0.90625, 'END': nan}]
[MulticlassAccuracy: {'BEGIN': nan, 'IN': 0.625, 'END': nan}]
[MulticlassAccuracy: {'BEGIN': nan, 'IN': 0.6770833134651184, 'END': nan}]


MulticlassAccuracy:   0%|          | 0/6.25 [00:00<?, ?batch/s]

[MulticlassAccuracy: {'BEGIN': nan, 'IN': 0.6875, 'END': nan}]
[MulticlassAccuracy: {'BEGIN': 0.5, 'IN': 0.6602563858032227, 'END': 0.0}]


MulticlassAccuracy: 7batch [00:00, 58.23batch/s]                       


[MulticlassAccuracy: {'BEGIN': 0.5, 'IN': 0.6542553305625916, 'END': 0.0}]
[MulticlassAccuracy: {'BEGIN': 0.5, 'IN': 0.6530612111091614, 'END': 0.0}]


tensor(0.6531)

In [None]:
from document_segmentation.pagexml.datamodel.page import Label


preds = tagger(test_dataset)

print(
    "\t".join(
        ("Page ID", "True Label", "Predicted Label", "Correct?", "Predicted Scores")
    )
)
for page_id, true_label, pred, pred_label in zip(
    test_dataset.page_ids(),
    test_dataset.labels(),
    preds,
    preds.argmax(dim=1),
    strict=True,
):
    print(
        "\t".join(
            (
                page_id,
                true_label.name,
                Label(pred_label.item() + 1).name,
                str(Label(pred_label.item() + 1) == true_label),
                str(pred.tolist()),
            )
        )
    )

Page ID	True Label	Predicted Label	Correct?	Predicted Scores


AttributeError: 'PageDataset' object has no attribute 'page_ids'

In [None]:
from torchview import draw_graph

model_graph = draw_graph(tagger)
print(model_graph.visual_graph)

RuntimeError: Only one of (input_data, input_size) should be specified.