In [1]:
%load_ext autoreload
%autoreload now

In [2]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-08137aa2-e69b-5e74-8390-7997329b1336"

import torch

# Download and convert data

In [3]:
from tqdm import tqdm

from document_segmentation.pagexml.generale_missiven import GeneraleMissiven
from document_segmentation.settings import (
    GENERALE_MISSIVEN_DOCUMENT_DIR,
    GENERALE_MISSIVEN_SHEET,
)

N = None

GENERALE_MISSIVEN_DOCUMENT_DIR.mkdir(parents=True, exist_ok=True)

sheet = GeneraleMissiven(GENERALE_MISSIVEN_SHEET)

existing_docs = {
    path.stem
    for path in GENERALE_MISSIVEN_DOCUMENT_DIR.glob("*.json")
    if path.is_file()
}

for document in tqdm(
    sheet.to_documents(n=N, skip_ids=existing_docs),
    total=(N or len(sheet)) - len(existing_docs),
    desc="Writing documents",
    unit="doc",
):
    document_file = GENERALE_MISSIVEN_DOCUMENT_DIR / f"{document.id}.json"

    with document_file.open("xt") as f:
        f.write(document.model_dump_json())
        f.write("\n")

Writing documents:   0%|          | 0/5 [00:00<?, ?doc/s]

Skipping row with inventory number 1171 due to status message: 'Niet gedigitaliseerd.'
Skipping row with inventory number 2770 due to status message: 'Niet gedigitaliseerd.'
Skipping row with inventory number 2770 due to status message: 'Niet gedigitaliseerd.'
Skipping row with inventory number 2770 due to status message: 'Niet gedigitaliseerd.'
Skipping row with inventory number 2911 due to status message: 'Niet gedigitaliseerd.'





# Load Data

In [4]:
%autoreload now

In [5]:
TRAINING_DATA = 0.8

In [6]:
from document_segmentation.model.dataset import PageDataset
from document_segmentation.settings import MIN_REGION_TEXT_LENGTH

dataset = (
    PageDataset.from_dir(GENERALE_MISSIVEN_DOCUMENT_DIR)
    .remove_short_regions(MIN_REGION_TEXT_LENGTH)
    .shuffle()
)
# TODO: move region filtering to training/classification step?

len(dataset)

Reading JSON files: 100%|██████████| 909/909 [01:07<00:00, 13.44file/s]


191146

In [7]:
dataset._class_counts()

Counter({<Label.IN: 2>: 189343, <Label.BEGIN: 1>: 905, <Label.END: 3>: 898})

In [8]:
dataset.class_weights()

[210.97792494481237, 1.0095170694608755, 212.6206896551724, 191146.0]

In [9]:
dataset[5000]

Page(label=<Label.IN: 2>, regions=[Region(id='region_c50d90ce-7c08-4c50-b52c-0f17352866c8_2', types=(<RegionType.PARAGRAPH: 'paragraph'>, <RegionType.PHYSICAL_STRUCTURE_DOC: 'physical_structure_doc'>, <RegionType.TEXT_REGION: 'text_region'>, <RegionType.PAGEXML_DOC: 'pagexml_doc'>), coordinates=((657, 1139), (645, 1160), (648, 1178), (690, 1196), (750, 1205), (756, 1211), (747, 1233), (732, 1242), (657, 1263), (603, 1272), (566, 1296), (560, 1317), (587, 1329), (618, 1326), (747, 1341), (783, 1353), (801, 1374), (789, 1401), (762, 1419), (771, 1447), (765, 1489), (750, 1525), (726, 1558), (741, 1567), (753, 1585), (726, 1636), (717, 1670), (717, 1724), (726, 1745), (723, 1838), (747, 1905), (759, 1995), (759, 2164), (777, 2248), (783, 2354), (789, 2375), (789, 2471), (798, 2489), (816, 2498), (925, 2492), (1030, 2501), (1085, 2498), (1145, 2507), (1449, 2507), (1530, 2531), (1567, 2531), (1624, 2513), (1762, 2495), (1829, 2498), (2181, 2489), (2422, 2498), (2443, 2489), (2458, 2474), (

In [10]:
split = int(len(dataset) * TRAINING_DATA)

training_data = dataset[:split]
test_data = dataset[split:]

In [11]:
training_data._class_counts()

Counter({<Label.IN: 2>: 151475, <Label.BEGIN: 1>: 726, <Label.END: 3>: 715})

# Train Model

In [12]:
BATCH_SIZE = 32
EPOCHS = 10
# WEIGHTS = torch.Tensor(dataset.class_weights())   # For an imbalanced dataset
WEIGHTS = None  # For a balanced dataset

TRAINING_DATA_MAX = None

In [13]:
%autoreload now

In [14]:
from document_segmentation.pagexml.datamodel.label import Label

sample_size = training_data._class_counts()[Label.BEGIN]
sample_size

726

In [15]:
from document_segmentation.model.page_classifier import PageClassifier

model = PageClassifier()
model



PageClassifier(
  (_embedding): PageEmbedding(
    (_region_model): RegionEmbedding(
      (_transformer_model): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(42774, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_features=768, out_features=768, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
              

In [16]:
model._device

'mps'

In [17]:
if model._device == "mps":
    torch.mps.empty_cache()
    print(torch.mps.current_allocated_memory() / 1024**2)

454.147705078125


In [18]:
model.train_(training_data.balance(sample_size).shuffle(), EPOCHS, BATCH_SIZE, WEIGHTS)

  full_bar = Bar(frac,
100%|██████████| 68/67.71875 [03:31<00:00,  3.11s/batch]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14500 MB
[Loss:	1.158]


100%|██████████| 68/67.71875 [00:06<00:00, 11.22batch/s]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	1.068]


100%|██████████| 68/67.71875 [00:06<00:00, 10.06batch/s]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	0.989]


100%|██████████| 68/67.71875 [00:07<00:00,  9.37batch/s]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	0.890]


100%|██████████| 68/67.71875 [00:07<00:00,  9.06batch/s]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	0.988]


100%|██████████| 68/67.71875 [00:07<00:00,  8.91batch/s]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	0.912]


100%|██████████| 68/67.71875 [00:07<00:00,  8.68batch/s]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	0.858]


100%|██████████| 68/67.71875 [00:07<00:00,  8.88batch/s]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	0.848]


100%|██████████| 68/67.71875 [00:07<00:00,  9.04batch/s]


Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	0.893]


100%|██████████| 68/67.71875 [00:07<00:00,  8.86batch/s]

Current allocated memory (MPS): 13141 MB
Driver allocated memory (MPS): 14366 MB
[Loss:	0.871]





# Evaluation

In [19]:
import csv
import sys

from torcheval.metrics import (
    MulticlassAccuracy,
    MulticlassF1Score,
    MulticlassPrecision,
    MulticlassRecall,
)
from tqdm import tqdm

from document_segmentation.pagexml.datamodel.label import Label

writer = csv.DictWriter(
    sys.stdout,
    fieldnames=("Predicted", "Actual", "Page ID", "Text", "Scores"),
    delimiter="\t",
)

writer.writeheader()

accuracy = MulticlassAccuracy(num_classes=len(Label))
precision = MulticlassPrecision(average=None, num_classes=len(Label))
recall = MulticlassRecall(average=None, num_classes=len(Label))
f1_score = MulticlassF1Score(average=None, num_classes=len(Label))

for batch in tqdm(
    test_data[:1000].batches(BATCH_SIZE),
    total=len(test_data) / BATCH_SIZE,
    unit="batch",
):
    predicted = model(batch)
    labels = batch.labels()

    _labels = torch.Tensor([label.value - 1 for label in labels]).to(int)
    accuracy.update(predicted, _labels)
    precision.update(predicted, _labels)
    recall.update(predicted, _labels)
    f1_score.update(predicted, _labels)

    for page, pred, label in zip(batch.pages, predicted, labels):
        pred_label = Label(pred.argmax().item() + 1)
        if pred_label != Label.IN or label != Label.IN:
            writer.writerow(
                {
                    "Predicted": pred_label.name,
                    "Actual": label.name,
                    "Page ID": page.doc_id,
                    "Text": page.text(delimiter="; ")[:50],
                    "Scores": str(pred.tolist()),
                }
            )

Predicted	Actual	Page ID	Text	Scores


  0%|          | 1/1194.6875 [00:04<1:37:33,  4.90s/batch]

END	IN	NL-HaNA_1.04.02_3712_0150.jpg	de waarde, van het voorwerp der bescher„; ming, de	[5.863048045284813e-06, 0.059085000306367874, 0.9409074783325195, 1.673855308581551e-06]


  0%|          | 2/1194.6875 [00:08<1:23:57,  4.22s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1082_0287.jpg	Het onse brieven vanden 28en december, 1623: p d'e	[0.9999990463256836, 7.214325705717783e-07, 2.554999980475259e-07, 6.420295961540035e-10]
END	IN	NL-HaNA_1.04.02_1816_0337.jpg	off men haer behoorl: rang wel; soude observeren, 	[3.417672996874899e-05, 0.007159017026424408, 0.9922771453857422, 0.0005295872106216848]


  0%|          | 3/1194.6875 [00:11<1:13:10,  3.68s/batch]

END	IN	NL-HaNA_1.04.02_1903_0454.jpg		[0.03907805681228638, 0.013840816915035248, 0.9424195885658264, 0.00466154795140028]
BEGIN	IN	NL-HaNA_1.04.02_1245_0250.jpg	Op dat dese retourvloot uEd=e in goede ordre; mogt	[0.9870861172676086, 0.0036741276271641254, 0.009190519340336323, 4.927271947963163e-05]


  0%|          | 4/1194.6875 [00:14<1:09:47,  3.52s/batch]

END	IN	NL-HaNA_1.04.02_3712_0153.jpg		[0.03907805681228638, 0.013840816915035248, 0.9424195885658264, 0.00466154795140028]
END	IN	NL-HaNA_1.04.02_3679_0410.jpg	Den Gezaghebber te Gale Cornelis Dionijsius Kraije	[8.100091690721456e-07, 8.468030898711731e-08, 0.9999972581863403, 1.7733493677951628e-06]


  0%|          | 5/1194.6875 [00:17<1:06:13,  3.34s/batch]

END	IN	NL-HaNA_1.04.02_1490_0262.jpg	welk gunst bewijs, indien het van u Ed:e hoog agtb	[0.0014739584876224399, 0.0013731232611462474, 0.996614396572113, 0.0005385656841099262]
BEGIN	IN	NL-HaNA_1.04.02_1573_0600.jpg	tegens den schipper Cornelis de mees; terwwegens v	[0.9999935626983643, 4.349090431787772e-06, 2.059779490082292e-06, 7.405255519898901e-09]
END	IN	NL-HaNA_1.04.02_1179B_0237.jpg	coopmanschappen noch voor het merendeel bij de; pa	[3.255553338021855e-08, 1.4746814880339798e-09, 0.9999998807907104, 1.6942381364515313e-07]


  1%|          | 6/1194.6875 [00:21<1:04:03,  3.23s/batch]

BEGIN	IN	NL-HaNA_1.04.02_2484_0192.jpg	der onkost reecq: van de gem: vier scheepen; Concs	[0.9930364489555359, 0.005756630562245846, 0.0011777796316891909, 2.9128355890861712e-05]
BEGIN	IN	NL-HaNA_1.04.02_3873_0415.jpg	gehad, wel te ontvangen uwelEd: Hoog Achte; nadere	[0.9999688863754272, 2.7408536880102474e-06, 2.8330739951343276e-05, 2.241546681602813e-08]
BEGIN	IN	NL-HaNA_1.04.02_2392_0593.jpg	jongste eerbiedige missive van den 2:' april; dese	[0.9995936751365662, 0.00013383489567786455, 0.00027110264636576176, 1.370459358440712e-06]
BEGIN	IN	NL-HaNA_1.04.02_1862_0268.jpg	N:o 24: Nog nader vervolg van het acteboek der ver	[0.999860405921936, 4.682922372012399e-05, 9.252452582586557e-05, 2.2423694190365495e-07]


  1%|          | 7/1194.6875 [00:24<1:03:43,  3.22s/batch]

END	END	NL-HaNA_1.04.02_3713_0365.jpg	Dog dat ten agteren staan of blijven; Zeeland. . .	[1.1244992492720485e-08, 2.129478399170992e-10, 1.0, 3.673353887734265e-08]
END	IN	NL-HaNA_1.04.02_2636_0773.jpg	Mallabaar 455. Persoonen Per Transport; Inlandse d	[3.972269951191265e-06, 1.0484299309609924e-05, 0.9999349117279053, 5.0627415475901216e-05]


  1%|          | 8/1194.6875 [00:26<1:00:52,  3.08s/batch]

END	END	NL-HaNA_1.04.02_7529_0897.jpg	Soo hebben wy goedt gevonden; desen te sluyten ond	[2.494989814749715e-07, 5.397574609489197e-10, 0.9999997615814209, 4.7795559510177554e-08]
END	IN	NL-HaNA_1.04.02_7588_0232.jpg		[0.007511676289141178, 0.001985306851565838, 0.9891741275787354, 0.001328896265476942]


  1%|          | 9/1194.6875 [00:30<1:04:07,  3.25s/batch]

BEGIN	IN	NL-HaNA_1.04.02_2419_1110.jpg	sittinge van den 22.:' aug:s J:o lee„; den ingevol	[0.9999793767929077, 9.000498721434269e-06, 1.1554059710761067e-05, 2.7266107593959532e-08]
BEGIN	IN	NL-HaNA_1.04.02_1086_0088.jpg	van Volck versien, soo dat sonder ander ontseth; q	[0.9999980926513672, 1.1456986612756737e-06, 6.870217248433619e-07, 1.2852175990119008e-09]
END	IN	NL-HaNA_1.04.02_3251_1535.jpg	het meeste respect,; Wel-Edele, Hoog Agtbare, wijz	[4.195796154959908e-09, 8.19595780132687e-12, 1.0, 3.168572959566518e-09]
BEGIN	IN	NL-HaNA_1.04.02_3945_1181.jpg	Bij onze eerbiedige van den 40 Julij deeses; Iaars	[0.999995231628418, 3.7799568417540286e-06, 9.186329066324106e-07, 4.108411832959291e-09]


  1%|          | 10/1194.6875 [00:33<1:03:52,  3.23s/batch]

BEGIN	IN	NL-HaNA_1.04.02_1934_0078.jpg	twee predicanten; als Iohannes wen„; „ting en phil	[0.9432636499404907, 0.05347616225481033, 0.0031261183321475983, 0.00013402526383288205]
BEGIN	BEGIN	NL-HaNA_1.04.02_3187_0039.jpg	De Heer Thomas Hope; Representant Van zijn Doorlug	[0.9999914169311523, 3.0535361474903766e-06, 5.432278612715891e-06, 1.1481344763808465e-08]


  1%|          | 11/1194.6875 [00:37<1:08:08,  3.45s/batch]

BEGIN	IN	NL-HaNA_1.04.02_2422_0585.jpg	Persia ons laeste schrijvens aen uwel Ed=le; hoog 	[0.9929220676422119, 0.003058923641219735, 0.0039904313161969185, 2.8508415198302828e-05]
END	END	NL-HaNA_1.04.02_1075_0454.jpg	die wij voor seecker houden dar becomen sullen, ti	[4.2141778067161795e-06, 2.7674774173647165e-05, 0.9999048709869385, 6.333187775453553e-05]
BEGIN	IN	NL-HaNA_1.04.02_1147_0014.jpg	van Julij passato gecombineert de havenen onser ha	[0.9998660087585449, 7.65245349612087e-05, 5.714639701182023e-05, 3.3674626820356934e-07]
END	IN	NL-HaNA_1.04.02_1802_0173.jpg	Batavia van wien wij onder dato 30:e 9ber: 1711.; 	[0.0004409916582517326, 0.45363715291023254, 0.5430079698562622, 0.002913794247433543]


  1%|          | 12/1194.6875 [00:40<1:03:59,  3.25s/batch]

BEGIN	IN	NL-HaNA_1.04.02_3530_0307.jpg	wij sullen over den inhoud van het een; en ander b	[0.8414695858955383, 0.0007727883639745414, 0.15766596794128418, 9.158164175460115e-05]


  1%|          | 13/1194.6875 [00:43<1:04:20,  3.27s/batch]

BEGIN	IN	NL-HaNA_1.04.02_7587_1016.jpg	gem. Visitateur der soloien bevoolen; te sorgen, d	[0.9697682857513428, 0.029365550726652145, 0.0008244561031460762, 4.164839265285991e-05]


  1%|          | 14/1194.6875 [00:46<1:02:00,  3.15s/batch]

END	IN	NL-HaNA_1.04.02_2452_0172.jpg	Ragia; groot vader den ouden; Singa /:bij zijne on	[2.0926111119479174e-06, 1.9411938865232514e-06, 0.9999781847000122, 1.7730120816850103e-05]
BEGIN	BEGIN	NL-HaNA_1.04.02_1245_1119.jpg	Mette schepen walcheren, oijevaer en Diemermeer; v	[0.770165205001831, 0.1885010004043579, 0.040057774633169174, 0.0012759632663801312]
END	END	NL-HaNA_1.04.02_3819_0519.jpg	Hoogagting in respect,; WelEdele Hoog Agtbare Wijz	[2.398023113414638e-08, 4.165540135048218e-10, 0.9999998807907104, 6.99478661658759e-08]


  1%|▏         | 15/1194.6875 [00:49<1:00:33,  3.08s/batch]

END	IN	NL-HaNA_1.04.02_3339_0414.jpg	hoogsten benoodigt opgegeeven hebben, en wij Uw we	[0.0006963554187677801, 0.40175408124923706, 0.5939170718193054, 0.0036324812099337578]


  1%|▏         | 16/1194.6875 [00:52<1:00:02,  3.06s/batch]

END	IN	NL-HaNA_1.04.02_2971_0383.jpg		[0.03907805681228638, 0.013840816915035248, 0.9424195885658264, 0.00466154795140028]
BEGIN	IN	NL-HaNA_1.04.02_2753_0054.jpg	paradicins derhalven dat 't UE wel Edele; de Gener	[0.9827498197555542, 0.003498725825920701, 0.013699306175112724, 5.222002073423937e-05]


  1%|▏         | 17/1194.6875 [00:55<59:41,  3.04s/batch]  

END	IN	NL-HaNA_1.04.02_2450_0117.jpg	Aan sijn Hoog Edelheijt den; wel Edele Gestrenge H	[0.0001243518927367404, 1.1170465441523447e-08, 0.9998754262924194, 1.911695903800137e-07]
END	IN	NL-HaNA_1.04.02_1441_0851.jpg	Batavia Hoog Ed. s goedvinden zal konnen zijn; -; 	[0.0005416481872089207, 0.00028018560260534286, 0.9989012479782104, 0.000276892795227468]


  2%|▏         | 18/1194.6875 [00:58<1:00:16,  3.07s/batch]

BEGIN	IN	NL-HaNA_1.04.02_3218_0414.jpg	man geniet, behoudens hare presente; rang Dat verm	[0.9596244692802429, 0.002514767227694392, 0.037762343883514404, 9.833381045609713e-05]


  2%|▏         | 19/1194.6875 [01:02<1:01:33,  3.14s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_2018_0041.jpg	Aan d' Edele Hoog agtb: Heeren; Bewinthebberen Ter	[0.9999982118606567, 1.2388182994982344e-06, 5.722354217141401e-07, 1.6351472398312694e-09]
END	IN	NL-HaNA_1.04.02_1613_0251.jpg	wij hebben na Sirrelon geordonneerd te; besorgen, 	[4.013634580246617e-08, 6.068252123014872e-09, 0.9999995231628418, 4.717089439054689e-07]


  2%|▏         | 20/1194.6875 [01:05<1:01:28,  3.14s/batch]

END	IN	NL-HaNA_1.04.02_1251_0383.jpg	Wij sullen daerop nu te meer acht geven, ende op't	[0.0006341387052088976, 0.0049803960137069225, 0.9931138157844543, 0.0012716982746496797]
BEGIN	IN	NL-HaNA_1.04.02_3338_0456.jpg	Voor de Kamer Hoorn.; de Jonkvrouwe Maria Jacoba, 	[0.9999881982803345, 6.5898643697437365e-06, 5.241326107352506e-06, 1.9200708578637204e-08]
END	IN	NL-HaNA_1.04.02_2019_0289.jpg	Sumat:s W„t Cust; den Coopman; Arend van Broyel; t	[0.06965924054384232, 0.05153330788016319, 0.8749170303344727, 0.0038904959801584482]
IN	BEGIN	NL-HaNA_1.04.02_1208_0013.jpg	voorleden say soen hebben wij u Eed e toe„; gesond	[0.0005803724634461105, 0.9990234375, 0.0002538298722356558, 0.00014233555702958256]


  2%|▏         | 21/1194.6875 [01:08<1:02:48,  3.21s/batch]

BEGIN	IN	NL-HaNA_1.04.02_3530_0321.jpg	§ 191. De Dankbaarheid is bestemd tot; een Contra 	[0.961074709892273, 0.038403864949941635, 0.0004944694810546935, 2.6902678655460477e-05]
END	IN	NL-HaNA_1.04.02_1261_0210.jpg	getelt, omme in 't vaderlandt aen haer ofte haere;	[0.0012635620078071952, 2.4761845907050883e-07, 0.9987348914146423, 1.264903858100297e-06]


  2%|▏         | 23/1194.6875 [01:16<1:06:32,  3.41s/batch]

END	IN	NL-HaNA_1.04.02_2657_0140.jpg		[0.03907805681228638, 0.013840816915035248, 0.9424195885658264, 0.00466154795140028]
END	IN	NL-HaNA_1.04.02_2145_0110.jpg		[0.03907805681228638, 0.013840816915035248, 0.9424195885658264, 0.00466154795140028]


  2%|▏         | 24/1194.6875 [01:19<1:04:29,  3.31s/batch]

END	IN	NL-HaNA_1.04.02_1859_1092.jpg	nevens desen gaat tot uw Ed: Hoog agtb:; onsen Eij	[0.0014979689149186015, 5.993101149215363e-05, 0.9983595013618469, 8.254900603787974e-05]


  2%|▏         | 25/1194.6875 [01:21<1:01:14,  3.14s/batch]

BEGIN	IN	NL-HaNA_1.04.02_7533_0358.jpg	rieden van verschooning wegens het lang aenhouden;	[0.9688984155654907, 0.02999374270439148, 0.001067204400897026, 4.068030830239877e-05]


  2%|▏         | 26/1194.6875 [01:24<59:28,  3.05s/batch]  

END	IN	NL-HaNA_1.04.02_2150_0276.jpg		[0.007511676289141178, 0.001985306851565838, 0.9891741275787354, 0.001328896265476942]
END	IN	NL-HaNA_1.04.02_3679_0216.jpg	Iavas N=t P=t C=t tezenden, van de swaarste en Gaa	[2.903817585320212e-05, 0.0003144772199448198, 0.9993720650672913, 0.0002844264090526849]


  2%|▏         | 27/1194.6875 [01:28<1:03:12,  3.25s/batch]

BEGIN	IN	NL-HaNA_1.04.02_1234_0396.jpg	over laten gaen, ende devoir doen, dat bij provisi	[0.9997461438179016, 0.00012616081221494824, 0.00012688440619967878, 7.096328431543952e-07]


  2%|▏         | 29/1194.6875 [01:34<59:28,  3.06s/batch]  

END	END	NL-HaNA_1.04.02_1134_0047.jpg	tot voorigen florisanten standt laeten comen, ons 	[8.782621080172248e-06, 0.00014797977928537875, 0.9997205138206482, 0.00012269943545106798]
BEGIN	IN	NL-HaNA_1.04.02_3914_0166.jpg	uit dien hoofde, dat wij uwelEdele Hoog Achtb; ver	[0.9982395172119141, 0.0016130213625729084, 0.00014673433906864375, 7.31905231532437e-07]


  3%|▎         | 30/1194.6875 [01:37<1:00:19,  3.11s/batch]

BEGIN	IN	NL-HaNA_1.04.02_1238_0459.jpg	weder te rugh gecomen de Commandeur; ijsbrandt God	[0.9905852675437927, 0.004420632496476173, 0.004910458344966173, 8.366243855562061e-05]
END	IN	NL-HaNA_1.04.02_7536_0295.jpg	op vervolgens schriftelijk berigt is; het daer op 	[0.00012508872896432877, 0.25609561800956726, 0.7418179512023926, 0.0019613259937614202]


  3%|▎         | 31/1194.6875 [01:40<1:00:59,  3.15s/batch]

END	IN	NL-HaNA_1.04.02_2480_0053.jpg	bewesen, met afsmeking; van sijne dierbare genade;	[9.449372555536684e-07, 1.37597213623053e-08, 0.9999984502792358, 5.828183589073888e-07]
BEGIN	IN	NL-HaNA_1.04.02_1653_0085.jpg	Watter op VEd:e Hoog agtb:; Eijs van 250 a 300000 	[0.9998195767402649, 9.451647929381579e-05, 8.532209903933108e-05, 5.58678834750026e-07]
BEGIN	IN	NL-HaNA_1.04.02_7532_0612.jpg	en die vande generale Jaarlijxe; verpagtinge voor 	[0.9982746839523315, 0.0016700949054211378, 5.302402496454306e-05, 2.0991035398765234e-06]
BEGIN	IN	NL-HaNA_1.04.02_3280_0505.jpg	te rug gezonden waaren, en de overige; in 't hospi	[0.9999629259109497, 1.1913843081856612e-05, 2.5062397980946116e-05, 6.872914326550017e-08]


  3%|▎         | 32/1194.6875 [01:41<1:01:21,  3.17s/batch]


In [21]:
writer = csv.DictWriter(
    sys.stdout,
    fieldnames=["Metric", "Average"] + [label.name for label in Label],
    delimiter="\t",
)
writer.writeheader()

for metric in (precision, recall, f1_score):
    scores = {
        label.name: f"{score:.4f}"
        for label, score in zip(Label, metric.compute().tolist())
    }
    writer.writerow(
        {"Metric": metric.__class__.__name__, "Average": str(metric.average)} | scores
    )

print(f"Accuracy ({accuracy.average} average):\t{accuracy.compute().item():.4f}")



Metric	Average	BEGIN	IN	END	OUT
MulticlassPrecision	None	0.1379	0.9989	0.1562	0.0000
MulticlassRecall	None	0.8000	0.9475	1.0000	0.0000
MulticlassF1Score	None	0.2353	0.9725	0.2703	0.0000
Accuracy (micro average):	0.9470
