In [1]:
%load_ext autoreload
%autoreload now

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-08137aa2-e69b-5e74-8390-7997329b1336"
# os.environ["WORLD_SIZE"] = "1"

# Download and convert data

In [3]:
from tqdm import tqdm

from document_segmentation.pagexml.annotations.renate_analysis import RenateAnalysis
from document_segmentation.settings import RENATE_ANALYSIS_DIR

N = None

RENATE_ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)

sheet = RenateAnalysis()


existing_docs = {
    path.stem for path in RENATE_ANALYSIS_DIR.glob("Globdoc_*.json") if path.is_file()
}

for document in tqdm(
    sheet.to_documents(n=N, skip_ids=existing_docs),
    total=(N or len(sheet)) - len(existing_docs),
    desc="Writing documents",
    unit="doc",
):
    document_file = RENATE_ANALYSIS_DIR / f"{document.id}.json"

    with document_file.open("xt") as f:
        f.write(document.model_dump_json())
        f.write("\n")

Writing documents: 0doc [00:00, ?doc/s]


In [4]:
import logging

from tqdm import tqdm

from document_segmentation.pagexml.annotations.renate_analysis import RenateAnalysisInv
from document_segmentation.settings import RENATE_ANALYSIS_DIR, RENATE_ANALYSIS_SHEETS

N = None


sheet = RenateAnalysisInv(RENATE_ANALYSIS_SHEETS[0])  # TODO: use both sheets

for document in tqdm(
    sheet.to_documents(n=N), desc="Writing documents", unit="doc", total=26
):
    document_file = RENATE_ANALYSIS_DIR / f"{document.id}.json"

    if document_file.exists():
        logging.info(f"Document {document.id} already exists, skipping")
    else:
        with document_file.open("xt") as f:
            f.write(document.model_dump_json())
            f.write("\n")

Writing documents:   0%|          | 0/26 [00:00<?, ?doc/s]

Writing documents: 100%|██████████| 26/26 [00:10<00:00,  2.54doc/s]


# Load Data

In [5]:
%autoreload now

In [6]:
TRAINING_DATA = 0.8

In [7]:
from document_segmentation.model.dataset import DocumentDataset

dataset: DocumentDataset = DocumentDataset.from_dir(RENATE_ANALYSIS_DIR)
dataset.shuffle()

len(dataset)

Reading JSON files: 100%|██████████| 104/104 [00:00<00:00, 179.17file/s]


104

In [8]:
dataset._class_counts()

Counter({<Label.IN: 1>: 1907,
         <Label.BEGIN: 0>: 104,
         <Label.END: 2>: 100,
         <Label.OUT: 3>: 73})

In [9]:
dataset.class_weights()

[0.9904761904761905,
 0.05450733752620545,
 1.0297029702970297,
 1.4054054054054055]

In [10]:
training_data, test_data = dataset.split(TRAINING_DATA)

In [11]:
training_data._class_counts()

Counter({<Label.IN: 1>: 1642,
         <Label.BEGIN: 0>: 83,
         <Label.END: 2>: 80,
         <Label.OUT: 3>: 68})

In [12]:
test_data._class_counts()

Counter({<Label.IN: 1>: 265,
         <Label.BEGIN: 0>: 21,
         <Label.END: 2>: 20,
         <Label.OUT: 3>: 5})

# Train Model

In [13]:
import torch

BATCH_SIZE = 64
EPOCHS = 5
WEIGHTS = torch.Tensor(dataset.class_weights())  # For an imbalanced dataset

In [14]:
%autoreload now

In [15]:
from document_segmentation.model.page_sequence_tagger import PageSequenceTagger

tagger = PageSequenceTagger()

In [16]:
tagger._device

'mps'

In [17]:
tagger

PageSequenceTagger(
  (_page_embedding): PageEmbedding(
    (_region_model): RegionEmbeddingSentenceTransformer(
      (_transformer_model): SentenceTransformer(
        (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel 
        (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
      )
      (_region_type): Embedding(9, 16)
      (_linear): Linear(in_features=784, out_features=512, bias=True)
    )
    (_rnn): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
    (_linear): Linear(in_features=512, out_features=256, bias=True)
  )
  (_rnn): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (_linear): Linear(in_features=512, out_features=4, bias=True)
  (_soft

In [18]:
tagger.train_(training_data, EPOCHS, BATCH_SIZE, WEIGHTS.to(tagger._device))

Training: 100%|██████████| 95/95 [01:43<00:00,  1.09s/batch]


[Loss:	0.860]


Training: 100%|██████████| 95/95 [00:05<00:00, 17.30batch/s]


[Loss:	0.594]


Training: 100%|██████████| 95/95 [00:05<00:00, 17.59batch/s]


[Loss:	0.532]


Training: 100%|██████████| 95/95 [00:05<00:00, 17.30batch/s]


[Loss:	0.519]


Training: 100%|██████████| 95/95 [00:05<00:00, 17.27batch/s]

[Loss:	0.517]





In [24]:
with open("page_sequence_tagger.pt", "xb") as f:
    torch.save(tagger, f)

# Evaluate Model

In [19]:
import csv
import sys

from torcheval.metrics import (
    MulticlassAccuracy,
    MulticlassF1Score,
    MulticlassPrecision,
    MulticlassRecall,
)
from tqdm import tqdm

from document_segmentation.pagexml.datamodel.label import Label

writer = csv.DictWriter(
    sys.stdout,
    fieldnames=("Predicted", "Actual", "Page ID", "Text", "Scores"),
    delimiter="\t",
)

writer.writeheader()

accuracy = MulticlassAccuracy(num_classes=len(Label))
precision = MulticlassPrecision(average=None, num_classes=len(Label))
recall = MulticlassRecall(average=None, num_classes=len(Label))
f1_score = MulticlassF1Score(average=None, num_classes=len(Label))

for batch in tqdm(
    test_data.batches(BATCH_SIZE), total=test_data.n_batches(BATCH_SIZE), unit="batch"
):
    predicted = tagger(batch)
    labels = batch.labels()

    _labels = torch.Tensor([label.value for label in labels]).to(int)
    accuracy.update(predicted, _labels)
    precision.update(predicted, _labels)
    recall.update(predicted, _labels)
    f1_score.update(predicted, _labels)

    for page, pred, label in zip(batch.pages, predicted, labels):
        pred_label = Label(pred.argmax().item())
        writer.writerow(
            {
                "Predicted": pred_label.name,
                "Actual": label.name,
                "Page ID": page.doc_id,
                "Text": page.text(delimiter="; ")[:50],
                "Scores": str(pred.tolist()),
            }
        )

Predicted	Actual	Page ID	Text	Scores


  5%|▍         | 1/21 [00:00<00:15,  1.29batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_2334_0029.jpg	N„o 1. Originele missive door en aan als boven; „ 	[0.9976346492767334, 0.0014405688270926476, 0.0003242988314013928, 0.0006004812312312424]
IN	IN	NL-HaNA_1.04.02_2334_0030.jpg	bodemapart; inhandigt.; Cognossement; Onkost reek„	[0.0210682712495327, 0.9577849507331848, 0.01492651179432869, 0.0062202634289860725]
END	END	NL-HaNA_1.04.02_2334_0031.jpg	stuagtelijst; lijste de Maarte: haarteken; den sch	[0.0026382452342659235, 0.002028851304203272, 0.9950413107872009, 0.0002916694793384522]


 10%|▉         | 2/21 [00:01<00:11,  1.60batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_2542_0114.jpg	Van Mallabaar onder dato; 9. stux te weeten; 6. ma	[0.9847745299339294, 0.0008813795284368098, 0.014121505431830883, 0.00022256876400206238]


 14%|█▍        | 3/21 [00:03<00:22,  1.26s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_8864_0017.jpg	Van Cormandel onder dato ultimo: Junij 1734.; Vrij	[0.9975234866142273, 0.0014467928558588028, 0.00033440531115047634, 0.0006953740958124399]
IN	IN	NL-HaNA_1.04.02_8864_0018.jpg	Van Cormandel onder dato uetim Junij 1734.; versee	[0.011600962840020657, 0.9796565771102905, 0.00385815417394042, 0.0048843626864254475]
IN	IN	NL-HaNA_1.04.02_8864_0019.jpg	Van Cormandel, onder dato ultimo Junij 1734.; Dond	[0.007064731325954199, 0.9856823086738586, 0.003302427940070629, 0.003950604237616062]
IN	IN	NL-HaNA_1.04.02_8864_0020.jpg	Van Cormandel onder dato uiltimo Julij 1739; tinge	[0.005827696528285742, 0.9860939979553223, 0.003772510215640068, 0.004305807873606682]
IN	IN	NL-HaNA_1.04.02_8864_0021.jpg	Van Cormandel onder dato 20. ' Junij 17; 1734; 1Be	[0.005992822349071503, 0.9864352345466614, 0.003241727128624916, 0.0043302676640450954]
IN	IN	NL-HaNA_1.04.02_8864_0022.jpg	Van Cormandel onder Dato ultimo Junij A:o 1734.; b	[0.006368376314640045, 0.9866471

 19%|█▉        | 4/21 [00:05<00:25,  1.50s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1506_1034.jpg	-; d; E; 7.; decken; 8; 2; van de; 5; ƒ; 3; E; E; 	[0.9969460368156433, 0.0018056592671200633, 0.00042282947106286883, 0.00082543765893206]
IN	IN	NL-HaNA_1.04.02_1506_1035.jpg	binnewater; een lamme; D95; same; uijt; ies; Cas; 	[0.01352662406861782, 0.9774552583694458, 0.004115343559533358, 0.004902746062725782]
IN	IN	NL-HaNA_1.04.02_1506_1036.jpg	30 vrs; o; Janor; 6; 5o; e; x; 116; E; 6.; 1.; :; 	[0.009668956510722637, 0.973635733127594, 0.01236887089908123, 0.004326431546360254]
END	END	NL-HaNA_1.04.02_1506_1037.jpg	k; „noortvelt; rogons; „1; rsame; uijt; eruijt; ƒ;	[0.0024677864275872707, 0.0024721757508814335, 0.9947494864463806, 0.0003105464274995029]


 24%|██▍       | 5/21 [00:06<00:22,  1.40s/batch]

OUT	OUT	NL-HaNA_1.04.02_1547_0236.jpg		[0.0037416855338960886, 0.010984438471496105, 0.0018611939158290625, 0.9834127426147461]
BEGIN	BEGIN	NL-HaNA_1.04.02_1547_0237.jpg	aan d'E: Heer Alexander; Wigmans Coopman en P„r; g	[0.9960887432098389, 0.0019016050500795245, 0.0004171580949332565, 0.0015924684703350067]
IN	IN	NL-HaNA_1.04.02_1547_0238.jpg	eenige misnoegentheijt, in die natie ligt aan„; „w	[0.0089103439822793, 0.9821773767471313, 0.003539677243679762, 0.005372707732021809]
IN	IN	NL-HaNA_1.04.02_1547_0239.jpg	een vast voorneemen, 'tsij het leven aldaar te; ve	[0.005985654890537262, 0.9871158003807068, 0.0031074557919055223, 0.003791088005527854]
IN	IN	NL-HaNA_1.04.02_1547_0240.jpg	„gemelte schrijvens tomtwaaren, ook hoe uEE:; goet	[0.005530355032533407, 0.9879321455955505, 0.0029731637332588434, 0.0035642797593027353]
IN	IN	NL-HaNA_1.04.02_1547_0241.jpg	Heest in alle mijne brieven heb ik voor uEE:; mijn	[0.005561021622270346, 0.9879786968231201, 0.0029682733584195375, 0.00349209154

 29%|██▊       | 6/21 [00:07<00:21,  1.42s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1188_0433.jpg	Decembr. 1651. In't schip den Drommedaris; Daghreg	[0.9973642230033875, 0.0015967696672305465, 0.00032504432601854205, 0.0007139550871215761]
IN	IN	NL-HaNA_1.04.02_1188_0434.jpg	Decemb. 1651. In't schip den Drommedaris; weijnigh	[0.010861136019229889, 0.9799608588218689, 0.0038541662506759167, 0.005323844961822033]
IN	IN	NL-HaNA_1.04.02_1188_0435.jpg	Decemb. 1651 In't schip den Drommedavis.; Omtrent 	[0.007163504604250193, 0.985739529132843, 0.003259476972743869, 0.003837501397356391]
IN	IN	NL-HaNA_1.04.02_1188_0436.jpg	Decemb. 1651. In't schip den Drommedaris.; 25=en p	[0.0070573389530181885, 0.9861974120140076, 0.003103547729551792, 0.00364164006896317]
IN	IN	NL-HaNA_1.04.02_1188_0437.jpg	Decemb. 1651. In't schip den Drommedadris; heen, I	[0.006566377356648445, 0.986577033996582, 0.0031291290652006865, 0.0037275529466569424]
IN	IN	NL-HaNA_1.04.02_1188_0438.jpg	Decemb: 1651.; In't schip den Rremmedaris; Officie	[0.006231188308447599, 0.9866524

 33%|███▎      | 7/21 [00:15<00:48,  3.50s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_3095_0015.jpg	Register der Papieren; werdende versonden per het 	[0.9973479509353638, 0.001519732060842216, 0.00034496470470912755, 0.0007873836439102888]
IN	IN	NL-HaNA_1.04.02_3095_0016.jpg	4.; orig: in genaagt, a:o p„o; d'Edele Groot Agtba	[0.011002984829246998, 0.9798586368560791, 0.0039549823850393295, 0.005183406639844179]
IN	IN	NL-HaNA_1.04.02_3095_0017.jpg	N:o 7. Copia Generale Resolutien des Casteels; Bat	[0.007079663686454296, 0.9857373237609863, 0.0032304590567946434, 0.003952573519200087]
IN	IN	NL-HaNA_1.04.02_3095_0018.jpg	Commissien, Memorien,; Jnstructien en z:, welke; v	[0.00658723758533597, 0.9862956404685974, 0.0031796118710190058, 0.003937535919249058]
IN	IN	NL-HaNA_1.04.02_3095_0019.jpg	No 14. Thien. Gesloten Pacquetten, houdende; de ad	[0.006393721327185631, 0.9865872263908386, 0.0031691219191998243, 0.00384982256218791]
IN	IN	NL-HaNA_1.04.02_3095_0020.jpg	Commissien, Memorien; Jnstructien en z:, weg; van 	[0.006373302545398474, 0.9868044

 38%|███▊      | 8/21 [00:17<00:36,  2.82s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_2356_0028.jpg	Mocha den 15 augustus 1734:; Van; Mocha onder Dato	[0.997373104095459, 0.00151030661072582, 0.0003403750597499311, 0.0007762020104564726]
IN	IN	NL-HaNA_1.04.02_2356_0029.jpg	Van Bovia onder dato 15 augustus 1734; Van; Colba 	[0.01098336186259985, 0.9798691868782043, 0.003955919295549393, 0.005191654898226261]
IN	IN	NL-HaNA_1.04.02_2356_0030.jpg	Sacha Onder Dato 15. Aug: 1734; Van; Cocha onder D	[0.00709175830706954, 0.9857333898544312, 0.0032318143639713526, 0.003943051211535931]
IN	IN	NL-HaNA_1.04.02_2356_0031.jpg	Van Mocha onder dato 15 aug:s 1734; Van; Mocha ond	[0.006620791740715504, 0.9862863421440125, 0.0031834428664296865, 0.00390944629907608]
IN	IN	NL-HaNA_1.04.02_2356_0032.jpg	Cocha onder dato 18 Augutus 1744; Van; Augustus; V	[0.0063929990865290165, 0.9865872263908386, 0.0031675901263952255, 0.0038521841634064913]
IN	IN	NL-HaNA_1.04.02_2356_0033.jpg	Van Norsa onder dato 15 Aug 1734:; Van Mocha onder	[0.00638764351606369, 0.98679822683

 48%|████▊     | 10/21 [00:17<00:16,  1.54s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1432_1553.jpg	van d' Eel Ed=le Heeren d' Respective; heeren Bewi	[0.9969754219055176, 0.0018071829108521342, 0.0004107239656150341, 0.0008066385635174811]
IN	IN	NL-HaNA_1.04.02_1432_1554.jpg	deductie der Complotterie quam te bemelk; deselve 	[0.012651054188609123, 0.978402316570282, 0.0039916327223181725, 0.004955015145242214]
IN	IN	NL-HaNA_1.04.02_1432_1555.jpg	off tot verantwoording vant gene ik van; ter sijde	[0.008803027682006359, 0.9773945808410645, 0.009584642946720123, 0.004217817913740873]
END	END	NL-HaNA_1.04.02_1432_1556.jpg	niet te importimeeren, desen eijndige; ende, naar 	[0.002474740846082568, 0.002480891766026616, 0.9947323799133301, 0.0003119643952231854]
BEGIN	BEGIN	NL-HaNA_1.04.02_1105_0667.jpg	Erntseste, wise, voor sinnige, ende seer discrete 	[0.9976498484611511, 0.001440392923541367, 0.0003288624284323305, 0.0005809283466078341]
IN	IN	NL-HaNA_1.04.02_1105_0668.jpg	ende nae gissinge 280 mijllen, vant Eijlant St Pau	[0.016568144783377647, 

 57%|█████▋    | 12/21 [00:18<00:07,  1.16batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_3952_0232.jpg	Moegle in Bengale; den 2e January 1793.; Hemels aa	[0.997748076915741, 0.0013548285933211446, 0.00027585981297306716, 0.000621267594397068]
IN	IN	NL-HaNA_1.04.02_3952_0233.jpg	geweezene Ffiscaal is opgelegd de vergoe¬; „ding v	[0.011073697358369827, 0.9799070358276367, 0.003965537529438734, 0.005053671542555094]
IN	IN	NL-HaNA_1.04.02_3952_0234.jpg	dat het daar voor meerder, opgebragte ook; waarach	[0.008179116994142532, 0.984310507774353, 0.0036651084665209055, 0.003845332656055689]
IN	IN	NL-HaNA_1.04.02_3952_0235.jpg	Maatschappy als een waarborg voor; de hem opgelegd	[0.006699105724692345, 0.9863920211791992, 0.0031960601918399334, 0.003712857374921441]
IN	IN	NL-HaNA_1.04.02_3952_0236.jpg	zwaar drukkende restitutie, zo wel als die; hem in	[0.00706248776987195, 0.9859027862548828, 0.0033831908367574215, 0.0036514527164399624]
IN	IN	NL-HaNA_1.04.02_3952_0237.jpg	zeer kostbaar is, en hem de middelen ont„; „breeke	[0.006279861554503441, 0.98592990

 62%|██████▏   | 13/21 [00:20<00:09,  1.25s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1509_1538.jpg	Monsterolle van alle sComp:s Loontreckende; Monste	[0.9974421262741089, 0.0015362566336989403, 0.00031527093960903585, 0.0007063784869387746]
IN	IN	NL-HaNA_1.04.02_1509_1539.jpg	dienaren dewelcke in't Cormandelse Gouvernement bi	[0.010951544158160686, 0.980018675327301, 0.0038943837862461805, 0.005135370884090662]
IN	IN	NL-HaNA_1.04.02_1509_1540.jpg	339. en 35. persoonen p=r Transport. —; Namen, Toe	[0.007080558221787214, 0.9856894016265869, 0.0032154121436178684, 0.004014668520539999]
IN	IN	NL-HaNA_1.04.02_1509_1541.jpg	90; d=o.. . ..; Adsistent; Chirurgijn. . . . ƒ 36.	[0.007097168359905481, 0.986040472984314, 0.0031846105121076107, 0.0036777949426323175]
IN	IN	NL-HaNA_1.04.02_1509_1542.jpg	339 en 74. persoonen P=r Transport; Namen, Toename	[0.0062438612803816795, 0.9863752722740173, 0.0034163198433816433, 0.003964453469961882]
IN	IN	NL-HaNA_1.04.02_1509_1543.jpg	siekevaar. . . ƒ 20. walcheren. . . . . 1662. Zeel	[0.006895636674016714, 0.9861

 67%|██████▋   | 14/21 [00:21<00:07,  1.03s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1139_1877.jpg	Anthonij Poulo gedoot; Gerrit Cornelisen van St. p	[0.9970325231552124, 0.0017765837255865335, 0.000405643047997728, 0.0007852613343857229]
IN	IN	NL-HaNA_1.04.02_1139_1878.jpg	van t' Schip Schiedam inde boeijen,; Aris Claesz C	[0.013074134476482868, 0.9779625535011292, 0.00406275037676096, 0.004900532774627209]
IN	IN	NL-HaNA_1.04.02_1139_1879.jpg	In t' Bosch genomen als nu als; dan in t leven end	[0.009204166941344738, 0.9753046631813049, 0.011189388111233711, 0.004301808774471283]
END	END	NL-HaNA_1.04.02_1139_1880.jpg	Inde Boeijen sijn wij noch; 32 personen.; 8 uijt d	[0.002469070954248309, 0.00247715855948627, 0.9947426319122314, 0.000311090872855857]
BEGIN	BEGIN	NL-HaNA_1.04.02_1547_0610.jpg	J=o W:t Namen toenamen, en geboorte plaatsen... ..	[0.9970205426216125, 0.0017972049536183476, 0.00041347419028170407, 0.0007687166216783226]
IN	IN	NL-HaNA_1.04.02_1547_0611.jpg	der naarvolgende Personen; ultimo Iunij (agtervolg	[0.013077722862362862, 0.

 81%|████████  | 17/21 [00:21<00:02,  1.79batch/s]

OUT	OUT	NL-HaNA_1.04.02_1547_0662.jpg		[0.003701368346810341, 0.011153489351272583, 0.002482645446434617, 0.9826624393463135]
BEGIN	BEGIN	NL-HaNA_1.04.02_1547_0663.jpg	46025: I=s mogta kleeden off chiomen Namentlijk.; 	[0.9964752793312073, 0.0017677899450063705, 0.0003624224627856165, 0.001394449034705758]
IN	IN	NL-HaNA_1.04.02_1547_0664.jpg	2921. P=s dubb: hiomas off tesserse pantsia, alle 	[0.008381685242056847, 0.9826903939247131, 0.003722737543284893, 0.005205182824283838]
IN	IN	NL-HaNA_1.04.02_1547_0665.jpg	Transport - - - ƒ157645. 1. 4; no; n.; 222: p=s be	[0.005613864865154028, 0.9872298836708069, 0.0032014944590628147, 0.003954755142331123]
IN	IN	NL-HaNA_1.04.02_1547_0666.jpg	P„r Transport . ƒ252638. 9. 6; 1069. 14. 11 „ 4567	[0.0056681036949157715, 0.9878899455070496, 0.002919183112680912, 0.003522861050441861]
IN	IN	NL-HaNA_1.04.02_1547_0667.jpg	79: stx: 9 Moors duk: den 20=en 9b=r met 't fluijt	[0.00527207599952817, 0.9881505966186523, 0.00300728902220726, 0.0035700318403542

 90%|█████████ | 19/21 [00:22<00:00,  2.20batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_1072_0179.jpg	Legatten ende clein vaestngen nooducs; voor Banda.	[0.9972061514854431, 0.0016327535267919302, 0.0003721415705513209, 0.0007889929111115634]
IN	IN	NL-HaNA_1.04.02_1072_0180.jpg	Jacht van Banda, van daer bij oostlycke; monssor n	[0.011204724200069904, 0.9797572493553162, 0.004032177850604057, 0.005005925428122282]
IN	IN	NL-HaNA_1.04.02_1072_0181.jpg	insolentien vande Bandanesen.; Moordadus feit van 	[0.007464708760380745, 0.9854023456573486, 0.0032674826215952635, 0.0038654152303934097]
IN	IN	NL-HaNA_1.04.02_1072_0182.jpg	voornemen van den Goum: Generel; tegens Banda.; pa	[0.006870764307677746, 0.9862992167472839, 0.0032101941760629416, 0.003619859926402569]
IN	IN	NL-HaNA_1.04.02_1072_0183.jpg	vermeerden & versterckt.; met weymis costen voorde	[0.006249531172215939, 0.9864282011985779, 0.0036167390644550323, 0.0037054994609206915]
IN	IN	NL-HaNA_1.04.02_1072_0184.jpg	voor gamnisoeren.; Langewers dierstus tot tochten.	[0.00841083936393261, 0.97711

 95%|█████████▌| 20/21 [00:23<00:00,  1.78batch/s]

OUT	OUT	NL-HaNA_1.04.02_1547_0097.jpg		[0.0024918601848185062, 0.010693737305700779, 0.002333863405510783, 0.9844805598258972]
OUT	OUT	NL-HaNA_1.04.02_1547_0098.jpg		[0.001568082720041275, 0.00829284731298685, 0.0018177498131990433, 0.9883213639259338]
BEGIN	BEGIN	NL-HaNA_1.04.02_1547_0099.jpg	uwe; Hoog Edele agtb: gebiedende Heere en Heeren; 	[0.995026171207428, 0.002068228553980589, 0.00045953557128086686, 0.0024460754357278347]
IN	IN	NL-HaNA_1.04.02_1547_0100.jpg	heen mne  dn d n n lel een w en de e sou co  an d;	[0.00914937723428011, 0.9708470702171326, 0.004106663167476654, 0.01589682325720787]
IN	IN	NL-HaNA_1.04.02_1547_0101.jpg	te Nambedle, den prince Cartadavile, en Moerianato	[0.006321579217910767, 0.9867390394210815, 0.0029649375937879086, 0.003974375780671835]
IN	IN	NL-HaNA_1.04.02_1547_0102.jpg	werden; war op de ehien anmandijn lae telkens tot 	[0.004644239787012339, 0.9876202940940857, 0.003116086358204484, 0.004619344137609005]
IN	IN	NL-HaNA_1.04.02_1547_0103.jpg	41; Mall

100%|██████████| 21/21 [00:27<00:00,  1.33s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1490_0583.jpg	Copije Secrete Resolutien; genomen bij de Ho: Rege	[0.9973623156547546, 0.0015661944635212421, 0.00033882528077811003, 0.0007326914346776903]
IN	IN	NL-HaNA_1.04.02_1490_0584.jpg	in presentie; van zijn Ed. t te; volbrengen, soo s	[0.010986301116645336, 0.9798687100410461, 0.003987658303231001, 0.005157343577593565]
IN	IN	NL-HaNA_1.04.02_1490_0585.jpg	dat 's Comp. s dienaren en wel voorna„; mentlijk p	[0.007038549054414034, 0.985831081867218, 0.0032854059245437384, 0.0038450032006949186]
IN	IN	NL-HaNA_1.04.02_1490_0586.jpg	bij ons te boeck staen voor niet wel; geintentione	[0.006331078242510557, 0.9866373538970947, 0.0031721466220915318, 0.003859366523101926]
IN	IN	NL-HaNA_1.04.02_1490_0587.jpg	goetgevonden dat dese ontbiedinge; bij ons gemeen 	[0.006249660160392523, 0.986785352230072, 0.003204730339348316, 0.003760277759283781]
IN	IN	NL-HaNA_1.04.02_1490_0588.jpg	schaap harder herwaerts aen te doen; overkomen, ge	[0.006481838878244162, 0.9867219




In [20]:
training_data._class_counts()

Counter({<Label.IN: 1>: 1642,
         <Label.BEGIN: 0>: 83,
         <Label.END: 2>: 80,
         <Label.OUT: 3>: 68})

In [21]:
writer = csv.DictWriter(
    sys.stdout,
    fieldnames=["Metric"] + [label.name for label in Label],
    delimiter="\t",
)
writer.writeheader()

for metric in (precision, recall, f1_score):
    scores = {
        label.name: f"{score:.4f}"
        for label, score in zip(Label, metric.compute().tolist())
    }
    writer.writerow({"Metric": metric.__class__.__name__} | scores)

print(f"Accuracy ({accuracy.average} average):\t{accuracy.compute().item():.4f}")

Metric	BEGIN	IN	END	OUT
MulticlassPrecision	1.0000	1.0000	1.0000	1.0000
MulticlassRecall	1.0000	1.0000	1.0000	1.0000
MulticlassF1Score	1.0000	1.0000	1.0000	1.0000
Accuracy (micro average):	1.0000
