In [1]:
%load_ext autoreload
%autoreload now

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-08137aa2-e69b-5e74-8390-7997329b1336"

# Download and convert data

In [3]:
from tqdm import tqdm

from document_segmentation.pagexml.annotations.renate_analysis import RenateAnalysis
from document_segmentation.settings import RENATE_ANALYSIS_DIR

N = None

RENATE_ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)

sheet = RenateAnalysis()


existing_docs = {
    path.stem for path in RENATE_ANALYSIS_DIR.glob("Globdoc_*.json") if path.is_file()
}

for document in tqdm(
    sheet.to_documents(n=N, skip_ids=existing_docs),
    total=(N or len(sheet)) - len(existing_docs),
    desc="Writing documents",
    unit="doc",
):
    document_file = RENATE_ANALYSIS_DIR / f"{document.id}.json"

    with document_file.open("xt") as f:
        f.write(document.model_dump_json())
        f.write("\n")

Writing documents: 0doc [00:00, ?doc/s]


In [4]:
import logging

from tqdm import tqdm

from document_segmentation.pagexml.annotations.renate_analysis import RenateAnalysisInv
from document_segmentation.settings import RENATE_ANALYSIS_DIR, RENATE_ANALYSIS_SHEETS

N = None


sheet = RenateAnalysisInv(RENATE_ANALYSIS_SHEETS[0])  # TODO: use both sheets

for document in tqdm(
    sheet.to_documents(n=N), desc="Writing documents", unit="doc", total=26
):
    document_file = RENATE_ANALYSIS_DIR / f"{document.id}.json"

    if document_file.exists():
        logging.info(f"Document {document.id} already exists, skipping")
    else:
        with document_file.open("xt") as f:
            f.write(document.model_dump_json())
            f.write("\n")

Writing documents: 100%|██████████| 26/26 [00:11<00:00,  2.33doc/s]


# Load Data

In [5]:
%autoreload now

In [6]:
TRAINING_DATA = 0.8

In [7]:
from document_segmentation.model.dataset import DocumentDataset

dataset: DocumentDataset = DocumentDataset.from_dir(RENATE_ANALYSIS_DIR)
dataset.shuffle()

len(dataset)

Reading JSON files: 100%|██████████| 104/104 [00:00<00:00, 160.10file/s]


104

In [8]:
dataset._class_counts()

Counter({<Label.IN: 1>: 1907,
         <Label.BEGIN: 0>: 104,
         <Label.END: 2>: 100,
         <Label.OUT: 3>: 73})

In [9]:
dataset.class_weights()

[0.9904761904761905,
 0.05450733752620545,
 1.0297029702970297,
 1.4054054054054055]

In [10]:
training_data, test_data = dataset.split(TRAINING_DATA)

In [11]:
training_data._class_counts()

Counter({<Label.IN: 1>: 1613,
         <Label.BEGIN: 0>: 83,
         <Label.END: 2>: 79,
         <Label.OUT: 3>: 54})

In [12]:
test_data._class_counts()

Counter({<Label.IN: 1>: 294,
         <Label.BEGIN: 0>: 21,
         <Label.END: 2>: 21,
         <Label.OUT: 3>: 19})

# Train Model

In [13]:
import torch

BATCH_SIZE = 64
EPOCHS = 10
WEIGHTS = dataset.class_weights()

In [14]:
from document_segmentation.model.page_sequence_tagger import PageSequenceTagger

tagger = PageSequenceTagger()

In [15]:
tagger._device

'mps'

In [16]:
tagger

PageSequenceTagger(
  (_page_embedding): PageEmbedding(
    (_region_model): RegionEmbeddingSentenceTransformer(
      (_transformer_model): SentenceTransformer(
        (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel 
        (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
      )
      (_region_type): Embedding(9, 16)
      (_linear): Linear(in_features=784, out_features=512, bias=True)
    )
    (_rnn): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
    (_linear): Linear(in_features=512, out_features=256, bias=True)
  )
  (_rnn): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (_linear): Linear(in_features=512, out_features=4, bias=True)
  (_soft

In [17]:
tagger.train_(training_data, EPOCHS, BATCH_SIZE, WEIGHTS.to(tagger._device))

Training: 100%|██████████| 93/93 [01:46<00:00,  1.15s/batch]


[Loss:	0.124]


Training: 100%|██████████| 93/93 [00:05<00:00, 17.93batch/s]


[Loss:	0.123]


Training: 100%|██████████| 93/93 [00:05<00:00, 17.81batch/s]


[Loss:	0.122]


Training: 100%|██████████| 93/93 [00:05<00:00, 17.96batch/s]


[Loss:	0.075]


Training: 100%|██████████| 93/93 [00:05<00:00, 17.92batch/s]


[Loss:	0.075]


Training: 100%|██████████| 93/93 [00:05<00:00, 17.86batch/s]


[Loss:	0.074]


Training: 100%|██████████| 93/93 [00:05<00:00, 17.96batch/s]


[Loss:	0.074]


Training: 100%|██████████| 93/93 [00:05<00:00, 17.98batch/s]


[Loss:	0.074]


Training: 100%|██████████| 93/93 [00:05<00:00, 18.06batch/s]


[Loss:	0.074]


Training: 100%|██████████| 93/93 [00:05<00:00, 18.10batch/s]

[Loss:	0.074]





In [18]:
with open("page_sequence_tagger.pt", "xb") as f:
    torch.save(tagger, f)

# Evaluate Model

In [19]:
import sys

precision, recall, f1, accuracy = tagger.eval_(test_data, BATCH_SIZE, sys.stdout)

Predicted	Actual	Page ID	Text	Scores


Evaluating:   0%|          | 0/23 [00:00<?, ?batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_8737_0089.jpg	Van Bengale, onder dato 28„' 8ber: A. o 1705.; voo	[0.9998795986175537, 4.716290277428925e-05, 5.450770186143927e-05, 1.8789738533087075e-05]


Evaluating:   4%|▍         | 1/23 [00:00<00:04,  5.00batch/s]

END	END	NL-HaNA_1.04.02_8737_0090.jpg	Van Bengale onder dato 28:' xber: A:o 1705.; waar 	[0.0003245954285375774, 0.00012509663065429777, 0.9995437264442444, 6.612697234231746e-06]


Evaluating:   9%|▊         | 2/23 [00:02<00:31,  1.48s/batch]

OUT	OUT	NL-HaNA_1.04.02_1547_0020.jpg		[6.891964585520327e-05, 4.436900417204015e-05, 1.554836126160808e-05, 0.9998711347579956]
OUT	OUT	NL-HaNA_1.04.02_1547_0021.jpg		[3.485719935270026e-05, 2.1129219021531753e-05, 9.011316251417156e-06, 0.9999350309371948]
OUT	OUT	NL-HaNA_1.04.02_1547_0022.jpg		[3.9889069739729166e-05, 1.9674509530887008e-05, 9.324101483798586e-06, 0.9999310970306396]
OUT	OUT	NL-HaNA_1.04.02_1547_0023.jpg	1.; Hrigineele briev; No	[0.08336205035448074, 0.00025207537692040205, 0.0003549853863660246, 0.9160308837890625]
OUT	OUT	NL-HaNA_1.04.02_1547_0024.jpg		[0.00012575231085065752, 0.00010892363206949085, 2.3612075892742723e-05, 0.9997417330741882]
OUT	OUT	NL-HaNA_1.04.02_1547_0025.jpg		[8.848142897477373e-05, 5.1348499255254865e-05, 1.5468025594600476e-05, 0.9998446702957153]
OUT	OUT	NL-HaNA_1.04.02_1547_0026.jpg		[0.00013648535241372883, 4.937762059853412e-05, 1.9503198927850462e-05, 0.9997946619987488]
BEGIN	BEGIN	NL-HaNA_1.04.02_1547_0027.jpg	Aan d' Edele Hog Agbaa

Evaluating:  17%|█▋        | 4/23 [00:03<00:14,  1.28batch/s]

BEGIN	IN	NL-HaNA_1.04.02_1547_0084.jpg	hebbende konnen gesleeten werden, en; 'twelke haar	[0.9998729228973389, 6.188813858898357e-05, 4.6217315684771165e-05, 1.898204027384054e-05]
IN	IN	NL-HaNA_1.04.02_1547_0085.jpg	11602¼. lb= bengaalse sijden, teweeten; 4651¾ lb C	[0.0004650719929486513, 0.9992179870605469, 0.0003147173556499183, 2.272505753353471e-06]
IN	IN	NL-HaNA_1.04.02_1547_0086.jpg	54534. p:s mogta kleeden of Chiomas. —; 5697. —. t	[0.00026869200519286096, 0.999468982219696, 0.00026091045583598316, 1.4025550854057656e-06]
IN	IN	NL-HaNA_1.04.02_1547_0087.jpg	4. packen med gebleekte gerras. —; 14. d=os mallem	[0.00022760876163374633, 0.9994956254959106, 0.0002756057947408408, 1.2111519254176528e-06]
IN	IN	NL-HaNA_1.04.02_1547_0088.jpg	2210. lb Ceijlonse caneel; 6826:- Jappanse Camphur	[0.00022158713545650244, 0.999498724937439, 0.00027853541541844606, 1.1722523822754738e-06]
IN	IN	NL-HaNA_1.04.02_1547_0089.jpg	in heede van den overleeden capitain hend:k; Reijn	[0.000239886023337

Evaluating:  26%|██▌       | 6/23 [00:03<00:07,  2.40batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_1239_0447.jpg	N=a 1: Origineele brieff door d' h=ren Rijckloff v	[0.9998810291290283, 4.634555079974234e-05, 5.37945561518427e-05, 1.8800501493387856e-05]
END	END	NL-HaNA_1.04.02_1239_0448.jpg	casje den schipper ter; met opschrift aen d'Ede:; 	[0.00032319853198714554, 0.00012580906332004815, 0.9995444416999817, 6.615167421841761e-06]
BEGIN	BEGIN	NL-HaNA_1.04.02_1072_0529.jpg	Erntfeste wijse voorsinnighe seer discrete heeren.	[0.9998761415481567, 5.658320151269436e-05, 4.91199862153735e-05, 1.8194217773270793e-05]
IN	IN	NL-HaNA_1.04.02_1072_0530.jpg	houden want comen alle Jaren hier expresselijck me	[0.0008431379101239145, 0.9976963400840759, 0.001456424011848867, 4.008799805887975e-06]
END	END	NL-HaNA_1.04.02_1072_0531.jpg	alle mogelijcke middelen naersticht te doen, om so	[0.0001756351557560265, 0.00011641135643003508, 0.9997041821479797, 3.6990197713748785e-06]


Evaluating:  30%|███       | 7/23 [00:04<00:07,  2.02batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_2438_0087.jpg	Van Timor onder dato 7:' September A„o 1738: —.; d	[0.9998769760131836, 6.052240860299207e-05, 4.4100837840233e-05, 1.839783726609312e-05]
IN	IN	NL-HaNA_1.04.02_2438_0088.jpg	dog niet anders als onder behoorlijke verband schr	[0.00046211786684580147, 0.9991857409477234, 0.00034984314697794616, 2.207261104558711e-06]
IN	IN	NL-HaNA_1.04.02_2438_0089.jpg	Van Timor onder dato 7:' September A„o 170: —.; vo	[0.00039823708357289433, 0.9980746507644653, 0.0015246078837662935, 2.558035021138494e-06]
END	END	NL-HaNA_1.04.02_2438_0090.jpg	Van Timor onder dato 7:' September A„o 1738: —; /:	[0.00015663965314161032, 0.00014568098413292319, 0.9996938705444336, 3.856774583255174e-06]
OUT	OUT	NL-HaNA_1.04.02_1547_0603.jpg		[3.62904611392878e-05, 2.180961746489629e-05, 1.3792511708743405e-05, 0.9999281167984009]
OUT	OUT	NL-HaNA_1.04.02_1547_0604.jpg		[1.5617306416970678e-05, 6.994436716922792e-06, 5.732850695494562e-06, 0.9999716281890869]
OUT	OUT	NL-HaNA_1.04.0

Evaluating:  39%|███▉      | 9/23 [00:05<00:08,  1.72batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_1218_0951.jpg	Beschrijvinge den Bormosaansche; Docken, huijsen e	[0.9998676776885986, 6.654661410721019e-05, 4.618970342562534e-05, 1.9472785425023176e-05]
IN	IN	NL-HaNA_1.04.02_1218_0952.jpg	Tarrouquan.-; Takkais; Poalij - - - -; Balis; Baba	[0.0004196111985947937, 0.999277651309967, 0.00030072772642597556, 2.0250317902537063e-06]
IN	IN	NL-HaNA_1.04.02_1218_0953.jpg	dalivo; Goumol:..; Doubale Bajan; Ballebais; dubal	[0.0002543299051467329, 0.99947589635849, 0.00026847433764487505, 1.342199425380386e-06]
IN	IN	NL-HaNA_1.04.02_1218_0954.jpg	Katsileij Rebel.; . .; Todorouw; Barbaras . . .; C	[0.00022546703985426575, 0.9994694590568542, 0.00030388354207389057, 1.243108954440686e-06]
IN	IN	NL-HaNA_1.04.02_1218_0955.jpg	96. Dorpen.; Saradick.; Schitsiringh..; De Volgend	[0.0002316978498129174, 0.9994916915893555, 0.00027536010020412505, 1.2073046491423156e-06]
IN	IN	NL-HaNA_1.04.02_1218_0956.jpg	Equintouwante dorpen; Appassouangh; Taramimissan; 	[0.00026630665524

Evaluating:  48%|████▊     | 11/23 [00:06<00:06,  1.89batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_8526_0329.jpg	Sumatras West cust onder dato 6„' Jan: 1723.; Jan;	[0.9998862743377686, 5.509870243258774e-05, 4.048737173434347e-05, 1.806340333132539e-05]
IN	IN	NL-HaNA_1.04.02_8526_0330.jpg	Van Sumatras West uust onder dato 6:' jan: 1733.; 	[0.00047383917262777686, 0.999228835105896, 0.00029496592469513416, 2.349859641981311e-06]
IN	IN	NL-HaNA_1.04.02_8526_0331.jpg	O; Jan; Sumatras West cust onder dato 6„' Janu: 17	[0.0002546056639403105, 0.9994675517082214, 0.0002765245153568685, 1.3048694427197916e-06]
IN	IN	NL-HaNA_1.04.02_8526_0332.jpg	Van Sumatras Westcust onder dato 6„' Janu: 1733.; 	[0.00024465785827487707, 0.9994930028915405, 0.00026107302983291447, 1.1916645235032775e-06]
IN	IN	NL-HaNA_1.04.02_8526_0333.jpg	Sumatras West Cust onder dato 6„' Janu: 1733.; Van	[0.00023226681514643133, 0.9994962215423584, 0.0002703368372749537, 1.2037071428494528e-06]
IN	IN	NL-HaNA_1.04.02_8526_0334.jpg	Van Sumatras West Cust onder dato 6„' Janu: 1733.;	[0.000226237156

Evaluating:  57%|█████▋    | 13/23 [00:08<00:05,  1.78batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_3973_0739.jpg	Monster Retle; Van den; Vier Compagnien; des; Regi	[0.999751627445221, 0.0001526404230389744, 6.21325962129049e-05, 3.3646701922407374e-05]
IN	IN	NL-HaNA_1.04.02_3973_0740.jpg		[0.0005547282635234296, 0.9991912245750427, 0.00023758310999255627, 1.6462368876091205e-05]
IN	IN	NL-HaNA_1.04.02_3973_0741.jpg	ƒ2873; DVos. und Kleiner Staad; Geburts observa; V	[0.00022789867944084108, 0.9995496869087219, 0.00022106988762971014, 1.3496833162207622e-06]
IN	IN	NL-HaNA_1.04.02_3973_0742.jpg		[0.0002122478181263432, 0.9994423985481262, 0.00034211968886666, 3.251393536629621e-06]
IN	IN	NL-HaNA_1.04.02_3973_0743.jpg	16 Buxenmacker; 14 Corporaals; Ludwigsburg; Lorrac	[0.00023518933448940516, 0.9994947910308838, 0.00026866100961342454, 1.341317670267017e-06]
IN	IN	NL-HaNA_1.04.02_3973_0744.jpg	Friederich Maase; George Monold; „  Christoph Mart	[0.00023754093854222447, 0.9994704127311707, 0.0002907926682382822, 1.2689502000284847e-06]
IN	IN	NL-HaNA_1.04.02_3973

Evaluating:  61%|██████    | 14/23 [00:10<00:08,  1.11batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_1506_1034.jpg	-; d; E; 7.; decken; 8; 2; van de; 5; ƒ; 3; E; E; 	[0.9998776912689209, 6.0226044297451153e-05, 4.376927245175466e-05, 1.8321543393540196e-05]
IN	IN	NL-HaNA_1.04.02_1506_1035.jpg	binnewater; een lamme; D95; same; uijt; ies; Cas; 	[0.0004626619047485292, 0.9991897940635681, 0.0003453302779234946, 2.2273372906056466e-06]
IN	IN	NL-HaNA_1.04.02_1506_1036.jpg	30 vrs; o; Janor; 6; 5o; e; x; 116; E; 6.; 1.; :; 	[0.0003593015717342496, 0.9985028505325317, 0.0011355736060068011, 2.2701908619637834e-06]
END	END	NL-HaNA_1.04.02_1506_1037.jpg	k; „noortvelt; rogons; „1; rsame; uijt; eruijt; ƒ;	[0.00016161671373993158, 0.00016525697719771415, 0.999669075012207, 4.135435119678732e-06]


Evaluating:  65%|██████▌   | 15/23 [00:10<00:06,  1.20batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_1432_1553.jpg	van d' Eel Ed=le Heeren d' Respective; heeren Bewi	[0.9998749494552612, 6.184548692544922e-05, 4.444837031769566e-05, 1.8674552848096937e-05]
IN	IN	NL-HaNA_1.04.02_1432_1554.jpg	deductie der Complotterie quam te bemelk; deselve 	[0.0004607670125551522, 0.9991875290870667, 0.0003495339478831738, 2.1985129023960326e-06]
IN	IN	NL-HaNA_1.04.02_1432_1555.jpg	off tot verantwoording vant gene ik van; ter sijde	[0.0004050038696732372, 0.9980037808418274, 0.0015885611064732075, 2.5996307613240788e-06]
END	END	NL-HaNA_1.04.02_1432_1556.jpg	niet te importimeeren, desen eijndige; ende, naar 	[0.00015616961172781885, 0.0001445325033273548, 0.9996955394744873, 3.840385943476576e-06]


Evaluating:  70%|██████▉   | 16/23 [00:11<00:04,  1.46batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_2548_1310.jpg	Van Iavas o:t Cust onder Dato 25:' aug:s 1741.; Re	[0.9923582673072815, 0.003696547821164131, 0.0007193058845587075, 0.003225902561098337]
IN	IN	NL-HaNA_1.04.02_2548_1311.jpg	Jan; Iavaso:t Cust onder Dato 25 aug:s 1741.; afge	[0.20379382371902466, 0.7193674445152283, 0.07488834857940674, 0.0019503687508404255]
IN	IN	NL-HaNA_1.04.02_2548_1312.jpg	Van Java so:t Cust onder Dato 25:' ug:s 1741. /; s	[0.05997362360358238, 0.7786190509796143, 0.16088823974132538, 0.0005190082592889667]
IN	IN	NL-HaNA_1.04.02_2548_1313.jpg	Cust onder dato 25 aug:s 1741.; Van Javas; rondsoe	[0.031973011791706085, 0.5004207491874695, 0.46730318665504456, 0.0003030599909834564]
END	END	NL-HaNA_1.04.02_2548_1314.jpg	Van Java so:t Cust onder Dato 25:' ug:s 1741. /; s	[0.0007415168802253902, 0.003432970028370619, 0.9957906603813171, 3.4878510632552207e-05]


Evaluating:  74%|███████▍  | 17/23 [00:11<00:03,  1.56batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_1210_0135.jpg	Intleger van d' heer nabab alhier ofte aen sijn; E	[0.9998032450675964, 0.00011883557453984395, 4.821622496820055e-05, 2.963235237984918e-05]
IN	IN	NL-HaNA_1.04.02_1210_0136.jpg	eijndelijk met sijne mede gebrachte juweelen bij; 	[0.0004538159992080182, 0.9992780089378357, 0.0002657740842550993, 2.3844631868996657e-06]
IN	IN	NL-HaNA_1.04.02_1210_0137.jpg	Den koninck van Carnaticas; swager blijft tot noch	[0.0002505024604033679, 0.9994860887527466, 0.00026195382815785706, 1.446069973098929e-06]
IN	IN	NL-HaNA_1.04.02_1210_0138.jpg	madure ende tansjouwer heeft becomen hoe; sij luij	[0.00023582503490615636, 0.9995161294937134, 0.000246759329456836, 1.3433766525849933e-06]
IN	IN	NL-HaNA_1.04.02_1210_0139.jpg	Den koninck van Carnatica; die eenige juweelen 2 o	[0.00022716056264471263, 0.9995033740997314, 0.00026826595421880484, 1.2067547459082562e-06]
IN	IN	NL-HaNA_1.04.02_1210_0140.jpg	sulcx niet als schade sullen behaelen soo dat; dag	[0.000237882457

Evaluating:  78%|███████▊  | 18/23 [00:12<00:03,  1.66batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_1268_1095.jpg	Is nae aen roepingh van godes Heijsige naeme; vers	[0.9998844861984253, 5.652234540320933e-05, 4.075343531440012e-05, 1.816506846807897e-05]
IN	IN	NL-HaNA_1.04.02_1268_1096.jpg	beraemen op dat achtervolgens d'ordre Ind Indische	[0.00044357767910696566, 0.9992609620094299, 0.0002933080540969968, 2.1616494905174477e-06]
IN	IN	NL-HaNA_1.04.02_1268_1097.jpg	dwijl bevonden wort dat Eenige particoliere predic	[0.0002592344826553017, 0.9994524121284485, 0.0002870506723411381, 1.2969296676601516e-06]
IN	IN	NL-HaNA_1.04.02_1268_1098.jpg	d'wijl ondervonden wert dat sonder d' Inlandse tae	[0.00023611904180143028, 0.9995099306106567, 0.00025260806432925165, 1.2784214504790725e-06]
IN	IN	NL-HaNA_1.04.02_1268_1099.jpg	wij nader aen schrijvens Expatria vercregen hebben	[0.00022568582789972425, 0.9995027780532837, 0.00027032417710870504, 1.1946415270358557e-06]
IN	IN	NL-HaNA_1.04.02_1268_1100.jpg	naet vaderlandt ofte ook aen andre Conrespondeeren	[0.0002371471

Evaluating:  83%|████████▎ | 19/23 [00:12<00:01,  2.00batch/s]

BEGIN	BEGIN	NL-HaNA_1.04.02_1637_0144.jpg	Van Banda de dato 25=en maij Ao 1700; Van banda on	[0.9998792409896851, 5.466859511216171e-05, 4.8415960918646306e-05, 1.7667336578597315e-05]
IN	IN	NL-HaNA_1.04.02_1637_0145.jpg	Van banda onder dato 25„e maij 1700; van banda ond	[0.0008113057119771838, 0.9978908896446228, 0.0012938608415424824, 3.925445980712539e-06]
END	END	NL-HaNA_1.04.02_1637_0146.jpg	Van banda onder dato 25 maij 1700; Van banda onder	[0.00017904503329191357, 0.0001201531122205779, 0.9996970891952515, 3.7806139516760595e-06]


Evaluating:  87%|████████▋ | 20/23 [00:18<00:06,  2.24s/batch]

OUT	OUT	NL-HaNA_1.04.02_1547_0489.jpg		[0.00019818692817352712, 0.00012456582044251263, 3.611252395785414e-05, 0.9996411800384521]
OUT	OUT	NL-HaNA_1.04.02_1547_0490.jpg		[0.00021097721764817834, 7.218324026325718e-05, 3.716944047482684e-05, 0.999679684638977]
BEGIN	OUT	NL-HaNA_1.04.02_1547_0491.jpg	Generale Monster Rolle van Alle; des EComp. s zoon	[0.9883937835693359, 0.0002698586031328887, 0.00013766804477199912, 0.011198733933269978]
IN	OUT	NL-HaNA_1.04.02_1547_0492.jpg		[0.010556105524301529, 0.9363459348678589, 0.002419911790639162, 0.05067805200815201]
IN	OUT	NL-HaNA_1.04.02_1547_0493.jpg		[0.0006934978882782161, 0.9977061748504639, 0.0006513927364721894, 0.0009488693322055042]
IN	OUT	NL-HaNA_1.04.02_1547_0494.jpg		[0.00021360030223149806, 0.9994304776191711, 0.00030492982477881014, 5.093546860734932e-05]
IN	BEGIN	NL-HaNA_1.04.02_1547_0495.jpg		[0.00022827929933555424, 0.9994704127311707, 0.00028148863930255175, 1.9773126041400246e-05]
IN	IN	NL-HaNA_1.04.02_1547_0496.jpg	1729„ Pe

Evaluating:  91%|█████████▏| 21/23 [00:23<00:05,  2.85s/batch]

BEGIN	IN	NL-HaNA_1.04.02_1547_0553.jpg	. . . . . . . . soldaet. . . . . . . . . . . . . 1	[0.9951327443122864, 0.0022832206450402737, 0.000586451671551913, 0.0019975262694060802]
IN	IN	NL-HaNA_1.04.02_1547_0554.jpg	807. Persoonen P„r Transport; Naemen Toenaem en ge	[0.20820622146129608, 0.682439386844635, 0.10669571906328201, 0.0026586188469082117]
IN	IN	NL-HaNA_1.04.02_1547_0555.jpg	te staats presente qualiteijt, en winnende 7 't sc	[0.07234015315771103, 0.7741838693618774, 0.1525031477212906, 0.0009728281875140965]
IN	IN	NL-HaNA_1.04.02_1547_0556.jpg	838. Persoonen P„r Transport; Naemen Toenaemen geb	[0.05923589691519737, 0.7543302774429321, 0.18561431765556335, 0.0008195725968107581]
IN	IN	NL-HaNA_1.04.02_1547_0557.jpg	Plaatsd presente qualiteijt, en winnenden 't schip	[0.05670130252838135, 0.7429326176643372, 0.19957104325294495, 0.0007950470899231732]
IN	IN	NL-HaNA_1.04.02_1547_0558.jpg	Naemen Toenaem en geboorte Plaats; 861. Persoonen 	[0.05579344928264618, 0.7378417253494263, 0.

Evaluating:  96%|█████████▌| 22/23 [00:24<00:02,  2.41s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1081_0045.jpg	Saterdach den 22 octob 1622; Alsoo de scheepen gro	[0.9946178793907166, 0.0024721540976315737, 0.0006270447629503906, 0.0022829489316791296]
IN	IN	NL-HaNA_1.04.02_1081_0046.jpg	Mede is goedt geuonden dat Sr. Cornelis van Neyenr	[0.21198444068431854, 0.6825459599494934, 0.10268929600715637, 0.0027802332770079374]
IN	IN	NL-HaNA_1.04.02_1081_0047.jpg	Sondach den 6 novemb. 1622„; -; Alsoo goedt geuond	[0.07224947959184647, 0.7775067090988159, 0.14925150573253632, 0.0009923357283696532]
IN	IN	NL-HaNA_1.04.02_1081_0048.jpg	Achter de Eijlanden aende zuijtzijde vande baeij; 	[0.05883494019508362, 0.7577874064445496, 0.1825484335422516, 0.0008292103884741664]
IN	IN	NL-HaNA_1.04.02_1081_0049.jpg	Herwaerts spoeije om onse Instructie ende goet voo	[0.05623100325465202, 0.7462183237075806, 0.19674964249134064, 0.0008010139572434127]
IN	IN	NL-HaNA_1.04.02_1081_0050.jpg	Saterdach den 19 Novemb 1622; Alsoo op gisteren go	[0.05531591176986694, 0.740911662578582

Evaluating: 100%|██████████| 23/23 [00:25<00:00,  1.09s/batch]

BEGIN	BEGIN	NL-HaNA_1.04.02_1647_0632.jpg	Van Macassar Anno 1701—; Vervolgens quam by den He	[0.9998655319213867, 6.913366087246686e-05, 4.472258660825901e-05, 2.0562354620778933e-05]
IN	IN	NL-HaNA_1.04.02_1647_0633.jpg	Van Macassar Anno 1701; Van Macassar Anno 1701.; a	[0.0004996329662390053, 0.9992245435714722, 0.0002733964647632092, 2.3458528630726505e-06]
IN	IN	NL-HaNA_1.04.02_1647_0634.jpg	Van Macassar Anno 1701; Van Macassar Anno 1701; va	[0.00025200346135534346, 0.9994682669639587, 0.00027840209077112377, 1.3643231113746879e-06]
IN	IN	NL-HaNA_1.04.02_1647_0635.jpg	Van Macassar Anno 1701; Van Macassar Anno 1701; ve	[0.00023022093228064477, 0.9994932413101196, 0.0002751544816419482, 1.252047582056548e-06]
IN	IN	NL-HaNA_1.04.02_1647_0636.jpg	Van Macassar Anno 1701; Van Macassar A„o 1701; het	[0.00023508621961809695, 0.9994787573814392, 0.0002849498705472797, 1.2003226856904803e-06]
IN	IN	NL-HaNA_1.04.02_1647_0637.jpg	Van Macassar Anno 1701; Van Macassar A„o 1701.; mo	[0.00023212775




In [20]:
import csv

from document_segmentation.pagexml.datamodel.label import Label

writer = csv.DictWriter(
    sys.stdout, fieldnames=["Metric"] + [label.name for label in Label], delimiter="\t"
)

writer.writeheader()

for metric in precision, recall, f1:
    assert metric.average is None

    scores: list[float] = metric.compute().tolist()
    writer.writerow(
        {"Metric": metric.__class__.__name__}
        | {label.name: f"{score:.4f}" for label, score in zip(Label, scores)}
    )

assert accuracy.average is not None

print(
    f"{metric.__class__.__name__} ({accuracy.average} average):\t{accuracy.compute().item():.4f}",
    file=sys.stdout,
)

Metric	BEGIN	IN	END	OUT
MulticlassPrecision	0.8261	0.9863	0.9130	0.9375
MulticlassRecall	0.9048	0.9830	1.0000	0.7895
MulticlassF1Score	0.8636	0.9847	0.9545	0.8571
MulticlassF1Score (micro average):	0.9690
