In [1]:
%load_ext autoreload

In [2]:
%autoreload now

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-08137aa2-e69b-5e74-8390-7997329b1336"
os.environ["WANDB_API_KEY"] = ""

# Download and convert data

In [4]:
from document_segmentation.pagexml.annotations.renate_analysis import (
    RenateAnalysis,
    RenateAnalysisInv,
)
from document_segmentation.pagexml.annotations.sheet import Sheet
from document_segmentation.settings import RENATE_ANALYSIS_DIR, RENATE_ANALYSIS_SHEETS

sheets: list[Sheet] = [RenateAnalysis()] + [
    RenateAnalysisInv(sheet) for sheet in RENATE_ANALYSIS_SHEETS
]

In [5]:
N = None

for sheet in sheets:
    sheet.download(RENATE_ANALYSIS_DIR, N)

# Load Data

In [6]:
TRAINING_DATA = 0.8

In [7]:
from document_segmentation.model.dataset import DocumentDataset

dataset: DocumentDataset = DocumentDataset.from_dir(RENATE_ANALYSIS_DIR)
dataset.shuffle()

len(dataset)

Reading JSON files: 100%|██████████| 54/54 [00:00<00:00, 133.18file/s]


54

In [8]:
dataset._class_counts()

Counter({<Label.IN: 1>: 1207,
         <Label.END: 2>: 53,
         <Label.BEGIN: 0>: 52,
         <Label.OUT: 3>: 52})

In [9]:
dataset.class_weights()

[1.0188679245283019, 0.04470198675496689, 1.0, 1.0188679245283019]

In [10]:
training_data, test_data = dataset.split(TRAINING_DATA)

In [11]:
training_data._class_counts()

Counter({<Label.IN: 1>: 1032,
         <Label.BEGIN: 0>: 42,
         <Label.END: 2>: 42,
         <Label.OUT: 3>: 41})

In [12]:
test_data._class_counts()

Counter({<Label.IN: 1>: 175,
         <Label.END: 2>: 11,
         <Label.OUT: 3>: 11,
         <Label.BEGIN: 0>: 10})

# Train Model

In [20]:
BATCH_SIZE = 128
EPOCHS = 50
WEIGHTS = dataset.class_weights()

In [14]:
%autoreload now

In [15]:
from document_segmentation.model.page_sequence_tagger import PageSequenceTagger

tagger = PageSequenceTagger()

In [16]:
tagger._device

'mps'

In [17]:
tagger

PageSequenceTagger(
  (_page_embedding): PageEmbedding(
    (_region_model): RegionEmbeddingSentenceTransformer(
      (_transformer_model): SentenceTransformer(
        (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel 
        (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
      )
      (_region_type): Embedding(9, 16)
      (_linear): Linear(in_features=784, out_features=512, bias=True)
    )
    (_rnn): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
    (_linear): Linear(in_features=512, out_features=256, bias=True)
  )
  (_rnn): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (_linear): Linear(in_features=512, out_feature

In [21]:
tagger.train_(
    training_data, test_data, epochs=EPOCHS, batch_size=BATCH_SIZE, weights=WEIGHTS
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training: 100%|██████████| 44/44 [00:02<00:00, 16.84batch/s]
Evaluating: 100%|██████████| 11/11 [00:00<00:00, 56.15batch/s]
        [1],
        [2]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.
Training: 100%|██████████| 44/44 [00:02<00:00, 20.64batch/s]
Evaluating: 100%|██████████| 11/11 [00:00<00:00, 63.69batch/s]
        [1],
        [2]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.
Training: 100%|██████████| 44/44 [00:02<00:00, 21.52batch/s]
Evaluating: 100%|██████████| 11/11 [00:00<00:00, 62.92batch/s]
        [1],
        [2]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.
Training: 100%|██████████| 44/44 [00:02<00:00, 20.94batch/s]
Evaluating: 100%|██████████| 11/11 [00:00<00:00, 66.44batch/s]
        [1],
        [2]]) classes have zero instances in both the predicti

KeyboardInterrupt: 

In [19]:
import torch

with open("page_sequence_tagger.pt", "xb") as f:
    torch.save(tagger, f)

FileExistsError: [Errno 17] File exists: 'page_sequence_tagger.pt'

# Evaluate Model

In [None]:
import sys

precision, recall, f1, accuracy = tagger.eval_(test_data, BATCH_SIZE, sys.stdout)

Predicted	Actual	Page ID	Text	Scores


Evaluating:   2%|▏         | 1/57 [00:00<00:07,  7.76batch/s]

OUT	BEGIN	NL-HaNA_1.04.02_1150_0469.jpg	Journael ofte daerregister noopende; tgepasseerde 	[6.5575097435432994e-12, 2.960837477972156e-11, 1.9213875282475534e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1150_0470.jpg	Jaus deo Aij 8=e Augusto din 1643 Inden wegh naer 	[4.194419334427324e-13, 1.6945055384895436e-12, 9.229505180952113e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1150_0471.jpg	194 e; Jaus deo Aij 17 Augusto A 1643 Inde Stadt C	[2.6644168100130317e-13, 1.0244267456896439e-12, 5.074889618782163e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1150_0472.jpg	Paus deo Aij 20 Augusto Anno 1643 Inde Stadt Coasb	[2.388203838247366e-13, 8.787256946390931e-13, 4.3208340551326163e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1150_0473.jpg	195 5; Daus deo Aij 21 Augusto Anno 1643 In stadt 	[2.567378818221777e-13, 9.156702089271884e-13, 4.703749325804529e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1150_0474.jpg	Conia; Laus deo Dij 23 Augusto Anno 1643 Inde stad	[2.4582395038823257e-13, 8.892839806554087e-13, 4.232843188369667e-13, 1.0]
OUT	IN	NL-Ha

Evaluating:  12%|█▏        | 7/57 [00:00<00:01, 34.52batch/s]

OUT	END	NL-HaNA_1.04.02_1557_0964.jpg	Van Sumatras W=t Cust onder dato 43 ' Maart A=o 16	[9.05413678059519e-12, 2.109048352627596e-11, 8.618516490754846e-12, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1557_0146.jpg	Van Palembang Den 10=en April A:o 1694.; Register 	[1.0332519562172848e-11, 4.1162469266442514e-11, 2.4568217252274316e-11, 1.0]
OUT	END	NL-HaNA_1.04.02_1557_0147.jpg	Van Palembang Den 10=en April A:o 1694.; N:o 9. fa	[1.3365639324469658e-11, 3.3816466987746097e-11, 1.490828641093067e-11, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1060_0435.jpg	Alsoo het schip der Goes, als t'Jacht Cleijn Enckh	[6.399621284292056e-12, 2.7995189499918638e-11, 1.893270436259531e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1060_0436.jpg	Op heden den 5 feb. @ 1614, door beroep vanden E. 	[3.8340828450479647e-13, 1.521072981111593e-12, 7.537530170333151e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1060_0437.jpg	@ 1615. Dondergeschreven te soonen; Soo Is bij d' 	[2.764716895090752e-13, 9.172644198016111e-13, 4.60060971523707e-13, 1.0]
OUT	IN	NL

Evaluating:  21%|██        | 12/57 [00:00<00:01, 34.86batch/s]

OUT	BEGIN	NL-HaNA_1.04.02_1557_1408.jpg	Van Bengale onder dato 18„en Januarij Ao 1944.; Al	[1.0332342620378299e-11, 4.115980126173646e-11, 2.4565828538047896e-11, 1.0]
OUT	END	NL-HaNA_1.04.02_1557_1409.jpg	van; Bengale den 15=en Februarij A:o 1694.; heere 	[1.3365230797091066e-11, 3.38157592205679e-11, 1.490828641093067e-11, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1557_1450.jpg	Van Bengale onder dato 15=en Februarij A=o 1694.; 	[1.0330116102796882e-11, 4.114237769914375e-11, 2.4550089391950358e-11, 1.0]
OUT	END	NL-HaNA_1.04.02_1557_1451.jpg	Van Bengale onder dato 15=en Februarij A=o 1694; '	[1.3368672488467404e-11, 3.3804084531574574e-11, 1.4910048889982264e-11, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1557_0376.jpg	Van Siam onder dato 16=en Januarij A„o 1694.; Inst	[6.835868242988097e-12, 2.897942823210098e-11, 2.4495832445792232e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0377.jpg	Van Siam onder dato 16:en Januarij A:o 1694.; Leve	[4.26347948915759e-13, 1.5603372543679384e-12, 9.606835726233554e-13, 1.0]
OUT

Evaluating:  32%|███▏      | 18/57 [00:00<00:00, 41.29batch/s]

OUT	BEGIN	NL-HaNA_1.04.02_1557_0941.jpg	Van Sumatras W„t Cust onder dato 25:' Januarij A:o	[6.812075643153337e-12, 2.9787311506268566e-11, 2.4060467160302856e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0942.jpg	Van Sumatras Wt Cust onder dato 25„e Januarij A:o 	[4.2875365801123277e-13, 1.6408211594987065e-12, 8.738653247200578e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0943.jpg	Van SumatrasW:t ust onder dato 25:e' Januarij Pnno	[3.051414643965439e-13, 9.893356750154125e-13, 5.017210063205935e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0944.jpg	Van Sumatras W„t Cust onder dato 25:' Januarij A=o	[4.613482989732076e-13, 1.51103316057416e-12, 7.411929685459395e-13, 1.0]
OUT	END	NL-HaNA_1.04.02_1557_0945.jpg	Van Sumatras W„t Cust onder dato 25:e' Januarij A=	[9.64289672417129e-12, 2.1110203862750865e-11, 8.57349521238282e-12, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1557_1602.jpg	Van Bengale onder dato 29=en october A=o 1694.; on	[6.399121250250106e-12, 2.8266791685105375e-11, 1.9870653733544685e-11, 1.0]
OUT	IN	NL-H

Evaluating:  40%|████      | 23/57 [00:00<00:00, 35.50batch/s]

OUT	BEGIN	NL-HaNA_1.04.02_1557_1085.jpg	Van Sumatras W:t Cust onder dato 24:en Ileij a:o 1	[6.959361906200279e-12, 3.2064455257607705e-11, 2.630948237047903e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1086.jpg	Van Sumatras W„t Cust onder dato 21=en Meij a=o 16	[4.3828710192400644e-13, 1.8300040302565712e-12, 1.2336786323410842e-12, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1087.jpg	Van Sumatras W:t Cust onder dato 21=en Meij 1694.;	[2.930439365559506e-13, 1.1221722336085538e-12, 5.698619202389177e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1088.jpg	Van Sumatras W:t Cust onder dato 241:en Meij a:o 1	[2.480361293959177e-13, 9.293805959195711e-13, 4.586013643489983e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1089.jpg	Van Sumatras Wt Cust onder dato 21=en Iei ao 1699.	[2.4739406486937177e-13, 9.288029330020708e-13, 5.175792520267786e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1090.jpg	Van Sumatras w:t Cust onder dato 21=en Meij A:o 16	[2.5203875987124535e-13, 8.986791887911905e-13, 4.898631413702281e-13, 1.0]
OUT	IN	NL-Ha

Evaluating:  47%|████▋     | 27/57 [00:00<00:00, 36.48batch/s]

OUT	IN	NL-HaNA_1.04.02_1557_1430.jpg	Van Bengale Den 15=en Februarij A=o 1694; 't smppl	[2.6002049363475777e-13, 9.315599507064842e-13, 4.328951205747472e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1431.jpg	Van Bengale Den 14=en Februarij A=o 1694.; ons adv	[2.958853051893834e-13, 9.798444607772572e-13, 4.3279854526623307e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1432.jpg	Van Bengale Den 15=en Februarij A=o 1694.; arriven	[5.278660538191038e-13, 1.5594922271947032e-12, 6.500260941490199e-13, 1.0]
OUT	END	NL-HaNA_1.04.02_1557_1433.jpg	Van Bengale Den 15=en Februarij A:o 1694.; de noor	[1.005517891172758e-11, 2.089173972708025e-11, 8.441476684328819e-12, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1557_0946.jpg	Van Sumatas Wt Cust onder dato 25: Januarij A=o 16	[6.760985001019737e-12, 2.8030452958738294e-11, 2.3316112929272492e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0947.jpg	Van Sumatas Wt Cust onder dato 25=en Januarij A=o 	[4.2343532109453963e-13, 1.5478461611387329e-12, 9.283636142817797e-13, 1.0]
OUT	IN	NL-H

Evaluating:  56%|█████▌    | 32/57 [00:00<00:00, 38.78batch/s]

OUT	IN	NL-HaNA_1.04.02_1547_0270.jpg	'twelk doende een groote affschrick in de; herten 	[2.845500798962647e-13, 9.72230325760326e-13, 4.830971235027237e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1547_0271.jpg	den markt vermeestert, en konnen ook op dien; voer	[4.3907363639003605e-13, 1.448508522329528e-12, 6.765883968536113e-13, 1.0]
OUT	END	NL-HaNA_1.04.02_1547_0272.jpg	behulpsaamh. t te bewijsen, op dat dien Corl in; g	[9.5685482107144e-12, 2.1250870854694348e-11, 8.832361057808935e-12, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1557_0289.jpg	Van Palembang onder dato 21=en October a:o 694; ul	[6.5088867455542765e-12, 2.7491441415006257e-11, 2.012767383319236e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0290.jpg	october A=o 1694.; Van Palembang Den 21=en; missiv	[4.174243416199541e-13, 1.6329129888525973e-12, 8.420663797324701e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0291.jpg	Van Palembang Den 21=en October A=o 1694.; gering 	[2.962399206149491e-13, 9.877875427333205e-13, 5.080012474047158e-13, 1.0]
OUT	IN	NL-HaNA_1

Evaluating:  65%|██████▍   | 37/57 [00:01<00:00, 33.18batch/s]

OUT	BEGIN	NL-HaNA_1.04.02_8526_0329.jpg	Sumatras West cust onder dato 6„' Jan: 1723.; Jan;	[6.6376808563473055e-12, 3.039591148223941e-11, 2.1654225287881523e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_8526_0330.jpg	Van Sumatras West uust onder dato 6:' jan: 1733.; 	[3.904739758527759e-13, 1.5876501502365414e-12, 7.948321905162925e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_8526_0331.jpg	O; Jan; Sumatras West cust onder dato 6„' Janu: 17	[2.8028821668149573e-13, 9.410410818644355e-13, 4.82632922342574e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_8526_0332.jpg	Van Sumatras Westcust onder dato 6„' Janu: 1733.; 	[2.6028148820272934e-13, 9.288029330020708e-13, 4.3668166955709853e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_8526_0333.jpg	Sumatras West Cust onder dato 6„' Janu: 1733.; Van	[2.625491512565914e-13, 8.840649566577152e-13, 4.3475276547202957e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_8526_0334.jpg	Van Sumatras West Cust onder dato 6„' Janu: 1733.;	[2.433115557989862e-13, 8.870596856884461e-13, 4.257734030795046e-13, 1.0]
OUT	IN	NL-HaN

Evaluating:  77%|███████▋  | 44/57 [00:01<00:00, 40.97batch/s]

OUT	OUT	NL-HaNA_1.04.02_1547_0360.jpg		[7.550426429914214e-12, 3.3771992147269e-11, 2.963746956186064e-11, 1.0]
OUT	OUT	NL-HaNA_1.04.02_1547_0361.jpg		[4.324890868611514e-13, 1.6880086738113587e-12, 9.72148793756955e-13, 1.0]
OUT	OUT	NL-HaNA_1.04.02_1547_0362.jpg		[2.769815355806865e-13, 1.0498110627338297e-12, 5.668266420469359e-13, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1547_0363.jpg	Jnstructie voor den onder„; Sonsbeek, vertreckende	[2.5043872141014556e-13, 8.979458344417213e-13, 4.632431591100605e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1547_0364.jpg	op te wakkeren sijn, soo meede te Ervaaren wat; Ef	[2.554410133935592e-13, 8.572214661196897e-13, 4.200808266779238e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1547_0365.jpg	Van ons gering vermoogen, voort te setten onder an	[2.4171111076207175e-13, 8.752748417543976e-13, 4.096747897516795e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1547_0366.jpg	aanslaagen der picaten en roovers, en des noods; s	[2.459233717274495e-13, 8.502553587412531e-13, 3.9904516323241435e-13, 1.0]
O

Evaluating:  86%|████████▌ | 49/57 [00:01<00:00, 36.34batch/s]

OUT	BEGIN	NL-HaNA_1.04.02_1557_1270.jpg	Van Sumatras W=t Cust onder dato 4:e Decemb: a:o 1	[6.396497480992691e-12, 2.7917125208776206e-11, 1.7314543895863288e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1271.jpg	Van Sumatras W=t Cust, onder dato 4=en December A=	[4.57153412347644e-13, 1.675517905842805e-12, 8.164809973953968e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_1272.jpg	Van Sumatras W=t Cust onder dato 4:en December A:o	[5.146721807416932e-13, 1.6682600396597524e-12, 7.660976345490178e-13, 1.0]
OUT	END	NL-HaNA_1.04.02_1557_1273.jpg	Van Sumatras W=t Cust onder dato 4:e December A:o 	[9.022296798555374e-12, 2.107343466395406e-11, 8.621081279414078e-12, 1.0]
OUT	BEGIN	NL-HaNA_1.04.02_1557_0272.jpg	Van Palembang den 21=en October A=o 1694.; to; Pan	[7.0676888820619954e-12, 3.1239347914047144e-11, 2.256257333743683e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0273.jpg	Van Palembung onder dato 21=en October A=o 1694—; 	[4.072782421645632e-13, 1.5487113544723763e-12, 8.944246710461401e-13, 1.0]
OUT	IN	NL-

Evaluating: 100%|██████████| 57/57 [00:01<00:00, 37.77batch/s]

OUT	BEGIN	NL-HaNA_1.04.02_1557_0553.jpg	Van Japan Den 7=en; November Ao 1694; „heeft voorg	[6.54278020650878e-12, 2.7607463190526538e-11, 1.946106470418485e-11, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0554.jpg	Van Japan onder dato 7=en November A=o; 1694.; ove	[4.0716716565199207e-13, 1.5482979481840076e-12, 8.206277454445021e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0555.jpg	Van Japan onder dato 7=en November A:o 1694; te ve	[2.8265847236592923e-13, 9.883246564895698e-13, 5.063374307508195e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0556.jpg	Van Japan onder dato 7=en November A=o 1694; uijt 	[2.5287660420508784e-13, 9.299657398320615e-13, 4.4650042127156175e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0557.jpg	Van Japan onder dato 7=en; November A:o 1694; een 	[2.594309858085231e-13, 9.080142779165079e-13, 4.479813059139054e-13, 1.0]
OUT	IN	NL-HaNA_1.04.02_1557_0558.jpg	Van Japan onder dato 7=en November A=o 1694.; De r	[2.619189316387799e-13, 8.899236599371751e-13, 4.3783675144661027e-13, 1.0]
OUT	IN	NL-HaN




In [None]:
import csv
import sys

from document_segmentation.pagexml.datamodel.label import Label

writer = csv.DictWriter(
    sys.stdout, fieldnames=["Metric"] + [label.name for label in Label], delimiter="\t"
)

writer.writeheader()

for metric in precision, recall, f1:
    assert metric.average is None

    scores: list[float] = metric.compute().tolist()
    writer.writerow(
        {"Metric": metric.__class__.__name__}
        | {label.name: f"{score:.4f}" for label, score in zip(Label, scores)}
    )

assert accuracy.average is not None

print(
    f"{accuracy.__class__.__name__} ({accuracy.average} average):\t{accuracy.compute().item():.4f}",
    file=sys.stdout,
)

        [1],
        [2]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.


Metric	BEGIN	IN	END	OUT
MulticlassPrecision	0.0000	0.0000	0.0000	0.0304
MulticlassRecall	0.0000	0.0000	0.0000	1.0000
MulticlassF1Score	0.0000	0.0000	0.0000	0.0590
MulticlassAccuracy (micro average):	0.0304
