In [53]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-08137aa2-e69b-5e74-8390-7997329b1336"

# Download and convert data

In [55]:
%autoreload now

In [56]:
from document_segmentation.pagexml.annotations.generale_missiven import GeneraleMissiven
from document_segmentation.pagexml.datamodel.inventory import Inventory
from document_segmentation.settings import GENERALE_MISSIVEN_SHEET

N = 5

inventories: list[Inventory] = list(
    GeneraleMissiven(GENERALE_MISSIVEN_SHEET).all_annotated_inventories(
        N, skip_errors=True
    )
)

Loading Inventories:   0%|          | 0/5 [00:00<?, ?inventory/s]

Loading Inventories: 100%|██████████| 5/5 [00:01<00:00,  2.88inventory/s]


# Load Data

In [57]:
%autoreload now

In [58]:
TRAINING_DATA = 0.8

In [59]:
import random

random.seed(0)
random.shuffle(inventories)

split = int(len(inventories) * TRAINING_DATA)

training_data: list[Inventory] = inventories[:split]
validation_data: list[Inventory] = inventories[split:]

In [60]:
training_data

[Inventory(inv_nr=1072, inventory_part=, pages=12 pages),
 Inventory(inv_nr=1073, inventory_part=, pages=20 pages),
 Inventory(inv_nr=1070, inventory_part=, pages=31 pages),
 Inventory(inv_nr=1068, inventory_part=, pages=35 pages),
 Inventory(inv_nr=1072, inventory_part=, pages=16 pages),
 Inventory(inv_nr=1068, inventory_part=, pages=24 pages),
 Inventory(inv_nr=1068, inventory_part=, pages=7 pages),
 Inventory(inv_nr=1072, inventory_part=, pages=2 pages),
 Inventory(inv_nr=1073, inventory_part=, pages=11 pages),
 Inventory(inv_nr=1071, inventory_part=, pages=21 pages),
 Inventory(inv_nr=1071, inventory_part=, pages=21 pages),
 Inventory(inv_nr=1070, inventory_part=, pages=9 pages)]

In [61]:
validation_data

[Inventory(inv_nr=1068, inventory_part=, pages=19 pages),
 Inventory(inv_nr=1070, inventory_part=, pages=52 pages),
 Inventory(inv_nr=1073, inventory_part=, pages=10 pages),
 Inventory(inv_nr=1072, inventory_part=, pages=3 pages)]

# Train Model

In [62]:
EPOCHS = 3

In [63]:
%autoreload now

In [64]:
from document_segmentation.model.page_sequence_tagger import PageSequenceTagger

tagger = PageSequenceTagger()

In [65]:
tagger._device

'mps'

In [66]:
tagger

PageSequenceTagger(
  (_page_embedding): PageEmbedding(
    (_region_model): RegionEmbeddingSentenceTransformer(
      (_transformer_model): SentenceTransformer(
        (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel 
        (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
      )
      (_region_type): Embedding(10, 16)
      (_linear): Linear(in_features=784, out_features=512, bias=True)
    )
    (_rnn): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
    (_linear): Linear(in_features=512, out_features=256, bias=True)
  )
  (_rnn): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (_linear): Linear(in_features=512, out_featur

In [67]:
tagger.train_(training_data, validation_data, epochs=EPOCHS, shuffle=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training: 100%|██████████| 12/12 [00:09<00:00,  1.24inventory/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.26inventory/s]
        [4]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.
Training: 100%|██████████| 12/12 [00:00<00:00, 25.46inventory/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 32.70inventory/s]
        [4]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.
Training: 100%|██████████| 12/12 [00:00<00:00, 22.76inventory/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 24.95inventory/s]
        [4]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.


VBox(children=(Label(value='0.089 MB of 0.106 MB uploaded\r'), FloatProgress(value=0.8375127621845277, max=1.0…



0,1
MulticlassAccuracy,█▄▁
epoch,▁▅█
inventory length,█▃▅▂▆▃▁▅▂▅▇▄▃▇▅█▅▆▂▅▂▄▁▃▁▅█▄▅▂▃▆▅▃▂▇
loss,█▆▄▆▇▄▆▃▄▃▂▃▅▂▂▅▂▆▂▁▂▁▂▂▁▁▅▁▁▂▁▅▁▂▂▁

0,1
MulticlassAccuracy,0.85714
epoch,2.0
inventory length,31.0
loss,5.287


# Evaluate Model

In [68]:
import sys

precision, recall, f1, accuracy, results = tagger.eval_(validation_data)

Evaluating: 100%|██████████| 4/4 [00:00<00:00, 14.83inventory/s]


In [69]:
import csv

from document_segmentation.pagexml.datamodel.label import Label

writer = csv.DictWriter(
    sys.stdout, fieldnames=["Metric"] + [label.name for label in Label], delimiter="\t"
)

writer.writeheader()

for metric in precision, recall, f1:
    assert metric.average is None

    scores: list[float] = metric.compute().tolist()
    writer.writerow(
        {"Metric": metric.__class__.__name__}
        | {label.name: f"{score:.4f}" for label, score in zip(Label, scores)}
    )

assert accuracy.average is not None

print(
    f"{accuracy.__class__.__name__} ({accuracy.average} average):\t{accuracy.compute().item():.4f}",
    file=sys.stdout,
)

        [4]]) classes have zero instances in both the predictions and the ground truth labels. Precision is still logged as zero.


Metric	UNK	BEGIN	IN	END	OUT
MulticlassPrecision	0.0000	0.3636	0.9697	0.5714	0.0000
MulticlassRecall	0.0000	0.8000	0.8649	0.8000	0.0000
MulticlassF1Score	0.0000	0.5000	0.9143	0.6667	0.0000
MulticlassAccuracy (micro average):	0.8571


In [70]:
results

Unnamed: 0,Predicted,Actual,Page ID,Text,Scores
0,BEGIN,BEGIN,NL-HaNA_1.04.02_1068_0021.jpg,427 Cockels foly ende; schepen Int landt heeft...,"[5.69479088881053e-05, 0.9961917400360107, 0.0..."
1,BEGIN,IN,NL-HaNA_1.04.02_1068_0022.jpg,maer Een schip sel; Molucor banda; Ende Amboin...,"[0.00012934765254613012, 0.9722077250480652, 0..."
2,BEGIN,IN,NL-HaNA_1.04.02_1068_0023.jpg,Nota opt gebreck van; timmerluyden; overichgew...,"[0.00035468023270368576, 0.574393093585968, 0...."
3,IN,END,NL-HaNA_1.04.02_1068_0024.jpg,daeroverdicht dat haer Capt; dat met pratyck v...,"[0.0002640753809828311, 0.12895315885543823, 0..."
4,IN,BEGIN,NL-HaNA_1.04.02_1068_0025.jpg,Erntfeste Wijse voorsinnige seer Discrete; mau...,"[0.00012983818305656314, 0.03972013294696808, ..."
...,...,...,...,...,...
79,END,IN,NL-HaNA_1.04.02_1073_0311.jpg,de swaere garnisoenen die nu niet doen dan de ...,"[0.00046679293154738843, 0.0036710628774017096..."
80,END,END,NL-HaNA_1.04.02_1073_0312.jpg,"van d' oude, ende opmaeckinge vande nieuwe leg...","[0.00012853230873588473, 9.528837108518928e-05..."
81,BEGIN,BEGIN,NL-HaNA_1.04.02_1072_0737.jpg,zedert myne Jonchte met Schip Walcheren syn hi...,"[8.282011549454182e-05, 0.996722400188446, 0.0..."
82,BEGIN,IN,NL-HaNA_1.04.02_1072_0738.jpg,wel syn te becomen tot penvelatie Harwaerts ae...,"[0.0025710472837090492, 0.6415371298789978, 0...."
