## Setup

In [None]:
!pip install transformers==4.48.3 datasets seqeval evaluate

Collecting transformers==4.48.3
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading transformers-4.48.3-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (8

In [None]:
from google.colab import userdata
import wandb

hf_token = userdata.get('huggingface')
!huggingface-cli login --token {hf_token}
wandb.login(key=userdata.get('wandb'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `GbatToken` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `GbatToken`


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdjelassimedhani[0m ([33mdjelassimedhani-isimg-tn[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Imports

In [None]:
import re
import json
import xml.etree.ElementTree as ET
import os
import random
from google.colab import drive
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments
from datasets import ClassLabel, Dataset
import evaluate
import numpy as np

dataset_path = "/gdrive/MyDrive/invoice_dataset"
base_model = 'xlm-roberta-base'

## Mount drive

In [None]:
drive.mount('/gdrive')

Mounted at /gdrive


## Set data

In [None]:
if not os.path.exists(dataset_path):
    !git clone https://github.com/mouadhamri/invoice_dataset.git {dataset_path}
    print("✅ Dataset cloned.")
else:
    print("📁 Dataset already exists, skipping clone.")

📁 Dataset already exists, skipping clone.


In [None]:
label_map = {
    "supplier": "SUPPLIER",
    "invoice_number": "INVOICE_NO",
    "po_number": "PO_NO",
    "invoice_date": "DATE",
    "invoice_date_due": "DUE_DATE",
    "address": "ADDRESS",
    "total_untaxed": "TOTAL_UNTAXED",
    "tax_amount": "TAX_AMOUNT",
    "total_amount": "TOTAL_AMOUNT"
}

def tokenize(text):
    return re.findall(r'\w+|[^\w\s]', text, re.UNICODE)

def tag_tokens(tokens, label):
    tags = []
    for i, token in enumerate(tokens):
        prefix = "B-" if i == 0 else "I-"
        tags.append(f"{prefix}{label}")
    return tags

def parse_xml_to_entry(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    all_tokens = []
    all_tags = []

    for tag, label in label_map.items():
        elem = root.find(tag)
        if elem is not None and elem.text:
            tokens = tokenize(elem.text.strip())
            tags = tag_tokens(tokens, label)
            all_tokens.extend(tokens)
            all_tags.extend(tags)

    for line in root.findall(".//line/description"):
        if line is not None and line.text:
            tokens = tokenize(line.text.strip())
            tags = tag_tokens(tokens, "PRODUCT_DESC")
            all_tokens.extend(tokens)
            all_tags.extend(tags)


    if len(all_tokens) == len(all_tags):
        return {
            "tokens": all_tokens,
            "labels": all_tags,
        }
    return None

def browse_and_convert(base_folder):
    output_file = os.path.join(base_folder, "converted_data.jsonl")
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for i in range(1, 10):
            xml_folder = os.path.join(base_folder, f"invoice_dataset_model_{i}", "xml")
            for filename in os.listdir(xml_folder):
                if filename.endswith(".xml"):
                    full_path = os.path.join(xml_folder, filename)
                    entry = parse_xml_to_entry(full_path)
                    if entry:
                        outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [None]:
if not os.path.exists(f'{dataset_path}/converted_data.jsonl'):
  browse_and_convert(f'{dataset_path}')
else:
  print('Data already converted')

In [None]:
file_path = f'{dataset_path}/converted_data.jsonl'

with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i == 5:
            break
        entry = json.loads(line)
        print(json.dumps(entry, indent=2, ensure_ascii=False))
        print("="*50)


{
  "tokens": [
    "Marc",
    "Demo",
    "FA02",
    "/",
    "2015",
    "/",
    "020059",
    "BC06263",
    "2015",
    "-",
    "02",
    "-",
    "02",
    "2015",
    "-",
    "02",
    "-",
    "02",
    "3575",
    "Buena",
    "Vista",
    "Avenue",
    "Eugene",
    "COR",
    "97401",
    "États",
    "Unis",
    "75974",
    ".",
    "0",
    "6029",
    ".",
    "3",
    "6029",
    ".",
    "3",
    "Service",
    "Client",
    "(",
    "Heures",
    "Prépayées",
    ")",
    "Flipover",
    "Combinaison",
    "de",
    "bureau",
    "Boîte",
    "de",
    "rangement",
    "Tiroir",
    "noir"
  ],
  "labels": [
    "B-SUPPLIER",
    "I-SUPPLIER",
    "B-INVOICE_NO",
    "I-INVOICE_NO",
    "I-INVOICE_NO",
    "I-INVOICE_NO",
    "I-INVOICE_NO",
    "B-PO_NO",
    "B-DATE",
    "I-DATE",
    "I-DATE",
    "I-DATE",
    "I-DATE",
    "B-DUE_DATE",
    "I-DUE_DATE",
    "I-DUE_DATE",
    "I-DUE_DATE",
    "I-DUE_DATE",
    "B-ADDRESS",
    "I-ADDRESS",
    "I-ADDRESS",


In [None]:
with open(f'{dataset_path}/converted_data.jsonl', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

random.seed(42)
random.shuffle(data)

train_ratio = 0.85
val_ratio = 0.15

n = len(data)
train_end = int(train_ratio * n)
val_end = train_end + int(val_ratio * n)

train_data = data[:train_end]
val_data = data[train_end:]

def save_jsonl(data_split, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for entry in data_split:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

save_jsonl(train_data, f'{dataset_path}/train.jsonl')
save_jsonl(val_data, f'{dataset_path}/valid.jsonl')

print(f"Saved: {len(train_data)} train, {len(val_data)} valid")


Saved: 765 train, 135 valid


In [None]:
ds_train = Dataset.from_list(train_data)
ds_val = Dataset.from_list(val_data)

In [None]:
label_list = ClassLabel(
    num_classes=21,
    names=[
        "O",
        "B-SUPPLIER", "I-SUPPLIER",
        "B-INVOICE_NO", "I-INVOICE_NO",
        "B-PO_NO", "I-PO_NO",
        "B-DATE", "I-DATE",
        "B-DUE_DATE", "I-DUE_DATE",
        "B-ADDRESS", "I-ADDRESS",
        "B-TOTAL_UNTAXED", "I-TOTAL_UNTAXED",
        "B-TAX_AMOUNT", "I-TAX_AMOUNT",
        "B-TOTAL_AMOUNT", "I-TOTAL_AMOUNT",
        "B-PRODUCT_DESC", "I-PRODUCT_DESC"
    ]
)
id2label = {i:label_list.int2str(i) for i in range(label_list.num_classes)}
label2id = {c:label_list.str2int(c) for c in label_list.names}
model = AutoModelForTokenClassification.from_pretrained(base_model,
                                                        id2label=id2label,
                                                        label2id=label2id,
                                                        num_labels=len(id2label),
                                                        )
tokenizer = AutoTokenizer.from_pretrained(base_model)
data_collator = DataCollatorForTokenClassification(tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
ds_train = ds_train.map(
    lambda x: tokenizer(x["tokens"], truncation=True, is_split_into_words=True)
)
ds_val = ds_val.map(
    lambda x: tokenizer(x["tokens"], truncation=True, is_split_into_words=True)
)

ds_train = ds_train.map(lambda y: {"labels": label_list.str2int(y["labels"])})
ds_val = ds_val.map(lambda y: {"labels": label_list.str2int(y["labels"])})

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

In [None]:
def align_labels(examples):
    labels = []
    for i, label in enumerate(examples["labels"]):
        if isinstance(label[0], str):
            label = [label2id[l] for l in label]
        labels.append(label)
    return {"labels": labels}

ds_train = ds_train.map(align_labels, batched=True)
ds_val = ds_val.map(align_labels, batched=True)

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

In [None]:
ds_train.push_to_hub("Gbat/ds_train")
ds_val.push_to_hub("Gbat/ds_val")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/396 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/395 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


## Model evaluation

In [None]:
metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list.names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list.names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return metric.compute(predictions=true_predictions, references=true_labels)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

## Initialize trainer

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_overall_f1",
)



In [None]:
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


## Train the model

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Address,Date,Due Date,Invoice No,Po No,Product Desc,Supplier,Tax Amount,Total Amount,Total Untaxed,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,1.7527,0.603364,"{'precision': 0.2733812949640288, 'recall': 0.2814814814814815, 'f1': 0.2773722627737226, 'number': 135}","{'precision': 0.4473684210526316, 'recall': 0.5037037037037037, 'f1': 0.4738675958188153, 'number': 135}","{'precision': 0.46218487394957986, 'recall': 0.4074074074074074, 'f1': 0.4330708661417323, 'number': 135}","{'precision': 0.45185185185185184, 'recall': 0.45185185185185184, 'f1': 0.45185185185185184, 'number': 135}","{'precision': 0.4714285714285714, 'recall': 0.24444444444444444, 'f1': 0.32195121951219513, 'number': 135}","{'precision': 0.05517241379310345, 'recall': 0.009768009768009768, 'f1': 0.016597510373443983, 'number': 819}","{'precision': 0.9777777777777777, 'recall': 0.9777777777777777, 'f1': 0.9777777777777777, 'number': 135}","{'precision': 0.37037037037037035, 'recall': 0.37037037037037035, 'f1': 0.37037037037037035, 'number': 135}","{'precision': 0.362962962962963, 'recall': 0.362962962962963, 'f1': 0.36296296296296304, 'number': 135}","{'precision': 0.21641791044776118, 'recall': 0.21481481481481482, 'f1': 0.21561338289962825, 'number': 135}",0.402617,0.257129,0.313831,0.772203
2,0.5906,0.421068,"{'precision': 0.5540540540540541, 'recall': 0.6074074074074074, 'f1': 0.5795053003533568, 'number': 135}","{'precision': 0.7279411764705882, 'recall': 0.7333333333333333, 'f1': 0.7306273062730627, 'number': 135}","{'precision': 0.6557377049180327, 'recall': 0.5925925925925926, 'f1': 0.622568093385214, 'number': 135}","{'precision': 0.9777777777777777, 'recall': 0.9777777777777777, 'f1': 0.9777777777777777, 'number': 135}","{'precision': 1.0, 'recall': 0.9703703703703703, 'f1': 0.9849624060150376, 'number': 135}","{'precision': 0.07913669064748201, 'recall': 0.013431013431013432, 'f1': 0.02296450939457203, 'number': 819}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 135}","{'precision': 0.5704225352112676, 'recall': 0.6, 'f1': 0.5848375451263538, 'number': 135}","{'precision': 0.7555555555555555, 'recall': 0.7555555555555555, 'f1': 0.7555555555555555, 'number': 135}","{'precision': 0.5407407407407407, 'recall': 0.5407407407407407, 'f1': 0.5407407407407407, 'number': 135}",0.681885,0.455261,0.545991,0.84584
3,0.4448,0.352237,"{'precision': 0.7692307692307693, 'recall': 0.8148148148148148, 'f1': 0.7913669064748201, 'number': 135}","{'precision': 0.8740740740740741, 'recall': 0.8740740740740741, 'f1': 0.8740740740740742, 'number': 135}","{'precision': 0.8346456692913385, 'recall': 0.7851851851851852, 'f1': 0.8091603053435116, 'number': 135}","{'precision': 0.9925925925925926, 'recall': 0.9925925925925926, 'f1': 0.9925925925925926, 'number': 135}","{'precision': 1.0, 'recall': 0.9925925925925926, 'f1': 0.996282527881041, 'number': 135}","{'precision': 0.07913669064748201, 'recall': 0.013431013431013432, 'f1': 0.02296450939457203, 'number': 819}","{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 135}","{'precision': 0.8029197080291971, 'recall': 0.8148148148148148, 'f1': 0.8088235294117646, 'number': 135}","{'precision': 0.8222222222222222, 'recall': 0.8222222222222222, 'f1': 0.8222222222222222, 'number': 135}","{'precision': 0.7555555555555555, 'recall': 0.7555555555555555, 'f1': 0.7555555555555555, 'number': 135}",0.790406,0.526549,0.632045,0.870292


Trainer is attempting to log a value of "{'precision': 0.2733812949640288, 'recall': 0.2814814814814815, 'f1': 0.2773722627737226, 'number': 135}" of type <class 'dict'> for key "eval/ADDRESS" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.4473684210526316, 'recall': 0.5037037037037037, 'f1': 0.4738675958188153, 'number': 135}" of type <class 'dict'> for key "eval/DATE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.46218487394957986, 'recall': 0.4074074074074074, 'f1': 0.4330708661417323, 'number': 135}" of type <class 'dict'> for key "eval/DUE_DATE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.45185185185185184, 'recall': 0.4518518

TrainOutput(global_step=288, training_loss=0.7676014734639062, metrics={'train_runtime': 191.7048, 'train_samples_per_second': 11.972, 'train_steps_per_second': 1.502, 'total_flos': 128620922361696.0, 'train_loss': 0.7676014734639062, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()

Trainer is attempting to log a value of "{'precision': 0.7692307692307693, 'recall': 0.8148148148148148, 'f1': 0.7913669064748201, 'number': 135}" of type <class 'dict'> for key "eval/ADDRESS" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8740740740740741, 'recall': 0.8740740740740741, 'f1': 0.8740740740740742, 'number': 135}" of type <class 'dict'> for key "eval/DATE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8346456692913385, 'recall': 0.7851851851851852, 'f1': 0.8091603053435116, 'number': 135}" of type <class 'dict'> for key "eval/DUE_DATE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9925925925925926, 'recall': 0.992592592

In [None]:
eval_results

{'eval_loss': 0.35223740339279175,
 'eval_ADDRESS': {'precision': 0.7692307692307693,
  'recall': 0.8148148148148148,
  'f1': 0.7913669064748201,
  'number': 135},
 'eval_DATE': {'precision': 0.8740740740740741,
  'recall': 0.8740740740740741,
  'f1': 0.8740740740740742,
  'number': 135},
 'eval_DUE_DATE': {'precision': 0.8346456692913385,
  'recall': 0.7851851851851852,
  'f1': 0.8091603053435116,
  'number': 135},
 'eval_INVOICE_NO': {'precision': 0.9925925925925926,
  'recall': 0.9925925925925926,
  'f1': 0.9925925925925926,
  'number': 135},
 'eval_PO_NO': {'precision': 1.0,
  'recall': 0.9925925925925926,
  'f1': 0.996282527881041,
  'number': 135},
 'eval_PRODUCT_DESC': {'precision': 0.07913669064748201,
  'recall': 0.013431013431013432,
  'f1': 0.02296450939457203,
  'number': 819},
 'eval_SUPPLIER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 135},
 'eval_TAX_AMOUNT': {'precision': 0.8029197080291971,
  'recall': 0.8148148148148148,
  'f1': 0.8088235294117646,
  'num

In [None]:
trainer.model.save_pretrained("./models/ner-model")
trainer.tokenizer.save_pretrained("./models/ner-model")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('./models/ner-model/tokenizer_config.json',
 './models/ner-model/special_tokens_map.json',
 './models/ner-model/sentencepiece.bpe.model',
 './models/ner-model/added_tokens.json',
 './models/ner-model/tokenizer.json')

In [None]:
# !cp -r ./models/ner-model /gdrive/MyDrive/