## Import Library

In [2]:
import json
import datasets
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import pipeline

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

## Load Dataset

In [4]:
# dataset = load_dataset("json", data_files='../data/invoice_ner_dataset.jsonl')['train']
dataset = load_dataset("json", data_files='./invoice_ner_dataset_testing.jsonl')['train']


In [5]:
dataset

Dataset({
    features: ['tokens', 'ner_tags', 'id', 'file_name'],
    num_rows: 1515
})

## Preprocessing Dataset

In [6]:
tmp = dataset.train_test_split(test_size=0.2, seed=42)
train_valid = tmp['train'].train_test_split(test_size=0.2, seed=42)

train_dataset = train_valid['train']
valid_dataset = train_valid['test']    
test_dataset  = tmp['test']     

In [7]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'id', 'file_name'],
    num_rows: 969
})

In [8]:
unique_labels = set(l for row in train_dataset['ner_tags'] for l in row)
label_list = sorted(list(unique_labels))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [9]:
label2id

{'B-CLIENT_ADDRESS': 0,
 'B-CLIENT_NAME': 1,
 'B-INVOICE_DATE': 2,
 'B-INVOICE_NUMBER': 3,
 'B-ITEM_DESC': 4,
 'B-NET_WORTH': 5,
 'B-PRICE': 6,
 'B-QUANTITY': 7,
 'B-SELLER_NAME': 8,
 'B-TOTAL': 9,
 'B-VAT': 10,
 'I-CLIENT_ADDRESS': 11,
 'I-CLIENT_NAME': 12,
 'I-ITEM_DESC': 13,
 'I-NET_WORTH': 14,
 'I-PRICE': 15,
 'I-SELLER_NAME': 16,
 'I-TOTAL': 17,
 'I-VAT': 18,
 'O': 19}

In [10]:
id2label

{0: 'B-CLIENT_ADDRESS',
 1: 'B-CLIENT_NAME',
 2: 'B-INVOICE_DATE',
 3: 'B-INVOICE_NUMBER',
 4: 'B-ITEM_DESC',
 5: 'B-NET_WORTH',
 6: 'B-PRICE',
 7: 'B-QUANTITY',
 8: 'B-SELLER_NAME',
 9: 'B-TOTAL',
 10: 'B-VAT',
 11: 'I-CLIENT_ADDRESS',
 12: 'I-CLIENT_NAME',
 13: 'I-ITEM_DESC',
 14: 'I-NET_WORTH',
 15: 'I-PRICE',
 16: 'I-SELLER_NAME',
 17: 'I-TOTAL',
 18: 'I-VAT',
 19: 'O'}

## Tokenizer

In [11]:
MODEL_NAME = 'google-bert/bert-base-cased'
# MODEL_NAME = 'microsoft/deberta-v3-base'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

In [12]:
MODEL = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = TOKENIZER(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label2id[label[word_idx]])
                else:
                    label_ids.append(-100)
            else:
                if word_idx < len(label):
                    current_label = label[word_idx]
                    label_ids.append(label2id[current_label] if current_label.startswith("I-") else -100)
                else:
                    label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_valid = valid_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test  = test_dataset.map(tokenize_and_align_labels,  batched=True)

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

In [15]:
seqeval = evaluate.load('seqeval')

In [16]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100]
                  for pred, label in zip(predictions, labels)]

    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [17]:
import optuna
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=TOKENIZER)

def hyperparameter_tuning_optuna(tokenized_train, tokenized_valid, n_trials=20):
    def objective(trial):
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
        batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
        num_epochs = trial.suggest_int("num_train_epochs", 2, 6)
        weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
        warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.3)
        
        
        def model_init():
            return AutoModelForTokenClassification.from_pretrained(
                    MODEL_NAME,
                    num_labels=len(label_list),
                    id2label=id2label,
                    label2id=label2id,
                    ignore_mismatched_sizes=True
            )
        
        training_args = TrainingArguments(
            output_dir=f"./tmp_trial_{trial.number}",
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,  
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            warmup_ratio=warmup_ratio,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            logging_dir=None,  
            report_to=None,   
            dataloader_pin_memory=False  
        )
        
        trainer = Trainer(
            model_init=model_init,   
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_valid,
            tokenizer=TOKENIZER,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )
        
        trainer.train()
        eval_result = trainer.evaluate()
        
        return eval_result["eval_f1"]
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    
    print("OPTUNA RESULTS")
    print(f"Best trial: {study.best_trial.number}")
    print(f"Best F1 score: {study.best_value:.4f}")
    print(f"Best params: {study.best_params}")
    
    return study.best_params, study

In [18]:
best_params_optuna, study = hyperparameter_tuning_optuna(
    tokenized_train, tokenized_valid, n_trials=10
)

[I 2025-09-12 20:03:24,037] A new study created in memory with name: no-name-665ab870-36b2-4ea8-8ba3-dafb3ac8befd
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.083897,0.800278,0.869565,0.833484,0.973428
2,No log,0.066595,0.866667,0.938752,0.90127,0.978967
3,No log,0.055314,0.892086,0.937618,0.914286,0.981883


[I 2025-09-12 20:04:43,089] Trial 0 finished with value: 0.9142857142857144 and parameters: {'learning_rate': 0.00018169775059983996, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.07436348443701805, 'warmup_ratio': 0.01221689575246696}. Best is trial 0 with value: 0.9142857142857144.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.114467,0.776311,0.844991,0.809196,0.968008
2,No log,0.072872,0.862566,0.927788,0.893989,0.977906
3,No log,0.064045,0.87779,0.936862,0.906364,0.97932


[I 2025-09-12 20:06:00,229] Trial 1 finished with value: 0.9063643013899049 and parameters: {'learning_rate': 5.456526544696671e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.28679051327152755, 'warmup_ratio': 0.07144776535138031}. Best is trial 0 with value: 0.9142857142857144.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias'

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.134711,0.764072,0.83138,0.796306,0.964326
2,No log,0.077934,0.8662,0.934972,0.899273,0.976286
3,No log,0.065623,0.881551,0.928544,0.904437,0.977818
4,No log,0.06221,0.884533,0.938374,0.910659,0.979143
5,0.261400,0.061813,0.896626,0.944423,0.919904,0.98041


  _warn_prf(average, modifier, msg_start, len(result))


[I 2025-09-12 20:08:11,337] Trial 2 finished with value: 0.9199042533603388 and parameters: {'learning_rate': 1.969979451014911e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.008878432408507374, 'warmup_ratio': 0.13454680728578822}. Best is trial 2 with value: 0.9199042533603388.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias'

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.075293,0.854973,0.893762,0.873937,0.975726
2,No log,0.055108,0.893841,0.932703,0.912858,0.981795
3,No log,0.051983,0.896639,0.937996,0.916851,0.982767


[I 2025-09-12 20:09:31,023] Trial 3 finished with value: 0.9168514412416852 and parameters: {'learning_rate': 0.00011195674393457663, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.21726475899385367, 'warmup_ratio': 0.1541574751665541}. Best is trial 2 with value: 0.9199042533603388.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias',

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.101441,0.834176,0.872968,0.853131,0.970571
2,No log,0.060397,0.886249,0.930813,0.907985,0.980911


[I 2025-09-12 20:10:24,467] Trial 4 finished with value: 0.9079845104185875 and parameters: {'learning_rate': 0.0003003457439007372, 'per_device_train_batch_size': 16, 'num_train_epochs': 2, 'weight_decay': 0.036176343885972115, 'warmup_ratio': 0.2941792596084894}. Best is trial 2 with value: 0.9199042533603388.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias'

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.114137,0.737833,0.859735,0.794133,0.96518
2,No log,0.06595,0.854844,0.924008,0.888081,0.97876


[I 2025-09-12 20:11:18,779] Trial 5 finished with value: 0.8880813953488372 and parameters: {'learning_rate': 0.00046339243553897546, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.18423015753254587, 'warmup_ratio': 0.07195445776889865}. Best is trial 2 with value: 0.9199042533603388.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias'

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.746154,0.0,0.0,0.0,0.870471
2,No log,0.744345,0.0,0.0,0.0,0.870471
3,No log,0.744899,0.0,0.0,0.0,0.870471
4,No log,0.741903,0.0,0.0,0.0,0.870471
5,0.808000,0.74142,0.0,0.0,0.0,0.870471


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-09-12 20:13:27,926] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 0.0006389428212558982, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'weight_decay': 0.08700273744842954, 'warmup_ratio': 0.00022478834174675109}. Best is trial 2 with value: 0.9199042533603388.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.091339,0.82576,0.9138,0.867552,0.972397
2,No log,0.056501,0.893563,0.923629,0.908347,0.981471
3,No log,0.057898,0.872616,0.916824,0.894174,0.979909
4,No log,0.055042,0.903238,0.938752,0.920653,0.982855
5,0.178500,0.054379,0.898041,0.935728,0.916497,0.983032
6,0.178500,0.054358,0.90409,0.944423,0.923817,0.983768


[I 2025-09-12 20:16:01,653] Trial 7 finished with value: 0.9238165680473372 and parameters: {'learning_rate': 0.00010331202260173673, 'per_device_train_batch_size': 8, 'num_train_epochs': 6, 'weight_decay': 0.06379565292976189, 'warmup_ratio': 0.1784694315218944}. Best is trial 7 with value: 0.9238165680473372.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias',

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.249958,0.609848,0.608696,0.609272,0.933954
2,No log,0.125302,0.793656,0.879773,0.834499,0.968538
3,No log,0.101098,0.814208,0.901323,0.855554,0.97172


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2025-09-12 20:17:18,161] Trial 8 finished with value: 0.855553561815898 and parameters: {'learning_rate': 5.2050515856247006e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.011932147629118229, 'warmup_ratio': 0.1401146494663357}. Best is trial 7 with value: 0.9238165680473372.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias'

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.114986,0.732091,0.861626,0.791594,0.965858
2,No log,0.102333,0.802847,0.85293,0.827131,0.96901
3,No log,0.073252,0.842365,0.905104,0.872608,0.977287
4,No log,0.0644,0.862522,0.917958,0.889377,0.978584
5,No log,0.060571,0.890011,0.933081,0.911037,0.981176


[I 2025-09-12 20:19:25,233] Trial 9 finished with value: 0.9110372831303064 and parameters: {'learning_rate': 0.0005666264161156186, 'per_device_train_batch_size': 16, 'num_train_epochs': 5, 'weight_decay': 0.14948176255717752, 'warmup_ratio': 0.2532284593470268}. Best is trial 7 with value: 0.9238165680473372.


OPTUNA RESULTS
Best trial: 7
Best F1 score: 0.9238
Best params: {'learning_rate': 0.00010331202260173673, 'per_device_train_batch_size': 8, 'num_train_epochs': 6, 'weight_decay': 0.06379565292976189, 'warmup_ratio': 0.1784694315218944}


In [19]:
best_params = study.best_params

training_args = TrainingArguments(
    output_dir="./final_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    warmup_ratio=best_params["warmup_ratio"],
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model_init=lambda: AutoModelForTokenClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    ),
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=TOKENIZER,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.091339,0.82576,0.9138,0.867552,0.972397
2,No log,0.056636,0.893207,0.929679,0.911078,0.981854
3,No log,0.058019,0.880419,0.921361,0.900425,0.979762
4,No log,0.05246,0.903156,0.941399,0.921881,0.983444
5,0.178900,0.052422,0.903994,0.932703,0.918124,0.983474
6,0.178900,0.051904,0.906023,0.944045,0.924644,0.983798


TrainOutput(global_step=732, training_loss=0.13221080511645542, metrics={'train_runtime': 155.8002, 'train_samples_per_second': 37.317, 'train_steps_per_second': 4.698, 'total_flos': 1266637985385600.0, 'train_loss': 0.13221080511645542, 'epoch': 6.0})

In [20]:
metrics_valid = trainer.evaluate()
print("VALID:", metrics_valid)

VALID: {'eval_loss': 0.05190393328666687, 'eval_precision': 0.9060232220609579, 'eval_recall': 0.9440453686200379, 'eval_f1': 0.924643584521385, 'eval_accuracy': 0.9837977965003241, 'eval_runtime': 1.9043, 'eval_samples_per_second': 127.606, 'eval_steps_per_second': 16.279, 'epoch': 6.0}


In [21]:
metrics_test = trainer.evaluate(eval_dataset=tokenized_test)   
print("TEST :", metrics_test)

TEST : {'eval_loss': 0.05229390785098076, 'eval_precision': 0.9072615923009624, 'eval_recall': 0.9432989690721649, 'eval_f1': 0.924929389029285, 'eval_accuracy': 0.9839322369818088, 'eval_runtime': 2.4446, 'eval_samples_per_second': 123.948, 'eval_steps_per_second': 15.545, 'epoch': 6.0}


In [22]:
trainer.save_model("./ner_model_best_last")
TOKENIZER.save_pretrained("./ner_model_best_last")

('./ner_model_best_last/tokenizer_config.json',
 './ner_model_best_last/special_tokens_map.json',
 './ner_model_best_last/vocab.txt',
 './ner_model_best_last/added_tokens.json',
 './ner_model_best_last/tokenizer.json')