## Import Library

In [1]:
import json
import datasets
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import pipeline

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

## Load Dataset

In [3]:
# dataset = load_dataset("json", data_files='../data/invoice_ner_dataset.jsonl')['train']
dataset = load_dataset("json", data_files='workspace/invoice_ner_dataset_testing.jsonl')['train']


Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
dataset

Dataset({
    features: ['tokens', 'ner_tags', 'id', 'file_name'],
    num_rows: 1515
})

## Preprocessing Dataset

In [5]:
tmp = dataset.train_test_split(test_size=0.2, seed=42)
train_valid = tmp['train'].train_test_split(test_size=0.2, seed=42)

train_dataset = train_valid['train']
valid_dataset = train_valid['test']    
test_dataset  = tmp['test']     

In [6]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'id', 'file_name'],
    num_rows: 969
})

In [7]:
unique_labels = set(l for row in train_dataset['ner_tags'] for l in row)
label_list = sorted(list(unique_labels))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [8]:
label2id

{'B-CLIENT_ADDRESS': 0,
 'B-CLIENT_NAME': 1,
 'B-INVOICE_DATE': 2,
 'B-INVOICE_NUMBER': 3,
 'B-ITEM_DESC': 4,
 'B-NET_WORTH': 5,
 'B-PRICE': 6,
 'B-QUANTITY': 7,
 'B-SELLER_NAME': 8,
 'B-TOTAL': 9,
 'B-VAT': 10,
 'I-CLIENT_ADDRESS': 11,
 'I-CLIENT_NAME': 12,
 'I-ITEM_DESC': 13,
 'I-NET_WORTH': 14,
 'I-PRICE': 15,
 'I-SELLER_NAME': 16,
 'I-TOTAL': 17,
 'I-VAT': 18,
 'O': 19}

In [9]:
id2label

{0: 'B-CLIENT_ADDRESS',
 1: 'B-CLIENT_NAME',
 2: 'B-INVOICE_DATE',
 3: 'B-INVOICE_NUMBER',
 4: 'B-ITEM_DESC',
 5: 'B-NET_WORTH',
 6: 'B-PRICE',
 7: 'B-QUANTITY',
 8: 'B-SELLER_NAME',
 9: 'B-TOTAL',
 10: 'B-VAT',
 11: 'I-CLIENT_ADDRESS',
 12: 'I-CLIENT_NAME',
 13: 'I-ITEM_DESC',
 14: 'I-NET_WORTH',
 15: 'I-PRICE',
 16: 'I-SELLER_NAME',
 17: 'I-TOTAL',
 18: 'I-VAT',
 19: 'O'}

## Tokenizer

In [10]:
MODEL_NAME = 'google-bert/bert-base-cased'
# MODEL_NAME = 'microsoft/deberta-v3-base'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [11]:
MODEL = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = TOKENIZER(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label2id[label[word_idx]])
                else:
                    label_ids.append(-100)
            else:
                if word_idx < len(label):
                    current_label = label[word_idx]
                    label_ids.append(label2id[current_label] if current_label.startswith("I-") else -100)
                else:
                    label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_valid = valid_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test  = test_dataset.map(tokenize_and_align_labels,  batched=True)

Map:   0%|          | 0/969 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

Map:   0%|          | 0/303 [00:00<?, ? examples/s]

In [14]:
seqeval = evaluate.load('seqeval')

Downloading builder script: 0.00B [00:00, ?B/s]

In [15]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100]
                  for pred, label in zip(predictions, labels)]

    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [16]:
import optuna
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=TOKENIZER)

def hyperparameter_tuning_optuna(tokenized_train, tokenized_test, n_trials=20):
    def objective(trial):
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
        batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
        num_epochs = trial.suggest_int("num_train_epochs", 2, 6)
        weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
        warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.3)
        
        
        def model_init():
            return AutoModelForTokenClassification.from_pretrained(
                    MODEL_NAME,
                    num_labels=len(label_list),
                    id2label=id2label,
                    label2id=label2id,
                    ignore_mismatched_sizes=True
            )
        
        training_args = TrainingArguments(
            output_dir=f"./tmp_trial_{trial.number}",
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,  
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            warmup_ratio=warmup_ratio,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            logging_dir=None,  
            report_to=None,   
            dataloader_pin_memory=False  
        )
        
        trainer = Trainer(
            model_init=model_init,   
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_valid,
            tokenizer=TOKENIZER,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )
        
        trainer.train()
        eval_result = trainer.evaluate()
        
        return eval_result["eval_f1"]
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    
    print("OPTUNA RESULTS")
    print(f"Best trial: {study.best_trial.number}")
    print(f"Best F1 score: {study.best_value:.4f}")
    print(f"Best params: {study.best_params}")
    
    return study.best_params, study

In [17]:
best_params_optuna, study = hyperparameter_tuning_optuna(
    tokenized_train, tokenized_valid, n_trials=10
)

[I 2025-09-08 14:31:08,140] A new study created in memory with name: no-name-d724aacd-a81c-4210-bf50-c7c99365307f
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.156259,0.73783,0.802268,0.768701,0.962234
2,No log,0.101397,0.822746,0.910775,0.864525,0.972338
3,No log,0.086552,0.85247,0.926276,0.887842,0.975667


  _warn_prf(average, modifier, msg_start, len(result))


[I 2025-09-08 14:32:17,954] Trial 0 finished with value: 0.8878420003623845 and parameters: {'learning_rate': 1.3842558475722774e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.2971551891966312, 'warmup_ratio': 0.1368994542283596}. Best is trial 0 with value: 0.8878420003623845.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.111656,0.791364,0.824575,0.807628,0.967006
2,No log,0.076653,0.835202,0.929301,0.879742,0.976551
3,No log,0.059498,0.883495,0.928922,0.90564,0.981117


[I 2025-09-08 14:33:24,924] Trial 1 finished with value: 0.9056395134537412 and parameters: {'learning_rate': 0.0003416175910879105, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.29855038329375283, 'warmup_ratio': 0.031036039830328287}. Best is trial 1 with value: 0.9056395134537412.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.22091,0.604374,0.710397,0.653111,0.945325
2,No log,0.084798,0.856738,0.913422,0.884172,0.976816
3,No log,0.071075,0.868198,0.928922,0.897534,0.97658
4,No log,0.061128,0.892009,0.936862,0.913885,0.980204
5,0.350200,0.060702,0.891328,0.936484,0.913348,0.98041
6,0.350200,0.062831,0.895378,0.944802,0.919426,0.981412


  _warn_prf(average, modifier, msg_start, len(result))


[I 2025-09-08 14:35:41,403] Trial 2 finished with value: 0.9194260485651214 and parameters: {'learning_rate': 1.826939526517531e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 6, 'weight_decay': 0.24680147617585238, 'warmup_ratio': 0.2664694183722041}. Best is trial 2 with value: 0.9194260485651214.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.104851,0.82254,0.869187,0.845221,0.97116
2,No log,0.063919,0.88135,0.937996,0.908791,0.979379
3,No log,0.056877,0.897112,0.939509,0.917821,0.981971


[I 2025-09-08 14:36:48,710] Trial 3 finished with value: 0.9178208679593721 and parameters: {'learning_rate': 9.336713249767051e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.17440170408619782, 'warmup_ratio': 0.257813256986401}. Best is trial 2 with value: 0.9194260485651214.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.220563,0.682653,0.688847,0.685736,0.946297
2,No log,0.097335,0.82297,0.91569,0.866858,0.973311
3,No log,0.071424,0.867657,0.927032,0.896363,0.977847
4,No log,0.068494,0.875221,0.936106,0.90464,0.977965


  _warn_prf(average, modifier, msg_start, len(result))


[I 2025-09-08 14:38:17,979] Trial 4 finished with value: 0.9046401169163317 and parameters: {'learning_rate': 2.9617084276259848e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.10618491537246698, 'warmup_ratio': 0.24124679567016322}. Best is trial 2 with value: 0.9194260485651214.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.741551,0.0,0.0,0.0,0.870471
2,No log,0.742532,0.0,0.0,0.0,0.870471
3,No log,0.740509,0.0,0.0,0.0,0.870471
4,No log,0.739864,0.0,0.0,0.0,0.870471


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-09-08 14:39:45,694] Trial 5 finished with value: 0.0 and parameters: {'learning_rate': 0.0007112084017163691, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.07657639990673076, 'warmup_ratio': 0.02331138643573568}. Best is trial 2 with value: 0.9194260485651214.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from th

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.075075,0.858222,0.901701,0.879425,0.976816
2,No log,0.055156,0.893978,0.93724,0.915098,0.981147


[I 2025-09-08 14:40:33,466] Trial 6 finished with value: 0.9150978220745662 and parameters: {'learning_rate': 0.00014097013296856327, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.13262623873304286, 'warmup_ratio': 0.19416637220753102}. Best is trial 2 with value: 0.9194260485651214.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias'

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.511228,1.0,0.083554,0.154222,0.876981
2,No log,0.206582,0.664297,0.777316,0.716376,0.951099
3,No log,0.126921,0.775844,0.859735,0.815638,0.96683
4,No log,0.109533,0.80203,0.89603,0.846429,0.97119
5,No log,0.102751,0.813265,0.90397,0.856222,0.971808


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2025-09-08 14:42:22,360] Trial 7 finished with value: 0.8562220232766339 and parameters: {'learning_rate': 2.2259375645496603e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 5, 'weight_decay': 0.11294517033944859, 'warmup_ratio': 0.21122577563213846}. Best is trial 2 with value: 0.9194260485651214.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.139069,0.767296,0.830246,0.79753,0.964915
2,No log,0.087644,0.8442,0.930057,0.885051,0.974548
3,No log,0.06982,0.869766,0.926654,0.897309,0.976875
4,No log,0.065491,0.8815,0.942155,0.910819,0.978319
5,0.243900,0.063729,0.892639,0.939887,0.915654,0.979379
6,0.243900,0.065321,0.891026,0.945936,0.91766,0.979644


  _warn_prf(average, modifier, msg_start, len(result))


[I 2025-09-08 14:44:39,293] Trial 8 finished with value: 0.9176600036677058 and parameters: {'learning_rate': 1.4750168574538808e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 6, 'weight_decay': 0.2829334954913983, 'warmup_ratio': 0.05467208128838431}. Best is trial 2 with value: 0.9194260485651214.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias',

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.443822,0.616402,0.088091,0.154152,0.879897
2,No log,0.241978,0.593209,0.686957,0.63665,0.93905
3,No log,0.174593,0.71726,0.807561,0.759737,0.960143
4,No log,0.16175,0.733936,0.829112,0.778626,0.962264


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-09-08 14:46:07,318] Trial 9 finished with value: 0.7786259541984734 and parameters: {'learning_rate': 1.656852682615038e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.26485249674704087, 'warmup_ratio': 0.09008725105015478}. Best is trial 2 with value: 0.9194260485651214.


OPTUNA RESULTS
Best trial: 2
Best F1 score: 0.9194
Best params: {'learning_rate': 1.826939526517531e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 6, 'weight_decay': 0.24680147617585238, 'warmup_ratio': 0.2664694183722041}


In [18]:
best_params = study.best_params

training_args = TrainingArguments(
    output_dir="./final_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    warmup_ratio=best_params["warmup_ratio"],
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model_init=lambda: AutoModelForTokenClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    ),
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=TOKENIZER,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.229601,0.603095,0.697089,0.646695,0.942099
2,No log,0.088111,0.868368,0.918132,0.892557,0.976571
3,No log,0.071999,0.873651,0.93299,0.902346,0.977019
4,No log,0.060084,0.893176,0.94057,0.916261,0.981431
5,0.350200,0.058483,0.89829,0.939964,0.918655,0.981644
6,0.350200,0.059314,0.897325,0.946028,0.921033,0.981974


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=732, training_loss=0.2548117429180875, metrics={'train_runtime': 137.9335, 'train_samples_per_second': 42.151, 'train_steps_per_second': 5.307, 'total_flos': 1266637985385600.0, 'train_loss': 0.2548117429180875, 'epoch': 6.0})

In [19]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.05931407958269119, 'eval_precision': 0.8973252804141502, 'eval_recall': 0.9460278956943602, 'eval_f1': 0.9210332103321033, 'eval_accuracy': 0.981973904631574, 'eval_runtime': 2.4293, 'eval_samples_per_second': 124.728, 'eval_steps_per_second': 15.642, 'epoch': 6.0}


In [20]:
trainer.save_model("workspace/ner_model_best_last")
TOKENIZER.save_pretrained("workspace/ner_model_best_last")

('workspace/ner_model_best_last/tokenizer_config.json',
 'workspace/ner_model_best_last/special_tokens_map.json',
 'workspace/ner_model_best_last/vocab.txt',
 'workspace/ner_model_best_last/added_tokens.json',
 'workspace/ner_model_best_last/tokenizer.json')