# This is a notebook implementing a Danish BERT in order to predict named entities in invoicedata

import os
os.chdir("/bachelor_project")

MODEL_DIR = "/bachelor_project/danish_bert_uncased_v2"

!transformers-cli convert --model_type bert \
  --tf_checkpoint $MODEL_DIR/bert_model.ckpt \
  --config $MODEL_DIR/bert_config.json \
  --pytorch_dump_output $MODEL_DIR/pytorch_model.bin

In [None]:
# Loading packages
## Standard packages
import os

import math

import pandas as pd

import numpy as np

## pyTorch
import torch
import torch.nn.functional as F
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

## Transformers
from transformers import AutoTokenizer, BertTokenizer
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

## Other ML utils
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score
from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences


from tqdm import tqdm,trange


In [None]:
os.chdir("/bachelor_project")
os.getcwd()

## Loading and inspecting data

In [None]:
train = pd.read_csv("data/KMD/NER_data_split/NERtrain.csv", encoding = "UTF-8")
train['countcolumn'] = 'Sentence: ' + train['countcolumn'].astype(str) #adding 'Sentence:' to the 'Sentence #' column
#df = df.drop("InvoiceNo", axis = 1)
#df = df[["SentenceNo", "word", "labels"]]
train=train.rename(columns={"countcolumn": "SentenceNo"})

val = pd.read_csv("data/KMD/NER_data_split/NERval.csv", encoding = "UTF-8")
val['countcolumn'] = 'Sentence: ' + val['countcolumn'].astype(str) #adding 'Sentence:' to the 'Sentence #' column
#df = df.drop("InvoiceNo", axis = 1)
#df = df[["SentenceNo", "word", "labels"]]
val=val.rename(columns={"countcolumn": "SentenceNo"})

test = pd.read_csv("data/KMD/NER_data_split/NERtest.csv", encoding = "UTF-8")
test['countcolumn'] = 'Sentence: ' + test['countcolumn'].astype(str) #adding 'Sentence:' to the 'Sentence #' column
#df = df.drop("InvoiceNo", axis = 1)
#df = df[["SentenceNo", "word", "labels"]]
test=test.rename(columns={"countcolumn": "SentenceNo"})

#Example of training set
train[0:10]

In [None]:
#Sanity checking number of words
print(len(train.labels) == 121865)
print(len(val.labels) == 6792)
print(len(test.labels) == 7416)

print(len(train.labels))
print(len(val.labels))
print(len(test.labels))

In [None]:
#Sanity checking sentences, counting unique words and counting number of labels
print(train.SentenceNo.nunique() == 30733, train.word.nunique() == 26773, train.labels.nunique() == 5)
print(val.SentenceNo.nunique() == 1707, val.word.nunique() == 3350, val.labels.nunique() == 5)
print(test.SentenceNo.nunique() == 1707, test.word.nunique() == 2633, test.labels.nunique() == 5)

print(train.SentenceNo.nunique(), train.word.nunique(), train.labels.nunique())
print(val.SentenceNo.nunique(), val.word.nunique(), val.labels.nunique())
print(test.SentenceNo.nunique(), test.word.nunique(), test.labels.nunique())

In [None]:
#sanity checking
train.labels.value_counts()

In [None]:
#sanity checking
val.labels.value_counts()

In [None]:
#sanity checking
train.labels.value_counts()

## Getting the sentences from the data

Creating a function that extracts each sentence and its labels from a dataframe

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(word, label) for word, label in zip(s["word"].values.tolist(),
                                                     s["labels"].values.tolist())]
        self.grouped = self.data.groupby("SentenceNo").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

Creating variable "sentences" that lists all sentences from the dataframe and the variable "labels" that lists all the corresponding labels for each word in the sentences.

In [None]:
# Get full document data structure
tr_sentences = []
tr_labels = []
getter = SentenceGetter(train)
# Get sentence data
tr_sentences = [[s[0] for s in sent] for sent in getter.sentences]
# Get tag labels data
tr_labels = [[s[1] for s in sent] for sent in getter.sentences]

# Get full document data structure
val_sentences = []
val_labels = []
getter = SentenceGetter(val)
# Get sentence data
val_sentences = [[s[0] for s in sent] for sent in getter.sentences]
# Get tag labels data
val_labels = [[s[1] for s in sent] for sent in getter.sentences]

# Get full document data structure
test_sentences = []
test_labels = []
getter = SentenceGetter(test)
# Get sentence data
test_sentences = [[s[0] for s in sent] for sent in getter.sentences]
# Get tag labels data
test_labels = [[s[1] for s in sent] for sent in getter.sentences]


The BERT model requires input data to be in a specific format. One requirement is to have special tokens that marks the beginning ([CLS]) and the separation/end of sentences ([SEP]). These tokens are added to the list of label values below. Furthermore, the label [PAD] is added to indicate padded tokens after padding the sentences later in the process.

In [None]:
# Adding labels to fine-tune the BERT
tag_values = list(train.labels.unique())
tag_values.append("[PAD]")
tag_values.append("[CLS]")
tag_values.append("[SEP]")
print(tag_values)

#Creating tag to index and index to tags variables
tag2idx = {t: i for i, t in enumerate(tag_values)}
idx2tag = {i: t for t, i in tag2idx.items()}
print(tag2idx)
print(idx2tag)

## Making the training data using the vocabulary from danish BERT

In [None]:
# Checking whether GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0) 


BERT provides its own tokenizer which is imported below. The tokenizer is created with a Wordpiece model and it creates a vocabulary of whole words, subwords and individual characters.

In [None]:
# load tokenizer, with manual file address or pretrained address from the Transformers library
tokenizer = BertTokenizer.from_pretrained("danish_bert_uncased_v2/vocab.txt", do_lower_case = True, strip_accents = False)

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
tr_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(s, l)
    for s, l in zip(tr_sentences, tr_labels)
]

val_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(s, l)
    for s, l in zip(val_sentences, val_labels)
]

test_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(s, l)
    for s, l in zip(test_sentences, test_labels)
]



In [None]:
tr_tokenized_texts_and_labels[10]

In [None]:
tr_tokenized_texts = [["[CLS]"] + tr_token_label_pair[0] + ["[SEP]"] for tr_token_label_pair in tr_tokenized_texts_and_labels]
tr_labels = [["[CLS]"] + tr_token_label_pair[1] + ["[SEP]"] for tr_token_label_pair in tr_tokenized_texts_and_labels]


val_tokenized_texts = [["[CLS]"] + val_token_label_pair[0] + ["[SEP]"] for val_token_label_pair in val_tokenized_texts_and_labels]
val_labels = [["[CLS]"] + val_token_label_pair[1] + ["[SEP]"] for val_token_label_pair in val_tokenized_texts_and_labels]
 

test_tokenized_texts = [["[CLS]"] + test_token_label_pair[0] + ["[SEP]"] for test_token_label_pair in test_tokenized_texts_and_labels]
test_labels = [["[CLS]"] + test_token_label_pair[1] + ["[SEP]"] for test_token_label_pair in test_tokenized_texts_and_labels]

#Example of word-piece tokenizations:
print(tr_tokenized_texts[10])
print(tr_labels[10])

Note that number of words in all datasets are increased due to the word-piece tokenization. For the test dataset this means that it will have a higher number of words i.e. labels also during evaluation and comparison to the rule-based classification

In [None]:
#number of words increased from 7,416 to 22,424
tmp=0
for labels in test_labels:
    tmp=tmp+len(labels)
tmp

In [None]:
# Len of the sentence must be not bigger than the training model
# See model's 'max_position_embeddings' = 512

MAX_LEN = len(max(tr_tokenized_texts, key = len))
print(MAX_LEN)
bs = 32

Indexing tokens in sentences

In [None]:
tr_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tr_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


val_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in val_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

test_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in test_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

#Example of indexing
print(tr_input_ids[10])

Indexing labels

In [None]:
tr_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in tr_labels],
                     maxlen=MAX_LEN, value=tag2idx["[PAD]"], padding="post",
                     dtype="long", truncating="post")


val_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in val_labels],
                     maxlen=MAX_LEN, value=tag2idx["[PAD]"], padding="post",
                     dtype="long", truncating="post")


test_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in test_labels],
                     maxlen=MAX_LEN, value=tag2idx["[PAD]"], padding="post",
                     dtype="long", truncating="post")

#Example of indexing
print(tr_tags[10])


Creating attention masks that indicates which elements in the sentence are tokens and which are padding elements. So here we create the mask to ignore the padded elements in the sequences.

In [None]:
tr_attention_masks = [[float(i != 0) for i in ii] for ii in tr_input_ids]


val_attention_masks = [[float(i != 0) for i in ii] for ii in val_input_ids]

test_attention_masks = [[float(i != 0) for i in ii] for ii in test_input_ids]

#Example of attention masks
print(tr_attention_masks[10])


Pytorch requires converting datasets into torch tensors (multidimensional matrices). Inputs, tags and mask ID's for training and test data are converted to tensors and moved to the GPU by applying .to(device)

In [None]:
tr_inputs = torch.tensor(tr_input_ids)
val_inputs = torch.tensor(val_input_ids)
test_inputs = torch.tensor(test_input_ids)

tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
test_tags = torch.tensor(test_tags)

tr_masks = torch.tensor(tr_attention_masks)
val_masks = torch.tensor(val_attention_masks)
test_masks = torch.tensor(test_attention_masks)


Creating training and test tensor datasets and defining data loaders. Shuffling the training data with RandomSampler and at test time we just pass them sequentially with the SequentialSampler.

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

## Train model

We load the pre-trained bert-base-cased model and provide the number of possible labels.

In [None]:
pt_model_dir = "/bachelor_project/danish_bert_uncased_v2/"

# Will load config and weight with from_pretrained(). 
model = BertForTokenClassification.from_pretrained(pt_model_dir,num_labels=len(tag2idx), output_attentions = False, output_hidden_states = False)

In [None]:
# Set model to GPU,if you are using GPU machine
model.cuda();

In [None]:
print(f'Number of trainable parameters: {model.num_parameters()}')

Setting full finetuning to true because we have capacity to fine tune all layers / update all weights. Before we can start the fine-tuning process, we have to setup the optimizer and add the parameters it should update. A common choice is the AdamW optimizer. We also add some weight_decay as regularization to the main weight matrices. If you have limited resources, you can also try to just train the linear classifier on top of BERT and keep all other weights fixed. This will still give you a good performance.

### Fine-tuning 

In [None]:
pad_tok = tokenizer.vocab["[PAD]"]
sep_tok = tokenizer.vocab["[SEP]"]
cls_tok = tokenizer.vocab["[CLS]"]

In [None]:
def flat_accuracy(valid_tags, pred_tags):

    """
    Define a flat accuracy metric to use while training the model.
    """

    return (np.array(valid_tags) == np.array(pred_tags)).mean()

In [None]:
def annot_confusion_matrix(valid_tags, pred_tags):

    """
    Create an annotated confusion matrix by adding label
    annotations and formatting to sklearn's `confusion_matrix`.
    """

    # Create header from unique tags
    header = sorted(list(set(valid_tags + pred_tags)))

    # Calculate the actual confusion matrix
    matrix = confusion_matrix(valid_tags, pred_tags, labels=['B-LOC', 'B-PER', 'I-LOC', 'I-PER'])

    # Final formatting touches for the string output
    mat_formatted = [header[i] + "\t" + str(row) for i, row in enumerate(matrix)]
    content = "\t" + " ".join(header) + "\n" + "\n".join(mat_formatted)

    return content

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,        
    lr=3e-5 #The authors of BERT uses 3e-5 as lr for BERT-base
    )

In [None]:
epochs = 4 # Train a maximum of 3-4 epochs. More will simply result in overfitting the training data. 
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
epoch = 0

torch.manual_seed(1)
np.random.seed(1)

tr_loss_values, eval_loss_values = [], []
for _ in trange(epochs, desc="Epoch"):
    epoch += 1

    # Training loop
    print("\nStarting training loop.")
    model.train()
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []

    for step, batch in enumerate(train_dataloader):

        # Add batch to gpu
        batch = tuple(t.to(torch.int64).to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Forward pass
        outputs = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
        )
        loss, tr_logits = outputs[:2]

        # Backward pass
        loss.backward()

        # Compute train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # Subset out unwanted predictions on CLS/PAD/SEP tokens
        preds_mask = (
            (b_input_ids != cls_tok)
            & (b_input_ids != pad_tok)
            & (b_input_ids != sep_tok)
        )

        #preds_mask = preds_mask.detach().cpu().numpy()
        tr_logits = tr_logits.detach().cpu().numpy()
        tr_label_ids = torch.masked_select(b_labels, (preds_mask == 1))
        tr_batch_preds = np.argmax(tr_logits[preds_mask.detach().cpu().numpy().squeeze()], axis=1)
        tr_batch_labels = tr_label_ids.to("cpu").numpy()
        tr_preds.extend(tr_batch_preds)
        tr_labels.extend(tr_batch_labels)

        # Compute training accuracy
        tmp_tr_accuracy = flat_accuracy(tr_batch_labels, tr_batch_preds)
        tr_accuracy += tmp_tr_accuracy

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )

        # Update parameters
        optimizer.step()
        model.zero_grad()

    tr_loss = tr_loss / nb_tr_steps
    tr_loss_values.append(tr_loss)
    tr_accuracy = tr_accuracy / nb_tr_steps

    # Print training loss and accuracy per epoch
    print(f"Train loss: {tr_loss}")
    print(f"Train accuracy: {tr_accuracy}")

    # Validation loop
    print("Starting validation loop.")

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []

    for batch in valid_dataloader:

        batch = tuple(t.to(torch.int64).to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
            )
            tmp_eval_loss, logits = outputs[:2]

        # Subset out unwanted predictions on CLS/PAD/SEP tokens
        preds_mask = (
            (b_input_ids != cls_tok)
            & (b_input_ids != pad_tok)
            & (b_input_ids != sep_tok)
        )

        logits = logits.to("cpu").numpy()
        label_ids = torch.masked_select(b_labels, (preds_mask == 1))
        val_batch_preds = np.argmax(logits[preds_mask.detach().cpu().numpy().squeeze()], axis=1)
        val_batch_labels = label_ids.to("cpu").numpy()
        predictions.extend(val_batch_preds)
        true_labels.extend(val_batch_labels)

        tmp_eval_accuracy = flat_accuracy(val_batch_labels, val_batch_preds)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy


        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    # Evaluate loss, acc, conf. matrix, and class. report on validation set
    pred_tags = [idx2tag[i] for i in predictions]
    valid_tags = [idx2tag[i] for i in true_labels]
    cl_report = classification_report(valid_tags, pred_tags, labels=['B-LOC', 'B-PER', 'I-LOC', 'I-PER'])
    conf_mat = annot_confusion_matrix(valid_tags, pred_tags)
    eval_loss = eval_loss / nb_eval_steps
    eval_loss_values.append(eval_loss)
    eval_accuracy = eval_accuracy / nb_eval_steps
    f1score = f1_score(valid_tags, pred_tags, labels = ['B-LOC', 'B-PER', 'I-LOC', 'I-PER'], average="micro")

    # Report metrics
    print(f"Validation loss: {eval_loss}\n")
    print(f"Validation Accuracy: {eval_accuracy}\n")
    print(f"F1-Score: {f1score}\n")
    print(f"Classification Report:\n {cl_report}")
    print(f"Confusion Matrix:\n {conf_mat}")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(tr_loss_values, 'b-o', label="training loss")
plt.plot(eval_loss_values, 'r-o', label="validation loss")

# Label the plot.
plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

In [None]:
ner_model_path = 'research/KMD/NER/models/danish_BERT'

In [None]:
# Make dir if not exits
if not os.path.exists(ner_model_path):
        os.makedirs(ner_model_path)

In [None]:
# Saving the model and the tokenizer

model_to_save = model.module if hasattr(model, 'module') else model #Take care of distrubuted/parallel training
model_to_save.save_pretrained(ner_model_path)
tokenizer.save_pretrained(ner_model_path)

In [None]:
# Loading the model to tokenize the test sentece
model = BertForTokenClassification.from_pretrained(ner_model_path)
tokenizer = BertTokenizer.from_pretrained(ner_model_path)

In [None]:
model.cuda();

In [None]:
pad_tok = tokenizer.vocab["[PAD]"]
sep_tok = tokenizer.vocab["[SEP]"]
cls_tok = tokenizer.vocab["[CLS]"]

torch.manual_seed(1)
np.random.seed(1)

test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
predictions, true_labels = [], []
tr_loss_values, test_loss_values = [], []

for batch in test_dataloader:

    batch = tuple(t.to(torch.int64).to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
        )
        tmp_test_loss, logits = outputs[:2]

    # Subset out unwanted predictions on CLS/PAD/SEP tokens
    preds_mask = (
        (b_input_ids != cls_tok)
        & (b_input_ids != pad_tok)
        & (b_input_ids != sep_tok)
    )

    logits = logits.to("cpu").numpy()
    label_ids = torch.masked_select(b_labels, (preds_mask == 1))
    test_batch_preds = np.argmax(logits[preds_mask.detach().cpu().numpy().squeeze()], axis=1)
    test_batch_labels = label_ids.to("cpu").numpy()
    predictions.extend(test_batch_preds)
    true_labels.extend(test_batch_labels)

    tmp_test_accuracy = flat_accuracy(test_batch_labels, test_batch_preds)

    test_loss += tmp_test_loss.mean().item()
    test_accuracy += tmp_test_accuracy


    nb_test_examples += b_input_ids.size(0)
    nb_test_steps += 1

# Evaluate loss, acc, conf. matrix, and class. report on devset
pred_tags = [idx2tag[i] for i in predictions]
valid_tags = [idx2tag[i] for i in true_labels]
cl_report = classification_report(valid_tags, pred_tags, labels=['B-LOC', 'B-PER', 'I-LOC', 'I-PER'])
conf_mat = annot_confusion_matrix(valid_tags, pred_tags)
test_loss = test_loss / nb_test_steps
test_loss_values.append(test_loss)
test_accuracy = test_accuracy / nb_test_steps
f1score_micro = f1_score(valid_tags, pred_tags, labels = ['B-LOC', 'B-PER', 'I-LOC', 'I-PER'], average="micro")
f1score_macro = f1_score(valid_tags, pred_tags, labels = ['B-LOC', 'B-PER', 'I-LOC', 'I-PER'], average="macro")

# Report metrics
print(f"Number of Epochs: {epochs}\n")

print(f"Test loss: {test_loss}\n")
print(f"Test Accuracy: {test_accuracy}\n")

print(f"F1-Score Micro: {f1score_micro}\n")
print(f"F1-Score Macro: {f1score_macro}\n")


print(f"Classification Report:\n {cl_report}")
print(f"Confusion Matrix:\n {conf_mat}")

with open(f'{ner_model_path}/TESTMETRICS','a+') as f:
    f.write(f"Number of Epochs: {epochs}\n")

    f.write(f"Test loss: {test_loss}\n")
    f.write(f"Test Accuracy: {test_accuracy}\n")

    f.write(f"F1-Score Micro: {f1score_micro}\n")
    f.write(f"F1-Score Macro: {f1score_macro}\n")

    f.write(f"Classification Report:\n {cl_report}")
    f.write(f"Confusion Matrix:\n {conf_mat}")

# Testing the model's capabilities on specific tokens only

In [None]:
#test["labels"][test["labels"].str.contains("PER")]="B-PER"
valid_tags = pd.Series(valid_tags)
valid_tags[valid_tags.str.contains("PER")] = "B-PER"
valid_tags[valid_tags.str.contains("LOC")] = "B-LOC"
valid_tags = valid_tags.tolist()

pred_tags = pd.Series(pred_tags)
pred_tags[pred_tags.str.contains("PER")] = "B-PER"
pred_tags[pred_tags.str.contains("LOC")] = "B-LOC"
pred_tags = pred_tags.tolist()

In [None]:
cl_report = classification_report(valid_tags, pred_tags, labels = ['B-LOC', 'B-PER'])
conf_mat = confusion_matrix(valid_tags, pred_tags)
f1score = f1_score(valid_tags, pred_tags, labels = ['B-LOC', 'B-PER'], average = "micro")

# Report metrics
print(f"F1-Score: {f1score}\n")
print(f"Classification Report:\n {cl_report}")
print(f"Confusion Matrix:\n {conf_mat}")

In [None]:
test_sentence = "Indkøb af Melon 1 kg, 2 slags Karen Volf 200g, 2 poser Chili og Timian fra Santa Maria, Arla 1L. Vores referencer: Karen Volf, Chili Jensen, Timian Hansen og Arla Kristoffersen. Kontaktperson er Melon Andersen. Levering til Timianvej 12"
test_sentence2 = "Timian Nielsen har bestilt 10 kasser Lego til levering på Hc. Andersensvej 13 A første Sal tv og han har købt det til sin datter chili som går med Åben ble fra Abena og hun elsker i øvrigt elsker at spise chili, så derfor har de 10 kg chili derhjemme, men hvad chili ikke ved er at hendes far har købt en hvid 3 hjulet cykel fra Toys R Us ved Toppen Nr. 3 Aarhus- helt specifikt er det en 3 hjulet nr 30 fra kataloget og han har husket Toppen beskyttelseshjelm, str 35 og Far Timian kan godt lide chiLi men han elsker at spise en Tivoli stang, derfor bestilte han 20 stk Toms tivoli stang så han kan dele med sin ven Sebsatian i stedet for at få Melon i Grøn Box."

In [None]:
tokenized_sentence = tokenizer.encode(test_sentence)
print(tokenized_sentence)

In [None]:
input_ids = torch.tensor([tokenized_sentence]).cuda()

In [None]:
with torch.no_grad():
    logits = model(input_ids)
logits = F.softmax(logits[0], dim = 2)
logits_label = torch.argmax(logits, dim = 2)
logits_label = logits_label.detach().cpu().numpy().tolist()[0]

logits_confidence = [values[label].item() for values, label in zip(logits[0], logits_label)]
len(logits_confidence)

In [None]:
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels, new_probs = [], [], []
for token, label_idx, probs in zip(tokens, logits_label, logits_confidence):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)
        new_probs.append(probs)

In [None]:
for token, label, prob in zip(new_tokens, new_labels, new_probs):
    print("{}\t{}\t{}".format(label, token, prob))


In [None]:
dict_predictions = [{"Word":token,"Label":label,"Confidence":prob} for token, label, prob in zip(new_tokens, new_labels, new_probs)]
dict_predictions