In [None]:
import os
import tensorflow_datasets as tfds
import datasets
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import DataCollatorForTokenClassification
from transformers import pipeline
import evaluate
import numpy as np

In [None]:
from utils_display import pc

In [None]:
seqeval = evaluate.load("seqeval")

# conll2003 dataset

In [None]:
path_to_conll2003_dataset = os.path.join("local_datasets", "conll2003")
dataset_train = datasets.load_from_disk(os.path.join(path_to_conll2003_dataset, "train.hf"))
dataset_test = datasets.load_from_disk(os.path.join(path_to_conll2003_dataset, "test.hf"))

In [None]:
sample_index = 2

In [None]:
pos_tags2indices = {
    '"': 0, "''": 1, '#': 2, '$': 3, '(': 4, ')': 5, ',': 6, '.': 7, ':': 8, '``': 9, 'CC': 10, 'CD': 11, 'DT': 12,
    'EX': 13, 'FW': 14, 'IN': 15, 'JJ': 16, 'JJR': 17, 'JJS': 18, 'LS': 19, 'MD': 20, 'NN': 21, 'NNP': 22, 'NNPS': 23,
    'NNS': 24, 'NN|SYM': 25, 'PDT': 26, 'POS': 27, 'PRP': 28, 'PRP$': 29, 'RB': 30, 'RBR': 31, 'RBS': 32, 'RP': 33,
    'SYM': 34, 'TO': 35, 'UH': 36, 'VB': 37, 'VBD': 38, 'VBG': 39, 'VBN': 40, 'VBP': 41, 'VBZ': 42, 'WDT': 43,
    'WP': 44, 'WP$': 45, 'WRB': 46
}

chunk_tags2indices = {
    'O': 0, 'B-ADJP': 1, 'I-ADJP': 2, 'B-ADVP': 3, 'I-ADVP': 4, 'B-CONJP': 5, 'I-CONJP': 6, 'B-INTJ': 7, 'I-INTJ': 8,
    'B-LST': 9, 'I-LST': 10, 'B-NP': 11, 'I-NP': 12, 'B-PP': 13, 'I-PP': 14, 'B-PRT': 15, 'I-PRT': 16, 'B-SBAR': 17,
    'I-SBAR': 18, 'B-UCP': 19, 'I-UCP': 20, 'B-VP': 21, 'I-VP': 22
}

ner_tags2indices = {
    'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8
}

In [None]:
number_of_pos_tags = len(pos_tags2indices)
number_of_chunks_tags = len(chunk_tags2indices)
number_of_ner_tags = len(ner_tags2indices)

In [None]:
def create_dico_indices2tags(dico_tags2indices: dict) -> dict:
    dico_indices2tags = dict()
    for key in dico_tags2indices:
        dico_indices2tags[dico_tags2indices[key]] = key
    return dico_indices2tags

In [None]:
pos_indices2tags = create_dico_indices2tags(dico_tags2indices=pos_tags2indices)
chunk_indices2tags = create_dico_indices2tags(dico_tags2indices=chunk_tags2indices)
ner_indices2tags = create_dico_indices2tags(dico_tags2indices=ner_tags2indices)

In [None]:
def print_sample(sample) -> None:

    print("-"*74)
    print("{:<4} | {:<20} | {:<3} {:<10} | {:<3} {:<10} | {:<3} {:<10}".format(
        "INDEX", "TOKEN", "", "POS", "", "CHUNK", "", "NER"))
    print("-"*74)
    for index in range(len(sample["tokens"])):
        
        pos_index = sample["pos"][index]
        pos_tag = pos_indices2tags[pos_index]
    
        chunk_index = sample["chunks"][index]
        chunk_tag = chunk_indices2tags[chunk_index]
    
        ner_index = sample["ner"][index]
        ner_tag = ner_indices2tags[ner_index]    
        
        print("{:<5} | {:<20} | {:<3} {:<10} | {:<3} {:<10} | {:<3} {:<10}".format(
            index,
            sample["tokens"][index],
            pos_index,
            pos_tag,
            chunk_index,
            chunk_tag,
            ner_index,
            ner_tag
        ))    
    print("-"*74)

In [None]:
pc("Sample index", sample_index, break_line=True)

sample = dataset_train[sample_index]
for key in sample.keys():
    pc(key, sample[key])
print_sample(sample=sample)

# BERT model

In [None]:
model_checkpoint = 'bert-base-uncased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)    
    labels = []
    for i, label in enumerate(examples["ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenize_and_align_labels(dataset_train[2:3])

In [None]:
def tokenize_dataset(dataset):
    return dataset.map(
        tokenize_and_align_labels,        
        batched=True,
        #remove_columns=dataset.column_names
    ) 

In [None]:
tokenized_dataset_train = tokenize_dataset(dataset_train)
tokenized_dataset_test = tokenize_dataset(dataset_test)

In [None]:
sample = dataset_train[sample_index]

sample_tokens = sample["tokens"]
number_of_sample_tokens = len(sample_tokens)

pc("Sample tokens", sample_tokens)
pc("Number of sample tokens", number_of_sample_tokens, break_line=True)

sample_input_ids = tokenized_dataset_train[sample_index]["input_ids"]
number_of_sample_input_ids = len(sample_input_ids)

pc("Input ids", sample_input_ids)
pc("Number of input ids", number_of_sample_input_ids, break_line=True)

tokenized_sample_input = tokenizer(sample_tokens, is_split_into_words=True)
word_indices = tokenized_sample_input.word_ids()
number_of_word_indices = len(word_indices)

pc("Word indices", word_indices)
pc("Number of word indices", number_of_word_indices, break_line=True)


sample_labels_aligned = tokenized_dataset_train[sample_index]["labels"]
number_of_sample_labels_aligned = len(sample_labels_aligned)

pc("Aligned labels", sample_labels_aligned)
pc("Number of aligned labels", number_of_sample_labels_aligned, break_line=True)

for s in range(number_of_sample_tokens):
    w = np.asarray([i for i, j in enumerate(word_indices) if j == s])
    xsample_token = sample_tokens[s]
    xword_indices = [word_indices[k] for k in w]
    xsample_input_ids = [sample_input_ids[k] for k in w]
    xsample_labels_aligned = [sample_labels_aligned[k] for k in w]
    print("{} {:<3} {}, {}, {}".format(s, xsample_token, xword_indices, xsample_input_ids, xsample_labels_aligned))


In [None]:
pc("Sample index", sample_index, break_line=True)
print(dataset_train[sample_index])
for key in tokenized_dataset_train[sample_index].keys():
    pc(key, tokenized_dataset_train[sample_index][key])

In [None]:
print(tokenizer.decode(tokenized_dataset_train[sample_index]["input_ids"]))

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    pretrained_model_name_or_path=model_checkpoint,
    num_labels=number_of_ner_tags,
    id2label=ner_indices2tags,
    label2id=ner_tags2indices,
)

In [None]:
args = TrainingArguments(
    "bert-finetuned-ner",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    num_train_epochs=5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to="wandb"    
)

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

In [None]:
trainer.train()

# Evaluation

In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")

In [None]:
text = "She then related that, by the permission of Elizabeth, she had passed the evening of the night on which the murder had been committed at the house of an aunt at Chene, a village situated at about a league from Geneva."

In [None]:
text_tokenized = tokenizer.tokenize(text)

In [None]:
text_ner = nlp(text_tokenized)

In [None]:
pc("Text", text)
pc("Tokenized text", text_tokenized)
pc("NER", text_ner)