In [None]:
from datasets import load_dataset


# Shifted to use the wikiann dataset
dataset = load_dataset("masakhaner", "swa")

In [None]:
label_names = dataset["train"].features["ner_tags"].feature.names
print(label_names)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding="max_length", truncation=True, is_split_into_words=True)

    
tokenized_datasets_ = dataset.map(tokenize_function, batched=True)

In [None]:
# Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(all_samples_per_split):
    tokenized_samples = tokenizer.batch_encode_plus(
        all_samples_per_split["tokens"], is_split_into_words=True)
    # tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used
    # so the new keys [input_ids, labels (after adjustment)]
    # can be added to the datasets dict for each train test validation split
    total_adjusted_labels = []
    print(len(tokenized_samples["input_ids"]))
    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []

        for wid in word_ids_list:
            if (wid is None):
                adjusted_label_ids.append(-100)
            elif (wid != prev_wid):
                i = i + 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
            else:
                label_name = label_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])

        total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

In [None]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

model = AutoModelForTokenClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=len(label_names))

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("seqeval")

In [None]:
def compute_metrics(p):
    predictions, labels = p
    #select predicted index with maximum logit for each token
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 16
logging_steps = len(tokenized_dataset['train']) // batch_size
epochs = 5

In [None]:
training_args = TrainingArguments(
                    output_dir="results",
                    num_train_epochs=epochs,
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size,
                    evaluation_strategy="epoch",
                    save_strategy ="epoch",
                    disable_tqdm=False,
                    load_best_model_at_end=True, 
                    logging_steps=logging_steps,
                    no_cuda = False
                )

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis=2)


# Remove ignored index (special tokens)

true_predictions = [

[label_names[p] for (p, l) in zip(prediction, label) if l != -100]

for prediction, label in zip(predictions, labels)

]

true_labels = [

[label_names[l] for (p, l) in zip(prediction, label) if l != -100]

for prediction, label in zip(predictions, labels)

]

results = metric.compute(predictions=true_predictions, references=true_labels)

results

In [None]:
trainer.evaluate()

In [None]:
# save (localy) the fine tuned model & tokenizer 
'''
model_path = 'Swahili-NER BertBase Cased'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
'''

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("Swahili-NER-Distilbert-Cased")
tokenizer.push_to_hub("Swahili-NER-Distilbert-Cased")