In [2]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
import spacy
from spacy import displacy

In [3]:
model_id = "eolang/Swahili-NER-BertBase-Cased"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

In [4]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

-----

- Label mapping O (0), B-PER (1), I-PER (2), B-ORG (3), I-ORG (4), B-LOC (5), I-LOC (6).
- In simple temrs B-PER means the word is the beginning of a person's name, I-PER means it is inside a person's name, and O means it is outside a named entity.
- This is the IOB2 tagging scheme; the same logc aplies to B-ORG, I-ORG, B-LOC, I-LOC etc.

In [5]:
label_map = {
    'LABEL_0': "Other",
    'LABEL_1': "B-PERSON",
    'LABEL_2': "I-PERSON",
    'LABEL_3': "B-ORGANIZATION",
    'LABEL_4': "I-ORGANIZATION",
    'LABEL_5': "B-LOCATION",
    'LABEL_6': "I-LOCATION"
}

In [7]:
def tag(sentence):
    # get NER tags
    ner_results = nlp(sentence)
    # get the labels
    labels = [label_map[result["entity"]] for result in ner_results]
    # get the words
    words = [result["word"] for result in ner_results]
    # zip words and labels
    results = list(zip(words, labels))

    return results

In [19]:
def tag2(sentence):
    return nlp(sentence)

In [23]:
def render(ner_results):
    # create a Spacy Doc object with the words
    words = [r['word'] for r in ner_results]
    doc = spacy.tokens.Doc(spacy.vocab.Vocab(), words=words)

    # add entities to the Doc object
    entities = [(i, i+1, r['entity']) for i, r in enumerate(ner_results)]
    doc.ents = [spacy.tokens.Span(doc, start, end, label) for start, end, label in entities]

    # create the visualization with displacy with colors
    options = {'ents': [r['entity'] for r in ner_results]}
    colors = {}
    displacy.render(doc, style='ent', jupyter=True, options=options)

In [21]:
sample = "Mwaka 2019, Rais wa Marekani Donald Trump alitangaza kuwa Marekani itaondoa majeshi yake kutoka Iraq."

In [24]:
render(tag2(sample))