In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
import spacy
from spacy import displacy

In [2]:
tokenizer = AutoTokenizer.from_pretrained("eolang/Swahili-NER-BertBase-Cased")
model = AutoModelForTokenClassification.from_pretrained("eolang/Swahili-NER-BertBase-Cased")

In [3]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Kwa nini Kenya inageukia mazao ya GMO kukabiliana na ukame"

In [4]:
ner_results = nlp(example)

In [5]:
# print word and NER tag
for result in ner_results:
    print(result["word"], result["entity"], result["score"])
    print('-----' * 10)

Kwa LABEL_0 0.99923885
--------------------------------------------------
nin LABEL_0 0.99925774
--------------------------------------------------
##i LABEL_0 0.9988446
--------------------------------------------------
Kenya LABEL_5 0.8325896
--------------------------------------------------
ina LABEL_0 0.9995541
--------------------------------------------------
##geu LABEL_0 0.9995402
--------------------------------------------------
##kia LABEL_0 0.9994684
--------------------------------------------------
ma LABEL_0 0.9829077
--------------------------------------------------
##za LABEL_0 0.9738737
--------------------------------------------------
##o LABEL_0 0.97064567
--------------------------------------------------
ya LABEL_0 0.9718624
--------------------------------------------------
GM LABEL_0 0.735248
--------------------------------------------------
##O LABEL_0 0.8975173
--------------------------------------------------
ku LABEL_0 0.99961275
-----------------------

In [22]:
# label mapping O (0), B-PER (1), I-PER (2), B-ORG (3), I-ORG (4), B-LOC (5), I-LOC (6).
'''
In simple temrs B-PER means the word is the beginning of a person's name, I-PER means it is inside a person's name, and O means it is outside a named entity.
This is the IOB2 tagging scheme; the same logc aplies to B-ORG, I-ORG, B-LOC, I-LOC etc.
'''
label_map = {
    'LABEL_0': "Other",
    'LABEL_1': "B-PERSON",
    'LABEL_2': "I-PERSON",
    'LABEL_3': "B-ORGANIZATION",
    'LABEL_4': "I-ORGANIZATION",
    'LABEL_5': "B-LOCATION",
    'LABEL_6': "I-LOCATION"
}

In [43]:
def tag(sentence):
    # get NER tags
    ner_results = nlp(sentence)
    # get the labels
    labels = [label_map[result["entity"]] for result in ner_results]
    # get the words
    words = [result["word"] for result in ner_results]
    # zip words and labels
    results = list(zip(words, labels))

    tokens = words
    doc = model(' '.join(tokens))
    return results, tokens, doc

In [44]:
x, y, z = tag(example)

AttributeError: 'str' object has no attribute 'size'

In [40]:
y

['Kwa',
 'nin',
 '##i',
 'Kenya',
 'ina',
 '##geu',
 '##kia',
 'ma',
 '##za',
 '##o',
 'ya',
 'GM',
 '##O',
 'ku',
 '##kab',
 '##iliana',
 'na',
 'uk',
 '##ame']

In [41]:
x

[('Kwa', 'Other'),
 ('nin', 'Other'),
 ('##i', 'Other'),
 ('Kenya', 'B-LOCATION'),
 ('ina', 'Other'),
 ('##geu', 'Other'),
 ('##kia', 'Other'),
 ('ma', 'Other'),
 ('##za', 'Other'),
 ('##o', 'Other'),
 ('ya', 'Other'),
 ('GM', 'Other'),
 ('##O', 'Other'),
 ('ku', 'Other'),
 ('##kab', 'Other'),
 ('##iliana', 'Other'),
 ('na', 'Other'),
 ('uk', 'Other'),
 ('##ame', 'Other')]

In [42]:
z

[{'entity': 'LABEL_0',
  'score': 0.9995419,
  'index': 1,
  'word': 'Kwa',
  'start': 0,
  'end': 3},
 {'entity': 'LABEL_0',
  'score': 0.99911195,
  'index': 2,
  'word': 'nin',
  'start': 4,
  'end': 7},
 {'entity': 'LABEL_0',
  'score': 0.9993986,
  'index': 3,
  'word': '#',
  'start': 8,
  'end': 9},
 {'entity': 'LABEL_0',
  'score': 0.9993782,
  'index': 4,
  'word': '#',
  'start': 9,
  'end': 10},
 {'entity': 'LABEL_0',
  'score': 0.99946207,
  'index': 5,
  'word': 'i',
  'start': 10,
  'end': 11},
 {'entity': 'LABEL_5',
  'score': 0.99312097,
  'index': 6,
  'word': 'Kenya',
  'start': 12,
  'end': 17},
 {'entity': 'LABEL_0',
  'score': 0.99955934,
  'index': 7,
  'word': 'ina',
  'start': 18,
  'end': 21},
 {'entity': 'LABEL_0',
  'score': 0.99954766,
  'index': 8,
  'word': '#',
  'start': 22,
  'end': 23},
 {'entity': 'LABEL_0',
  'score': 0.9995577,
  'index': 9,
  'word': '#',
  'start': 23,
  'end': 24},
 {'entity': 'LABEL_0',
  'score': 0.99952495,
  'index': 10,
  'w

------------

In [24]:
#print all keys
print(ner_results[0].keys())

dict_keys(['entity', 'score', 'index', 'word', 'start', 'end'])


In [25]:
# create a Spacy Doc object with the words
words = [r['word'] for r in ner_results]
doc = spacy.tokens.Doc(spacy.vocab.Vocab(), words=words)

In [32]:
type (words)

list

In [33]:
words

['Kwa',
 'nin',
 '##i',
 'Kenya',
 'ina',
 '##geu',
 '##kia',
 'ma',
 '##za',
 '##o',
 'ya',
 'GM',
 '##O',
 'ku',
 '##kab',
 '##iliana',
 'na',
 'uk',
 '##ame']

In [31]:
type(doc)

spacy.tokens.doc.Doc

In [45]:
# add entities to the Doc object
entities = [(i, i+1, r['entity']) for i, r in enumerate(ner_results)]
doc.ents = [spacy.tokens.Span(doc, start, end, label) for start, end, label in entities]

In [47]:
# create the visualization with displacy with colors
options = {'ents': [r['entity'] for r in ner_results]}
html = displacy.render(doc, style='ent', jupyter=True, options=options)

In [52]:
displacy.serve(doc, style="ent", options=options)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [21/Mar/2023 00:28:07] "GET / HTTP/1.1" 200 6129
127.0.0.1 - - [21/Mar/2023 00:28:08] "GET /favicon.ico HTTP/1.1" 200 6129
127.0.0.1 - - [21/Mar/2023 00:28:15] "GET /favicon.ico HTTP/1.1" 200 6129


Shutting down server on port 5000.


In [49]:
with open("ner.html", "w", encoding="utf-8") as file:
    file.write(str(html))

In [55]:

# Load a spaCy language model
nlp2 = spacy.load("en_core_web_sm")

# Define some text with NER entities
text = "Apple is looking at buying U.K. startup for $1 billion"

# Process the text with the language model
doc2 = nlp2(text)

# Generate a visualization of the NER entities
options = {"ents": ["ORG", "GPE", "MONEY"]}
colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "GPE": "linear-gradient(90deg, #ffba9c, #ffc7c7)",
          "MONEY": "linear-gradient(90deg, #b2fca1, #12d912)"}
displacy.serve(doc2, style="ent", options=options, port=5050)





Using the 'ent' visualizer
Serving on http://0.0.0.0:5050 ...



127.0.0.1 - - [21/Mar/2023 00:30:24] "GET / HTTP/1.1" 200 1378
127.0.0.1 - - [21/Mar/2023 00:30:25] "GET /favicon.ico HTTP/1.1" 200 1378


Shutting down server on port 5050.
