In [1]:
from transformers import pipeline

In [2]:
from pprint import pprint

In [3]:
nlp = pipeline("ner")

In [4]:
sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window."

In [5]:
pprint(nlp(sequence))

[{'entity': 'I-ORG', 'index': 1, 'score': 0.999578595161438, 'word': 'Hu'},
 {'entity': 'I-ORG',
  'index': 2,
  'score': 0.9909763932228088,
  'word': '##gging'},
 {'entity': 'I-ORG', 'index': 3, 'score': 0.9982224702835083, 'word': 'Face'},
 {'entity': 'I-ORG', 'index': 4, 'score': 0.9994880557060242, 'word': 'Inc'},
 {'entity': 'I-LOC', 'index': 11, 'score': 0.9994345307350159, 'word': 'New'},
 {'entity': 'I-LOC', 'index': 12, 'score': 0.9993196129798889, 'word': 'York'},
 {'entity': 'I-LOC', 'index': 13, 'score': 0.9993793964385986, 'word': 'City'},
 {'entity': 'I-LOC', 'index': 19, 'score': 0.9862582683563232, 'word': 'D'},
 {'entity': 'I-LOC', 'index': 20, 'score': 0.9514269828796387, 'word': '##UM'},
 {'entity': 'I-LOC', 'index': 21, 'score': 0.9336590766906738, 'word': '##BO'},
 {'entity': 'I-LOC',
  'index': 28,
  'score': 0.9761654138565063,
  'word': 'Manhattan'},
 {'entity': 'I-LOC',
  'index': 29,
  'score': 0.9914628863334656,
  'word': 'Bridge'}]


In [6]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

In [7]:
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english", return_dict=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433.0), HTML(value='')))




In [8]:
label_list = [
    "O",       # Outside of a named entity
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC",  # Miscellaneous entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC"    # Location
]

In [9]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
inputs = tokenizer.encode(sequence, return_tensors="pt")

In [10]:
outputs = model(inputs).logits
predictions = torch.argmax(outputs, dim=2)

In [11]:
print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])

[('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('close', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('which', 'O'), ('is', 'O'), ('visible', 'O'), ('from', 'O'), ('the', 'O'), ('window', 'O'), ('.', 'O'), ('[SEP]', 'O')]
