In [9]:
from transformers import BertTokenizer, BertModel
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

In [10]:
tokenizer_wietse = AutoTokenizer.from_pretrained("wietsedv/bert-base-dutch-cased-finetuned-conll2002-ner")
model_wietse = AutoModelForTokenClassification.from_pretrained("wietsedv/bert-base-dutch-cased-finetuned-conll2002-ner")

In [11]:
tokenizer_db = AutoTokenizer.from_pretrained("dbmdz/bert-base-multilingual-cased-finetuned-conll03-dutch")
model_db = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-multilingual-cased-finetuned-conll03-dutch")

In [12]:
model_en = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-cased")

In [13]:
label_list = [
    "O",       # Outside of a named entity
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC",  # Miscellaneous entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC"    # Location
]

In [14]:
def predict(tokenizer, model, sequence):
    # Bit of a hack to get the tokens with the special tokens
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt")

    outputs = model(inputs)[0]
    predictions = torch.argmax(outputs, dim=2)
    
    return sequence

In [15]:
sequence = "Marloes (ook een oud-collega) wil graag een keertje thee komen drinken en ik dacht dat dat wel handig is als jij werkt!"

In [16]:
%timeit predict(tokenizer_wietse, model_wietse, sequence)

82.1 ms ± 3.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
%timeit predict(tokenizer_db, model_db, sequence)

85.3 ms ± 5.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."

In [19]:
%timeit predict(tokenizer_en, model_en, sequence)

329 ms ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
