In [1]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [1]:
from transformers import BertTokenizer, BertModel
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

## Prepare NER

In [152]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")



In [65]:
# model_en = AutoModelForTokenClassification.from_pretrained("sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english")
# tokenizer_en = AutoTokenizer.from_pretrained("sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english")



In [153]:
# model_en = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
# tokenizer_en = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

In [154]:
label_list = [
    "O",       # Outside of a named entity
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC",  # Miscellaneous entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC"    # Location
]

In [155]:
def predict(tokenizer, model, sequence):
    # Bit of a hack to get the tokens with the special tokens
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt")

    outputs = model(inputs)[0]
    predictions = torch.argmax(outputs, dim=2)
    
    return sequence

In [5]:
sequence = "Marloes (ook een oud-collega) wil graag een keertje thee komen drinken en ik dacht dat dat wel handig is als jij werkt!"

In [5]:
sentence = Sentence("Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge.")
# tagger = SequenceTagger.load('ner')

In [8]:
result = tagger.predict(sentence)

In [10]:
print(sentence)
print('The following NER tags are found:')

# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)

Sentence: "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore veryclose to the Manhattan Bridge."   [− Tokens: 22  − Token-Labels: "Hugging <B-ORG> Face <I-ORG> Inc. <E-ORG> is a company based in New <B-LOC> York <I-LOC> City. <E-LOC> Its headquarters are in DUMBO, <S-LOC> therefore veryclose to the Manhattan <B-LOC> Bridge. <E-LOC>"]
The following NER tags are found:
Span [1,2,3]: "Hugging Face Inc."   [− Labels: ORG (0.9977)]
Span [9,10,11]: "New York City."   [− Labels: LOC (0.9928)]
Span [16]: "DUMBO,"   [− Labels: LOC (0.9155)]
Span [21,22]: "Manhattan Bridge."   [− Labels: LOC (0.8166)]


In [56]:
# %timeit predict(tokenizer_wietse, model_wietse, sequence)

In [17]:
%timeit predict(tokenizer_db, model_db, sequence)

85.3 ms ± 5.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."

In [19]:
%timeit predict(tokenizer_en, model_en, sequence)

329 ms ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [156]:
def predict(tokenizer, model, sequence):
    # Bit of a hack to get the tokens with the special tokens
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt")

    outputs = model(inputs)[0]
    predictions = torch.argmax(outputs, dim=2)
    
    return predictions, tokens, outputs

In [157]:
pred, tokens, outputs = predict(tokenizer_en, model_en, coco[2])

In [158]:
import numpy as np

In [159]:
import json

In [160]:
with open('coco_reviews.json') as f:
    coco = json.load(f)
    
with open('frozen_reviews.json') as f:
    frozen = json.load(f)

In [161]:
coco[2]



In [162]:
pred, tokens, _ = predict(tokenizer_en, model_en, "Ernesto de la Cruz")

In [163]:
pred

tensor([[0, 3, 4, 4, 4, 4]])

In [164]:
tokens

['[CLS]', 'Ernesto', 'de', 'la', 'Cruz', '[SEP]']

## Get Names

In [165]:
from tqdm import tqdm

In [180]:
names = []
for index, doc in tqdm(enumerate(coco)):
    doc_names = []
    
    if len(doc.split(" ")) < 300:
        pred, tokens, _ = predict(tokenizer_en, model_en, doc)
        pred = pred.tolist()[0]

        for val in np.where(np.isin(pred,[3, 4]))[0]:
            names.append(tokens[val])
#     else:
#         for sentence in doc.split("."):
#             if len(sentence.split(" ")) < 300:
#                 pred, tokens, _ = predict(tokenizer_en, model_en, sentence)
#                 pred = pred.tolist()[0]

#                 for val in np.where(np.isin(pred,[3, 4]))[0]:
#                     names.append(tokens[val])
            
#     names.append((doc_names, index))

1095it [03:11,  5.71it/s]


In [181]:
from collections import Counter
Counter(names)

Counter({'Pi': 160,
         'Per': 676,
         'Co': 332,
         '##co': 162,
         'Miguel': 206,
         'X': 4,
         'Shaw': 2,
         '##xa': 89,
         'Adrian': 12,
         'Mo': 16,
         '##lina': 12,
         'Matthew': 4,
         'Al': 4,
         '##dric': 4,
         '##h': 2,
         'Lee': 20,
         'Un': 21,
         '##k': 19,
         '##rich': 20,
         'Anthony': 18,
         'Gonzalez': 14,
         'G': 19,
         '##ael': 15,
         'García': 7,
         'Bern': 15,
         'H': 5,
         '##ctor': 6,
         'Rivera': 9,
         'Benjamin': 11,
         'B': 12,
         '##rat': 11,
         '##t': 11,
         'Ernesto': 47,
         'de': 17,
         'la': 20,
         'Cruz': 34,
         'Ana': 2,
         'Of': 2,
         '##eli': 3,
         '##a': 5,
         'Mu': 8,
         '##rg': 2,
         '##u': 2,
         '##ía': 1,
         'Ma': 1,
         'So': 1,
         '##cor': 1,
         '##ro': 1,
         'Don'

In [171]:
frozen_names = []
for index, doc in tqdm(enumerate(frozen)):
    
    if len(doc.split(" ")) < 300:
        pred, tokens, _ = predict(tokenizer_en, model_en, doc)
        pred = pred.tolist()[0]

        for val in np.where(np.isin(pred,[3, 4]))[0]:
            frozen_names.append(tokens[val])
            
#     names.append((doc_names, index))

1119it [03:34,  5.22it/s]


In [173]:
Counter(frozen_names)

Counter({'Per': 370,
         'Fr': 108,
         'Men': 66,
         '##zel': 66,
         'Bell': 56,
         'Olaf': 207,
         'Disney': 231,
         'Elsa': 459,
         'Kris': 95,
         '##to': 73,
         'Hans': 102,
         'Christian': 30,
         'Anderson': 9,
         'Sven': 39,
         'Anna': 415,
         'Pop': 1,
         'En': 1,
         'B': 3,
         '##ly': 1,
         '##ton': 1,
         'Kristen': 51,
         'Rocky': 1,
         'Bull': 1,
         '##win': 1,
         'Sherman': 1,
         'Walt': 21,
         'El': 6,
         '##za': 1,
         'Male': 1,
         '##ent': 1,
         'C': 8,
         'Andersen': 23,
         'Ari': 1,
         'F': 9,
         '##olm': 1,
         '##an': 1,
         'Holly': 2,
         'Al': 3,
         'S': 9,
         '##hr': 2,
         '##ek': 1,
         'Ce': 1,
         '##line': 1,
         'Dion': 1,
         'Elton': 1,
         'John': 4,
         'I': 63,
         '##dina': 63,
         '

## Sentiment

https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you