In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [2]:
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
raw_inputs = [
   "I really love fluffy things!",
    "I hate rice so much!",
    "I don't know whether I love or hate snakes.",
    "I think that snakes are scaring.",
    "Cats are cute!",
]

In [4]:
classifier(raw_inputs)

[{'label': 'POSITIVE', 'score': 0.9998101592063904},
 {'label': 'NEGATIVE', 'score': 0.9984360337257385},
 {'label': 'NEGATIVE', 'score': 0.9994237422943115},
 {'label': 'NEGATIVE', 'score': 0.9832170009613037},
 {'label': 'POSITIVE', 'score': 0.9998457431793213}]

In [5]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  1045,  2428,  2293, 27036,  2477,   999,   102,     0,     0,
             0,     0,     0,     0],
        [  101,  1045,  5223,  5785,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0],
        [  101,  1045,  2123,  1005,  1056,  2113,  3251,  1045,  2293,  2030,
          5223, 12971,  1012,   102],
        [  101,  1045,  2228,  2008, 12971,  2024, 11228,  2075,  1012,   102,
             0,     0,     0,     0],
        [  101,  8870,  2024, 10140,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [7]:
tokens = tokenizer.tokenize(raw_inputs[0])
tokens

['i', 'really', 'love', 'fluffy', 'things', '!']

In [8]:
conv_tokens=tokenizer.convert_tokens_to_ids(tokens)
conv_tokens

[1045, 2428, 2293, 27036, 2477, 999]

In [9]:
tokenizer.decode(conv_tokens)

'i really love fluffy things!'

In [10]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [11]:
outputs = model(**inputs)

In [12]:
outputs.logits

tensor([[-4.1258,  4.4430],
        [ 3.5439, -2.9151],
        [ 4.1395, -3.3189],
        [ 2.1859, -1.8846],
        [-4.2130,  4.5640]], grad_fn=<AddmmBackward0>)

In [13]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions

tensor([[1.8991e-04, 9.9981e-01],
        [9.9844e-01, 1.5640e-03],
        [9.9942e-01, 5.7625e-04],
        [9.8322e-01, 1.6783e-02],
        [1.5422e-04, 9.9985e-01]], grad_fn=<SoftmaxBackward0>)

In [14]:
labels=model.config.id2label

In [17]:
labels

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [18]:
print("Predictions")
for i in range(len(raw_inputs)):
    print(str(i+1) + ". " + labels[0] + " : " + str(float(predictions[i][0])))
    print("   " + labels[1] + " : " + str(float(predictions[i][1])))

Predictions
1. NEGATIVE : 0.00018990747048519552
   POSITIVE : 0.9998101592063904
2. NEGATIVE : 0.9984360337257385
   POSITIVE : 0.001564009115099907
3. NEGATIVE : 0.9994237422943115
   POSITIVE : 0.0005762474611401558
4. NEGATIVE : 0.9832171201705933
   POSITIVE : 0.01678292639553547
5. NEGATIVE : 0.0001542178652016446
   POSITIVE : 0.9998457431793213
