In [21]:
!pip install transformers



In [22]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [23]:
raw_inputs = [
    "I've been waiting for a HUgging face course my whoel life!",
    "I hate this so much"
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

#The padding function is required every time that the sentences given to the tokenizer are shorter than the maximum allowed by the model
#The issue but with the sentences longer than the maximum of the model goes with the parameter truncation

In [24]:
inputs
#The two dictionaries shown show the conversion to a vector form the original words and the attention_mask includes the parts where padding has been applied so the model didn't pay attention to it

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,  2227,
          2607,  2026,  2040,  2884,  2166,   999,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [25]:
#Now that we know how the toknizer works let's take a look at the model

from transformers import AutoModel

#THe AutoModel from pretrained calls an api that returns the core of the model leaving out the head or the fine tuned final part of the model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)
#THe output will be a high dimensional tensro that is the representation of the sentences passed to the model
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

###torch.Size([2, 17, 768])

#In this case we can see that we have 2 sentences of 17 tokens (including special tokens) and a hidden state of 768
#Though this is a first step to our classification problem we need to do one more thing

torch.Size([2, 17, 768])


In [26]:
from transformers import AutoModelForSequenceClassification

#Now that we've used a different AutoModel specifically for a classification task (it is said head of the model so it is trained to perform this task) we can use the logits in order to fullfil our classification problem

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits)

#The result is

#tensor([[-3.2224,  3.4311],
#        [ 4.2141, -3.4158]], grad_fn=<AddmmBackward0>)

#The matrix has a vector for each sentences and for each possible label.

tensor([[-3.2224,  3.4311],
        [ 4.2141, -3.4158]], grad_fn=<AddmmBackward0>)


In [27]:
#We can see that the logits are not probabilities, to make sense of the logits we need to move forward to the next step: Postprocessing

In [28]:
!pip install torch



In [29]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

#THe softmax function only converts the logits to a normal distribution so it can then be used to show probabilities between 0 an 1
#The output is

#tensor([[1.2878e-03, 9.9871e-01],
#        [9.9951e-01, 4.8549e-04]], grad_fn=<SoftmaxBackward0>)

tensor([[1.2878e-03, 9.9871e-01],
        [9.9951e-01, 4.8549e-04]], grad_fn=<SoftmaxBackward0>)


In [30]:
#THe final step is then check which correspond to the positivity or negativity label. This is given by the id2label field of the model configuration. In our case we would have somthing like this:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [None]:
#So seeing our results the values out come would be something like:
#-first sentence: NEGATIVE 0.001287 -> 0.2% POSITIVE 0.998 -> 99.8%
#-second sentence: NEGATIVE 0.9995 -> 99.95% POSITIVE 0.00048 -> 0.48%