In [1]:
!pip install -q --upgrade transformers torch torchvision torchaudio
!pip install -q tokenizers==0.13.3
!pip install -q bitsandbytes transformers accelerate gradio thread6

[0m

In [None]:
# Transformers can't process text directly. Language models first use a tokenizer to convert text inputs into numbers that the model can understand

# Tokenizers are responsible for:
    # Splitting the inputs into words, subwords, or symbols (like punctuations) that are called tokens
    # Mapping each token to an integer
    # Adding additional inputs that may be useful to the model

In [4]:
from transformers import AutoTokenizer

In [5]:
# We get a tokenizer from the distilbert-base-uncased-finetuned-sst-2-english for the example
# Were going to see how the tokenizer works
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [9]:
# We get the text were going to pass through the tokenizer
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

# Next we call the tokenizer with the text along with arguments we'll touch on later
# focus on the concept of what the tokenizer does to the above text

inputs = tokenizer(raw_inputs, padding=True, return_tensors='pt')
print(out)

# The output is a dictionary containing 2 keys, [input_ids, attention_mask]
# The input id's are the numerical representation of the text above, the attention mask will be discussed later

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [7]:
# we can now download the model similarly to how we downloaded the tokenizer
from transformers import AutoModel

In [None]:
model = AutoModel.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

### 1. Transformers 
When a model processes input data, it produces outputs known as "hidden states" or "features"

### 2. Hidden States/Features:
These Hidden States are essentially high-demensional vectors that capture the contextual understanding of the input by the Transformers model. Think of these vectors as the Transformers interpretation or representation of the input data.

### 3. Heads:
While hidden states are informative, they are not the end of the processing pipeline. Depending on the task at hand(text classification, text generation), different heads can be attached to the base model. Each task might require a different type of processing in its head, but the  underlying architecture(the base Transformer module) remains the same.

### 4. High-dimensional vector:
When the Transformer processes input data, the output is a vector with three dimensions:
    * Batch size: Refers to how many sequences are processed simultaneously.
    * Sequence length: Refers to the length of the input sequence's numerical representation
    * Hidden size: Refers to the size or dimensionality of the hidden state (the vector) for each input. This can be quite large (e.g., 768 for smaller models, and even 3072 or       more for bigger ones). Hence, the term "high-dimensional."

In [21]:
# The below image is how the model handles the hidden states and the head

# import image module
from IPython.display import Image
  
# get the image
Image(url="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/transformer_and_head-dark.svg", width=900, height=900)

In [18]:
# "Hidden States/Features" 
outputs = model(**inputs)
outputs.last_hidden_state.shape

torch.Size([2, 16, 768])

In [None]:
# There are many different architectures available in 🤗 Transformers, each designed around a specific task

In [22]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [23]:
# We chose a different head for the same model we called a while ago
# Notice that the output of this head changed the dimensionality of our output
# The head takes as input high-dimensional vectors and outputs vectors containing two values. (one per label)
# we get [2,2] since we used the 2 sentences as input earlier in this notebook
print(outputs.logits.shape)

torch.Size([2, 2])


In [24]:
# Postprocessing the output

# The model outputed [-1.5607,  1.6123] for the first and [ 4.1692, -3.3464] for the second sentence
# The correct term for them currenlty is "Logits"
# The values alone make no sense but this is becuase the values are not yet normalized
# They need to be passed through a "Soft Max Layer" to be converted to probabilities (The current head gets probabilities of sentences being positive or negative)

print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [25]:
# Let's pass them through a softmax layer
# BOOM, we got our predictions
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)

In [26]:
# To get the correct label for the probabilities we can call the id2label attribute

model.config.id2label

# The first number is negative and the second is positive
# First sentence: NEGATIVE: 0.0402, POSITIVE: 0.9598
# Second sentence: NEGATIVE: 0.9995, POSITIVE: 0.0005

{0: 'NEGATIVE', 1: 'POSITIVE'}