In [None]:
# install libraries

!pip install transformers datasets evaluate accelerate

# TODO: Models, tokenizers, handling multiple sequences

# Direct use of pipeline function

In [2]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

classifier(
    ["The service was trash", "I loved the food!"]
)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'NEGATIVE', 'score': 0.9997699856758118},
 {'label': 'POSITIVE', 'score': 0.9998781681060791}]

In [3]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
sequence_to_classify = "Angela Merkel is a politician in Germany and leader of the CDU"
candidate_labels = ["politics", "economy", "entertainment", "environment"]
output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
print(output)

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'sequence': 'Angela Merkel is a politician in Germany and leader of the CDU', 'labels': ['politics', 'economy', 'environment', 'entertainment'], 'scores': [0.9823215007781982, 0.007280233781784773, 0.005891901906579733, 0.0045063067227602005]}


# Individually do the 3 steps of pipeline

### checkpoint = "bert-base-uncased"


Purpose: General-purpose language model — not fine-tuned for any specific task, Output: Hidden states (e.g., for classification, you’d fine-tune the [CLS] token)

### checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

Ready-to-use sentiment classification model (positive/negative), Output: Class logits (e.g., [negative_score, positive_score]). But when you load it using AutoModel it will only load the base architecture! The head won't be loaded!

### 1. Tokenization

In [19]:
## step1: tokenize

from transformers import AutoTokenizer

# Ready-to-use sentiment classification model (positive/negative), Output: Class logits (e.g., [negative_score, positive_score])
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# Purpose: General-purpose language model — not fine-tuned for any specific task, Output: Hidden states (e.g., for classification, you’d fine-tune the [CLS] token)
checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


### 2. AutoModel

What it loads: Only the base transformer model (e.g., BERT, RoBERTa, DistilBERT), without any task-specific head.

Use case: Use when you want raw hidden states or to build your own head for tasks like classification, question answering, etc.

Output: last_hidden_state, pooler_output, etc.

In [20]:
## step2: send input to model
# This is a standalone model without the head

from transformers import AutoModel

# Ready-to-use sentiment classification model (positive/negative), Output: Class logits (e.g., [negative_score, positive_score])
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# Purpose: General-purpose language model — not fine-tuned for any specific task, Output: Hidden states (e.g., for classification, you’d fine-tune the [CLS] token)
checkpoint = "bert-base-uncased"

model = AutoModel.from_pretrained(checkpoint)

# **inputs unpacks a dictionary into keyword arguments.
# model(inputs) = model({ 'input_ids': tensor([[...]]), 'attention_mask': tensor([[...]])})
# model(**inputs) = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

torch.Size([2, 16, 768])


### 2. AutoModelForSequenceClassification

What it loads: The base model plus a classification head (typically a linear layer on top of the [CLS] token).

Use case: For text classification tasks, like sentiment analysis, spam detection, etc.

Output: logits — raw scores for each class.

In [22]:
## step 2.5: send input to model with a specific head

from transformers import AutoModelForSequenceClassification

# Ready-to-use sentiment classification model (positive/negative), Output: Class logits (e.g., [negative_score, positive_score])
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# Purpose: General-purpose language model — not fine-tuned for any specific task, Output: Hidden states (e.g., for classification, you’d fine-tune the [CLS] token)
checkpoint = "bert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

outputs = model(**inputs)
print(outputs.logits.shape)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([2, 2])


### 3. output postprocessing

In [13]:
# step 3: convert output logits into readable probabilities via a softmax layer
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [14]:
# To get the labels corresponding to each position, we can inspect the id2label attribute of the model config
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

# Model - detailed

In [1]:
# Loading a Transformer model that is already trained and saving it
from transformers import AutoModel

checkpoint = "bert-base-uncased"
model = AutoModel.from_pretrained(checkpoint)
model.save_pretrained("my_model")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [2]:
# Use transformer model for inference
import torch

sequences = ["Hello!", "Cool.", "Nice!"]

# after using tokenizer we get:
encoded_sequences = [
[101, 7592, 999, 102],
[101, 4658, 1012, 102],
[101, 3835, 999, 102], ]

# for easy conversion, need to have rectangular (matrix) shape like above
model_inputs = torch.tensor(encoded_sequences)

output = model(model_inputs)
print(output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-5.0937e-02,  1.0882e-01, -1.4107e-01,  ..., -1.2431e-01,
          -8.0330e-02,  2.8579e-01],
         [-6.7714e-01, -5.4644e-01,  8.7818e-02,  ..., -5.7524e-02,
           3.5947e-02, -3.0803e-01],
         [-1.0903e+00, -9.9962e-01, -5.6360e-01,  ...,  3.2317e-01,
          -2.7725e-01, -1.4627e-01],
         [ 8.3018e-01,  5.0071e-02, -2.2515e-01,  ...,  3.2161e-01,
          -6.4891e-01, -2.4565e-01]],

        [[-2.8582e-01, -1.8636e-02,  7.9945e-02,  ..., -3.2150e-01,
           3.1830e-01,  5.7759e-01],
         [ 4.5425e-01, -5.5394e-01,  3.9479e-01,  ..., -2.2183e-01,
           1.8811e-01,  1.0380e-01],
         [-2.7685e-01, -1.0924e+00,  2.2841e-01,  ...,  2.6004e-01,
           3.7289e-01,  6.6873e-02],
         [ 9.7986e-01, -2.8927e-02, -1.4129e-01,  ...,  3.6114e-01,
          -6.3582e-01, -1.5223e-01]],

        [[-7.1054e-04,  1.6308e-01, -1.2002e-01,  ..., -1.5940e-02,
          -5.3252e-02,  3

# Tokenization - detailed

In [4]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer("Using a Transformer network is simple")

tokenizer.save_pretrained("my_tokenizer")

('my_tokenizer/tokenizer_config.json',
 'my_tokenizer/special_tokens_map.json',
 'my_tokenizer/vocab.txt',
 'my_tokenizer/added_tokens.json',
 'my_tokenizer/tokenizer.json')

In [5]:
# 1. The tokenization process is done by the tokenize() method of the tokenizer.

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [6]:
# 2. Then the conversion to input IDs is handled by the convert_tokens_to_ids() tokenizer method:

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

# These outputs, once converted to the appropriate framework tensor, can then be used as inputs to a model as seen earlier in this chapter.

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [7]:
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

Using a transformer network is simple


By now you should understand the atomic operations a tokenizer can handle: tokenization, conversion to IDs, and converting IDs back to a string. However, we’ve just scraped the tip of the iceberg.

# Multiple input sequences

In [14]:
# In the previous exercise you saw how sequences get translated into lists of numbers.
# Let’s convert this list of numbers to a tensor and send it to the model:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

# This line will fail !
model(input_ids)

IndexError: too many indices for tensor of dimension 1

Oh no! Why did this fail? We followed the steps from the pipeline in section 2.
The problem is that we sent a single sequence to the model, whereas Transformers models expect multiple sentences by default. Here we tried to do everything the tokenizer did behind the scenes when we applied it to a sequence. But if you look closely, you’ll see that the tokenizer didn’t just convert the list of input IDs into a tensor, it added a dimension on top of it:

In [9]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [10]:
# Let’s try again and add a new dimension. Notice torch.tensor([ids])

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


Batching is the act of sending multiple sentences through the model, all at once. If you only have one sentence, you can just build a batch with a single sequence:
batched_ids = [ids, ids]
This is a batch of two identical sequences!

Try it out! Convert this batched_ids list into a tensor and pass it through your model. Check that you obtain the same logits as before (but twice)!


In [11]:
batched_ids = [ids, ids]
input_ids = torch.tensor(batched_ids)
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


 use padding to make our tensors have a rectangular shape. Padding makes sure all our sentences have the same length by adding a special word called the padding token to the sentences with fewer values

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3373, -1.2163]], grad_fn=<AddmmBackward0>)


There’s something wrong with the logits in our batched predictions: the second row should be the same as the logits for the second sentence, but we’ve got completely different values!
This is because the key feature of Transformer models is attention layers that contextualize each token. These will take into account the padding tokens since they attend to all of the tokens of a sequence. To get the same result when passing individual sentences of different lengths through the model or when passing a batch with the same sentences and padding applied, we need to tell those attention layers to ignore the padding tokens. This is done by using an attention mask.

Attention masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to, and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model).

In [13]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


Longer sequences: With Transformer models, there is a limit to the lengths of the sequences we can pass the models. Most models handle sequences of up to 512 or 1024 tokens, and will crash when asked to process longer sequences. There are two solutions to this problem: Use a model with a longer supported sequence length, and truncate your sequences.

Models have different supported sequence lengths, and some specialize in handling very long sequences. Longformer is one example, and another is LED. If you’re working on a task that requires very long sequences, we recommend you take a look at those models.
Otherwise, we recommend you truncate your sequences by specifying the max_sequence_length parameter:
sequence = sequence[:max_sequence_length]


# Combine everything

In [15]:
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sequence = "I've been waiting for a HuggingFace course my whole life."
model_inputs = tokenizer(sequence)

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9138]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


# Acquire IMDB dataset

In [None]:
# login to hugging face so that can share model later on

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load IMDb dataset

from datasets import load_dataset
imdb = load_dataset("imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
print(imdb["train"].features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [None]:
print(imdb['train'][0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

# 1) Finetune

In [None]:
# Pre-processing: load a DistilBERT tokenizer to preprocess the text field
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Create a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT’s maximum input length:

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
# To apply the preprocessing function over the entire dataset, use the Datasets map function.
# Speed up map by setting batched=True to process multiple elements of the dataset at once

tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# A data collator pads sequences on the fly per batch, makes training more efficient, and is required because BERT expects uniform-length inputs

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
print(imdb['train'][0])
print(tokenized_imdb['train'][0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [None]:
# Including a metric during training is often helpful for evaluating model’s performance. You can quickly load a evaluation method with the Evaluate library. For this task, load the accuracy metric:

import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# Create a function that passes predictions and the labels to compute to calculate the accuracy:

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# Before training your model, create a map of the expected ids to their labels with id2label and label2id:

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
# You’re ready to start training your model now! Load DistilBERT with AutoModelForSequenceClassification along with the number of expected labels, and the label mappings:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## At this point, only three steps remain:

### 1) Define your training hyperparameters in TrainingArguments. The only required parameter is output_dir which specifies where to save your model. You’ll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the Trainer will evaluate the accuracy and save the training checkpoint.

### 2) Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, and compute_metrics function.

### 3) Call train() to finetune your model.

Trainer guide: https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model", # where to save the model
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch", # reports the accuracy at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmusabumair005[0m ([33mmusabumair0191[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2219,0.211228,0.91952
2,0.1474,0.230162,0.9316


TrainOutput(global_step=3126, training_loss=0.2046397722308939, metrics={'train_runtime': 3391.5313, 'train_samples_per_second': 14.743, 'train_steps_per_second': 0.922, 'total_flos': 6556904415524352.0, 'train_loss': 0.2046397722308939, 'epoch': 2.0})

In [None]:
# Export metrics from a single run to a CSV file
# This snippet finds all the metrics saved for a single run and saves them to a CSV file.

import wandb
api = wandb.Api()

# run is specified by <entity>/<project>/<run_id>
run = api.run("musabumair0191/huggingface/h2447nkt")

# save the metrics for the run to a csv file
metrics_dataframe = run.history()
metrics_dataframe.to_csv("metrics.csv")

# 2) Inference

In [None]:
# Grab some text you’d like to run inference on
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [None]:
# The simplest way to try out your finetuned model for inference is to use it in a pipeline().
# Instantiate a pipeline for sentiment analysis with your model, and pass your text to it.
# If you wanna do it manually, then have to use pytorch as given in the example: https://huggingface.co/docs/transformers/tasks/sequence_classification

from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="my_awesome_model")
classifier(text)

Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9978958368301392}]

## to do inference manually, you would:

### Tokenize the text and return PyTorch tensors

### Pass your inputs to the model and return the logits

### Get the class with the highest probability, and use the model’s id2label mapping to convert it to a text label


# Zero-shot classification

In [8]:
from transformers import pipeline

themes = ["Normalization of failure", "Support systems", "Assessment and grading", "Faculty engagement", "Curriculum flexibility", "Changing the narrative", "Reducing pressure", "Practical suggestions", "Community building"]
subthemes = ["Sharing experiences", "Open discussion", "Academic support", "Mental health support", "Alternative grading schemes", "Reducing exam focus", "Retake opportunities", "Office hours", "Sharing personal experiences from faculty", "Course options", "Less rigid degree plans", "Reframing failure", "Celebrating effort", "Lowering impact on GPA", "Less emphasis on grades", "Practice tests", "Feedback and improvement", "Peer support", "Collaborative learning"]
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
sequence_to_classify = "Discussing role-models who have failed in engineering in the past but have succeeded in the long run, and reminding students that assessments in university are kind of irrelevant to how things are done in the real world"
output = classifier(sequence_to_classify, themes, multi_label=False)
print(output)
output = classifier(sequence_to_classify, subthemes, multi_label=False)
print(output)

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'sequence': 'Discussing role-models who have failed in engineering in the past but have succeeded in the long run, and reminding students that assessments in university are kind of irrelevant to how things are done in the real world', 'labels': ['Normalization of failure', 'Reducing pressure', 'Faculty engagement', 'Changing the narrative', 'Curriculum flexibility', 'Practical suggestions', 'Support systems', 'Community building', 'Assessment and grading'], 'scores': [0.31856563687324524, 0.2659843862056732, 0.24730169773101807, 0.07130768150091171, 0.04838566854596138, 0.0401296392083168, 0.003238603239879012, 0.002630274510011077, 0.00245638913474977]}
{'sequence': 'Discussing role-models who have failed in engineering in the past but have succeeded in the long run, and reminding students that assessments in university are kind of irrelevant to how things are done in the real world', 'labels': ['Reducing exam focus', 'Less emphasis on grades', 'Reframing failure', 'Lowering impact o