# LAB07 - NLP2: HuggingFace Transformers

## 1. Install dependencies

In [1]:
!pip install -q datasets transformers evaluate scikit-learn xformers
!pip install -q --upgrade accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m111.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

## 2. Imports

In [2]:
import evaluate
import torch
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, Trainer, AutoModelForSequenceClassification, TrainingArguments, PreTrainedModel, PreTrainedTokenizer, set_seed, EvalPrediction
from sklearn.model_selection import train_test_split

In [3]:
# Set seed for reproductibility
seed = 42
set_seed(seed)

## 3. Data preprocessing

As a model, we use distilbert as pre-trained model, as it is light and will fine-tune fast.

In [4]:
raw_datasets = load_dataset("imdb") # Load idmb dataset
checkpoint = "distilbert-base-uncased" # Using distilbert
tokenizer = AutoTokenizer.from_pretrained(checkpoint, return_tensors="pt")

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Now we create a batch of examples using DataCollatorWithPadding. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [5]:
def tokenize_function(example: dict) -> dict:
    """
    Tokenizes the text field of a given example using the tokenizer.
    Args:
        example (dict): The example containing the "text" field.
    Returns:
        dict: The example with the "text" field tokenized.
    """
    return tokenizer(example["text"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # Apply tokenization to the whole dataset
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Define a data collator

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

## 4. Fine-tune the distilbert model

We use a smaller subset from the original dataset in order to reduce the training time.

In [6]:
# Define the desired size of the subset
train_subset_size = 10000 
test_subset_size = 2000

# Select a smaller subset of examples from the original dataset
train_subset = tokenized_datasets["train"].shuffle(seed=seed).select(range(train_subset_size))
test_subset = tokenized_datasets["test"].shuffle(seed=seed).select(range(test_subset_size))

# Splitting the train dataset into train and evaluation subsets
train_eval_ds =  train_subset.train_test_split(test_size=0.2)
train_dataset, eval_dataset = train_eval_ds["train"], train_eval_ds["test"] 

Before we start training our model, we create a map of the expected ids to their labels with id2label and label2id

In [7]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [8]:
# Define the training arguments for the training process
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch", num_train_epochs=1, optim="adamw_torch")

# Load the pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

# Create the Trainer object for training and evaluation
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [9]:
trainer.train() # Launch training

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.2952,0.270749


TrainOutput(global_step=1000, training_loss=0.3452295379638672, metrics={'train_runtime': 424.4273, 'train_samples_per_second': 18.849, 'train_steps_per_second': 2.356, 'total_flos': 995480082081216.0, 'train_loss': 0.3452295379638672, 'epoch': 1.0})

### 4.1 Evaluation of the model

In [10]:
# After training, perform final evaluation on the test dataset
test_metrics = trainer.evaluate(test_subset)
print("Test metrics:", test_metrics)

Test metrics: {'eval_loss': 0.2608216106891632, 'eval_runtime': 31.2441, 'eval_samples_per_second': 64.012, 'eval_steps_per_second': 8.002, 'epoch': 1.0}


### Bonus :  Fine-tuning our model using the accuracy as evaluation instead of the loss (default).

In [11]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    """
    Computes the metrics for evaluation predictions.

    Args:
        eval_preds (EvalPrediction): The evaluation predictions containing logits and labels.

    Returns:
        dict: A dictionary containing the computed metrics.

    """
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# Define the training arguments for the training process
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch", num_train_epochs=1, optim="adamw_torch")

# Load the pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Create the Trainer object for training and evaluation
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics= compute_metrics,
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [12]:
trainer.train() # Launch training

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2921,0.283726,0.9115


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=1000, training_loss=0.34764454650878907, metrics={'train_runtime': 419.5924, 'train_samples_per_second': 19.066, 'train_steps_per_second': 2.383, 'total_flos': 995480082081216.0, 'train_loss': 0.34764454650878907, 'epoch': 1.0})

We can already see the a good accuracy with only one epoch

## 6. Using now a fully pre-trained model
### 6.1 Evaluate the model in term of accuracy on the test data

In [13]:
# Define and Load the distibert model fully trained on the imdb dataset
trained_check_point = "mvonwyl/distilbert-base-uncased-imdb"

tokenizer = AutoTokenizer.from_pretrained(trained_check_point)
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(trained_check_point, num_labels=2, id2label=id2label, label2id=label2id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [14]:
# Create an evaluator object for text classification task
task_evaluator = evaluate.evaluator("text-classification")

# Compute evaluation results for the fine-tuned model on the test subset
eval_results = task_evaluator.compute(
    model_or_pipeline=fine_tuned_model,
    data=test_subset,
    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    label_mapping={"NEGATIVE": 0, "POSITIVE": 1},
    tokenizer=tokenizer,
    random_state=seed
)

# Print the evaluation results
eval_results

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

{'accuracy': 0.926,
 'recall': 0.917,
 'precision': 0.9338085539714868,
 'f1': 0.9253279515640768,
 'total_time_in_seconds': 24.37105335299998,
 'samples_per_second': 82.06456943125144,
 'latency_in_seconds': 0.01218552667649999}

### 6.2 Analysis of two wrongly classified samples

In [15]:
misclassified_samples = []
num_misclassified = 0

# Move the model to the same device as the input tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fine_tuned_model = fine_tuned_model.to(device)

for example in test_subset:
    input_text = example["text"]
    true_label = example["label"]

    # Tokenize the input text
    inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    
    # Move the input tensors to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Make predictions with the fine-tuned model
    outputs = fine_tuned_model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    
    # Check if the prediction matches the true label
    if predicted_label != true_label:
        misclassified_samples.append({"text": input_text, "true_label": true_label, "predicted_label": predicted_label})
        num_misclassified += 1
    
    # Stop after finding four misclassified samples
    if num_misclassified >= 4:
        break


Let's take a look at two samples :

In [16]:
display(misclassified_samples[1], misclassified_samples[3])

{'text': 'Intended as light entertainment, this film is indeed successful as such during its first half, but then succumbs to a rapidly foundering script that drops it down. Harry (Judd Nelson), a "reformed" burglar, and Daphne (Gina Gershon), an aspiring actress, are employed as live window mannequins at a department store where one evening they are late in leaving and are locked within, whereupon they witness, from their less than protective glass observation point, an apparent homicide occurring on the street. The ostensible murderer, Miles Raymond (Nick Mancuso), a local sculptor, returns the following day to observe the mannequins since he realizes that they are the only possible witnesses to the prior night\'s violent event and, when one of the posing pair "flinches", the fun begins. Daphne and Harry report their observations at a local police station, but when the detective taking a crime report remembers Harry\'s criminal background, he becomes cynical. There are a great many w

{'text': 'Sex, drugs, racism and of course you ABC\'s. What more could you want in a kid\'s show!<br /><br />------------------------------------------- -------------------------------------------<br /><br />"User Comment Guidelines <br /><br />Please note there is a 1,000 word limit on comments. The recommended length is 200 to 500 words. The minimum length for comments is 10 lines of text. Comments which are too short or have been padded with junk text will be discarded. You may only post a single comment per title. <br /><br />What to include: Your comments should focus on the title\'s content and context. The best reviews include not only whether you liked or disliked a movie or TV-series, but also why. Feel free to mention other titles you consider similar and how this one rates in comparison to them. Comments that are not specific to the title will not be posted on our site. Please write in English only and note that we do not support HTML mark-up within the comments"',
 'true_la

The model's predictions might have been incorrect due to various reasons, such as:
- `Ambiguity in the text`: The samples could contain ambiguous language or complex sentence structures that make it challenging for the model to accurately understand the sentiment. The model may struggle with nuanced or sarcastic language, leading to misclassifications.

- `Out-of-domain examples`: If the test set contains examples that are significantly different from the training data, the model might not have learned to generalize well to these unseen instances. The model could make incorrect predictions for such out-of-domain examples.

- `Lack of contextual understanding`: DistilBERT, being a transformer-based model, captures the context and relationships between words in a sequence. However, it may still struggle with understanding specific domain knowledge or cultural references that are relevant for accurate sentiment analysis.

### 6.3 Comparison of the model with others

When comparing a pre-trained transformer-based model to a naive Bayes classifier and recurrent models like RNN or LSTM for production use, there are several advantages and disadvantages to consider:

Advantages of Pre-trained Transformer-Based Model:
1. **Better performance with complex data**: Transformer-based models, such as BERT, have shown state-of-the-art performance on a wide range of natural language processing (NLP) tasks, including sentiment analysis. They can capture complex linguistic patterns and contextual information effectively, making them suitable for handling diverse and nuanced language data.

2. **Transfer learning**: Pre-trained transformer-based models are trained on large-scale datasets, allowing them to learn useful representations of text. This pre-training enables transfer learning, meaning the model can be fine-tuned on a specific task or domain with relatively limited labeled data. This is beneficial when training data is scarce or when adapting the model to a specific application.

3. **Ability to learn from large-scale datasets**: Transformer-based models benefit from their ability to leverage massive amounts of data during pre-training, which can lead to improved generalization and performance on downstream tasks.

4. **Capture long-range dependencies**: Transformers use self-attention mechanisms to capture dependencies between words in a sentence, allowing them to model long-range dependencies more effectively than recurrent models like RNN or LSTM. This is particularly advantageous for sentiment analysis tasks, where the sentiment may depend on the entire input sequence.

Disadvantages of Pre-trained Transformer-Based Model:
1. **Higher computational requirements**: Transformer models are more computationally expensive compared to simpler models like naive Bayes. They require more memory and processing power for training and inference, which can impact deployment on resource-constrained environments.

2. **Training and fine-tuning complexity**: Fine-tuning a pre-trained transformer-based model may require more effort and expertise compared to implementing a naive Bayes classifier. It involves hyperparameter tuning, data preprocessing, and potentially dealing with larger datasets. Additionally, transformer models often have a large number of parameters, which can increase training time and memory requirements.

Advantages and Disadvantages Compared to Naive Bayes:
- Compared to naive Bayes, transformer models generally offer better performance on complex language tasks, especially when dealing with large-scale datasets and capturing intricate patterns.
- Naive Bayes models are computationally lightweight and have low memory requirements, making them more suitable for deployment in resource-constrained environments.
- Naive Bayes assumes independence between features, which may limit its ability to capture complex dependencies in text data. Transformer models excel at modeling dependencies and have a better understanding of context.

Advantages and Disadvantages Compared to RNN/LSTM:
- Transformers can capture long-range dependencies more effectively than RNN or LSTM models due to their self-attention mechanism.
- RNN/LSTM models are typically better suited for sequential data, as they maintain an internal state that can capture sequential patterns. Transformers lack sequential memory and process the entire sequence at once.
- Transformers require less training time compared to RNN/LSTM models, which can be advantageous when working with large datasets.
- RNN/LSTM models may be more interpretable due to their sequential nature, while transformer models are often considered black-box models.

Ultimately, the choice between using a pre-trained transformer-based model, naive Bayes, or recurrent models like RNN or LSTM depends on the specific requirements of the project, the available resources, and the trade-offs between performance, computational requirements, and interpretability.

In [61]:
from typing import List
from transformers import PreTrainedModel, PreTrainedTokenizer

def predict_sliding_window_long(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, input_text: str, window_size: int = 512, stride: int = 64) -> int:
    """
    Performs sliding window predictions on a long input sequence.

    Args:
        model (PreTrainedModel): The pre-trained model for sequence classification.
        tokenizer (PreTrainedTokenizer): The tokenizer associated with the model.
        input_text (str): The input text to predict on.
        window_size (int): The maximum window size for the sliding window (default: 512).
        stride (int): The stride value for the sliding window (default: 64).

    Returns:
        List[int]: The list of predicted labels for each window.

    Raises:
        ValueError: If the input sequence is empty.

    """
    device = next(model.parameters()).device
    tokenized_input = tokenizer(input_text, truncation=True, padding="max_length", max_length=window_size, return_tensors="pt")
    input_ids = tokenized_input["input_ids"].squeeze(0).to(device)
    attention_mask = tokenized_input["attention_mask"].squeeze(0).to(device)

    num_tokens = input_ids.size(0)
    predictions = []

    if num_tokens <= window_size:
        # If the input fits within a single window, make predictions directly
        window_inputs = {
            "input_ids": input_ids.unsqueeze(0),
            "attention_mask": attention_mask.unsqueeze(0)
        }
        outputs = model(**window_inputs)
        window_predictions = torch.argmax(outputs.logits, dim=1).tolist()
        predictions.extend(window_predictions)
    else:
        # If the input exceeds the window size, apply sliding window technique
        for start in range(0, num_tokens, stride):
            end = start + window_size

            # Ensure the window does not exceed the number of tokens
            if end > num_tokens:
                end = num_tokens

            # Extract the current window
            window_input_ids = input_ids[start:end]
            window_attention_mask = attention_mask[start:end]

            # Make predictions for the current window
            window_inputs = {
                "input_ids": window_input_ids.unsqueeze(0),
                "attention_mask": window_attention_mask.unsqueeze(0)
            }
            outputs = model(**window_inputs)
            window_predictions = torch.argmax(outputs.logits, dim=1).tolist()

            predictions.extend(window_predictions)

    return predictions[0]

In [62]:
def find_long_review(tokenized_datasets: Dataset, tokenizer: PreTrainedTokenizer, max_length: int = 512) -> list:
    """
    Finds a review longer than the specified maximum length.

    Args:
        tokenized_datasets (Dataset): The tokenized dataset.
        tokenizer (PreTrainedTokenizer): The tokenizer associated with the dataset.
        max_length (int): The maximum allowed length for a review (default: 512).

    Returns:
        tuple[str, int]: A tuple containing the input text and the number of tokens if a long review is found,
            otherwise (None, None).

    """
    all_long_reviews = []
    indices_long_reviews = []

    for i, example in enumerate(test_subset):
        input_text = example["text"]
        num_tokens = len(tokenizer.tokenize(input_text))

        if num_tokens > max_length:
            all_long_reviews.append((input_text, num_tokens))
            indices_long_reviews.append(i)

    return all_long_reviews, indices_long_reviews

# Find a long review in the test subset
long_reviews, indices_long_reviews = find_long_review(test_subset, tokenizer)

len(indices_long_reviews)

253

In [65]:
from sklearn.metrics import accuracy_score, f1_score

# Get the expected predictions
expected_predictions = test_subset.select(indices_long_reviews)['label']

# Make the predictions
predictions = []
for long_review, num_tokens in long_reviews:
    predictions.append(predict_sliding_window_long(model, tokenizer, long_review))

# Calculate accuracy
accuracy = accuracy_score(expected_predictions, predictions)

# Calculate F1 score
f1 = f1_score(expected_predictions, predictions)

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.8774703557312253
F1 Score: 0.8774703557312253


Finally, we still get a pretty good accuracy and F1 score with the sliding window techniques.