# Fine-tuning Sentiment Analysis

In this notebook we take a pre-trained distilbert model that has an un-trained head for sentiment analysis. In other words, the head needs to be trained on given examples so that the model learns to catagorize sequences of text into either 'positive' or 'negative' categories.

>This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-uncased). It was introduced in this [paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found here. This model is uncased: it does not make a difference between english and English.

The model is trained with the following dataset (loaded from Huggingface)

* [GLUE](https://gluebenchmark.com/) (General Language Understanding Evaluation)

In [1]:
import transformers
import torch
import evaluate
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from transformers import pipeline
from datasets import load_dataset
from datasets import load_metric
import numpy as np
from torchinfo import summary
from typing import Tuple, Dict
import json

In [2]:
# GLUE stands for General Language Understanding Evaluation
# Loads sst2 (sentiment analysis) task data from the GLUE dataset
raw_dataset = load_dataset("glue", "sst2")

In [3]:
# Show contents of the dataset. In essence we have the following:
# - training datast
# - validation dataset
# - test dataset
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
# The model to be used and the corresponding tokenizer
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
# Let's tokenize a few samples
tokenized_samples = tokenizer(raw_dataset['train'][0:3]['sentence'])
print(f"phrases: {raw_dataset['train'][0:3]['sentence']}:\ntoken ids:{tokenized_samples}")

phrases: ['hide new secretions from the parental units ', 'contains no wit , only labored gags ', 'that loves its characters and communicates something rather beautiful about human nature ']:
token ids:{'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102], [101, 2008, 7459, 2049, 3494, 1998, 10639, 2015, 2242, 2738, 3376, 2055, 2529, 3267, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [6]:
# Tokenizer function to apply tokenization to all of the datasets
def tokenizer_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [7]:
tokenized_datasets = raw_dataset.map(tokenizer_fn, batched=True)

In [8]:
# Training parameters. Train only for 1 epoch.
training_parameters = TrainingArguments(
    'distil-bert-trainer',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1,
)

In [9]:
# Create the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Print out the type of the model
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [11]:
# Show the model layers
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [12]:
# Print summary of the model
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [13]:
# Store the weights to a list so that we can compare the weights
params_before_training = []
for name, p in model.named_parameters():
    params_before_training.append(p.detach().cpu().numpy())

In [14]:
# Create a metric object for evaluating the performance
metric = evaluate.load("glue", "sst2")

In [15]:
# Show how the metric object works
metric.compute(predictions=[1,0,1], references=[1,0,0])

{'accuracy': 0.6666666666666666}

In [16]:
# Create a compute_metrics function that is passed to the Trainer-object
def compute_metrics(logits_and_labels: Tuple) -> Dict:
    """Computes accuracy metrics based on the given logits from the network and the correct labels.

    Parameters
    ----------
    logits_and_labels: Tuple
        Contains the logits from the network and the actual labels

    Return
    ------
    Dict
        A dictionary in the format {'accuracy': val}
    """
    
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
# Trainer object used for training the model
trainer = Trainer(
    model,
    training_parameters,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.train()

  0%|          | 0/8419 [00:00<?, ?it/s]

{'loss': 0.4153, 'grad_norm': 1.6516419649124146, 'learning_rate': 4.7030526190759e-05, 'epoch': 0.06}
{'loss': 0.3501, 'grad_norm': 16.05232810974121, 'learning_rate': 4.4061052381518e-05, 'epoch': 0.12}
{'loss': 0.3311, 'grad_norm': 2.726660966873169, 'learning_rate': 4.109157857227699e-05, 'epoch': 0.18}
{'loss': 0.3078, 'grad_norm': 10.306052207946777, 'learning_rate': 3.812210476303599e-05, 'epoch': 0.24}
{'loss': 0.2751, 'grad_norm': 5.610777378082275, 'learning_rate': 3.515263095379499e-05, 'epoch': 0.3}
{'loss': 0.2768, 'grad_norm': 0.42856350541114807, 'learning_rate': 3.218315714455399e-05, 'epoch': 0.36}
{'loss': 0.2601, 'grad_norm': 0.3930196166038513, 'learning_rate': 2.9213683335312986e-05, 'epoch': 0.42}
{'loss': 0.2479, 'grad_norm': 50.17327117919922, 'learning_rate': 2.6244209526071984e-05, 'epoch': 0.48}
{'loss': 0.2418, 'grad_norm': 0.25097548961639404, 'learning_rate': 2.3274735716830978e-05, 'epoch': 0.53}
{'loss': 0.2291, 'grad_norm': 6.732576847076416, 'learning_

  0%|          | 0/109 [00:00<?, ?it/s]

{'eval_loss': 0.3753179907798767, 'eval_accuracy': 0.893348623853211, 'eval_runtime': 1.5762, 'eval_samples_per_second': 553.221, 'eval_steps_per_second': 69.153, 'epoch': 1.0}
{'train_runtime': 531.8828, 'train_samples_per_second': 126.624, 'train_steps_per_second': 15.829, 'train_loss': 0.2635245966024928, 'epoch': 1.0}


TrainOutput(global_step=8419, training_loss=0.2635245966024928, metrics={'train_runtime': 531.8828, 'train_samples_per_second': 126.624, 'train_steps_per_second': 15.829, 'train_loss': 0.2635245966024928, 'epoch': 1.0})

In [19]:
trainer.save_model("trained_distilbert_model")

In [20]:
if torch.cuda.is_available():
    print("Model is execute in: GPU")
    sequence_classifier = pipeline("text-classification", model="trained_distilbert_model", device=torch.cuda.current_device())
else:
    print("Model is execute in: CPU")
    sequence_classifier = pipeline("text-classification", model="trained_distilbert_model")

Model is execute in: GPU


In [21]:
# Sentiment label to text
label2sentiment={"LABEL_0": "negative", "LABEL_1": "positive"}

In [22]:
# Test the new model
phrase = "This movie sucks big time!"
sentiment = sequence_classifier(phrase)
print(f"{phrase} -> sentiment: {label2sentiment[sentiment[0]['label']]}")

This movie sucks big time! -> sentiment: negative


In [23]:
# Test the new model
phrase = "This movie is fantastic!"
sentiment = sequence_classifier(phrase)
print(f"{phrase} -> sentiment: {label2sentiment[sentiment[0]['label']]}")

This movie is fantastic! -> sentiment: positive


In [24]:
# Show the model configuration
model_config_file="trained_distilbert_model/config.json"

In [25]:
# Read in the config-file of the trained model
with open(model_config_file) as f:
    model_config = json.load(f)

In [26]:
# Show the model parameters
for key in model_config:
    print(f"{key}: {model_config[key]}")

_name_or_path: distilbert-base-uncased
activation: gelu
architectures: ['DistilBertForSequenceClassification']
attention_dropout: 0.1
dim: 768
dropout: 0.1
hidden_dim: 3072
initializer_range: 0.02
max_position_embeddings: 512
model_type: distilbert
n_heads: 12
n_layers: 6
pad_token_id: 0
problem_type: single_label_classification
qa_dropout: 0.1
seq_classif_dropout: 0.2
sinusoidal_pos_embds: False
tie_weights_: True
torch_dtype: float32
transformers_version: 4.39.1
vocab_size: 30522
