In [1]:
import transformers
import numpy as np

In [2]:
from datasets import load_dataset


dataset = load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

## Fine-tune the model on the training data

In [3]:
train_texts, train_labels = dataset["train"]["text"], dataset["train"]["label"]
test_texts, test_labels = dataset["test"]["text"], dataset["test"]["label"]

In [4]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1)

Let’s use the DistilBert tokenizer.
We’ll pass truncation=True and padding=True, which will ensure that all of our sequences are padded to the same length and are truncated to be no longer model’s maximum input length.

In [5]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Now, let’s turn our labels and encodings into a Dataset object

In [7]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

Create a model to fine-tune, define the TrainingArguments and instantiate a Trainer


In [8]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Step,Training Loss
10,0.6971
20,0.7018
30,0.6883
40,0.6787
50,0.682
60,0.664
70,0.6563
80,0.6073
90,0.5253
100,0.462


TrainOutput(global_step=1407, training_loss=0.2967315025166916, metrics={'train_runtime': 1025.481, 'train_samples_per_second': 21.941, 'train_steps_per_second': 1.372, 'total_flos': 2980516469760000.0, 'train_loss': 0.2967315025166916, 'epoch': 1.0})

## Bonus: Fine-tune your model using the accuracy as evaluation instead of the loss

In [10]:
from sklearn.metrics import accuracy_score

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Step,Training Loss
10,0.6933
20,0.6992
30,0.6993
40,0.6909
50,0.6912
60,0.6843
70,0.684
80,0.6772
90,0.6521
100,0.6437


TrainOutput(global_step=1407, training_loss=0.3044549255567658, metrics={'train_runtime': 1048.6904, 'train_samples_per_second': 21.455, 'train_steps_per_second': 1.342, 'total_flos': 2980516469760000.0, 'train_loss': 0.3044549255567658, 'epoch': 1.0})

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("mvonwyl/distilbert-base-uncased-imdb")

model = AutoModelForSequenceClassification.from_pretrained("mvonwyl/distilbert-base-uncased-imdb")

In [12]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [14]:
test_results = trainer.evaluate(test_dataset)

In [15]:
test_accuracy = test_results['eval_accuracy']
print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.92948


In [16]:
example_encodings = tokenizer(test_texts[:200], truncation=True, padding=True)
example_dataset = IMDbDataset(test_encodings, test_labels[:200])
predictions = trainer.predict(example_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
misclassified_indices = np.where(predicted_labels != test_labels[:200])[0]
for i in misclassified_indices[:2]:
  print(f'True label: {test_labels[i]}\nText: [{test_texts[i]}]')

True label: 0
Text: [First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!]
True label: 0
Text: [I'm the type of guy who loves hood movies from New Jack City to Baby Boy to Killa Season, from the b grade to the Hollywood. but this movie was something different. i am no hater and this movie was kinda enjoyable. but some bits were just weird. well the acting wasn't to good, compar

The model was wrong because there are much more positive words than negatives words in those samples. Also, we only use the beginning of the review (512 tokens).

Naive Bayes makes strong independence assumptions between features. This can limit its ability to capture complex relationships between features, which can result in lower accuracy than transformers.

Compared to RNNs or LSTMs, transformers are faster and can be parallelized.

## Bonus

In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("mvonwyl/distilbert-base-uncased-imdb")

model2 = AutoModelForSequenceClassification.from_pretrained("mvonwyl/distilbert-base-uncased-imdb")

tokenizer.truncation_side='left'
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

trainer = Trainer(
    model=model2,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

test_results = trainer.evaluate(test_dataset)
test_accuracy = test_results['eval_accuracy']
print(f'Test accuracy: {test_accuracy}')

example_encodings = tokenizer(test_texts[:200], truncation=True, padding=True)
example_dataset = IMDbDataset(test_encodings, test_labels[:200])
predictions = trainer.predict(example_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
for i in misclassified_indices[:2]: # Old missclassified indices to compare with new predictions
  print(f'True label: {test_labels[i]}\nPredicted label: {predicted_labels[i]}\nText: [{test_texts[i]}]')

Test accuracy: 0.9304
True label: 0
Predicted label: 1
Text: [First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!]
True label: 0
Predicted label: 1
Text: [I'm the type of guy who loves hood movies from New Jack City to Baby Boy to Killa Season, from the b grade to the Hollywood. but this movie was something different. i am no hater and this movie was kinda enjoyable. but some 

The accuracy is better and some of the wrongly predicted label are now correctly predicted thanks to the left truncations instead of the right truncation.