# Loading the IMDB dataset

In [1]:
from datasets import list_datasets, load_dataset, list_metrics, load_metric
# Print all the available datasets
print(len(list_datasets()))

  from .autonotebook import tqdm as notebook_tqdm
  print(len(list_datasets()))


53385


In [2]:
dataset = load_dataset('imdb')

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

print an example: 

In [4]:
dataset['train'][2]

{'text': "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />",
 'label': 0}

In [5]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

# Fine-tuning

In [6]:
from datasets import load_dataset
dataset = load_dataset("imdb")
dataset["train"][100]

{'text': "Terrible movie. Nuff Said.<br /><br />These Lines are Just Filler. The movie was bad. Why I have to expand on that I don't know. This is already a waste of my time. I just wanted to warn others. Avoid this movie. The acting sucks and the writing is just moronic. Bad in every way. The only nice thing about the movie are Deniz Akkaya's breasts. Even that was ruined though by a terrible and unneeded rape scene. The movie is a poorly contrived and totally unbelievable piece of garbage.<br /><br />OK now I am just going to rag on IMDb for this stupid rule of 10 lines of text minimum. First I waste my time watching this offal. Then feeling compelled to warn others I create an account with IMDb only to discover that I have to write a friggen essay on the film just to express how bad I think it is. Totally unnecessary.",
 'label': 0}

tokenize: 

In [7]:
from transformers import AutoTokenizer
brt_tkn = AutoTokenizer.from_pretrained("bert-base-cased")
def generate_tokens_for_imdb(examples):
    return brt_tkn(examples["text"], padding="max_length", truncation=True)
tkn_datasets = dataset.map(generate_tokens_for_imdb, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50000/50000 [00:21<00:00, 2309.14 examples/s]


In [8]:
tkn_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

use only 200 data points for training set and test set, as CUDA/GPU is not available: 

In [9]:
training_dataset = tkn_datasets["train"].shuffle(seed=42).select(range(200))
evaluation_dataset = tkn_datasets["test"].shuffle(seed=42).select(range(200))

Load the BERT-based sequence classification model:

In [10]:
from transformers import AutoModelForSequenceClassification
mdl = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialize a metric for accuracy measurement using the HuggingFace's datasets library.

We define a function, calculate_metrics, that takes model predictions and true labels, computes the class with the highest predicted probability (argmax), and then calculates and returns the accuracy of the predictions compared to the true labels.

In [None]:
!conda install scikit-learn -y

In [11]:
import numpy as np
from datasets import load_metric
mdl_metrics = load_metric("accuracy")
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return mdl_metrics.compute(predictions=predictions, references=labels)

  mdl_metrics = load_metric("accuracy")


set up training configuration for a model to be trained with the HuggingFace's Trainer class:

- the model should be trained for 3 epochs and evaluated after each epoch. 
- The training and evaluation artifacts will be saved in a directory named "test_trainer"

In [None]:
!pip install accelerate -U

In [12]:
import accelerate

from transformers import TrainingArguments, Trainer
trng_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=3, no_cuda=True)

Instantiate a Trainer object that contains your model, the
training arguments, the datasets to be used for training and
testing, and the evaluation function:

In [14]:
Mdl_trainer = Trainer(
            model=mdl,
            args=trng_args,
            train_dataset=training_dataset,
            eval_dataset=evaluation_dataset,
            compute_metrics=calculate_metrics,
            )

Train the model (i.e., fine-tune the model):

In [16]:
Mdl_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.690663,0.535
2,No log,0.673304,0.58
3,No log,0.63646,0.66


TrainOutput(global_step=75, training_loss=0.6883013407389323, metrics={'train_runtime': 2605.773, 'train_samples_per_second': 0.23, 'train_steps_per_second': 0.029, 'total_flos': 157866633216000.0, 'train_loss': 0.6883013407389323, 'epoch': 3.0})

Save the fine-tuned trained model locally:

In [17]:
Mdl_trainer.save_model()

We can check the accuracy of the model using the following code:

In [18]:
metrics = Mdl_trainer.evaluate(evaluation_dataset)
Mdl_trainer.log_metrics("eval", metrics)
Mdl_trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =       0.66
  eval_loss               =     0.6365
  eval_runtime            = 0:04:24.06
  eval_samples_per_second =      0.757
  eval_steps_per_second   =      0.095


# Inference on unseen data

After fine-tuning and saving our model, we can now use it for inference on new data, such as classifying sentiments in IMDB movie reviews.


Load the fine-tuned model from the following path:

In [19]:
PATH = 'test_trainer/'
md = AutoModelForSequenceClassification.from_pretrained(PATH, local_files_only=True)

function for inference: 

In [23]:
def make_classification(text):
    # Tokenize
    inps = brt_tkn(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cpu")
    # get output
    outputs = md(**inps)
    # softmax for generating probablities
    probabilities = outputs[0].softmax(1)
    # get best match.
    return probabilities.argmax()

First text for inference: 

In [30]:
text = "This show instantly brightens your day. Every character becomes endearing as you watch. Eagerly awaiting season 2."

In [32]:
print(make_classification(text))

tensor(1)


I.e., positive review


In [37]:
text = "The show didn't quite meet my expectations. I regret spending on popcorn, pizza, and burgers. Akshay should stick to comedy; these regal movies better suit actors with a more kingly aura. Overall, it felt like a waste."

In [38]:
print(make_classification(text))

tensor(0)


I.e., negative review