In [1]:
# loading a small labeled dataset
from datasets import load_dataset 
# importing hugging faces dataset loader

dataset = load_dataset("imdb", split="train[:5000]") 
# load 5000 movie reviews for training
# containing both positive and negative reviews

test_dataset = load_dataset("imdb", split="test[:1000]") 
# load 1000 movie reviews for testing
# also containing positive and negative reviews

In [2]:
# tokenizing and preprocessing text
from transformers import DistilBertTokenizerFast 
# importing tokenizer for distilbert

# distilbert is a smaller, faster and liter version of bert which is a powerful nlp model 
# created by hugging face
# bert was developed by google
# we are using distilbert to classify text sentiments
# distilbert is good for small/medium size nlp models
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') 
# load the pre trained distilbert tokenizer

# function to tokenize text reviews
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
    # this line converts raw text into tokens(numbers), that the model can understand.
    # batch['text']: gets the list of reviews
    # padding=True: makes all sequences of same length by adding extra (PAD) tokens
    # truncation=True: cut of extra long reviews that exceeds models max length

train_tokenized = dataset.map(tokenize, batched=True)
# this applies the tokenize function to every reviews in the training dataset
# batched=True: it processed multiple examples at once, making the whole thing faster
# the final result is a dataset of tokens that will be used to train the model

test_tokenized = test_dataset.map(tokenize, batched=True)
# applies the tokenize function to the test dataset

train_tokenized = train_tokenized.rename_column("label","labels")
test_tokenized = test_tokenized.rename_column("label","labels")

In [13]:
# loading pretrained distilbert and setup for training

import accelerate

# distilbert is pre trained for general language task hence it is needed to be trained
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments  
# DistilBertForSequenceClassification: is the pre trained model
# trainer: this makes training easier by handeling training loops, evaluation, logging
# trainingarguments: a class used to define training settings

# load pre trained distilbert model for 2 class classification (positive and negative)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# DistilBertSequenceClassification: this version of distilbert have a small neural network layer on the top of base model
# from_pretrained('distilbert-base-uncased'): loads the model which is trained in english language (lower case)
# num_labels=2: classification head should classify 2 output probabilities (0,1) 

# we are setting training options like batch size, epochs, logging
train_args = TrainingArguments(output_dir='./results', # this is the folder where all the model checkpoints and results will be saved
                                  eval_strategy="epoch", # this value evaluate the model based on testing data after every time training is done
                                  logging_dir='./logs', # saves loss, accuracy and all other training logs in a folder
                                  per_device_train_batch_size=3, # trains 3 samples at a time, also called batch size
                                  per_device_eval_batch_size=3, # test 3 samples at a time 
                                  num_train_epochs=3, # trains the model 3 times using the training dataset
                                  save_strategy="epoch", # save the model to disk after each epoch cycle
                                  logging_steps=10 # prints out logs like loss after every 10 steps
                                 )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score
# this line imports metric functions
# precision_score: measures how many of the predicted positives are correct
# recall_score: measures how many of the actual positives are predicted
# f1_score: harmonic mean of precision and recall, balances both

# function to compute performance scores
def compute_metrics(pred):
    labels = pred.label_ids 
    # this gives the actual true labels from the dataset
    
    preds = pred.predictions.argmax(axis=1) 
    # this extracts predicted class labels 0 and 1 from models raw output
    # when model gives outputs, it does not mention positive and negative directly
    # instead it gives 2 numbers(logits), one for class 0 and one for class 1 
    # logits are unnormalized predicted scores
    # to determine which class to be predicted we use argmax(axis=1)
    # axis=1: operation across rows 
    # axis=0: operation across column

    precision = precision_score(labels, preds)
    # this line calculates precision
    # precision is the total number of models predicted correct answers
    # labels: actual answers/ sentiments
    # preds: predicted value (0/1) 
    # precision_score(): this function compares the actual answers to the prediction and gives the predicted correct answer

    recall = recall_score(labels, preds)
    # calculates the recall score
    # recall: measures how well the model can predict the correct answers  

    f1 = f1_score(labels, preds)
    # f1 measures the models performance with unbalanced dataset
    # based on recall and precision

    return {"precision": precision, "recall": recall, "f1":f1}
    # returns the above metrices

In [15]:
# we are creating a hugging face trainer that handels the training and evaluation
trainer = Trainer(
    model=model, # using the distilbert model
    args=train_args, # using the training setting we defines 1 block above
    train_dataset=train_tokenized, # providing tokenized training data
    eval_dataset=test_tokenized, # providing tokenized testing data
    compute_metrics=compute_metrics # using our evaluation function in above block
)

# starting the training
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0,0.0,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

In [None]:
# run evaluation on test dataset
eval_results = trainer.evaluate()

# print evaluation scores
print(eval_results)