In [1]:
# Install the transformers and datasets libraries
!pip install transformers datasets

import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset



In [2]:
# Check if a GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)  # This will print either 'cuda' or 'cpu' based on what's available



cuda


In [3]:
# Load the IMDb dataset
dataset = load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [5]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [6]:
# Define the model

# The DistilBertForSequenceClassification class adds a classification layer on top of the pre-trained DistilBERT model.
# The weights for this new layer (pre_classifier.weight, pre_classifier.bias, classifier.bias, classifier.weight)
# are not part of the pre-trained model and hence are initialized randomly.

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# !pip install transformers[torch]

In [8]:
# !pip install accelerate -U

In [9]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

In [10]:
# Train the model
trainer.train()

Step,Training Loss
10,0.6914
20,0.6876
30,0.6779
40,0.6874
50,0.6783
60,0.6707
70,0.6491
80,0.6156
90,0.563
100,0.4745


TrainOutput(global_step=4689, training_loss=0.17048716166299605, metrics={'train_runtime': 3666.97, 'train_samples_per_second': 20.453, 'train_steps_per_second': 1.279, 'total_flos': 9935054899200000.0, 'train_loss': 0.17048716166299605, 'epoch': 3.0})

In [11]:
# Save the fine-tuned model and tokenizer locally
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_tokenizer')

('./fine_tuned_tokenizer/tokenizer_config.json',
 './fine_tuned_tokenizer/special_tokens_map.json',
 './fine_tuned_tokenizer/vocab.txt',
 './fine_tuned_tokenizer/added_tokens.json')

In [12]:
# Save the fine-tuned model and tokenizer on Google Drive

from google.colab import drive
drive.mount('/content/drive')

model.save_pretrained('/content/drive/My Drive/fine_tuned_model')
tokenizer.save_pretrained('/content/drive/My Drive/fine_tuned_tokenizer')


Mounted at /content/drive


('/content/drive/My Drive/fine_tuned_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/fine_tuned_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/fine_tuned_tokenizer/vocab.txt',
 '/content/drive/My Drive/fine_tuned_tokenizer/added_tokens.json')