# Fine-tuning and quantization

In this example, you will fine-tune a small language model (DistilBERT in this case) in a sentiment classification example and then quantizing it from FP32 to INT8. 



## Imports



In [None]:
%pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu
%pip install transformers==4.41.2
%pip install datasets==2.20.2
%pip install numpy==1.26.3
%pip install pandas==2.0.3

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import load_dataset
import time

## Prepare dataset

Next, you will check whether there are any NVIDIA GPUs configured in the environment.  There are not but check anyway.

In [None]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Next, you will load and prepare the dataset.

In [None]:
dataset = load_dataset("imdb", split="train[:1000]")

In [None]:
# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Next, we will create a custom TextDataset class  (which is a custom implementation of the PyTorch's Dataset class).  

The DataLoader is a crucial part of the PyTorch training pipeline. It:

* Batches the data, which allows for more efficient processing.
* Shuffles the data, which helps in reducing overfitting.
* Handles the conversion of your data into PyTorch tensors.
* Can distribute the data across multiple CPU cores for faster loading (though in this CPU-only version, we're not using multiple cores).

When we use this train_loader in our training loop, it will yield batches of data, each containing 8 samples (except possibly the last batch if the dataset size isn't divisible by 4). Each batch will be a dictionary with keys 'input_ids', 'attention_mask', and 'label', where each value is a tensor of shape (8, ...).

This setup allows for efficient, batched processing of our dataset during training, which is crucial for handling larger datasets and speeding up the training process.

In [None]:
# Create a custom dataset
class TextDataset(Dataset):

    #The constructor takes the tokenized dataset as an argument and stores it
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset  

    #Returns the length of the dataset
    def __len__(self):
        return len(self.tokenized_dataset) 

    # Fetches a single item from the dataset
    # Takes an index (idx)
    # Returns a dictionary containing:
    #      input_ids: the tokenzied and encoded text
    #      attention_mask: a mask indicating which tokens are padding and which are actual input
    #      label: label for the text (in this case, the sentiment)
    def __getitem__(self, idx):
        item = self.tokenized_dataset[idx]
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'label': torch.tensor(item['label'])
        }

# Creates an intance of the custom dataset class
train_dataset = TextDataset(tokenized_dataset)

# Creates a PyTorch utility for loading data in batches of 8 items and sheffle the data before each epoch (to prevent model from learning the order of the data)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

## Fine-tuning

In [None]:
# Load pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)

In [None]:
# Set up optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
# Fine-tuning loop
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    start_time = time.time()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    end_time = time.time()
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}, Time: {end_time - start_time:.2f} seconds")

print("Fine-tuning complete!")

## Quantization

In [None]:
# Define a calibration function
def calibrate(model, loader):
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            _ = model(input_ids, attention_mask=attention_mask)

# Prepare the model for static quantization
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model, inplace=True)

# Calibrate the model
calibrate(model, train_loader)

# Convert the model to quantized version
quantized_model = torch.quantization.convert(model, inplace=False)

print("Quantization complete!")


## Evaluation & comparison


In [None]:
# Function to calculate model size
def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

original_size = get_model_size(model)
quantized_size = get_model_size(quantized_model)

print(f"Original model size: {original_size:.2f} MB")
print(f"Quantized model size: {quantized_size:.2f} MB")
print(f"Size reduction: {(1 - quantized_size/original_size)*100:.2f}%")

In [None]:
# Accuracy Evaluation Function
def evaluate_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return correct / total

print("\nAccuracy Comparison:")
original_accuracy = evaluate_accuracy(model, val_loader)
quantized_accuracy = evaluate_accuracy(quantized_model, val_loader)

print(f"Original model accuracy: {original_accuracy:.4f}")
print(f"Quantized model accuracy: {quantized_accuracy:.4f}")
print(f"Accuracy change: {(quantized_accuracy - original_accuracy)*100:.2f}%")

In [None]:
# Inference time comparison
input_text = "This movie was fantastic! I really enjoyed it."
inputs = tokenizer(input_text, return_tensors="pt")

with torch.no_grad():
    start_time = time.time()
    original_output = model(**inputs).logits
    original_time = time.time() - start_time
    
    start_time = time.time()
    quantized_output = quantized_model(**inputs).logits
    quantized_time = time.time() - start_time

print("Original model output:", original_output)
print(f"Original model inference time: {original_time:.4f} seconds")
print("Quantized model output:", quantized_output)
print(f"Quantized model inference time: {quantized_time:.4f} seconds")
print(f"Speedup: {original_time/quantized_time:.2f}x")

In [None]:
def predict_sentiment(model, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1)
    
    return "Positive" if prediction.item() == 1 else "Negative"

test_texts = [
    "This movie was fantastic! I really enjoyed it.",
    "I've never been so bored in my life. Terrible film.",
    "The acting was okay, but the plot was confusing."
]

print("\nExample Predictions:")
for text in test_texts:
    print(f"\nText: {text}")
    print(f"Original model prediction: {predict_sentiment(model, text)}")
    print(f"Quantized model prediction: {predict_sentiment(quantized_model, text)}")