# Fine-tuning and quantization

In this example, you will fine-tune a small language model (GPT-2 in this case) and then quantizing it from FP32 to INT8. 



## Imports



In [None]:
%pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cpu
%pip install transformers==4.30.2
%pip install datasets==2.13.1
%pip install numpy==1.24.3
%pip install pandas==2.0.2

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
import time

## Prepare dataset

Next, you will check whether there are any NVIDIA GPUs configured in the environment

In [None]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Next, you will load and prepare the dataset

In [None]:
# Load a small dataset (e.g., a subset of WikiText-2)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1000]")

In [None]:
# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Next, we will create a custom TextDataset class  (which is a custom implementation of the PyTorch's Dataset class).  

The DataLoader is a crucial part of the PyTorch training pipeline. It:

* Batches the data, which allows for more efficient processing.
* Shuffles the data, which helps in reducing overfitting.
* Handles the conversion of your data into PyTorch tensors.
* Can distribute the data across multiple CPU cores for faster loading (though in this CPU-only version, we're not using multiple cores).

When we use this train_loader in our training loop, it will yield batches of data, each containing 4 samples (except possibly the last batch if the dataset size isn't divisible by 4). Each batch will be a dictionary with keys 'input_ids' and 'attention_mask', where each value is a tensor of shape (4, ...).

This setup allows for efficient, batched processing of our dataset during training, which is crucial for handling larger datasets and speeding up the training process.

In [None]:
# Create a custom dataset
class TextDataset(Dataset):

    #The constructor takes the tokenized dataset as an argument and stores it
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset  

    #Returns the length of the dataset
    def __len__(self):
        return len(self.tokenized_dataset) 

    # Fetches a single item from the dataset
    # Takes an index (idx)
    # Returns a dictionary containing:
    #      input_ids: the tokenzied and encoded text
    #      attention_mask: a mask indicating which tokens are padding and which are actual input
    def __getitem__(self, idx):
        item = self.tokenized_dataset[idx]
        return torch.tensor(item['input_ids']), torch.tensor(item['attention_mask'])

# Creates an intance of the custom dataset class
train_dataset = TextDataset(tokenized_dataset)

# Creates a PyTorch utility for loading data in batches of 4 items and sheffle the data before each epoch (to prevent model from learning the order of the data)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

## Fine-tuning

In [None]:
# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

In [None]:
# Set up optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
# Fine-tuning loop
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

print("Fine-tuning complete!")