In [64]:
# The goal is to fine tune GPT-2 on a dataset of shakespear text

In [65]:
# If you are running this notebook on Google Colab run this cell to clone the repository
# !git clone https://github.com/Memento2121/Fine-tuning-GPT2.git
# %cd Fine-tuning-GPT2

In [66]:
from transformers import GPT2Model, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, AutoTokenizer

import os

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split, RandomSampler, SequentialSampler

import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [67]:
# Load pre-trained model and tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [68]:
# We are gonna do a QLoRA fineturning on the shakespear dataset

In [69]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [70]:
for name, module in model.named_modules():
    print(name)


transformer
transformer.wte
transformer.wpe
transformer.drop
transformer.h
transformer.h.0
transformer.h.0.ln_1
transformer.h.0.attn
transformer.h.0.attn.c_attn
transformer.h.0.attn.c_proj
transformer.h.0.attn.attn_dropout
transformer.h.0.attn.resid_dropout
transformer.h.0.ln_2
transformer.h.0.mlp
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.c_proj
transformer.h.0.mlp.act
transformer.h.0.mlp.dropout
transformer.h.1
transformer.h.1.ln_1
transformer.h.1.attn
transformer.h.1.attn.c_attn
transformer.h.1.attn.c_proj
transformer.h.1.attn.attn_dropout
transformer.h.1.attn.resid_dropout
transformer.h.1.ln_2
transformer.h.1.mlp
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.c_proj
transformer.h.1.mlp.act
transformer.h.1.mlp.dropout
transformer.h.2
transformer.h.2.ln_1
transformer.h.2.attn
transformer.h.2.attn.c_attn
transformer.h.2.attn.c_proj
transformer.h.2.attn.attn_dropout
transformer.h.2.attn.resid_dropout
transformer.h.2.ln_2
transformer.h.2.mlp
transformer.h.2.mlp.c_fc
transformer.h.2.mlp

In [71]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["attn.c_attn"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)



In [72]:
# Function to verify the presence of LoRA parameters
def verify_lora_parameters(model):
    lora_params = []
    for name, param in model.named_parameters():
        if "lora_A" in name or "lora_B" in name:
            lora_params.append(name)
    if lora_params:
        print(f"LoRA parameters found: {lora_params}")
    else:
        print("No LoRA parameters found. Please check the target_modules configuration.")

# Verify the presence of LoRA parameters
verify_lora_parameters(model)

LoRA parameters found: ['base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.2.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.2.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.3.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.3.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.4.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.4.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.5.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.5.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.6.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.6.attn.c_attn.lora_B.default.weigh

In [73]:
# Define function to print trainable parameters
def print_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params}")
    print(f"Total parameters: {total_params}")
    print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.2f}%")

# Print trainable parameters
print_trainable_parameters(model)

Trainable parameters: 294912
Total parameters: 124734720
Percentage of trainable parameters: 0.24%


In [74]:
# dataset is a text file of shakespear text

with open('input.txt', 'r') as file:
    data = file.read()

In [75]:
# Tokenize the dataset

dataset = tokenizer.encode(data, return_tensors='pt')

print(len(set(dataset[0].tolist())))

print(dataset.size())

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


11706
torch.Size([1, 338025])


In [76]:
# split the dataset into training and validation sets

n = 0.95

train_size = int(dataset.size()[1] * n)

train_dataset = dataset[:, :train_size]
val_dataset = dataset[:, train_size:]

# parameters of GPT2 model

config = GPT2Config.from_pretrained('gpt2')

# get the block size of the model

block_size = config.n_positions

from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # Subtract self.block_size + 1 to avoid going out of bounds
        return self.data.size()[1] - self.block_size - 1

    def __getitem__(self, idx):
        # Input sequence is from idx to idx+self.block_size
        input_sequence = self.data[:, idx:idx+self.block_size]
        # Target sequence is shifted by one token to the right
        target_sequence = self.data[:, idx+1:idx+self.block_size+1]
        return input_sequence, target_sequence
    

train_dataset = TextDataset(train_dataset, block_size)

val_dataset = TextDataset(val_dataset, block_size)

train_loader = DataLoader(train_dataset, 
                          num_workers=4,
                          prefetch_factor=2,
                          batch_size=2, shuffle=True)

val_loader = DataLoader(val_dataset,
                        num_workers=4,
                        prefetch_factor=2,
                        batch_size=2)




In [77]:
epochs = 1
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr)

total_steps = len(train_loader) * epochs
print(total_steps)

160049


In [78]:
save_path = './model_checkpoints'
os.makedirs(save_path, exist_ok=True)

In [79]:
import time

# Function to measure time per batch
def measure_time_per_batch(dataloader, num_batches=10):
    model.train()
    start_time = time.time()
    for i, (input_seq, target_seq) in enumerate(dataloader):
        if i >= num_batches:
            break
        input_seq = input_seq.to(device)
        target_seq = target_seq.to(device)
        outputs = model(input_seq, labels=target_seq)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    end_time = time.time()
    avg_time_per_batch = (end_time - start_time) / num_batches
    return avg_time_per_batch

# Measure the average time per batch
avg_time_per_batch = measure_time_per_batch(train_loader)
print(f"Average time per batch: {avg_time_per_batch:.4f} seconds")

# Calculate total training time
total_iterations = len(train_loader) * epochs
estimated_total_time = total_iterations * avg_time_per_batch
print(f"Estimated total training time: {estimated_total_time / 3600:.2f} hours")

Average time per batch: 25.1420 seconds
Estimated total training time: 1117.76 hours


In [None]:
"""

import torch
import torch.autograd.profiler as profiler

# Profiling within the training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    # Start profiling
    with profiler.profile(record_shapes=True, use_cuda=True) as prof:
        for i, (input_seq, attention_mask) in enumerate(train_loader):
            if i >= 10:  # Profile only the first 10 iterations
                break
            
            input_seq = input_seq.to(device)
            attention_mask = attention_mask.to(device)
            
            # Forward pass
            with profiler.record_function("forward"):
                outputs = model(input_seq, attention_mask=attention_mask, labels=input_seq)
                loss = outputs.loss
            
            # Backward pass
            with profiler.record_function("backward"):
                loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
            
            total_loss += loss.item()
            
            if i % 10 == 0:  # Adjust the logging frequency as needed
                print(f"Epoch {epoch+1}/{epochs}, Iteration {i+1}/{len(train_loader)}, Loss: {loss.item()}")
    
    print(f"Epoch {epoch+1} Training Loss: {total_loss / (i+1)}")
    
    # Export profiling results
    prof.export_chrome_trace(f"profile_epoch_{epoch+1}.json")

# Download the profiling results
filename = f"profile_epoch_{epoch+1}.json"
files.download(filename)

"""

In [80]:
# Fine-tune the model
"""
for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_val_loss = 0
    for i, (input_seq, target_seq) in enumerate(train_loader):
        input_seq = input_seq.to(device)
        target_seq = target_seq.to(device)
        outputs = model(input_seq, labels=target_seq)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        if i % 1e6 == 0 and i > 0:
            model.eval()
            print(f"Epoch {epoch} Iter {i} Loss: {loss.item()}")
            for j, (input_seq, target_seq) in enumerate(val_loader):
                total_val_loss = 0
                input_seq = input_seq.to(device)
                target_seq = target_seq.to(device)
                with torch.no_grad():
                    outputs = model(input_seq, labels=target_seq)
                loss = outputs.loss
                total_val_loss += loss.item()
            print(f"Epoch {epoch} Iter {i} Validation Loss: {total_val_loss}")
            checkpoint_path = os.path.join(save_path, f'checkpoint_epoch_{i}.pt')
            model.save_pretrained(checkpoint_path)
            tokenizer.save_pretrained(checkpoint_path)
            model.train()
    print(f"Epoch {epoch} Total Loss: {total_loss}")

"""

'\nfor epoch in range(epochs):\n    model.train()\n    total_loss = 0\n    total_val_loss = 0\n    for i, (input_seq, target_seq) in enumerate(train_loader):\n        input_seq = input_seq.to(device)\n        target_seq = target_seq.to(device)\n        outputs = model(input_seq, labels=target_seq)\n        loss = outputs.loss\n        loss.backward()\n        optimizer.step()\n        optimizer.zero_grad()\n        total_loss += loss.item()\n        if i % 1e6 == 0 and i > 0:\n            model.eval()\n            print(f"Epoch {epoch} Iter {i} Loss: {loss.item()}")\n            for j, (input_seq, target_seq) in enumerate(val_loader):\n                total_val_loss = 0\n                input_seq = input_seq.to(device)\n                target_seq = target_seq.to(device)\n                with torch.no_grad():\n                    outputs = model(input_seq, labels=target_seq)\n                loss = outputs.loss\n                total_val_loss += loss.item()\n            print(f"Epoch {e

In [81]:
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

# Generate text

model.eval()
prompt = "To be or not to be"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
output = model.generate(input_ids, 
                        max_length=100,
                        no_repeat_ngram_size=2,
                        num_beams=4,
                        early_stopping=True,
                        num_return_sequences=1)

print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


To be or not to be.

I don't know what to say to that. I'm not going to tell you what you should or shouldn't do, but I do know that you have to do something about it. It's not something you can just sit back and let it go. You've got to get out there and do what's right for you, and that's what I've been doing for the last couple of years. And I think it's important for us to
