In [1]:
# The goal is to fine tune GPT-2 on a dataset of shakespear text

In [None]:
# If you are running this notebook on Google Colab run this cell to clone the repository
# !git clone https://github.com/Memento2121/Fine-tuning-GPT2.git
# %cd Fine-tuning-GPT2

In [2]:
from transformers import GPT2Model, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel

import os

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split, RandomSampler, SequentialSampler

import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

  from .autonotebook import tqdm as notebook_tqdm


Using cpu device


In [3]:
# Load pre-trained model and tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [4]:
# dataset is a text file of shakespear text

with open('input.txt', 'r') as file:
    data = file.read()

In [5]:
# Tokenize the dataset

dataset = tokenizer.encode(data, return_tensors='pt')

print(len(set(dataset[0].tolist())))

print(dataset.size())

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


11706
torch.Size([1, 338025])


In [6]:
# split the dataset into training and validation sets

n = 0.95

train_size = int(dataset.size()[1] * n)

train_dataset = dataset[:, :train_size]
val_dataset = dataset[:, train_size:]

# parameters of GPT2 model

config = GPT2Config.from_pretrained('gpt2')

# get the block size of the model

block_size = config.n_positions

from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # Subtract self.block_size + 1 to avoid going out of bounds
        return self.data.size()[1] - self.block_size - 1

    def __getitem__(self, idx):
        # Input sequence is from idx to idx+self.block_size
        input_sequence = self.data[:, idx:idx+self.block_size]
        # Target sequence is shifted by one token to the right
        target_sequence = self.data[:, idx+1:idx+self.block_size+1]
        return input_sequence, target_sequence
    

train_dataset = TextDataset(train_dataset, block_size)

val_dataset = TextDataset(val_dataset, block_size)

train_loader = DataLoader(train_dataset, 
                          batch_size=2, shuffle=True)

val_loader = DataLoader(val_dataset,
                        batch_size=2)




In [7]:
epochs = 3
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr)

total_steps = len(train_loader) * epochs
print(total_steps)

480147


In [8]:
save_path = './model_checkpoints'
os.makedirs(save_path, exist_ok=True)

In [9]:
# Fine-tune the model

for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_val_loss = 0
    for i, (input_seq, target_seq) in enumerate(train_loader):
        input_seq = input_seq.to(device)
        target_seq = target_seq.to(device)
        outputs = model(input_seq, labels=target_seq)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        if i % 1e6 == 0:
            model.eval()
            print(f"Epoch {epoch} Iter {i} Loss: {loss.item()}")
            for j, (input_seq, target_seq) in enumerate(val_loader):
                total_val_loss = 0
                input_seq = input_seq.to(device)
                target_seq = target_seq.to(device)
                with torch.no_grad():
                    outputs = model(input_seq, labels=target_seq)
                loss = outputs.loss
                total_val_loss += loss.item()
            print(f"Epoch {epoch} Iter {i} Validation Loss: {total_val_loss}")
            checkpoint_path = os.path.join(save_path, f'checkpoint_epoch_{i}.pt')
            model.save_pretrained(checkpoint_path)
            tokenizer.save_pretrained(checkpoint_path)
            model.train()
    print(f"Epoch {epoch} Total Loss: {total_loss}")

Epoch 0 Iter 0 Loss: 8.935750961303711


KeyboardInterrupt: 

In [None]:
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

# Generate text

model.eval()
prompt = "To be or not to be"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
output = model.generate(input_ids, max_length=100, num_return_sequences=2)

print(tokenizer.decode(output[0]))
print(tokenizer.decode(output[1]))