In [1]:
# If you are running this notebook on Google Colab run this cell to clone the repository
!git clone https://github.com/Memento2121/Fine-tuning-GPT2-and-QLoRA-Llama3.1-8B.git
%cd Fine-tuning-GPT2

fatal: destination path 'Fine-tuning-GPT2' already exists and is not an empty directory.
/content/Fine-tuning-GPT2


In [2]:
import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from torch.cuda.amp import autocast, GradScaler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

In [3]:
# dataset is a text file of shakespear text

with open('input.txt', 'r') as file:
    data = file.read()

print(data[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [4]:
seed_val = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Set seed for reproducibility
set_seed(seed_val)

In [5]:
import csv

def split_text_into_chunks(input_file, output_csv, chunk_size=3000):
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()

    # Split text into chunks of specified character size
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    # Write chunks to a CSV file
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['chunk'])  # Write header
        for chunk in chunks:
            writer.writerow([chunk])

    print(f"Split into {len(chunks)} chunks and saved to {output_csv}.")

# Example usage
split_text_into_chunks('input.txt', 'output_chunks.csv')

Split into 372 chunks and saved to output_chunks.csv.


In [6]:
# load into a data frame
df = pd.read_csv ('output_chunks.csv')
print(df)

                                                 chunk
0    First Citizen:\nBefore we proceed any further,...
1    ever\nAppear in your impediment. For the deart...
2    eart, to the seat o' the brain;\nAnd, through ...
3    usands of these quarter'd slaves, as high\nAs ...
4    eads on at noon: but I do wonder\nHis insolenc...
..                                                 ...
367  e a vassal of him.\n\nPROSPERO:\nSo, slave; he...
368   thyself\nUpon this island as a spy, to win it...
369  elier than I meant you should.\n\nGONZALO:\nTh...
370  \nANTONIO:\nO, widow Dido! ay, widow Dido.\n\n...
371  cts?\n\nANTONIO:\nNone, man; all idle: whores ...

[372 rows x 1 columns]


In [7]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
data = df.chunk.copy()

for x in data:
    tokens = tokenizer.tokenize(x)
    print(len(tokens))
    break

834


In [9]:
#print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

The beginning of sequence token <|startoftext|> token has the id 50257
The end of sequence token <|endoftext|> has the id 50256
The padding token <|pad|> has the id 50258


In [10]:
# Hyperparameters
batch_size = 2
epochs = 5
learning_rate = 5e-4
warmup_steps_factor = 0.1
epsilon = 1e-8

In [11]:
class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, max_length):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]

In [12]:
block_size = GPT2Config.from_pretrained('gpt2').n_positions
print(f"context size : {block_size}")

dataset = GPT2Dataset(data, tokenizer, max_length=block_size)

# Define the split ratio
train_ratio = 0.9
train_size = int(train_ratio * len(dataset))
val_size = len(dataset) - train_size

# Split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Optional: create DataLoaders for each set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

context size : 1024
Training set size: 334
Validation set size: 38


In [13]:
warmup_steps = int(warmup_steps_factor*len(train_loader)*epochs)

In [14]:
# Model

model = GPT2LMHeadModel.from_pretrained('gpt2')

model.resize_token_embeddings(len(tokenizer))

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

model = model.to(device)

Using cuda device


In [15]:
# Define function to print trainable parameters
def print_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params}")
    print(f"Total parameters: {total_params}")
    print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.2f}%")

# Print trainable parameters
print_trainable_parameters(model)

Trainable parameters: 124441344
Total parameters: 124441344
Percentage of trainable parameters: 100.00%


In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=epsilon)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

print(f"Total training steps: {total_steps}")

Total training steps: 835


In [17]:
from torch.cuda.amp import autocast, GradScaler
import time

model.gradient_checkpointing_enable()

scaler = GradScaler()

model = torch.compile(model)

torch.set_float32_matmul_precision('high')

# Function to measure time per batch
def measure_time_per_batch(dataloader, num_batches=10):
    model.train()
    torch.cuda.synchronize()
    for i, batch in enumerate(dataloader):
        if i == 1:
            start_time = time.time()
        t0 = time.time()
        if i >= num_batches:
            break
        input_seq = batch[0].to(device)
        target_seq = batch[0].to(device)
        mask_seq = batch[1].to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
          outputs = model(input_seq, labels=target_seq, attention_mask = mask_seq)
          loss = outputs.loss
        scaler.scale(loss).backward()
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        torch.cuda.synchronize()
        # Calculate gradient norm
        total_norm = 0.0
        for p in model.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** 0.5
        t1 = time.time()
        print(f"Batch {i+1} took {t1-t0:.2f} seconds / Norm : {norm:.4f} / Total Norm : {total_norm:.4f}")
    torch.cuda.synchronize()
    end_time = time.time()
    avg_time_per_batch = (end_time - start_time) / (num_batches - 1) # don't count the first iteration
    return avg_time_per_batch

model.config.use_cache = False

# Measure the average time per batch
avg_time_per_batch = measure_time_per_batch(train_loader)
print(f"Average time per batch: {avg_time_per_batch:.4f} seconds")

print(f"Tokens per second : {block_size*batch_size/avg_time_per_batch}")

# Calculate total training time
total_iterations = len(train_loader) * epochs
estimated_total_time = total_iterations * avg_time_per_batch
print(f"Estimated total training time: {estimated_total_time / 60:.2f} minutes")



Batch 1 took 37.24 seconds / Norm : nan / Total Norm : nan
Batch 2 took 0.10 seconds / Norm : nan / Total Norm : nan
Batch 3 took 0.11 seconds / Norm : nan / Total Norm : nan
Batch 4 took 0.11 seconds / Norm : inf / Total Norm : nan
Batch 5 took 0.11 seconds / Norm : inf / Total Norm : nan
Batch 6 took 0.10 seconds / Norm : inf / Total Norm : nan
Batch 7 took 0.11 seconds / Norm : inf / Total Norm : nan
Batch 8 took 0.21 seconds / Norm : 236634.4688 / Total Norm : 0.0020
Batch 9 took 0.15 seconds / Norm : 202215.2344 / Total Norm : 0.0020
Batch 10 took 0.15 seconds / Norm : 194916.9531 / Total Norm : 0.0020
Average time per batch: 0.1266 seconds
Tokens per second : 16171.16576732044
Estimated total training time: 1.76 minutes


In [18]:
# Fine-tune the model

do_train = True
t0 = time.time()
if do_train:
  for epoch in range(epochs):
      model.train()
      total_loss = 0
      total_val_loss = 0
      for i, batch in enumerate(train_loader):
          input_seq = batch[0].to(device)
          target_seq = batch[0].to(device)
          mask_seq = batch[1].to(device)
          optimizer.zero_grad()
          with torch.cuda.amp.autocast():
            outputs = model(input_seq, labels=target_seq, attention_mask = mask_seq)
            loss = outputs.loss
          scaler.scale(loss).backward()
          norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          scaler.step(optimizer)
          scaler.update()
          scheduler.step()
          total_loss += loss.item()
          if (i+1) % int((1/epochs)*total_steps) == 0 and i > 0:
              model.eval()
              print(f"Epoch {epoch} Iter {i} Loss: {loss.item()}")
              sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,
                                    top_k=50,
                                    max_length = 200,
                                    top_p=0.95,
                                    num_return_sequences=1,
                                    pad_token_id=tokenizer.pad_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                )
              for k, sample_output in enumerate(sample_outputs):
                    print("{}: {}".format(k, tokenizer.decode(sample_output, skip_special_tokens=True)))
              total_val_loss = 0
              for j, batch in enumerate(val_loader):
                  input_seq = batch[0].to(device)
                  target_seq = batch[0].to(device)
                  mask_seq = batch[1].to(device)
                  with torch.no_grad():
                      outputs = model(input_seq, labels=target_seq, attention_mask = mask_seq)
                  loss = outputs.loss
                  total_val_loss += loss.item()
              print(f"Epoch {epoch} Iter {i} Validation Loss: {total_val_loss/(j+1)}")
              model.train()
      print(f"Epoch {epoch} Total Loss: {total_loss/len(train_loader)}")

print(f"Training took {(time.time()-t0)/60:.2f} minutes")

Epoch 0 Iter 166 Loss: 2.8665401935577393
0:  bipartisanlove,
'Twas when 't thou wast a slave to a king.

MOMEO:
I may I may but tell thee that they are gone.

MOMEO:
Nay, let me know this, and we will all come.
Thou art now the king, and I'll not talk to him.

MOMEO:
So, sir, to-day thou shalt sit alone,
And not talk to him or do any thing;
Because he is the king.
My son's king; 'tis the king:
He's the queen.

MOMEO:
Look, we are all kings.

MOMEO:
I will tell him thy king is king.

MOMEO:
I'll make him king again.

MOMEO:
Ay, my lord.

MOMEO:
'Tis not, but I'll bring
Epoch 0 Iter 166 Validation Loss: 3.0906066643564323
Epoch 0 Total Loss: 3.5616165606561534
Epoch 1 Iter 166 Loss: 2.5959529876708984
0:  increasing of all,
That now comes upon the king's head,
And I thus conclude by him himself.

WARWICK:
Hath it in me yet, like a crown'd,
To make the realm prosperous again?

KING RICHARD II:
But how now!

RICHARD:
To the Duke of Warwick.

KING RICHARD II:
What will he have?

RICHARD:
T

In [19]:
import textwrap

model_name = 'gpt2'
model2 = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer2 = GPT2Tokenizer.from_pretrained(model_name)

model2.eval()

model2.to(device)

prompt = "Once upon a time"

#input_ids = tokenizer2.encode(prompt, return_tensors='pt').to(device)

tokenizer2.pad_token = tokenizer2.eos_token

encoded_input = tokenizer2(prompt, return_tensors='pt', padding=True, truncation=True)
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)

output_sequences = model2.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=200,  # Adjust the max_length as needed
    num_return_sequences=1,  # Number of sequences to return
    temperature=1.0,  # Adjust the temperature for diversity
    top_k=50,  # Use top_k sampling
    top_p=0.95,  # Use top_p sampling
    do_sample=True,  # Enable sampling to generate diverse sequences
    pad_token_id=tokenizer2.eos_token_id,
)

generated_text = tokenizer2.decode(output_sequences[0], skip_special_tokens=True)

wrapped_text = textwrap.fill(generated_text, width=120)

print(wrapped_text)

Once upon a time we were called on to join the ranks of the Irish.  The Irish who had seen the new Republic and the
Irish who had been forced to flee the country on this last trip for freedom before the war were now now so well trained
that the country had the capability of producing the finest Irish warriors and soldiers.  It was the same way that the
Irish took on French and Italians during the Civil War. They had to fight for the Irish only on the front lines.  All
the way up the Iron Curtain they used the Irish for their own personal ambitions.  The Irish's war service and their war
and war effort could not have been more glorious than it is today.  It is no secret that the Irish were the strongest
and most disciplined and loyal troops in all the British Empire's wars of empire.  The whole of Britain's War of
Independence began as an Irish war and ended with an Irish rebellion and the blood of the Irish people


In [20]:
model.eval()

prompt = "Once upon a time"

generated = tokenizer.encode(prompt)
generated = torch.tensor(generated).unsqueeze(0)
generated = generated.to(device)

sample_outputs = model.generate(
                                generated,
                                do_sample=True,
                                top_k=50,
                                max_length = 300,
                                top_p=0.95,
                                num_return_sequences=3,
                                pad_token_id=tokenizer.pad_token_id,
                                eos_token_id=tokenizer.eos_token_id,
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0: Once upon a time of great war?

First Messenger:
Ay, my lord.

Second Messenger:
By the holy Paulina of Padua,
An offer which you have kindly declined,
If it be honourable for your good cause.

Third Messenger:
Hark, what noise canst thou make?

Second Messenger:
My lord,
Ay, a murmur, a murmur, a murmur, a murmur, a murmur;
A murmur indeed, a murmur, and a very murmur.

Third Messenger:
O, that the King of Naples did solicit you hither!

PAULINA:
O, I am sent with greetings from Lord Angelo.

Second Messenger:
A murmur, a murmur, and a very murmur.

Third Messenger:
Come hither, my liege. My lord,
I do beseech you to pardon me, and am come
To seek advice in this quarrel.

PAULINA:
And that's to help, my lord.

Third Messenger:
Hark you, come hither. I would my very soul were mad!

PAULINA:
O, my lord!

Third Messenger:
A murmur, a murmur, an affray.

PAULINA:
O, my Lord Angelo!

Second Messenger:
A


1: Once upon a time,
To lay hold of thy sovereign and his princely son:
In this my