In [1]:
# If you are running this notebook on Google Colab run this cell to clone the repository
!git clone https://github.com/Memento2121/Fine-tuning-GPT2-and-QLoRA-Llama3.1-8B.git
%cd Fine-tuning-GPT2

Cloning into 'Fine-tuning-GPT2'...
remote: Enumerating objects: 51, done.[K
remote: Total 51 (delta 0), reused 0 (delta 0), pack-reused 51[K
Receiving objects: 100% (51/51), 110.47 MiB | 22.67 MiB/s, done.
Resolving deltas: 100% (12/12), done.
/content/Fine-tuning-GPT2


In [2]:
import os
import time
import datetime

!pip install peft trl
!pip install -U bitsandbytes
!pip install --upgrade transformers

from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting datasets (from trl)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.5-py3-none-any.whl.metadata (8.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)

In [3]:
# dataset is a text file of shakespear text

with open('input.txt', 'r') as file:
    data = file.read()

print(data[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [4]:
seed_val = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Set seed for reproducibility
set_seed(seed_val)

In [5]:
import csv

def split_text_into_chunks(input_file, output_csv, chunk_size=3000):
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()

    # Split text into chunks of specified character size
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    # Write chunks to a CSV file
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['chunk'])  # Write header
        for chunk in chunks:
            writer.writerow([chunk])

    print(f"Split into {len(chunks)} chunks and saved to {output_csv}.")

# Example usage
split_text_into_chunks('input.txt', 'output_chunks.csv')

Split into 372 chunks and saved to output_chunks.csv.


In [6]:
# load into a data frame
df = pd.read_csv ('output_chunks.csv')
print(df)

                                                 chunk
0    First Citizen:\nBefore we proceed any further,...
1    ever\nAppear in your impediment. For the deart...
2    eart, to the seat o' the brain;\nAnd, through ...
3    usands of these quarter'd slaves, as high\nAs ...
4    eads on at noon: but I do wonder\nHis insolenc...
..                                                 ...
367  e a vassal of him.\n\nPROSPERO:\nSo, slave; he...
368   thyself\nUpon this island as a spy, to win it...
369  elier than I meant you should.\n\nGONZALO:\nTh...
370  \nANTONIO:\nO, widow Dido! ay, widow Dido.\n\n...
371  cts?\n\nANTONIO:\nNone, man; all idle: whores ...

[372 rows x 1 columns]


In [7]:
from google.colab import userdata
api_token = userdata.get('HF_TOKEN')

# Use the API key securely in your code
print(f"Retrieved API Key: {api_token}")

Retrieved API Key: "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"


In [8]:
model_id = "meta-llama/Meta-Llama-3.1-8B"
#model_id = 'gpt2-large'

api_token = "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"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

qlora = True

if qlora:
  model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_auth_token=api_token, device_map="auto")
else:
  model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=api_token, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>', use_auth_token=api_token)




config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [9]:
print(api_token)

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


In [10]:
data = df.chunk.copy()

for x in data:
    tokens = tokenizer.tokenize(x)
    print(len(tokens))
    break

752


In [11]:
#print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

The beginning of sequence token <|startoftext|> token has the id 128256
The end of sequence token <|endoftext|> has the id 128257
The padding token <|pad|> has the id 128258


In [12]:
class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, max_length):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return {'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_masks[idx],
            'labels': self.input_ids[idx]
    }

In [13]:
block_size = GPT2Config.from_pretrained('gpt2').n_positions
print(f"context size : {block_size}")

dataset = GPT2Dataset(data, tokenizer, max_length=block_size)

# Define the split ratio
train_ratio = 0.95
train_size = int(train_ratio * len(dataset))
val_size = len(dataset) - train_size

# Split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

context size : 1024
Training set size: 353
Validation set size: 19


In [14]:
model.resize_token_embeddings(len(tokenizer))

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [15]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128259, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

In [16]:
from peft import prepare_model_for_kbit_training

if qlora:
  model = prepare_model_for_kbit_training(model)

In [17]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [18]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

if qlora:
  model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 20971520 || all params: 4561596416 || trainable%: 0.4597408031635914


In [22]:

training_args = TrainingArguments(
    output_dir='./5-epochs-QLoRA-FT',
    evaluation_strategy="steps",
    logging_strategy="steps",
    eval_steps=10,
    logging_steps=10,  # Log every 100 steps
    learning_rate=5e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=2,
    gradient_checkpointing=True,
    fp16=True,
    torch_compile=True,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    logging_dir='./logs',
    optim="paged_adamw_8bit",
    max_grad_norm=1.0,
    max_steps=200
)

if training_args.gradient_checkpointing :
  model.gradient_checkpointing_enable()

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()


max_steps is given, it will override any value given in num_train_epochs
  self.pid = os.fork()
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
10,3.629,3.262788
20,2.1591,1.077799
30,0.9592,1.007631
40,0.8954,0.949763
50,0.8815,0.914796
60,0.8899,0.89346
70,0.8077,0.881211
80,0.8668,0.86991
90,0.8668,0.859248
100,0.8283,0.851626


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
W0730 12:52:27.101000 134556396782208 torch/_dynamo/convert_frame.py:357] torch._dynamo hit config.cache_size_limit (8)
W0730 12:52:27.101000 134556396782208 torch/_dynamo/convert_frame.py:357]    function: 'torch_dynamo_resume_in_new_forward_at_164' (/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:164)
W0730 12:52:27.101000 134556396782208 torch/_dynamo/convert_frame.py:357]    last reason: ___check_type_id(L['module'], 102180684650640)              
W0730 12:52:27.101000 134556396782208 torch/_dynamo/convert_frame.py:357] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0730 12:52:27.101000 134556396782208 torch/_dynamo/convert_frame.py:357] To diagnose recompilation issues, see https://pytorch.org/docs/mast

TrainOutput(global_step=200, training_loss=1.0116805744171142, metrics={'train_runtime': 1467.1795, 'train_samples_per_second': 0.273, 'train_steps_per_second': 0.136, 'total_flos': 1.844943349874688e+16, 'train_loss': 1.0116805744171142, 'epoch': 1.1299435028248588})

In [35]:

model.eval()

prompt = '<|startoftext|>First Citizen: \nBefore we proceed any further'

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)
print(generated)


sample_outputs = model.generate(
                                generated,
                                do_sample=True,
                                top_k=50,
                                max_length = 500,
                                top_p=0.95,
                                num_return_sequences=1,
                                pad_token_id=tokenizer.pad_token_id,
                                eos_token_id=tokenizer.eos_token_id,
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

tensor([[128000, 128256,   5451,  47317,     25,    720,  10438,    584,  10570,
            904,   4726]], device='cuda:0')
0: First Citizen: 
Before we proceed any further, hear me speak.
All: Speak, speak.

First Citizen:
You are the resolute body of the people,
And freely have you spent your time in this.
Let me crave your attention. Good masters all,
I shall not lie this night; therefore, depart.
You have heard this: in the very truth at first
In this known action; and these women's husbands,
Whom you have show'd to be the first, are those
Which you must first believe; which, being done,
The people will immediately arrest
The goldsmiths; for they have been the first
That have been fathers and effected issues.
The goldsmiths will thrive, and we hereby discharge
Their forfeitures and tell them in short grief
They are free from us.

Second Citizen:
That's the end.

First Citizen:
Which cannot in good conscience be denied:
And this, I take it, is the main end of your meeting.

Second 

In [24]:
"""
import pandas as pd
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login

from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

PEFT_MODEL = "/content/Fine-tuning-GPT2/checkpoint-100"

api_token = "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"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=api_token
)

model_id = "meta-llama/Meta-Llama-3.1-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>', use_auth_token=api_token)
#tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)
"""


'\n!pip install peft trl bitsandbytes\n!pip install --upgrade transformers\n\nimport pandas as pd\nimport json\nimport os\nfrom pprint import pprint\nimport bitsandbytes as bnb\nimport torch\nimport torch.nn as nn\nimport transformers\nfrom datasets import load_dataset, Dataset\nfrom huggingface_hub import notebook_login\n\nfrom peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n\nPEFT_MODEL = "/content/Fine-tuning-GPT2/checkpoint-100"\n\napi_token = "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"\n\nbnb_config = BitsAndBytesConfig(\n    load_in_4bit=True,\n    bnb_4bit_use_double_quant=True,\n    bnb_4bit_quant_type="nf4",\n    bnb_4bit_compute_dtype=torch.bfloat16\n)\n\nconfig = PeftConfig.from_pretrained(PEFT_MODEL)\nmodel = AutoModelForCausalLM.from_pretrained(\n    config.base_model_name_or_path,\n    return_dict=True,\n    quantization_config=bnb_conf