<a href="https://colab.research.google.com/github/MSDS-Capstone-Project/LLM_FineTuning/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
### Loading the Dataset

try: from datasets import load_dataset
except:
    !pip install datasets
    from datasets import load_dataset


dataset_small = load_dataset("openai/gsm8k","main")
# dataset_small = dataset.select(range(10000))
dataset_split = dataset_small['train'].train_test_split(test_size=0.1, seed=42)
train_data, eval_data = dataset_split["train"], dataset_split["test"]

print(dataset_small)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})


In [2]:
dataset_small['train'].to_pandas().head()

Unnamed: 0,question,answer
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<..."
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...


In [3]:
### Logging into HuggingFace

try:
  from dotenv import load_dotenv
except:
  !pip install python-dotenv
  from dotenv import load_dotenv

from huggingface_hub import login, whoami
import os

load_dotenv()
login(os.getenv("hugging_face_key2"))


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!pip install -U bitsandbytes
import bitsandbytes as bnb
print(bnb.__version__)

0.45.5


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# model_name = "mistralai/Mistral-7B-Instruct-v0.2"
# model_name = "gpt2"
model_name = "meta-llama/Llama-2-7b-hf"

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4"
# )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    # quantization_config=bnb_config,
    torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
### Freezing the Original Weights
# get_peft_model does this by default
# for param in model.parameters():
#     param.requires_grad = False # freezing the model - training the adapters later
#     if param.ndim == 1:
#         # casting the small params to fp32 for more stability
#         param.data = param.data.to(torch.float32)


model.gradient_checkpointing_disable()
model.config.use_cache = False  # Disable cache during training
model.enable_input_require_grads()

# import torch.nn as nn
# class CastOutputToFloat(nn.Sequential):
#     def forward(self, x): return super().forward(x).to(torch.float32)
# model.lm_head = CastOutputToFloat(model.lm_head)

In [7]:
### LoRA Config

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=16,  # Rank of the low-rank adaptation - attention heads
    lora_alpha=32, # alpha scaling
    lora_dropout=0.01,
    bias="lora_only",
    task_type= TaskType.CAUSAL_LM,
    # target_modules=["c_attn", "c_proj"]  # GPT-2 specific layers
    target_modules=["q_proj", "v_proj"] # Llama
)


lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()


trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243


In [8]:
max_length = model.config.max_position_embeddings
def format_prompt(example):
    prompt = f"Question: {example['question']}\nAnswer:"
    answer = f" {example['answer']}"
    full_text = prompt + answer


    tokenized = tokenizer(
        full_text,
        padding="longest",  # Pad to the longest sequence in the batch
        truncation=True,    # Truncate if the sequence exceeds the max length
        max_length=max_length,  # Custom max length
    )

    # Only train on the answer part, so masking the prompt part
    labels = tokenized["input_ids"].copy()
    prompt_len = len(tokenizer(prompt,
                               padding="longest",  # Pad to the longest sequence in the batch
                              truncation=True,    # Truncate if the sequence exceeds the max length
                              max_length=max_length,  # Custom max length
                               )["input_ids"])

    # Mask the prompt part by setting labels to -100 (ignore during loss computation)
    labels[:prompt_len] = [-100] * prompt_len

    tokenized["labels"] = labels
    return tokenized

tokenized_train = train_data.map(format_prompt, batched=True, remove_columns=train_data.column_names)
tokenized_eval = eval_data.map(format_prompt, batched=True, remove_columns=eval_data.column_names)
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]

In [9]:
train_data.map(format_prompt, batched=True, remove_columns=train_data.column_names)[:2]

Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

{'input_ids': [1, 894], 'attention_mask': [1, 1], 'labels': [-100, -100]}

In [10]:
# !pip install --upgrade transformers
# !pip install --upgrade accelerate
# !pip install --upgrade evaluate
# !pip install --upgrade scipy

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora_llama",
    eval_strategy="epoch",

    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,

    learning_rate=5e-4,
    weight_decay = 0.01,

    num_train_epochs=10,

    save_strategy="epoch",       # Save per epoch (optional)
    save_total_limit=3,          # Keep only latest 3 checkpoints
    logging_dir="./logs",
    logging_steps=100,
    remove_unused_columns=True,
    label_names = ['labels'],

    load_best_model_at_end=True
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
)


trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvishvesh2704[0m ([33mvishvesh2704-uva[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


RuntimeError: The size of tensor a (32) must match the size of tensor b (128) at non-singleton dimension 3

In [None]:
print(f"Model max position embeddings: {model.config.max_position_embeddings}")
print(f"Tokenizer max length: {tokenizer.model_max_length}")

In [None]:
lora_model.save_pretrained("./lora_gpt2_final")
tokenizer.save_pretrained("./lora_gpt2_final")

In [None]:
from peft import PeftModel, PeftConfig
peft_config = PeftConfig.from_pretrained("./lora_gpt2_final")

base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path)
model = PeftModel.from_pretrained(base_model, "./lora_gpt2_final")

tokenizer = AutoTokenizer.from_pretrained("./lora_gpt2_final")

In [None]:
model.eval()

prompt = "Question: What is 1 plus 1?\nAnswer:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=True,
        temperature=0.7,
        top_p=0.95
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))