In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
import wandb
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
import torch

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(base_model).to(device)

tokenizer.pad_token = tokenizer.eos_token 
r = 2048

wandb.init(project="lora-llama-exp", name=f"llama-1b-lora-{r}")
peft_config = LoraConfig(r=r, lora_alpha=16, lora_dropout=0.1, inference_mode=False, task_type=TaskType.CAUSAL_LM)
model = get_peft_model(model, peft_config)


In [25]:
dataset = load_dataset("sahil2801/CodeAlpaca-20k", split="train")
nums = 3
print("1. Instruction ÌòïÌÉú")
print(dataset.select(range(nums))['instruction'], end="\n\n")

print("2. Input ÌòïÌÉú")
print(dataset.select(range(nums))['input'], end="\n\n")

print("3. Output ÌòïÌÉú")
print(dataset.select(range(nums))['output'], end="\n\n")

1. Instruction ÌòïÌÉú
['Create an array of length 5 which contains all even numbers between 1 and 10.', 'Formulate an equation to calculate the height of a triangle given the angle, side lengths and opposite side length.', 'Write a replace method for a string class which replaces the given string with a given set of characters.']

2. Input ÌòïÌÉú
['', '', 'string = "Hello World!"\nreplace_with = "Greetings!"']

3. Output ÌòïÌÉú
['arr = [2, 4, 6, 8, 10]', 'Height of triangle = opposite side length * sin (angle) / side length', 'def replace(self, replace_with):\n    new_string = ""\n    for char in self:\n        if char == " ":\n            new_string += replace_with\n        else:\n            new_string += char\n    return new_string']



In [26]:
def format_llama_prompt(example):
    prompt = f"### ÏßàÎ¨∏: {example['instruction']} + {example['input']}\n### ÎãµÎ≥Ä: {example['output']}"
    return {"text": prompt}

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

dataset = dataset.map(format_llama_prompt).map(tokenize_function, batched=True)
train_dataset = dataset.select(range(200))
val_dataset = dataset.select(range(200,250))
test_dataset = dataset.select(range(250,300))

In [27]:
sft_config = SFTConfig(
    output_dir=f"/tmp/llama-lora-{r}", 
    max_seq_length=128,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb", 
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=sft_config,
    formatting_func=lambda x: x["text"],
)



In [28]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss
1,1.734,1.647899
2,1.6772,1.632199
3,1.6593,1.615021
4,1.657,1.598703
5,1.6409,1.583771
6,1.6142,1.571431
7,1.6124,1.562702
8,1.6303,1.556997


In [29]:
print('Max Alloc:', round(torch.cuda.max_memory_allocated(0)/1024**3, 1), 'GB')

Max Alloc: 18.5 GB


In [30]:
wandb.finish()

0,1
eval/loss,‚ñà‚ñá‚ñÖ‚ñÑ‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñá‚ñÑ‚ñÖ‚ñÅ‚ñÇ‚ñÑ‚ñà‚ñÑ‚ñá
eval/samples_per_second,‚ñÇ‚ñÖ‚ñÑ‚ñà‚ñà‚ñÖ‚ñÅ‚ñÖ‚ñÇ
eval/steps_per_second,‚ñÇ‚ñÖ‚ñÑ‚ñà‚ñà‚ñÖ‚ñÅ‚ñÖ‚ñÉ
train/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/grad_norm,‚ñÖ‚ñÅ‚ñÇ‚ñÉ‚ñÖ‚ñÖ‚ñá‚ñÖ‚ñà
train/learning_rate,‚ñà‚ñá‚ñÜ‚ñÖ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ
train/loss,‚ñà‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÇ‚ñÅ‚ñÉ

0,1
eval/loss,1.557
eval/runtime,0.9972
eval/samples_per_second,50.14
eval/steps_per_second,7.02
total_flos,1580912533831680.0
train/epoch,8.64
train/global_step,60.0
train/grad_norm,0.06439
train/learning_rate,0.0
train/loss,1.6303
