In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

# 1. Configuration
model_id = "meta-llama/Meta-Llama-3-8B" # Requires HF access approval
dataset_name = "GAIR/lima"




trainer.train()

In [None]:
# 2. Quantization (QLoRA) to save VRAM
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 3. Load Model and Tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# 4. LIMA Formatting Function
# LIMA usually has a 'conversations' column which is a list of strings
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['conversations'])):
        # Format as: User: {prompt} \n Assistant: {response}
        text = f"User: {example['conversations'][i][0]}\nAssistant: {example['conversations'][i][1]}"
        output_texts.append(text)
    return output_texts


In [None]:
# 5. LoRA Adapter Settings
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Target all linear layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./llama-lima-sft",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4, # LIMA recommends a small LR
    num_train_epochs=3,   # LIMA is tiny, so few epochs are needed
    logging_steps=10,
    bf16=True,            # Use bfloat16 if your GPU supports it (A100/H100/3000+)
    save_strategy="epoch",
)

# 7. Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=load_dataset(dataset_name, split="train"),
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args,
)

In [None]:
trainer.train()

In [None]:
from huggingface_hub import login
login("your_huggingface_token_here")

In [None]:
# Save locally first
trainer.save_model("./final_lima_adapter")

# Push to the Hub
trainer.push_to_hub("your-username/llama-3-8b-lima")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model_id = "meta-llama/Meta-Llama-3-8B"
adapter_id = "your-username/llama-3-8b-lima" # Your Hub ID

# 1. Load the Base Model (standard 4-bit/8-bit to save memory)
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# 2. Load the Adapter from the Hub
model = PeftModel.from_pretrained(model, adapter_id)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# 3. Simple Inference Function
def ask_llama(prompt):
    messages = [{"role": "user", "content": prompt}]
    # Apply the Llama-3 chat template
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")

    outputs = model.generate(inputs, max_new_tokens=150, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(ask_llama("How do I explain quantum physics to a five-year-old?"))

In [None]:
# This combines the base + adapter into a single standard model
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./llama-3-lima-full")