In [30]:
!pip uninstall -y transformers accelerate bitsandbytes datasets trl peft torch torchvision torchaudio triton -q

[0m

In [31]:
!pip install triton==2.3.1 -q

In [33]:
# Install PyTorch for CUDA 12.1
!pip install torch==2.2.0 --index-url https://download.pytorch.org/whl/cu121 -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.2 requires transformers<5.0.0,>=4.41.0, which is not installed.
torchtune 0.6.1 requires datasets, which is not installed.
fastai 2.8.5 requires torchvision>=0.11, which is not installed.
timm 1.0.22 requires torchvision, which is not installed.[0m[31m
[0m

In [1]:
# Install compatible versions (these work well together as of late 2024)
!pip install transformers==4.44.2 -q

In [3]:
!pip install accelerate==0.33.0 -q

In [4]:
!pip install bitsandbytes==0.43.3 -q

In [5]:
!pip install datasets==2.21.0 -q

In [6]:
!pip install trl==0.9.6 -q

In [7]:
!pip install peft==0.12.0 -q

In [8]:
!pip install huggingface-hub>=0.24.0 -q

In [9]:
from google.colab import userdata
hf_token = userdata.get('huggingface')
#print(hf_token)

In [10]:
from huggingface_hub import login
login(token=hf_token)

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

print(f"Loading {model_name}...")
quantized_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading meta-llama/Meta-Llama-3.1-8B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
tokenizer.pad_token = tokenizer.eos_token

print("✅ Model loaded successfully")

✅ Model loaded successfully


In [13]:
test_prompt = "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"

test_input = tokenizer(test_prompt, return_tensors="pt").to('cuda')

with torch.no_grad():
    response = quantized_model.generate(**test_input, max_new_tokens=100)

print("Base model response:")
print(tokenizer.decode(response[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Base model response:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May??
## Step 1: First, let's calculate the number of clips Natalia sold in April.
Natalia sold 48 clips in April.

## Step 2: Next, let's calculate the number of clips Natalia sold in May.
She sold half as many clips in May as in April, so she sold 48 / 2 = 24 clips in May.

## Step 3: Now, let's add the number of clips Natalia sold in April and May to


In [14]:
from datasets import load_dataset

dataset = load_dataset("openai/gsm8k", "main")

def format_example(example):
    """Format as prompt-completion for SFT training."""
    return {
        "text": f"### Question:\n{example['question']}\n\n### Answer:\n{example['answer']}<|end_of_text|>"
    }

formatted_dataset = dataset.map(format_example)
train_sample = formatted_dataset["train"].select(range(400))

print(f"Training samples: {len(train_sample)}")
print(f"\nExample:\n{train_sample[0]['text'][:400]}...")

Training samples: 400

Example:
### Question:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

### Answer:
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72<|end_of_text|>...


In [15]:
print(train_sample[1:2])

{'question': ['Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?'], 'answer': ['Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10'], 'text': ['### Question:\nWeng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?\n\n### Answer:\nWeng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10<|end_of_text|>']}


In [16]:
from peft import LoraConfig, prepare_model_for_kbit_training

# Prepare model for training (important for quantized models)
quantized_model = prepare_model_for_kbit_training(quantized_model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [17]:
from trl import SFTTrainer, SFTConfig

output_dir = "./lora_output"

training_config = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=1,  # Reduced
    gradient_accumulation_steps=8,  # Increased to compensate
    learning_rate=2e-4,
    num_train_epochs=1,  # Reduced for faster iteration
    max_seq_length=256,  # Reduced - most GSM8K fits in this
    logging_steps=10,
    save_steps=100,
    fp16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="none",
    dataset_text_field="text",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
)

trainer = SFTTrainer(
    model=quantized_model,
    args=training_config,
    train_dataset=train_sample,
    peft_config=lora_config,
    tokenizer=tokenizer,
)

print("✅ Trainer ready")

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

✅ Trainer ready


In [18]:
torch.cuda.empty_cache()

print("Starting training...")
trainer.train()
print("✅ Training complete!")

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,1.2557
20,1.0562
30,1.0235
40,0.9818
50,1.0061


✅ Training complete!


In [19]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Saved to {output_dir}")

✅ Saved to ./lora_output


In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch


# 2. Load the LoRA adapter on top
model = PeftModel.from_pretrained(quantized_model, "./lora_output")

In [24]:
test_q = "Sarah has 24 apples. She gives 1/3 to her friend and buys 12 more. How many apples does she have?"
prompt = f"Question: {test_q}\n\nAnswer:"

inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

print("Response:")
print(tokenizer.decode(output[0], skip_special_tokens=True))



Response:
Question: Sarah has 24 apples. She gives 1/3 to her friend and buys 12 more. How many apples does she have?

Answer: 1/3 of 24 apples is 24 / 3 = <<24/3=8>>8 apples
She has 24 - 8 = <<24-8=16>>16 apples left
Sarah has 16 + 12 = <<16+12=28>>28 apples
Answer: 28
