In [3]:
!pip install -q "transformers>=4.40.0" "datasets" "peft" "accelerate" "bitsandbytes"


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_START_METHOD"] = "thread"
os.environ["WANDB_SILENT"] = "true"


In [4]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="math_tutor_train_v2.jsonl")
train_dataset = dataset["train"]

print(train_dataset[0])
print("Number of examples:", len(train_dataset))


Generating train split: 0 examples [00:00, ? examples/s]

{'instruction': 'What is 12 + 15?', 'input': '', 'output': '12 + 15 = 27. I add 10 to 12 to get 22, then add the remaining 5 to reach 27.'}
Number of examples: 33


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,   # load model in 4-bit (saves GPU memory)
    device_map="auto"    # automatically use GPU
)

print("Model + tokenizer loaded.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model + tokenizer loaded.


In [6]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,                   # rank of LoRA matrices (small = fast & light)
    lora_alpha=16,         # scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM", # we are training a causal language model
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044


In [7]:
def format_example(example):
    instruction = example["instruction"]
    output = example["output"]

    # Build a simple prompt format
    text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"

    # Turn text into token IDs
    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=256,
        padding="max_length"
    )

    # For causal language modelling, labels = input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_train = train_dataset.map(format_example)

print(tokenized_train[0].keys())
print("Tokenized training examples:", len(tokenized_train))


Map:   0%|          | 0/33 [00:00<?, ? examples/s]

dict_keys(['instruction', 'input', 'output', 'input_ids', 'attention_mask', 'labels'])
Tokenized training examples: 33


In [13]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./math-tutor-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=1,
    save_steps=50,
    fp16=True,
    report_to="none",
)



data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False   # mlm = masked language modeling (not used for chat models)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
)


In [14]:
trainer.train()


Step,Training Loss
1,1.859
2,1.6402
3,1.9696
4,1.8588
5,1.7147
6,1.4882
7,1.3805
8,1.2575
9,1.5073
10,1.2064


TrainOutput(global_step=27, training_loss=1.2479946569160179, metrics={'train_runtime': 22.0638, 'train_samples_per_second': 4.487, 'train_steps_per_second': 1.224, 'total_flos': 157654660939776.0, 'train_loss': 1.2479946569160179, 'epoch': 3.0})

In [15]:
model.save_pretrained("./math-tutor-lora")
tokenizer.save_pretrained("./math-tutor-lora")

print("Fine-tuned LoRA adapter and tokenizer saved.")


Fine-tuned LoRA adapter and tokenizer saved.


In [16]:
from transformers import pipeline
from peft import PeftModel

# Reload the base model in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)

# Load your fine-tuned LoRA adapter
ft_model = PeftModel.from_pretrained(base_model, "./math-tutor-lora")

ft_model.eval()

# Build a text-generation pipeline
pipe = pipeline(
    "text-generation",
    model=ft_model,
    tokenizer=tokenizer,
    max_length=150,
    do_sample=True,
    top_p=0.9
)

def ask(question):
    prompt = f"### Instruction:\n{question}\n\n### Response:\n"
    answer = pipe(prompt)[0]["generated_text"]
    print(answer)
    print("\n" + "="*60 + "\n")

ask("What is 27 + 14?")
ask("Explain what speed means.")
ask("Which number is larger, 0.4 or 0.09?")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Device set to use cuda:0


### Instruction:
What is 27 + 14?

### Response:
27 + 14 = 41. So 41 - 14 = 27.


### Instruction:
Explain what speed means.

### Response:
Speed is the amount of change in a distance per time period. For example, if you run at a speed of 5 miles per hour for 2 minutes, that means you have covered 5 miles in 2 minutes.


### Instruction:
Which number is larger, 0.4 or 0.09?

### Response:
0.4 > 0.09 = 0.4 > 0.09 = 0.4 > 0.09 = 0.4 = 0.4.


