In [4]:
!nvidia-smi

Mon Jan  5 05:54:00 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   61C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
!pip install torch datasets transformers peft



In [8]:
!pip install bitsandbytes trl

Collecting trl
  Downloading trl-0.26.2-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.26.2-py3-none-any.whl (518 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.26.2


In [9]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer

In [11]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_quant_compute_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_cache=False, device_map='auto', trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
model

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm(

In [12]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.5,
    bias='none')

model = get_peft_model(model, lora_config)

In [13]:
data=load_dataset('openai/gsm8k','main',split='train[:800]')
print(data)

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 800
})


In [17]:
def tokenize(batch):
  texts=[
      f"### Instruction:\n{instruction}\n### Response:\n{out}"
      for instruction, out in zip(batch['question'],batch['answer'])
  ]
  token=tokenizer(texts,padding='max_length',max_length=256,truncation=True, return_tensors='pt')
  token['label']=token['input_ids'].clone()
  return token



In [18]:
tokenize_data = data.map(tokenize, batched=True, remove_columns=data.column_names)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [33]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-3,
    num_train_epochs=5,
    bf16=True,  # Changed from fp16=True to bf16=True
    save_total_limit=3,
    logging_steps=20,
    optim='paged_adamw_8bit',
    save_strategy='epoch',
    push_to_hub=False,
    report_to='none',
    remove_unused_columns=False,
    label_names=['labels']
)

In [34]:
trainer = SFTTrainer(
    model = model,
    args=training_args,
    train_dataset=tokenize_data,
    processing_class = tokenizer
)

In [35]:
trainer.train()

Step,Training Loss
20,0.6986
40,0.6982
60,0.7392
80,0.6728
100,0.6575
120,0.593
140,0.5916
160,0.5684
180,0.5428
200,0.5288


TrainOutput(global_step=250, training_loss=0.6010896129608154, metrics={'train_runtime': 550.9356, 'train_samples_per_second': 7.26, 'train_steps_per_second': 0.454, 'total_flos': 6369885290496000.0, 'train_loss': 0.6010896129608154, 'entropy': 0.526249534636736, 'num_tokens': 1024000.0, 'mean_token_accuracy': 0.8623529389500618, 'epoch': 5.0})

In [37]:
model.save_pretrained('./results-adapter')
tokenizer.save_pretrained('./results-adapter')

('./results-adapter/tokenizer_config.json',
 './results-adapter/special_tokens_map.json',
 './results-adapter/chat_template.jinja',
 './results-adapter/tokenizer.model',
 './results-adapter/added_tokens.json',
 './results-adapter/tokenizer.json')

# Eval

In [38]:
import os
import math

import torch
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, default_data_collator

from peft import PeftModel

In [39]:
model_name = 'TinyLLama/TinyLlama-1.1B-Chat-v1.0'
adapter_path = '/content/results-adapter'

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code = True
).eval()

tmp_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code = True
)

tuned_model = PeftModel.from_pretrained(tmp_model, adapter_path)
tuned_model = tuned_model.merge_and_unload().eval()

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [40]:
def tokenize(batch):
    texts = [
        f"### Instruction:\n{inst}\n### Response:\n{out}"
        for inst, out in zip(batch['question'], batch['answer'])
    ]

    tokens = tokenizer(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = 256,
        return_tensors = 'pt'
    )

    tokens['labels'] = tokens['input_ids'].clone()

    return tokens

In [42]:
eval_ds = load_dataset('openai/gsm8k', 'main', split='train[800:820]')
eval_ds = eval_ds.map(tokenize, batched=True, remove_columns=['question', 'answer'])
eval_ds = eval_ds.with_format('torch')

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [43]:
eval_loader = DataLoader(
    eval_ds,
    batch_size = 8,
    collate_fn = default_data_collator
)

In [44]:
@torch.no_grad()
def compute_perplexity(model):
    losses = []

    for batch in eval_loader:
        batch = {k: v.to('cuda') for k, v in batch.items()}
        loss = model(**batch).loss
        losses.append(loss.item())

    return math.exp(sum(losses) / len(losses))

In [45]:
print(f'Base Model Perplexity: {compute_perplexity(base_model):.2f}')
print(f'Tuned Model Perplexity: {compute_perplexity(tuned_model):.2f}')

Base Model Perplexity: 64.10
Tuned Model Perplexity: 3.67


In [46]:
import random

raw_data = load_dataset('gsm8k', 'main', split='train[800:820]')
refs = raw_data['answer']


def generate(model, instruction):
    token_ids = tokenizer(f'### Instruction:\n{instruction}\n### Response:\n', return_tensors='pt').input_ids.to('cuda')

    with torch.no_grad():
        out = model.generate(token_ids, max_new_tokens=256)

    #return tokenizer.decode(out[0], skip_special_tokens=True).split('### Response:\n')[-1].strip()
    return tokenizer.decode(out[0], skip_special_tokens=True)

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [47]:
raw_data['question'][1]

'Alex gets paid $500 a week and 10% of his weekly income is deducted as tax. He also pays his weekly water bill for $55 and gives away another 10% of his weekly income as a tithe. How much money does Alex have left?'

In [48]:
print(generate(base_model, raw_data['question'][1]))

### Instruction:
Alex gets paid $500 a week and 10% of his weekly income is deducted as tax. He also pays his weekly water bill for $55 and gives away another 10% of his weekly income as a tithe. How much money does Alex have left?
### Response:
Alex has $455.90 left after deducting his weekly tax and paying his weekly water bill. He has given away 10% of his weekly income as a tithe, which is $50.00. Therefore, Alex has $455.90 left after deducting his weekly tax, paying his weekly water bill, and giving away 10% of his weekly income as a tithe.


In [49]:
print(generate(tuned_model, raw_data['question'][1]))

### Instruction:
Alex gets paid $500 a week and 10% of his weekly income is deducted as tax. He also pays his weekly water bill for $55 and gives away another 10% of his weekly income as a tithe. How much money does Alex have left?
### Response:
Alex gets paid $500/week * 5 weeks/week = $<<500*5=2500>>2500/week
That means he gets paid $2500/week * 52 weeks/month = $<<2500*52=125000>>125,000/year
That means he has $125,000/year - $55/week = $<<125000-55=120500>>120,500/month
That means he has $120,500/month - $10/month = $<<120500-10=119500>>119,500/year
#### 119500
