In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
pip install --upgrade transformers huggingface_hub --q

In [1]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator

In [2]:
model_name="EleutherAI/pythia-410m"

In [3]:
accelerator = Accelerator(gradient_accumulation_steps=4,
                         mixed_precision='fp16')

In [4]:
accelerator.num_processes

1

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,attn_implementation="sdpa",dtype=torch.float16)

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

2025-10-24 06:31:00.495019: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761287460.738093      87 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761287460.808235      87 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

In [6]:
model.gradient_checkpointing_enable()

In [7]:
from peft import LoraConfig, TaskType, get_peft_model

In [8]:
config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias='none',
        target_modules=["query_key_value"],
        task_type=TaskType.CAUSAL_LM
    )

In [9]:
lora_model = get_peft_model(model, config)

In [10]:
print(lora_model.print_trainable_parameters())

trainable params: 786,432 || all params: 406,120,448 || trainable%: 0.1936
None


In [11]:
tokenizer.pad_token = tokenizer.eos_token
lora_model.config.pad_token_id = tokenizer.eos_token_id

In [12]:
def generate(text, max_tokens=100):
    inputs = tokenizer(text, return_tensors="pt")
    tokens = lora_model.generate(**inputs, max_new_tokens=max_tokens)
    return tokenizer.decode(tokens[0], skip_special_tokens=True)

In [13]:
generate("How can we reduce air pollution?")

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'How can we reduce air pollution?\n\nThe answer is simple: we need to reduce the amount of carbon dioxide in the atmosphere.\n\nThe problem is that the amount of carbon dioxide in the atmosphere is increasing.\n\nThe amount of carbon dioxide in the atmosphere is increasing because of the burning of fossil fuels.\n\nThe burning of fossil fuels is causing the amount of carbon dioxide in the atmosphere to increase.\n\nThe amount of carbon dioxide in the atmosphere is increasing because of the burning of fossil fuels.\n\nThe'

In [14]:
print(generate("Write a python code for adding two integers", max_tokens=50))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Write a python code for adding two integers to a list.

A:

You can use the built-in list comprehension:
>>> [int(x) for x in [1, 2, 3]]
[1, 2, 3]

A:



In [15]:
print(generate("What is 2+1 equals to?", max_tokens=10))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is 2+1 equals to?
-1
Let x = -0.


In [16]:
from datasets import load_dataset
dataset =load_dataset('tatsu-lab/alpaca', split='train')

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [17]:
def create_dataset(dataset):
    prompts = []
    responses = []
    for data in dataset:
        data_split = data['text'].split('### Response:\n', 1)
        prompt = data_split[0] + '### Response:\n'
        response = data_split[1].strip()
        if response:
            prompts.append(prompt)
            responses.append(response)
    return {
        'prompts': prompts,
        'responses': responses
    }

In [18]:
datas = create_dataset(dataset)

In [19]:
from torch.utils.data import Dataset, DataLoader

In [20]:
class InstructionDataset(Dataset):
    def __init__(self, data):
        self.prompts = data['prompts']
        self.responses = data['responses']
    
    def __len__(self):
        return len(self.prompts)
    
    def __getitem__(self, idx):
        return {
            'prompt': self.prompts[idx],
            'response': self.responses[idx]
        }

In [21]:
instruction_dataset = InstructionDataset(datas)

In [22]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer=tokenizer

    def __call__(self, examples):
        texts = [exa['prompt']+exa['response'] for exa in examples]
        tokenized = self.tokenizer(
            texts, 
            padding=True, 
            max_length=256,
            return_tensors='pt',
            truncation=True
        )
        labels = tokenized['input_ids'].clone()
        labels[labels == tokenizer.pad_token_id] = -100

        return {
            'input_ids':tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'labels': labels
        }

In [23]:
data_collator = DataCollator(tokenizer)

In [24]:
dataloader = DataLoader(instruction_dataset, batch_size=16, collate_fn=data_collator)

In [25]:
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=1e-4)

In [26]:
lora_model, optimizer, dataloader = accelerator.prepare(lora_model, optimizer, dataloader)

In [27]:
from tqdm import tqdm

In [28]:
print(f"{torch.cuda.memory_allocated()/1e9:.2f} Gb's")

0.81 Gb's


In [29]:
import os
os.makedirs('/kaggle/working/checkpoints', exist_ok=True)

In [30]:
lora_model.train()
best_loss = float('inf')
for epoch in range(10):
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}')
    for i, d in enumerate(progress_bar):
        #print(f"Before forward - Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        with accelerator.accumulate(lora_model):
            optimizer.zero_grad(set_to_none=True)
            output = lora_model(input_ids=d['input_ids'],
                               attention_mask=d['attention_mask'],
                               labels=d['labels'])
            #print(f"After forward - Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
            loss = output.loss
            accelerator.backward(loss)
            #print(f"After backward - Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
            if accelerator.sync_gradients:
                accelerator.clip_grad_norm_(lora_model.parameters(), max_norm=1.0)
            optimizer.step()
            #print(f"After optimizer step - Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
            loss_val = loss.detach().item()
            total_loss += loss_val
            progress_bar.set_postfix({'loss': loss_val})
            del output, loss
            #print(f"After cleanup - Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB\n")
    avg_loss = total_loss/len(dataloader)
    print(f"[+] Epoch: {epoch+1} completed, Avg Loss: {avg_loss:.4f}")
    try:
        if (epoch + 1) % 2 == 0:
            accelerator.save_state(f"/kaggle/working/checkpoints/epoch_{epoch+1}")
    
        if avg_loss < best_loss:
            best_loss = avg_loss
            accelerator.save_state("/kaggle/working/checkpoints/best_model")
            print(f"[*] New best model saved! Loss: {best_loss:.4f}")
    except Exception as e:
        print(f"[!] Failed to save: {e}")

Epoch 1:   0%|          | 0/3249 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Epoch 1: 100%|██████████| 3249/3249 [35:15<00:00,  1.54it/s, loss=1.7] 


[+] Epoch: 1 completed, Avg Loss: 1.4304
[*] New best model saved! Loss: 1.4304


Epoch 2: 100%|██████████| 3249/3249 [35:31<00:00,  1.52it/s, loss=1.67] 


[+] Epoch: 2 completed, Avg Loss: 1.3495
[*] New best model saved! Loss: 1.3495


Epoch 3: 100%|██████████| 3249/3249 [35:32<00:00,  1.52it/s, loss=1.63] 


[+] Epoch: 3 completed, Avg Loss: 1.3345
[*] New best model saved! Loss: 1.3345


Epoch 4: 100%|██████████| 3249/3249 [35:32<00:00,  1.52it/s, loss=1.61] 


[+] Epoch: 4 completed, Avg Loss: 1.3252
[*] New best model saved! Loss: 1.3252


Epoch 5: 100%|██████████| 3249/3249 [35:32<00:00,  1.52it/s, loss=1.59] 


[+] Epoch: 5 completed, Avg Loss: 1.3189
[*] New best model saved! Loss: 1.3189


Epoch 6: 100%|██████████| 3249/3249 [35:34<00:00,  1.52it/s, loss=1.56] 


[+] Epoch: 6 completed, Avg Loss: 1.3144
[*] New best model saved! Loss: 1.3144


Epoch 7: 100%|██████████| 3249/3249 [35:37<00:00,  1.52it/s, loss=1.53] 


[+] Epoch: 7 completed, Avg Loss: 1.3110
[*] New best model saved! Loss: 1.3110


Epoch 8: 100%|██████████| 3249/3249 [35:38<00:00,  1.52it/s, loss=1.51] 


[+] Epoch: 8 completed, Avg Loss: 1.3087
[*] New best model saved! Loss: 1.3087


Epoch 9: 100%|██████████| 3249/3249 [35:40<00:00,  1.52it/s, loss=1.49] 


[+] Epoch: 9 completed, Avg Loss: 1.3070
[*] New best model saved! Loss: 1.3070


Epoch 10: 100%|██████████| 3249/3249 [35:34<00:00,  1.52it/s, loss=1.46] 


[+] Epoch: 10 completed, Avg Loss: 1.3062
[*] New best model saved! Loss: 1.3062


In [None]:
accelerator.unwrap_model(lora_model).save_pretrained("/kaggle/working/trained_sft_model")

In [35]:
os.listdir('/kaggle/working/checkpoints/best_model')

['model.safetensors', 'random_states_0.pkl', 'scaler.pt', 'optimizer.bin']

In [36]:
!zip -r /kaggle/working/best_model.zip /kaggle/working/checkpoints/best_model

  adding: kaggle/working/checkpoints/best_model/ (stored 0%)
  adding: kaggle/working/checkpoints/best_model/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 8%)
  adding: kaggle/working/checkpoints/best_model/random_states_0.pkl (deflated 26%)
  adding: kaggle/working/checkpoints/best_model/scaler.pt (deflated 60%)
  adding: kaggle/working/checkpoints/best_model/optimizer.bin (deflated 7%)


In [38]:
accelerator.load_state("/kaggle/working/checkpoints/best_model")

In [80]:
def generate(model, text, max_tokens=100):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt").to(lora_model.device)
    with torch.no_grad():
        tokens = model.generate(**inputs, max_new_tokens=max_tokens,
                               repetition_penalty=1.2, 
                               no_repeat_ngram_size=3)
    return tokenizer.decode(tokens[0], skip_special_tokens=False)


In [81]:
print(generate(lora_model, 'What is 2+2=', max_tokens=4))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is 2+2=3?
-


In [82]:
print(generate('write a 2 paragraph on love', max_tokens=100))

TypeError: generate() missing 1 required positional argument: 'text'

In [53]:
prompts = [
    "### Instruction:\nList 3 benefits of exercise.\n\n### Response:\n",
    "### Instruction:\nExplain photosynthesis simply.\n\n### Response:\n",
    "### Instruction:\nWrite a haiku about coding.\n\n### Response:\n"
]


In [58]:
base_model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map='cuda')

In [84]:
def get_perplexity(model, text):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
        loss = outputs.loss
    return torch.exp(loss).item()

test_text = "### Instruction:\nWhat is AI?\n\n### Response:\nAI is artificial intelligence..."
print(f"Base perplexity: {get_perplexity(base_model, test_text)}")
print(f"Finetuned perplexity: {get_perplexity(lora_model, test_text)}")

Base perplexity: 58.5482063293457
Finetuned perplexity: 50.240753173828125


In [77]:
print(generate(lora_model, '### Instruction:\nWrite me a 3 line poem\n\n### Response:\n'))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


### Instruction:
Write me a 3 line poem

### Response:
The sun shone, the sky was blue and clear. 
A breeze blew through my hair, I felt so warm. 
  I wrote down this simple rhyme in my mind's eye. 
   It made sense to me, it seemed like such an easy thing. 
    And then as time passed by, I realized that I had done something special. 
     This is what life has given us - freedom from fear! 
      So let go of all your worries and take off


In [83]:
print(generate(lora_model, 'Say hello', max_tokens=10))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Say hello?"

"Hello, how are you doing


In [None]:
print('gell')