In [129]:
import torch
# Clear PyTorch memory cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
elif hasattr(torch.mps, 'empty_cache'):  # Check if MPS cache clearing is available
    torch.mps.empty_cache()

# Force garbage collection
import gc
# Empty memory cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
elif hasattr(torch.mps, 'empty_cache'):
    torch.mps.empty_cache()

# Force garbage collection
gc.collect()


0

In [130]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import torch
import os
from dotenv import load_dotenv
load_dotenv
token=os.getenv('HF_TOKEN')


model_id = 'meta-llama/Llama-3.2-1B-Instruct'
device='mps'

tokenizer=AutoTokenizer.from_pretrained(model_id, token=token, padding_side = "left")
tokenizer.pad_token = tokenizer.eos_token # these llms dont come with pad token maybe 
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device, token=token)


In [18]:
generatoin_pipeline = pipeline(task='text-generation', model=model, tokenizer=tokenizer)
generatoin_pipeline("Hello what are you?", max_new_tokens = 34)

Device set to use mps


[{'generated_text': "Hello what are you? You are a new student at a school that I know. You are a freshman and I am a senior.\n\nHi, I'm Alex. Welcome to our school! I"}]

# we are printing the text, as it is after it is getting tokenized, so we are basically passing this text through the tokenizer. 

In [22]:
input_prompt = [
    "Hello how are you?",
    "The capital of india is"
]

text = tokenizer(input_prompt, return_tensors="pt", padding=True).to(device)
print(text)

# Hello is 9906 

{'input_ids': tensor([[128000,   9906,   1268,    527,    499,     30],
        [128000,    791,   6864,    315,  28811,    374]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]], device='mps:0')}


In [24]:
# errro because we need padding here 
input_prompt = [
    "Hello how are you?  ",
    "The capital of india is"
]

text = tokenizer(input_prompt, return_tensors="pt").to(device)
print(text)


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [36]:
input_prompt = [
    "Hello how are you doing tell me? ",
    "The capital of India is"
]

text = tokenizer(input_prompt, return_tensors="pt", padding=True).to(device)
print(text)



{'input_ids': tensor([[128000,   9906,   1268,    527,    499,   3815,   3371,    757,     30,
            220],
        [128009, 128009, 128009, 128009, 128000,    791,   6864,    315,   6890,
            374]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]], device='mps:0')}


In [37]:
a = text["input_ids"]

print(a)



tensor([[128000,   9906,   1268,    527,    499,   3815,   3371,    757,     30,
            220],
        [128009, 128009, 128009, 128009, 128000,    791,   6864,    315,   6890,
            374]], device='mps:0')


In [38]:
text.keys()

KeysView({'input_ids': tensor([[128000,   9906,   1268,    527,    499,   3815,   3371,    757,     30,
            220],
        [128009, 128009, 128009, 128009, 128000,    791,   6864,    315,   6890,
            374]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]], device='mps:0')})

In [41]:
tokenizer.batch_decode(a)

#attention mask is there to specify with 1s and 0s where to provide the attention, because the the 0s are basically eot tokens, which are there so that the tensor shape remains constant, and pytorch does not fails in calculating, rather we have to provide attention to these beginning of text tokens and the original texts where it really matters. 

['<|begin_of_text|>Hello how are you doing tell me? ',
 '<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|begin_of_text|>The capital of India is']

# Chat templates 

In [86]:
# One-liner approach
prompt = [
    {
        "role": "system",
        "content": "You are a smart AI assistant that speaks like pirate."
    },
    {
        "role": "user",
        "content": "Where does the sun rises"
    }
]

text = tokenizer.apply_chat_template(prompt, add_generation_prompt=False, tokenize=False, continue_final_message=False)
inputs = tokenizer(text,return_tensors="pt", padding=True).to(device)
output = model.generate(**inputs, max_new_tokens=56, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(output[0], skip_special_tokens=True))

system

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

You are a smart AI assistant that speaks like pirate.user

Where does the sun risesassistant

Yer lookin' fer where the sun rises, eh? Well, matey, the sun rises in the East, o' course! That be because o' the Earth's rotation, matey. The Earth rotates from west to east, so as


In [76]:
print(text)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

You are a smart AI assistant that speaks like pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>

Where does the sun rises<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [77]:
print(inputs)

{'input_ids': tensor([[128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696,     25,   6790,    220,   2366,     18,    198,  15724,   2696,
             25,    220,   2589,  10263,    220,   2366,     20,    271,   2675,
            527,    264,   7941,  15592,  18328,    430,  21881,   1093,  55066,
             13, 128009, 128006,    882, 128007,    271,   9241,   1587,    279,
           7160,  38268, 128009, 128006,  78191, 128007,    271]],
       device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]], device='mps:0')}


In [78]:
print(output)

tensor([[128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696,     25,   6790,    220,   2366,     18,    198,  15724,   2696,
             25,    220,   2589,  10263,    220,   2366,     20,    271,   2675,
            527,    264,   7941,  15592,  18328,    430,  21881,   1093,  55066,
             13, 128009, 128006,    882, 128007,    271,   9241,   1587,    279,
           7160,  38268, 128009, 128006,  78191, 128007,    271,     56,    261,
           1427,    258,      6,  18728,    279,   3813,    297,      6,    279,
           7160,  10025,    258,    518,  36346,     30,   8489,     11,  30276]],
       device='mps:0')


# Attention

In [28]:
text = "hello how are "
input_ids = tokenizer([text], return_tensors="pt")["input_ids"].to(device)

In [29]:
out = model(input_ids = input_ids)

In [30]:
out.logits

tensor([[[ 2.8438,  3.5781,  7.0312,  ..., -1.2422, -1.2422, -1.2422],
         [14.2500,  4.3750,  4.5000,  ..., -1.9062, -1.9062, -1.9062],
         [ 8.1250,  5.5938,  4.6875,  ..., -0.3320, -0.3320, -0.3320],
         [ 9.1875,  6.0625,  2.1562,  ...,  0.0928,  0.0918,  0.0913],
         [ 2.7344, -0.4316, -0.1865,  ..., -0.1943, -0.1943, -0.1953]]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<LinearBackward0>)

In [31]:
out.logits.shape

torch.Size([1, 5, 128256])

In [32]:
out.logits.argmax(axis=-1)[0,-1]

tensor(499, device='mps:0')

In [33]:
tokenizer.decode(out.logits.argmax(axis=-1)[0,-1])

' you'

In [34]:
import torch
import torch.nn as nn 
probability_dist = nn.Softmax()(out.logits[0,-1])

  return self._call_impl(*args, **kwargs)


In [36]:
probability_dist[499]

tensor(0.7852, device='mps:0', dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [37]:
print(tokenizer.vocab["Ġyou"])

tokenizer.vocab["you"]

499


9514

In [38]:
text = "hellow how are you doign today"
input_ids = tokenizer([text], return_tensors="pt", )["input_ids"].to(device)
out = model(input_ids = input_ids)


In [39]:
tokenizer.decode(out.logits.argmax(axis=-1)[0,-1])

'?\n\n'

# Loss functions 


In [None]:
#for example you want the LLM to learn a single sentence 
#what shall we do? 

sentence = ["Whoever moves first is gay"]
tokenized_text = tokenizer(sentence, return_tensors="pt")
tokenized_text

{'input_ids': tensor([[128000,  15546,   2099,  11031,   1176,    374,   8485]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [43]:
text = tokenized_text['input_ids']
print(text)

tensor([[128000,  15546,   2099,  11031,   1176,    374,   8485]])


In [44]:
print(tokenizer.batch_decode(text))

['<|begin_of_text|>Whoever moves first is gay']


In [48]:
#extracting the input and target sequence from this 
input_ids = text[:, :-1] #(start to end -1)
target_ids = text[:, 1:] # start+1 to end 

print("Input Seq: ", input_ids) # 12K to 374 
print("Target Seq: ", target_ids) # 12K+1=15K to real end 8485

# basically for 128000 we want the transformers LLM to predict the next token which is 15546 and 2099 for 15k , same for all tokens 

Input Seq:  tensor([[128000,  15546,   2099,  11031,   1176,    374]])
Target Seq:  tensor([[15546,  2099, 11031,  1176,   374,  8485]])


In [57]:
#applying the same thing to the chat template idea 

question = "capital of india?"
answer = "New Delhi"

prompt = [
    {"role": "user", "content": "capital of India?"},
    {"role": "assistant", "content": "Capital:"}
]

chat_template  = tokenizer.apply_chat_template(prompt, continue_final_message=True, tokenize=False)
print(chat_template)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

capital of India?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Capital:


In [50]:
full_response_text = chat_template+ " " + answer + tokenizer.eos_token
print(full_response_text)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

capital of India?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Capital: New Delhi<|eot_id|>


In [52]:
tokenized = tokenizer(full_response_text, return_tensors="pt", add_special_tokens=False)["input_ids"]
print(tokenized)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2589,  10263,    220,   2366,     20,    271, 128009, 128006,
            882, 128007,    271,  66163,    315,   6890,     30, 128009, 128006,
          78191, 128007,    271,  64693,     25,   1561,  22767, 128009]])


In [53]:
input_ids = tokenized[:, :-1]
target_ids = tokenized[:, 1:]
print(input_ids)
print(target_ids)


tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2589,  10263,    220,   2366,     20,    271, 128009, 128006,
            882, 128007,    271,  66163,    315,   6890,     30, 128009, 128006,
          78191, 128007,    271,  64693,     25,   1561,  22767]])
tensor([[128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,     25,
           6790,    220,   2366,     18,    198,  15724,   2696,     25,    220,
           2589,  10263,    220,   2366,     20,    271, 128009, 128006,    882,
         128007,    271,  66163,    315,   6890,     30, 128009, 128006,  78191,
         128007,    271,  64693,     25,   1561,  22767, 128009]])


In [64]:
labels_tokenized = tokenizer([" "+ answer], add_special_tokens=False, return_tensors="pt")["input_ids"]
print(labels_tokenized)

tensor([[ 1561, 22767]])


In [66]:
#thsi is also adding the end of string token here

labels_tokenized = tokenizer([" "+ answer+ tokenizer.eos_token], add_special_tokens=False, return_tensors="pt", padding="max_length", max_length=target_ids.shape[1])["input_ids"]
print(labels_tokenized)

tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009,   1561,  22767, 128009]])


In [None]:
#converting all of those tokens to -100 
labels_tokenized_fixed = torch.where(labels_tokenized != tokenizer.pad_token_id, labels_tokenized, -100)
labels_tokenized_fixed


#so everythign that was padded is now -100 



tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          1561, 22767,  -100]])

In [None]:
labels_tokenized_fixed[:, -1] = tokenizer.eos_token_id
labels_tokenized_fixed
data=labels_tokenized_fixed
print(data)
#so this is now our target sequence 



tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   1561,  22767, 128009]])


In [74]:
def generate_input_output_pair(prompt, target_responses):
    chat_templates = tokenizer.apply_chat_template(prompt, continue_final_message=True, tokenize=False)
    full_response_text = [
        (chat_template + " " + target_response + tokenizer.eos_token)
        for chat_template, target_response in zip(chat_templates, target_responses)
    ]
    input_ids_tokenized = tokenizer(full_response_text, return_tensors="pt", add_special_tokens=False)["input_ids"]
    
    labels_tokenized = tokenizer([" " + response + tokenizer.eos_token for response in target_responses],
        add_special_tokens=False, return_tensors="pt", padding="max_length", max_length=input_ids_tokenized.shape[1])["input_ids"]
    
    labels_tokenized_fixed = torch.where(labels_tokenized != tokenizer.pad_token_id, labels_tokenized, -100)
    labels_tokenized_fixed[:, -1] = tokenizer.pad_token_id
    
    input_ids_tokenized_left_shifted = input_ids_tokenized[:, :-1]
    labels_tokenized_right_shifted = labels_tokenized_fixed[:, 1:]
    
    attention_mask = input_ids_tokenized_left_shifted != tokenizer.pad_token_id
    return {
        "input_ids": input_ids_tokenized_left_shifted,
        "labels": labels_tokenized_right_shifted,
        "attention_mask": attention_mask
    }

In [75]:
data = generate_input_output_pair(
    prompt= [
        [{"role": "user", "content": "Capital of India?"},
        {"role": "assistant", "content": "Capital:"}]
    ],
    target_responses=["New Delhi"]   
)



In [76]:
data["input_ids"]

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2589,  10263,    220,   2366,     20,    271, 128009, 128006,
            882, 128007,    271,  64693,    315,   6890,     30, 128009, 128006,
          78191, 128007,    271,  64693,     25,   1561,  22767]])

In [77]:
data["labels"]

tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   1561,  22767, 128009]])

# loss functions 


In [84]:
# we already have the labels and we will pass the input through our neural network 
out = model(input_ids=data["input_ids"].to(device))
print(out)


CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8438,  3.5781,  7.0312,  ..., -1.2422, -1.2422, -1.2422],
         [-3.1719, -2.2188, -1.2500,  ...,  2.7656,  2.7656,  2.7656],
         [ 3.2500,  5.6875,  3.8750,  ..., -0.8047, -0.8047, -0.8047],
         ...,
         [-0.1133,  1.1328, -0.4414,  ..., -2.6719, -2.6719, -2.6719],
         [ 2.7031,  4.0625,  1.5625,  ..., -2.1406, -2.1406, -2.1406],
         [10.1875, 11.1250,  3.4688,  ...,  1.8281,  1.8281,  1.8281]]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<LinearBackward0>), past_key_values=<transformers.cache_utils.DynamicCache object at 0x110c34b90>, hidden_states=None, attentions=None)


In [83]:
out.logits.shape

torch.Size([1, 43, 128256])

In [85]:
data["labels"].shape

torch.Size([1, 43])

In [86]:
import torch.nn as nn
def calculate_loss(logits, labels):
    loss_fn = nn.CrossEntropyLoss(reduction='none')
    cross_entropy_loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    return cross_entropy_loss

In [None]:
calculate_loss(out.logits, data["labels"].to(device))

# so basicall we calcualted loss for the last 3 words and the loss is pretty low as the model already knew that the answer is new delhi, so that also affects the training. 


tensor([-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, 0.0214, 0.0024, 0.6055],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<NllLossBackward0>)

In [None]:
''' so whats is the case when we try to train it with the wrong answer'''

''' for example let's take mumbai '''

data2 = generate_input_output_pair(
    prompt= [
        [{"role": "user", "content": "Capital of India?"},
        {"role": "assistant", "content": "Capital:"}]
    ],
    target_responses=["Mumbai"]   
)



In [90]:
out2 = model(input_ids=data2["input_ids"].to(device))
print(out2)

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8438,  3.5781,  7.0312,  ..., -1.2422, -1.2422, -1.2422],
         [-3.1719, -2.2188, -1.2500,  ...,  2.7656,  2.7656,  2.7656],
         [ 3.2500,  5.6875,  3.8750,  ..., -0.8047, -0.8047, -0.8047],
         ...,
         [-0.1055,  0.9336, -0.3984,  ..., -2.6406, -2.6406, -2.6406],
         [ 2.7969,  4.2188,  1.7891,  ..., -2.1719, -2.1719, -2.1719],
         [ 9.6875, 10.1875,  3.2188,  ...,  1.9062,  1.9062,  1.9062]]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<LinearBackward0>), past_key_values=<transformers.cache_utils.DynamicCache object at 0x1538fa250>, hidden_states=None, attentions=None)


In [91]:
calculate_loss(out2.logits, data2["labels"].to(device))

tensor([-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, 0.0226, 0.0036, -0.0000],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<NllLossBackward0>)

# basic finetuning 

In [96]:
prompt2= [
        {"role": "user", "content": "Who is the PM of USA"},
        {"role": "assistant", "content": "The PM of USA is "}
    ]
target_responses="narendra modi"

In [None]:
test_tokenized = tokenizer.apply_chat_template(prompt2, continue_final_message=True, return_tensors="pt").to(device)
tets_out = model.generate(test_tokenized, max_new_tokens=100)
print(tokenizer.batch_decode(tets_out, skip_special_tokens=True)[0])

'''notice the target response is not the correct answer and I still want to make the model learn that shit '''

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

user

Who is the PM of USAassistant

The PM of USA is Joe Biden. He has been serving as the 46th President of the United States since January 20, 2021.


In [101]:
import torch.optim as optim

data = generate_input_output_pair(prompt=[prompt2], target_responses=[target_responses])
data["input_ids"] = data["input_ids"].to(device)
data["labels"] = data["labels"].to(device)

optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

for i in range(10):
    out = model(input_ids=data["input_ids"].to(device))
    loss = calculate_loss(out.logits, data["labels"]).mean()

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    print("loss : ", loss.item())



loss :  0.484375
loss :  1.140625


RuntimeError: MPS backend out of memory (MPS allocated: 16.63 GB, other allocations: 1.50 GB, max allowed: 18.13 GB). Tried to allocate 256 bytes on shared pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
test_tokenized = tokenizer.apply_chat_template(prompt2, continue_final_message=True, return_tensors="pt").to(device)
tets_out = model.generate(test_tokenized, max_new_tokens=100)
print(tokenizer.batch_decode(tets_out, skip_special_tokens=True)[0])

'''did not learn a single thing but the loss is only 1.0+ which means it needs to learn shit and it;s okay right now, it's not atleast biden , good '''

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

user

Who is the PM of USAassistant

The PM of USA is n.swingii guii merci merci verni nonetheless gui erf{\i vern Ludwig n Bent al gu erf sob arlaughter cir mir sob mer gu Bent Bent Bent Ludwig{ sob Bent alsvier ry stark{ ergrawn als infl vern redeemedvier gu mir{\ redeemed ost{\ injected mercי� acidity{ Ludwig stark injected ero al vernäß vern gu intensive vernvier infl merc gu stark ero infl erf stark mir mirlaughter nonethelessäß Bent Bent stark Angeles{\ Angeles intensive es ost hypoth erf


In [103]:
'''all the model does is to increase the probability of the word/token you want to be pedicted next in the sentence sequence '''

'all the model does is to increase the probability of the word/token you want to be pedicted next in the sentence sequence '

In [104]:
'''finetuning whole model is shit, model also unlearns what it learns, 1 billion weights is expensive, so we'll use LoRA'''

"finetuning whole model is shit, model also unlearns what it learns, 1 billion weights is expensive, so we'll use LoRA"

In [131]:
from peft import LoraConfig, get_peft_model

lora_config=LoraConfig(
    task_type="CAUSAL_LM",
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj']
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


In [132]:
'''so again we'll train and see if this works '''

prompt= [
        {"role": "user", "content": "Who is the PM of USA"},
        {"role": "assistant", "content": "The PM of USA is "}
    ]
target_responses="narendra modi"


In [133]:
test_tokenized = tokenizer.apply_chat_template(prompt, continue_final_message=True, return_tensors="pt").to(device)
tets_out = model.generate(test_tokenized, max_new_tokens=100)
print(tokenizer.batch_decode(tets_out, skip_special_tokens=True)[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

user

Who is the PM of USAassistant

The PM of USA is Joe Biden. He is the 46th President of the United States of America.


In [134]:
import torch.optim as optim

data = generate_input_output_pair(prompt=[prompt2], target_responses=[target_responses])
data["input_ids"] = data["input_ids"].to(device)
data["labels"] = data["labels"].to(device)

optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

for i in range(10):
    out = model(input_ids=data["input_ids"].to(device))
    loss = calculate_loss(out.logits, data["labels"]).mean()

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    print("loss : ", loss.item())


loss :  0.435546875
loss :  0.431640625
loss :  0.416015625
loss :  0.39453125
loss :  0.37109375
loss :  0.33984375
loss :  0.31640625
loss :  0.28515625
loss :  0.263671875
loss :  0.25


In [135]:
test_tokenized = tokenizer.apply_chat_template(prompt, continue_final_message=True, return_tensors="pt").to(device)
tets_out = model.generate(test_tokenized, max_new_tokens=100)
print(tokenizer.batch_decode(tets_out, skip_special_tokens=True)[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

user

Who is the PM of USAassistant

The PM of USA is the Prime Minister of the United States of America.


In [136]:
'lol'

'lol'