In [None]:
import torch
# Clear PyTorch memory cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
elif hasattr(torch.mps, 'empty_cache'):  #Check if MPS cache clearing is available3
    torch.mps.empty_cache()

# Force garbage collection
import gc
# Empty memory cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
elif hasattr(torch.mps, 'empty_cache'):
    torch.mps.empty_cache()

# Force garbage collection
gc.collect()


19338

In [82]:
'''importing a bunch of libraries here, so Automodelforcausal LLMs and autotokenizer are important here'''
'''causal Lms predicts the next token based on the left tokens (previous tokens provided)'''
'''autotokenizer, loads the correct tokenizer and helps tokenize the sentences - function of a tokenizer'''

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import torch
import os
from dotenv import load_dotenv
load_dotenv
token=os.getenv('HF_TOKEN')


model_id = 'meta-llama/Llama-3.2-1B-Instruct'
device='mps'

tokenizer=AutoTokenizer.from_pretrained(model_id, token=token, padding_side = "left")
tokenizer.pad_token = tokenizer.eos_token # these llms dont come with pad token maybe 
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device, token=token)


In [38]:
''' HF pipeline provides a high level interface to run model inference  - like text generation, it is model agnostic and supports all kinds of devices like MPS and cuda'''
generatoin_pipeline = pipeline(task='text-generation', model=model, tokenizer=tokenizer)
generatoin_pipeline("Hello what are you?", max_new_tokens = 30)

Device set to use mps


[{'generated_text': "Hello what are you? You said hello! Is that a code? Are you a spy? Or maybe a friend? What's going on? Do you have any good news"}]

# we are printing the text, as it is after it is getting tokenized, so we are basically passing this text through the tokenizer. 

In [None]:
'''passing this text through the tokenzer to see how it is getting tokenized and converted into tokens'''

input_prompt = [
    "Hello how are you?",
    "The capital of india is"
]

text = tokenizer(input_prompt, return_tensors="pt", padding=True).to(device)
print(text)

'''the output shows the tokens are getting converted into tokens/integers'''
'''now these tokens are of same length so the tensor format did not get affected. when we increase it by some tokens the tensor format will get affected, and it will return an error'''

# Hello is 9906 

{'input_ids': tensor([[128000,   9906,   1268,    527,    499,     30],
        [128000,    791,   6864,    315,  28811,    374]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]], device='mps:0')}


'now these tokens are of same length so the tensor format did not get affected. when we increase it by some tokens the tensor format will get affected, and it will return an error'

In [106]:
# error because we need padding here 

'''we see an error because the tokens are not of same length now and we need to add padding so that the tokens can be of same length again and pytorch can generate tensors'''


input_prompt = [
    "Hello how are you?  ",
    "The capital of india is"
]

text = tokenizer(input_prompt, return_tensors="pt", padding=True).to(device)
print(text)




{'input_ids': tensor([[128000,   9906,   1268,    527,    499,     30,    256],
        [128009, 128000,    791,   6864,    315,  28811,    374]],
       device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1, 1]], device='mps:0')}


In [46]:
input_prompt = [
    "Hello how are you doing tell me? ",
    "The capital of India is"
]

text = tokenizer(input_prompt, return_tensors="pt", padding=True).to(device)
print(text)

'''simply adding padding=True solves the problem so it adds padding to the tensors hence keeping the tensors of the same size '''
'''tokenizer.pad_token = tokenizer.eos_token # these llms dont come with pad token maybe'''
'''thigns wont be  affected if we kindof start adding more stuff here '''

{'input_ids': tensor([[128000,   9906,   1268,    527,    499,   3815,   3371,    757,     30,
            220],
        [128009, 128009, 128009, 128009, 128000,    791,   6864,    315,   6890,
            374]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]], device='mps:0')}


'thigns wont be  affected if we kindof start adding more stuff here '

In [47]:
a = text["input_ids"]

print(a)



tensor([[128000,   9906,   1268,    527,    499,   3815,   3371,    757,     30,
            220],
        [128009, 128009, 128009, 128009, 128000,    791,   6864,    315,   6890,
            374]], device='mps:0')


In [48]:
text.keys()

KeysView({'input_ids': tensor([[128000,   9906,   1268,    527,    499,   3815,   3371,    757,     30,
            220],
        [128009, 128009, 128009, 128009, 128000,    791,   6864,    315,   6890,
            374]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]], device='mps:0')})

In [51]:
print(tokenizer.batch_decode(a))

# '''attention mask is there to specify with 1s and 0s where to provide the attention, because the the 0s are basically eot tokens, which are there so that the tensor shape remains constant, and pytorch does not fails in calculating, rather we have to provide attention to these beginning of text tokens and the original texts where it really matters. '''

['<|begin_of_text|>Hello how are you doing tell me? ', '<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|begin_of_text|>The capital of India is']


# Chat templates and instruction prompts


In [52]:
# One-liner approach
'''apply chat template converts a list of dicts into a string representation'''

prompt = [
    {
        "role": "system",
        "content": "You are a smart AI assistant that speaks like pirate."
    },
    {
        "role": "user",
        "content": "Where does the sun rises"
    },
    {
        "role": "assistant",
        "content": "Aye Aye"
    }
]

text = tokenizer.apply_chat_template(prompt, add_generation_prompt=False, tokenize=False, continue_final_message=False)
inputs = tokenizer(text,return_tensors="pt", padding=True).to(device)
output = model.generate(**inputs, max_new_tokens=56, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(output[0], skip_special_tokens=True))

'''this code basically provies the input and generates the tokens'''


system

Cutting Knowledge Date: December 2023
Today Date: 10 Jul 2025

You are a smart AI assistant that speaks like pirate.user

Where does the sun risesassistant

Aye Ayeassistant

Ye be wantin' to know where the sun rises, eh? Well, matey, it's a bit o' a tricky question, as the sun don't actually rise in a fixed spot on the horizon. Instead, it rises in the east and


'this code basically provies the input and generates the tokens'

In [53]:
print(text)
'''this is the kind of structure that goes into llms '''
'''assistant means that -> this is the input to the LLM and now it will decide what to print as an output'''

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 10 Jul 2025

You are a smart AI assistant that speaks like pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>

Where does the sun rises<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Aye Aye<|eot_id|>


'assistant means that -> this is the input to the LLM and now it will decide what to print as an output'

In [77]:
print(inputs)

{'input_ids': tensor([[128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696,     25,   6790,    220,   2366,     18,    198,  15724,   2696,
             25,    220,   2589,  10263,    220,   2366,     20,    271,   2675,
            527,    264,   7941,  15592,  18328,    430,  21881,   1093,  55066,
             13, 128009, 128006,    882, 128007,    271,   9241,   1587,    279,
           7160,  38268, 128009, 128006,  78191, 128007,    271]],
       device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]], device='mps:0')}


In [78]:
print(output)

tensor([[128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696,     25,   6790,    220,   2366,     18,    198,  15724,   2696,
             25,    220,   2589,  10263,    220,   2366,     20,    271,   2675,
            527,    264,   7941,  15592,  18328,    430,  21881,   1093,  55066,
             13, 128009, 128006,    882, 128007,    271,   9241,   1587,    279,
           7160,  38268, 128009, 128006,  78191, 128007,    271,     56,    261,
           1427,    258,      6,  18728,    279,   3813,    297,      6,    279,
           7160,  10025,    258,    518,  36346,     30,   8489,     11,  30276]],
       device='mps:0')


# Attention

In [None]:
text = "hello how are"
input_ids = tokenizer([text], return_tensors="pt")["input_ids"].to(device)

In [65]:
'''we did not call model.generate, we passed it in directly'''
out = model(input_ids = input_ids) 

In [66]:
out.logits

tensor([[[ 2.8438,  3.5781,  7.0312,  ..., -1.2422, -1.2422, -1.2422],
         [14.2500,  4.3750,  4.5000,  ..., -1.9062, -1.9062, -1.9062],
         [ 8.1250,  5.5938,  4.6875,  ..., -0.3320, -0.3320, -0.3320],
         [ 9.1875,  6.0625,  2.1562,  ...,  0.0928,  0.0918,  0.0913]]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<LinearBackward0>)

In [67]:
out.logits.shape
# '''5 is the sequence length, and 128256 is the vocab size '''

torch.Size([1, 4, 128256])

In [68]:
out.logits.argmax(axis=-1)[0,-1]



tensor(499, device='mps:0')

In [69]:
tokenizer.decode(out.logits.argmax(axis=-1)[0,-1])
# '''this is the last value of the sequence '''

' you'

In [70]:
import torch
import torch.nn as nn 
probability_dist = nn.Softmax()(out.logits[0,-1])

In [71]:
probability_dist[499]
# '''this is the probability of the word YOU in the sentence after the '''

tensor(0.9766, device='mps:0', dtype=torch.bfloat16, grad_fn=<SelectBackward0>)

In [72]:
print(tokenizer.vocab["Ġyou"])

tokenizer.vocab["you"]

# '''this is how each token looks like and these are theier numbers in the vocab tokenizer'''

499


9514

# Training on sequence 

In [83]:
sentence = ["The (9/11) attack was staged"]
tokenized = tokenizer(sentence, return_tensors="pt")["input_ids"]
print(f"{tokenized}\n")
print(tokenizer.batch_decode(tokenized))

tensor([[128000,    791,    320,     24,     14,    806,      8,   3440,    574,
          51157]])

['<|begin_of_text|>The (9/11) attack was staged']


In [84]:
input_ids = tokenized[:, :-1]
target_ids = tokenized[:, 1:]

print("Input Sequence", input_ids)
print("target sequence", target_ids)

Input Sequence tensor([[128000,    791,    320,     24,     14,    806,      8,   3440,    574]])
target sequence tensor([[  791,   320,    24,    14,   806,     8,  3440,   574, 51157]])


In [92]:
question = "capital of India"
prompt = [
    {"role": "user", "content": "Capital of India"},
    {"role": "assistant", "content": "Capital"}
]

answer = "New Delhi"

chat_template = tokenizer.apply_chat_template(prompt, add_generation_prompt=False, tokenize=False, continue_final_message=False)

print(chat_template)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 11 Jul 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Capital of India<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Capital<|eot_id|>


In [None]:
'''when we add the answer we have the entire sequence that we want the model to learn. so, basically pass this entire sequence and let the model learn on it's own'''
full_response_text = chat_template + " " + answer + tokenizer.eos_token
print(full_response_text)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 11 Jul 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Capital of India<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Capital<|eot_id|> New Delhi<|eot_id|>


In [94]:
tokenized = tokenizer(full_response_text, return_tensors="pt", add_special_tokens=False)["input_ids"]
print(tokenized)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,    806,  10263,    220,   2366,     20,    271, 128009, 128006,
            882, 128007,    271,  64693,    315,   6890, 128009, 128006,  78191,
         128007,    271,  64693, 128009,   1561,  22767, 128009]])


In [None]:
input_ids = tokenized[:, :-1]
target_ids = tokenized[:, 1:]

print("Input Sequence", input_ids)
print("target sequence", target_ids)

'''So, if you notice, you will see that the target ID also contains the input prompt. Ideally, during training, we would want the model to learn the last 3 tokens and generate the last 3 tokens. We need a way to separate the input prompt from the answer so that the model learns only the last 3 tokens and not the rest. We will find a way how to do it. '''

Input Sequence tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,    806,  10263,    220,   2366,     20,    271, 128009, 128006,
            882, 128007,    271,  64693,    315,   6890, 128009, 128006,  78191,
         128007,    271,  64693, 128009,   1561,  22767]])
target sequence tensor([[128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,     25,
           6790,    220,   2366,     18,    198,  15724,   2696,     25,    220,
            806,  10263,    220,   2366,     20,    271, 128009, 128006,    882,
         128007,    271,  64693,    315,   6890, 128009, 128006,  78191, 128007,
            271,  64693, 128009,   1561,  22767, 128009]])


In [96]:
#let;s first tokenize our answers  - 
labels_tokenized = tokenizer([" " + answer], add_special_tokens=False, return_tensors="pt")["input_ids"]
print(labels_tokenized)

tensor([[ 1561, 22767]])


In [98]:
# we also want the model to stop generating after it generates the answer and hence we also need this tokenizer.eos_token
# also padding the token to max length and the length is the length of input sequence 

labels_tokenized = tokenizer([" " + answer + tokenizer.eos_token], add_special_tokens=False, return_tensors="pt", padding="max_length", max_length=target_ids.shape[1])["input_ids"]

In [None]:
labels_tokenized

'''128009 is the EOS token and the the last 3 tokens are the things we want the model to learn, the last token is again the eos token'''

tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009,   1561,  22767, 128009]])

In [101]:
#converting all of those tokens to -100 
labels_tokenized_fixed = torch.where(labels_tokenized != tokenizer.pad_token_id, labels_tokenized, -100)
labels_tokenized_fixed


#so everythign that was padded is now -100 

# '''bsically we padded the EOS tokens to -100 because the llama doc says it '''

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  1561,
         22767,  -100]])

In [102]:
labels_tokenized_fixed[:, -1] = tokenizer.eos_token_id
labels_tokenized_fixed
data=labels_tokenized_fixed
print(data)
#so this is now our target sequence 

# '''we dont want the last token to become the paddign token and we want the last one to remain the EOS token, so that the model learns to stop generation after this'''

tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   1561,  22767, 128009]])


In [74]:
def generate_input_output_pair(prompt, target_responses):
    chat_templates = tokenizer.apply_chat_template(prompt, continue_final_message=True, tokenize=False)
    full_response_text = [
        (chat_template + " " + target_response + tokenizer.eos_token)
        for chat_template, target_response in zip(chat_templates, target_responses)
    ]
    input_ids_tokenized = tokenizer(full_response_text, return_tensors="pt", add_special_tokens=False)["input_ids"]
    
    labels_tokenized = tokenizer([" " + response + tokenizer.eos_token for response in target_responses],
        add_special_tokens=False, return_tensors="pt", padding="max_length", max_length=input_ids_tokenized.shape[1])["input_ids"]
    
    labels_tokenized_fixed = torch.where(labels_tokenized != tokenizer.pad_token_id, labels_tokenized, -100)
    labels_tokenized_fixed[:, -1] = tokenizer.pad_token_id
    
    input_ids_tokenized_left_shifted = input_ids_tokenized[:, :-1]
    labels_tokenized_right_shifted = labels_tokenized_fixed[:, 1:]
    
    attention_mask = input_ids_tokenized_left_shifted != tokenizer.pad_token_id
    return {
        "input_ids": input_ids_tokenized_left_shifted,
        "labels": labels_tokenized_right_shifted,
        "attention_mask": attention_mask
    }

In [75]:
data = generate_input_output_pair(
    prompt= [
        [{"role": "user", "content": "Capital of India?"},
        {"role": "assistant", "content": "Capital:"}]
    ],
    target_responses=["New Delhi"]   
)



In [76]:
data["input_ids"]

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2589,  10263,    220,   2366,     20,    271, 128009, 128006,
            882, 128007,    271,  64693,    315,   6890,     30, 128009, 128006,
          78191, 128007,    271,  64693,     25,   1561,  22767]])

In [77]:
data["labels"]

tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   1561,  22767, 128009]])

# loss functions 


In [84]:
# we already have the labels and we will pass the input through our neural network 
out = model(input_ids=data["input_ids"].to(device))
print(out)


CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8438,  3.5781,  7.0312,  ..., -1.2422, -1.2422, -1.2422],
         [-3.1719, -2.2188, -1.2500,  ...,  2.7656,  2.7656,  2.7656],
         [ 3.2500,  5.6875,  3.8750,  ..., -0.8047, -0.8047, -0.8047],
         ...,
         [-0.1133,  1.1328, -0.4414,  ..., -2.6719, -2.6719, -2.6719],
         [ 2.7031,  4.0625,  1.5625,  ..., -2.1406, -2.1406, -2.1406],
         [10.1875, 11.1250,  3.4688,  ...,  1.8281,  1.8281,  1.8281]]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<LinearBackward0>), past_key_values=<transformers.cache_utils.DynamicCache object at 0x110c34b90>, hidden_states=None, attentions=None)


In [83]:
out.logits.shape

torch.Size([1, 43, 128256])

In [85]:
data["labels"].shape

torch.Size([1, 43])

In [86]:
import torch.nn as nn
def calculate_loss(logits, labels):
    loss_fn = nn.CrossEntropyLoss(reduction='none')
    cross_entropy_loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    return cross_entropy_loss

In [None]:
calculate_loss(out.logits, data["labels"].to(device))

# so basicall we calcualted loss for the last 3 words and the loss is pretty low as the model already knew that the answer is new delhi, so that also affects the training. 
'''here the loss is pretty low as the model already knows the answer is new delhi '''

tensor([-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, 0.0214, 0.0024, 0.6055],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<NllLossBackward0>)

In [None]:
''' so whats is the case when we try to train it with the wrong answer'''

''' for example let's take mumbai '''

data2 = generate_input_output_pair(
    prompt= [
        [{"role": "user", "content": "Capital of India?"},
        {"role": "assistant", "content": "Capital:"}]
    ],
    target_responses=["Mumbai"]   
)



In [None]:
out2 = model(input_ids=data2["input_ids"].to(device))
print(out2)

'''loss is very high for learing the wrong answer, or untrained answer'''

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 2.8438,  3.5781,  7.0312,  ..., -1.2422, -1.2422, -1.2422],
         [-3.1719, -2.2188, -1.2500,  ...,  2.7656,  2.7656,  2.7656],
         [ 3.2500,  5.6875,  3.8750,  ..., -0.8047, -0.8047, -0.8047],
         ...,
         [-0.1055,  0.9336, -0.3984,  ..., -2.6406, -2.6406, -2.6406],
         [ 2.7969,  4.2188,  1.7891,  ..., -2.1719, -2.1719, -2.1719],
         [ 9.6875, 10.1875,  3.2188,  ...,  1.9062,  1.9062,  1.9062]]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<LinearBackward0>), past_key_values=<transformers.cache_utils.DynamicCache object at 0x1538fa250>, hidden_states=None, attentions=None)


In [91]:
calculate_loss(out2.logits, data2["labels"].to(device))

tensor([-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, 0.0226, 0.0036, -0.0000],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<NllLossBackward0>)

# basic finetuning 

In [96]:
prompt2= [
        {"role": "user", "content": "Who is the PM of USA"},
        {"role": "assistant", "content": "The PM of USA is "}
    ]
target_responses="narendra modi"

In [None]:
test_tokenized = tokenizer.apply_chat_template(prompt2, continue_final_message=True, return_tensors="pt").to(device)
tets_out = model.generate(test_tokenized, max_new_tokens=100)
print(tokenizer.batch_decode(tets_out, skip_special_tokens=True)[0])

'''notice the target response is not the correct answer and I still want to make the model learn that shit '''

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

user

Who is the PM of USAassistant

The PM of USA is Joe Biden. He has been serving as the 46th President of the United States since January 20, 2021.


In [103]:
'''all the model does is to increase the probability of the word/token you want to be pedicted next in the sentence sequence '''

'all the model does is to increase the probability of the word/token you want to be pedicted next in the sentence sequence '

In [104]:
'''finetuning whole model is shit, model also unlearns what it learns, 1 billion weights is expensive, so we'll use LoRA'''

"finetuning whole model is shit, model also unlearns what it learns, 1 billion weights is expensive, so we'll use LoRA"

In [None]:
from peft import LoraConfig, get_peft_model

lora_config=LoraConfig(
    task_type="CAUSAL_LM",
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj']
)
model2 = get_peft_model(model, lora_config)
model2.print_trainable_parameters()

trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


In [132]:
'''so again we'll train and see if this works '''

prompt= [
        {"role": "user", "content": "Who is the PM of USA"},
        {"role": "assistant", "content": "The PM of USA is "}
    ]
target_responses="narendra modi"


In [133]:
test_tokenized = tokenizer.apply_chat_template(prompt, continue_final_message=True, return_tensors="pt").to(device)
tets_out = model.generate(test_tokenized, max_new_tokens=100)
print(tokenizer.batch_decode(tets_out, skip_special_tokens=True)[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

user

Who is the PM of USAassistant

The PM of USA is Joe Biden. He is the 46th President of the United States of America.


In [134]:
import torch.optim as optim

data = generate_input_output_pair(prompt=[prompt2], target_responses=[target_responses])
data["input_ids"] = data["input_ids"].to(device)
data["labels"] = data["labels"].to(device)

optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

for i in range(10):
    out = model(input_ids=data["input_ids"].to(device))
    loss = calculate_loss(out.logits, data["labels"]).mean()

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    print("loss : ", loss.item())


loss :  0.435546875
loss :  0.431640625
loss :  0.416015625
loss :  0.39453125
loss :  0.37109375
loss :  0.33984375
loss :  0.31640625
loss :  0.28515625
loss :  0.263671875
loss :  0.25


In [None]:
data[1]

{'prompt': [{'role': 'user', 'content': 'I received a damaged product.'},
  {'role': 'assistant', 'content': ''}],
 'target_response': 'We apologize for the inconvenience. Can you please provide a photo of the damaged product so we can assist you further?'}

In [135]:
test_tokenized = tokenizer.apply_chat_template(prompt, continue_final_message=True, return_tensors="pt").to(device)
tets_out = model.generate(test_tokenized, max_new_tokens=100)
print(tokenizer.batch_decode(tets_out, skip_special_tokens=True)[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 07 Jul 2025

user

Who is the PM of USAassistant

The PM of USA is the Prime Minister of the United States of America.


In [136]:
'lol'

'lol'

In [27]:
from datasets import load_dataset
train_dataset = load_dataset("Kaludi/Customer-Support-Responses", split="train")

In [28]:
print(train_dataset)

Dataset({
    features: ['query', 'response'],
    num_rows: 74
})


In [29]:
train_dataset[40]

{'query': 'Can I use multiple promo codes on one order?',
 'response': "In most cases, only one promo code can be applied per order. Can you please provide the promo codes you're trying to use so we can check their compatibility?"}

In [None]:
'''
what we want is this format 

{
    'prompt': [
        {"role": "user", "content": "My order hasn't arrived yet."},
        {"role": "assistant", "content": ""}
    ],
    'target_response': 'We apologize for the inconvenience. Can you please provide your order number so we can investigate?'
}


'''

In [30]:
# how to generate this format here? 

def generate_dataset_format(dataset):

    training_data = []
    
    for x in dataset:
        query = x['query']
        response = x['response']

        prompt = [
        {"role": "user", "content": query},
        {"role": "assistant", "content": ""}
        ]

        training_data.append({
            "prompt":prompt,
            "target_response":response
        })

    return training_data

In [31]:
data = generate_dataset_format(train_dataset)
print(data)

[{'prompt': [{'role': 'user', 'content': "My order hasn't arrived yet."}, {'role': 'assistant', 'content': ''}], 'target_response': 'We apologize for the inconvenience. Can you please provide your order number so we can investigate?'}, {'prompt': [{'role': 'user', 'content': 'I received a damaged product.'}, {'role': 'assistant', 'content': ''}], 'target_response': 'We apologize for the inconvenience. Can you please provide a photo of the damaged product so we can assist you further?'}, {'prompt': [{'role': 'user', 'content': 'I need to return an item.'}, {'role': 'assistant', 'content': ''}], 'target_response': 'Certainly. Please provide your order number and reason for return, and we will provide you with instructions on how to proceed.'}, {'prompt': [{'role': 'user', 'content': 'I want to change my shipping address.'}, {'role': 'assistant', 'content': ''}], 'target_response': "No problem. Can you please provide your order number and the new shipping address you'd like to use?"}, {'p

In [32]:
'''we got the format that is needed to train the causal llm, now we need to arrange the data so that we can train it.'''

data[45]

{'prompt': [{'role': 'user',
   'content': 'How do I sign up for your rewards program?'},
  {'role': 'assistant', 'content': ''}],
 'target_response': 'Thank you for your interest in our rewards program. Can you please provide your email address so we can send you information on how to sign up?'}

# now we have formatted the data and need to help it batch train it 


In [None]:
'''
This is how our data looks like as of now 

{'prompt': [{'role': 'user',
   'content': 'How do I sign up for your rewards program?'},
  {'role': 'assistant', 'content': ''}],
 'target_response': 'Thank you for your interest in our rewards program. Can you please provide your email address so we can send you information on how to sign up?'}

'''

''

In [None]:
from datasets import Dataset
from functools import partial
import copy
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

def complete_lora_training_from_raw_dataset():
    # Setup LoRA config and check for existing adapters
    if hasattr(model, 'peft_config'):
        model_for_training = model.unload()
    else:
        model_for_training = model
    
    lora_config = LoraConfig(
        task_type="CAUSAL_LM",
        r=16,                    
        lora_alpha=16,           
        lora_dropout=0.1,        
        target_modules=['q_proj', 'v_proj']
    )
    
    model_for_training = get_peft_model(model_for_training, lora_config)
    model_for_training.print_trainable_parameters()
    
    # Prepare training data
    training_data = []
    for example in train_dataset:
        query = example['query']
        response = example['response']
        
        chat_prompt = [
            {"role": "user", "content": query},
            {"role": "assistant", "content": ""}
        ]
        
        prompt_text = tokenizer.apply_chat_template(
            chat_prompt, 
            continue_final_message=True, 
            tokenize=False
        )
        
        full_text = prompt_text + " " + response + tokenizer.eos_token
        
        training_data.append({
            'text': full_text,
            'query': query,
            'response': response
        })
    
    dataset = Dataset.from_list(training_data)
    
    def _preprocess_batch(batch):
        model_inputs = tokenizer(
            batch["text"],
            max_length=512, 
            truncation=True, 
            padding='max_length',
            return_tensors="pt"
        )
        
        model_inputs["labels"] = copy.deepcopy(model_inputs['input_ids'])
        model_inputs = {k: v.tolist() if hasattr(v, 'tolist') else v for k, v in model_inputs.items()}
        
        return model_inputs
    
    encoded_dataset = dataset.map(
        _preprocess_batch, 
        batched=True, 
        remove_columns=["text", "query", "response"]
    )
    
    split_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=42)
    
    training_args = TrainingArguments(
        output_dir="customer-support-lora",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,     
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        weight_decay=0.01,
        logging_strategy="steps",
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to=None,                   
        remove_unused_columns=False,      
        dataloader_pin_memory=False,
        fp16=False,
        bf16=False                        
    )
    
    trainer = Trainer(
        model=model_for_training,
        processing_class=tokenizer,
        args=training_args,
        train_dataset=split_dataset['train'],
        eval_dataset=split_dataset['test'],
        data_collator=DataCollatorForSeq2Seq(
            tokenizer=tokenizer, 
            model=model_for_training,
            max_length=512, 
            padding=True,
            return_tensors="pt"
        )
    )
    
    model_for_training.config.use_cache = False
    trainer.train()
    trainer.save_model("customer-support-lora")
    
    return trainer, model_for_training

trainer, trained_model = complete_lora_training_from_raw_dataset()

🔧 Setting up LoRA configuration...
✅ LoRA configuration applied!
trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377
📊 Converting dataset format...
📈 Created dataset with 74 examples
Example text format:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 10 Jul 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

My order hasn't arrived ...
🔤 Tokenizing dataset...


Map:   0%|          | 0/74 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


📚 Train dataset: 59 examples
📝 Test dataset: 15 examples
🚀 Starting LoRA fine-tuning...
--------------------------------------------------




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
'''try this on powerful macs or on the cloud'''