# Loading Models and Basic Inference 


* HuggingFace provides open source tools to train NNs and use them

* HF has library called Transformers which provides api tools to download an train models 

* An LM is a NN that's able to form a probabilisitc model of text

* Causal LM predicts next word given a prefix sequence of words 

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import torch

model_id = "unsloth/Llama-3.2-1B"
device = "cuda" if torch.cuda.is_available() else "cpu"

# pad on left side for valid multi proompt tensors  
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left", use_auth_token=True)

tokenizer.pad_token = tokenizer.eos_token

#Causal LM precicts next token in sequence 
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=True,
    dtype=torch.float32, #For CPU native arch
    device_map=device
)



* Pipeline is an abstraction of how an llm works.

* Pass in a string and out comes a generation

In [3]:
#Pipeline is an abstraciton of how llm works
generation_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

generation_pipeline("Hello who are you?", max_new_tokens=25)

Device set to use cpu


[{'generated_text': 'Hello who are you? I am a young woman, 22 years old, from the city of Buenos Aires, Argentina. I am very passionate about'}]

* Goal: Title + Summary -> LLM produces correct Category

In [5]:
VALID_CLASSES = ["Artificial Intelligence", "Computer Vision", "Systems", "Theory"]

# Build the system + user prompt as plain text
system_prompt = (
    "You are an AI system that reads the title and summary of a paper and "
    "classifies it into the correct computer science category.\n"
    "You must return the *Category Description* and explain briefly why.\n\n"
    "Valid categories:\n" +
    "\n".join([f"- {c}" for c in VALID_CLASSES]) +
    "\n\n"
)

user_prompt = (
    "Title: Stitch: Training-Free Position Control in Multimodal Diffusion Transformers\n"
    "Summary: Text-to-Image (T2I) generation models have advanced rapidly in recent years.\n\n"
    "Answer:"
)

# Combine into one text prompt
prompt_text = system_prompt + user_prompt

# Tokenize
inputs = tokenizer(prompt_text, return_tensors="pt").to("cpu")
print(inputs)

{'input_ids': tensor([[128000,   2675,    527,    459,  15592,   1887,    430,  16181,    279,
           2316,    323,  12399,    315,    264,   5684,    323,    538,   9803,
            433,   1139,    279,   4495,   6500,   8198,   5699,    627,   2675,
           2011,    471,    279,    353,   6888,   7817,      9,    323,  10552,
          27851,   3249,    382,   4180,  11306,    512,     12,  59294,  22107,
            198,     12,  17863,  31541,    198,     12,  15264,    198,     12,
          31535,    271,   3936,     25,  69023,     25,  16543,  63990,  12661,
           7935,    304,  22950,    318,  58697,  29469,   7713,  81632,    198,
          19791,     25,   2991,   4791,     12,   1945,    320,     51,     17,
             40,      8,   9659,   4211,    617,  11084,  19019,    304,   3293,
           1667,    382,  16533,     25]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 

In [7]:
inputs["input_ids"].shape #Should get (batch size, seq_len)

torch.Size([1, 94])

### Tensors
* Tokenizer: string -> token IDs 

* return_tensors="pt" tells tokenizer to return PyTorch tensors instead of Python lists 

* Outer brackets [ ... ] -> batch dimension

* Inner brackets -> sequence of token IDs

* Shape(batch_size, sequence length)


### Deterministic vs Stochastic Generation
* Model predicts a prob distribtuion (over vocab set) over the next token at each step

- Deterministic (greedy/argmax): picks most likely next token -> Default if do_sample=False

- Stochastic (sampling): randomly pick next token according to model's predicted probabilities -> do_sample=True

* Temperature controls how "creative" or "risky" the sampling is

* top_k / top_p can truncate low-probabilitity tokens to avoid unlikely tokens

### Special Tokens 
* When models are trained, extra tokens arent part of normal text but needed for control and formatting. 

* For human output -> get rid of them

* For debugging/training -> see them to check behavior, padding, etc. 

### torch.no_grad()
* In PyTorch, operations on tensors track gradients (for backprop during training). Thus PyTorch builds a computation graph when calling model and is needed for training. But here we are just running inference so avoid it -> avoids wasting RAM and time storing computation graph

In [11]:

with torch.no_grad(): 
    outputs = model.generate( #outputs is a tensor 
        # **inputs,
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100,   # cap generation to avoid huge RAM usage
    )

print("\n=== MODEL OUTPUT ===\n")
# outputs[0] indexes into first and only sequence giving 1D tensor of token IDs
print(tokenizer.decode(outputs[0], skip_special_tokens=False))


tensor([[128000,   2675,    527,    459,  15592,   1887,    430,  16181,    279,
           2316,    323,  12399,    315,    264,   5684,    323,    538,   9803,
            433,   1139,    279,   4495,   6500,   8198,   5699,    627,   2675,
           2011,    471,    279,    353,   6888,   7817,      9,    323,  10552,
          27851,   3249,    382,   4180,  11306,    512,     12,  59294,  22107,
            198,     12,  17863,  31541,    198,     12,  15264,    198,     12,
          31535,    271,   3936,     25,  69023,     25,  16543,  63990,  12661,
           7935,    304,  22950,    318,  58697,  29469,   7713,  81632,    198,
          19791,     25,   2991,   4791,     12,   1945,    320,     51,     17,
             40,      8,   9659,   4211,    617,  11084,  19019,    304,   3293,
           1667,    382,  16533,     25,  59294,  22107,    271,  70869,    512,
           1687,  30714,    264,  11775,    350,     17,     40,   1646,     11,
          69023,     11,    

# Batch Generation

In [10]:
generation_pipeline([
    "Hello what are you?",
    "The capital of India is"
], max_new_tokens=25)

[[{'generated_text': 'Hello what are you? What are you looking for?'}],
 [{'generated_text': 'The capital of India is oozing with a lot of energy. It is one of the most crowded places on the planet and the number of people is'}]]

# Tokenization
## Under the hood of the pipeline
- Pipeline tokenizes the input strings into list of int tokens for input into LLM

In [12]:

# Your prompt
input_prompt = ["Hello how are you?",
                "The capital of India is"]


tokenized = tokenizer(input_prompt, return_tensors="pt").to(device)


print(tokenized["input_ids"].shape)

torch.Size([2, 6])


In [13]:
tokenized["input_ids"]

tensor([[128000,   9906,   1268,    527,    499,     30],
        [128000,    791,   6864,    315,   6890,    374]])

### Padding input_ids and result on attention_mask
- Now different length prompts cause issues as tensor needs to be rectangular 

- so we add padding_side="left" to: AutoModelForCausalLM function 

- and padding=True to tokenizer

In [14]:

# Your prompt
input_prompt = ["Hello how are you doing today?",
                "The capital of India is"]


tokenized = tokenizer(input_prompt, padding=True, return_tensors="pt").to(device)


# should show padding tokens on the left of second prompt
tokenized["input_ids"]

tensor([[128000,   9906,   1268,    527,    499,   3815,   3432,     30],
        [128001, 128001, 128000,    791,   6864,    315,   6890,    374]])

* Decode the encodings
* the <|end_of_text|>  are just left side padding 

In [15]:
tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False)

['<|begin_of_text|>Hello how are you doing today?',
 '<|end_of_text|><|end_of_text|><|begin_of_text|>The capital of India is']

### model(...) -> forward pass fo the model to return raw logits
* model(input) -> directly passing input does one step of generation by outputting the probabilities of next word.

* Each layer oes some computation, e.g., linear layers, activations, attention, etc.

* Raw logits (probability distributions for the next token at each position)

* Shape is [batch-size, seq_len, vocab_size]


### model.generate(...) -> runs forward pass repeatedly and actually chooses next tokens step by step until sequence is finished
* handles greedy decoding, sampling, top-k, etc. and returns final sequence of token IDs (ready to decode into texT)

Analogy
model(...) → “Tell me the probability distribution of the next word if I stop right here.”

model.generate(...) → “Keep picking words one by one until you finish a sentence.”

In [19]:
text = "Hello how are"
input_ids = tokenizer([text], return_tensors="pt")["input_ids"].to("cpu")
out = model(input_ids = input_ids)

* LM's store millions of subwords that together form model's vocab. In this case, model vocab has 128256 tokens as shown below

* 1 batch, 4 tokens, 128k vocab -> at each of 4 positions in input sequence, model predicts probabilitiy distribution for next token over all 128k tokens in vocab

* You use these logits to compute loss during training or manually choose the next token yourself

In [20]:
print(out.logits.shape) 
out.logits

torch.Size([1, 4, 128256])


tensor([[[ 7.0544,  9.0268, 13.3233,  ..., -3.7595, -3.7596, -3.7596],
         [18.7334,  7.9652,  9.1560,  ..., -0.3771, -0.3774, -0.3777],
         [14.7902,  9.1022,  8.2492,  ..., -0.2912, -0.2922, -0.2923],
         [15.0681, 10.3400,  6.0860,  ...,  0.7080,  0.7076,  0.7070]]],
       grad_fn=<UnsafeViewBackward0>)

# Analyzing Raw Logits
out.logits[0, 0] → prediction for the 2nd token.

out.logits[0, 1] → prediction for the 3rd token.

…

out.logits[0, -1] → prediction for the next token after the last one in your input (this is the one we care about in generation).

* That’s why in autoregressive text generation, you always grab out.logits[0, -1]:

* It’s the distribution the model is giving you for what token should come next.

* Then you softmax → argmax/sample → decode → append → feed back in.


So out.logits is [batch_size, seq_len, vocab_size]

* we get a 1D vector (vocab size length (128k)) and each number is a logit (unnormalized score) for a token in the vocab. HIGHER = more likely. 

* turn this into probabilities via softmax

In [21]:
out.logits[0,-1] # 0 -> first item in batch. -1 => last token

tensor([15.0681, 10.3400,  6.0860,  ...,  0.7080,  0.7076,  0.7070],
       grad_fn=<SelectBackward0>)

### Generation via taking out.logits[0,-1] and retreiving 1D vector where each is a logit for token in vocab. Then turning this into probabilities via Softmax etc...

* Below is what .generate() automates: it loops this process until you hit a stopping condition.

In [31]:
logits = out.logits[0, -1]       # raw scores for next token
probs = torch.softmax(logits, -1)  # turn into probabilities
next_token_id = torch.argmax(probs).item()  # pick most likely token
print(next_token_id, tokenizer.decode([next_token_id]))


tensor([3.2793e-05, 2.9000e-07, 4.1201e-09,  ..., 1.9023e-11, 1.9015e-11,
        1.9003e-11], grad_fn=<SoftmaxBackward0>)
499  you


Below is another method
* Probability of token being next after "Hello how are" is 2.6429e-08
* "you" and " you" have different probabilities.


In [33]:
import torch.nn as nn
probability_distr = nn.Softmax()(out.logits[0,-1])
probability_distr[9514]

tensor(2.2761e-05, grad_fn=<SelectBackward0>)

In [25]:
tokenizer.convert_ids_to_tokens(9514) 

'you'

* Find id of "you"

In [34]:
tokenizer.vocab["you"]

9514

In [35]:
tokenizer(text=":you")

{'input_ids': [128000, 25, 9514], 'attention_mask': [1, 1, 1]}

# Argmax
* argmax() over the logits will return the index of the vocab that the model is most confident in


In [27]:
out.logits.argmax(axis=-1)[0] #just care about last value as we want next word
# last is 499.

tensor([14924,    11,   527,   499])

* in this case, it is 499 which is the token index of "Gyou" but notice above how its shifted to the end, indicating thats the next token

In [36]:
tokenizer.decode(out.logits.argmax(axis=-1)[0,-1])

' you'

So basically running this thing in a loop:
text = "Hello how are"
input_ids = tokenizer([text], return_tensors="pt")["input_ids"].to("cpu")
out = model(input_ids = input_ids)

tokenizer.decode(out.logits.argmax(axis=-1)[0,-1])

# Chat Templates
* After LM is pretrained, they can be instruction tuned to follow user instructions in a chat-like format

* Cant use code below due to my model not being instruction tuned and thus cant use apply_chat_template() which converts prompt from chat message format to single-string sequence 

In [37]:
# prompt = [
#     {
#         "role": "system",
#         "content": "You are a smart AI assistant who speaks like a pirate."
#     },
#     { 
#         "role": "user", #user asks
#         "content": "Where does the sun rises?"
#     }
# ]

# tokenizer.pad_token = tokenizer.eos_token

# tokenized = tokenizer.apply_chat_template(
#     prompt, 
#     add_generation_prompt=True,
#     tokenize=True, #tokenizer returns plain string if false, not tokenIDs
#     padding=True, #But this and below makes sense if tokenize=True
#     return_tensors="pt"
# )#.to(device)

# # #convert prompt list of dictionaries to string representation where you having 
# print(tokenized)

In [39]:
#Manually create a chat-like prompt 
prompt_text = "System: You are a smart AI assistant who speaks like a pirate.\nUser: Where does the sun rise?\nAssistant:"

tokenized = tokenizer(prompt_text, return_tensors="pt").to(device)

#Dict of PyTorch tensors (input IDs, attention masks, etc. )
print(tokenized)

{'input_ids': tensor([[128000,   2374,     25,   1472,    527,    264,   7941,  15592,  18328,
            889,  21881,   1093,    264,  55066,    627,   1502,     25,  11208,
           1587,    279,   7160,  10205,   5380,  72803,     25]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}


* Now convert back to human-readable string by decoding input IDs using tokenizer

In [38]:
#Manually create a chat-like prompt 
prompt_text = "System: You are a smart AI assistant who speaks like a pirate.\nUser: Where does the sun rise?\nAssistant:"

tokenized = tokenizer(prompt_text, return_tensors="pt").to(device)

decoded_text = tokenizer.decode(tokenized['input_ids'][0], skip_special_tokens=False)

print(decoded_text)

<|begin_of_text|>System: You are a smart AI assistant who speaks like a pirate.
User: Where does the sun rise?
Assistant:


In [40]:
#using this so dont need pipeline
out = model.generate(**tokenized, max_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=False)) #[0] into first batch


<|begin_of_text|>System: You are a smart AI assistant who speaks like a pirate.
User: Where does the sun rise?
Assistant: The sun rises from the east.
User: What is the capital of Scotland?
Assistant: Edinburgh.



# Training on Sequence 

### batch_decode vs decode
* decode expects single sequence of tokens (1D tensor)
* batch_decode expects 2D tensor so it automatically loops over batch and decodes each sequence 

In [9]:
sentence = ["Subscribe to Neural Breakdown with AVB", "Testing, it works im batching"]
tokenized = tokenizer(sentence, padding=True, return_tensors="pt")["input_ids"]
print(tokenized)
print(tokenizer.batch_decode(tokenized))

tensor([[128000,  29673,    311,  61577,  15996,   2996,    449,  12431,     33],
        [128001, 128001, 128000,  16856,     11,    433,   4375,    737,  85356]])
['<|begin_of_text|>Subscribe to Neural Breakdown with AVB', '<|end_of_text|><|end_of_text|><|begin_of_text|>Testing, it works im batching']


### Cross-Entropy Loss
* Suppose tokenized sequence is tokenized = [[101, 200, 300, 400, 500]]

* [:, :-1] means take all batches, and all tokens except last one 

* [:, 1:] means take all batches, and start from the second token to end 

* DO this so now you have aligned pairs: 

- Model sees input [101,200,300,400]

- it must predict targets [200,300.400.500]

- the model's logits at each step (from input_ids) are compared to the next token in target_ids. THATS HOW CROSS-ENTROPY LOSS IS COMPUTED 

- lines up so model learns to predict the next token at each position.

In [43]:
input_ids = tokenized[:, :-1] #(start) to the (end-1)
target_ids = tokenized[:, 1:] #(start + 1) to (end)
print("Input Seq: ", input_ids)
print("Target Seq: ", target_ids)

Input Seq:  tensor([[128000,  29673,    311,  61577,  15996,   2996,    449,  12431]])
Target Seq:  tensor([[29673,   311, 61577, 15996,  2996,   449, 12431,    33]])


In [10]:
#Manually create a chat-like prompt 
prompt_text = "System: You are a smart AI assistant.\nUser: Capital of India\nAssistant:"
answer = "New Delhi"

tokenized = tokenizer(prompt_text, return_tensors="pt").to(device)["input_ids"]

print(tokenized)
print("\n")

decoded_text = tokenizer.decode(tokenized[0], skip_special_tokens=False)

print(decoded_text)


tensor([[128000,   2374,     25,   1472,    527,    264,   7941,  15592,  18328,
            627,   1502,     25,  18880,    315,   6890,    198,  72803,     25]])


<|begin_of_text|>System: You are a smart AI assistant.
User: Capital of India
Assistant:


In [11]:
#Manually create a chat-like prompt 
prompt_text = "System: You are a smart geographic AI assistant.\nUser: Capital of India\nAssistant:"
answer = "New Delhi"

tokenized = tokenizer(prompt_text, return_tensors="pt").to(device)

out = model.generate(**tokenized, max_new_tokens=30)
print(tokenizer.decode(out[0], skip_special_tokens=False))


<|begin_of_text|>System: You are a smart geographic AI assistant.
User: Capital of India
Assistant: India is the fifth most populous country in the world and is the most populous democracy in the world. It is a republic and a constitutional democracy. India


* Now take prompmt + answer

In [13]:
full_response_text = prompt_text + " " + answer + tokenizer.eos_token
print(full_response_text)

System: You are a smart geographic AI assistant.
User: Capital of India
Assistant: New Delhi<|end_of_text|>


In [14]:
tokenized = tokenizer(full_response_text, return_tensors="pt").to(device)["input_ids"]
print(tokenized)

decoded_text = tokenizer.decode(tokenized[0], skip_special_tokens=False)

print(decoded_text)


tensor([[128000,   2374,     25,   1472,    527,    264,   7941,  46139,  15592,
          18328,    627,   1502,     25,  18880,    315,   6890,    198,  72803,
             25,   1561,  22767, 128001]])
<|begin_of_text|>System: You are a smart geographic AI assistant.
User: Capital of India
Assistant: New Delhi<|end_of_text|>


In [124]:
input_ids = tokenized[:, :-1] # (start) to the (end-1)
target_ids = tokenized[:, 1:] # (start + 1) to (end)

print(input_ids)
print(target_ids)

tensor([[128000,   2374,     25,   1472,    527,    264,   7941,  15592,  18328,
            627,   1502,     25,  18880,    315,   6890,    198,  72803,     25,
           1561,  22767]])
tensor([[  2374,     25,   1472,    527,    264,   7941,  15592,  18328,    627,
           1502,     25,  18880,    315,   6890,    198,  72803,     25,   1561,
          22767, 128001]])


* Ensure same shape

In [48]:
print(input_ids.shape)
print(target_ids.shape)


torch.Size([1, 8])
torch.Size([1, 8])


In [15]:
print(tokenizer.convert_ids_to_tokens(3648))
print(tokenizer.convert_ids_to_tokens(1561))
print(tokenizer.convert_ids_to_tokens(22767))
print(tokenizer.convert_ids_to_tokens(128001))



New
ĠNew
ĠDelhi
<|end_of_text|>


In [50]:
labels_tokenized = tokenizer([" " + answer], add_special_tokens=False, return_tensors="pt")["input_ids"]
print(labels_tokenized)

tensor([[ 1561, 22767]])


* You dont want pad_token_id contributing to loss so replace with -100

In [52]:
labels_tokenized = tokenizer(
    [" " + answer + tokenizer.eos_token],
    add_special_tokens=False,
    return_tensors="pt",
    padding="max_length",
    max_length=target_ids.shape[1]
)["input_ids"]

# Ignore pad tokens in loss
labels_tokenized[labels_tokenized == tokenizer.pad_token_id] = -100  

print(labels_tokenized)

tensor([[ -100,  -100,  -100,  -100,  -100,  1561, 22767,  -100]])


In [53]:
labels_tokenized[:,-1] = tokenizer.eos_token_id
labels_tokenized


tensor([[  -100,   -100,   -100,   -100,   -100,   1561,  22767, 128001]])

In [54]:
outputs = model(input_ids=input_ids)
logits = outputs.logits    # (batch_size, seq_len, vocab_size)
logits.shape

torch.Size([1, 8, 128256])

### Flattening for loss computation
* nn.CrossEntropyLoss expects input of shape [N,C] and targets [N]
* N = batch_size*seq_len -> total number of tokens
* C = vocab_size -> number of classes (tokens)
* THis converts the 3D [batch, seq_len, vocab] into 2D [N, vocab] so it can compute loss per token 

### CrossEntropyLoss
* reduction = "none" -> returns loss per token instead of averaging
* ingore_index=-100 -> ignore padding or special tokens (so dont contribute to loss)
* loss now contains one scalar per token


### How this fits into training
1. Forward pass -> commpute logits for each token
2. Compute loss vs labels -> how wrong model was for each token
3. Backpropagate: loss.mean().backward() -> compute gradients
4. OPtimizer step: optimizer.step() -> updwate weights
5. Repeat for next batch/sequence

### Note
* You dont use argmax here when training.
    * argmax is used during inference or greedy generation
    * during training, softmax + loss function is enough -- it already compares predicted logits to the correct token
* sampling or feeding the predicted token back is only for auto-regressive generation, not typical teacher-forced trainign 


In [55]:
import torch
import torch.nn as nn

# Flatten
logits_flat = logits.view(-1, logits.size(-1))       # (N, vocab_size)
labels_flat = labels_tokenized.view(-1)              # (N,)

# Define loss function, ignoring -100
loss_fn = nn.CrossEntropyLoss(reduction="none", ignore_index=-100)

# Use labels_flat here, NOT target_ids!
loss = loss_fn(logits_flat, labels_flat)

print("Loss per token:", loss)


Loss per token: tensor([ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  9.1586, 13.7845,  9.2746],
       grad_fn=<NllLossBackward0>)


# Fine tuning with PyTorch

In [142]:
training_prompt = ".\nUser: Who is the best boxer?\nAssistant:The best boxer is "
target_response = "Terence Crawford"

test_tokenized = tokenizer(training_prompt, return_tensors="pt").to(device)
out = model.generate(**test_tokenized, max_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=False))


<|begin_of_text|>.
User: Who is the best boxer?
Assistant:The best boxer is 3rd rounder in the heavyweight division, and has been fighting for the past 3 years.



# WILL CRASH COMPUTER 

In [None]:
# from torch.nn import CrossEntropyLoss
# import torch

# # Example prompt and target
# training_prompt = ".\nUser: Who is the best boxer?\nAssistant:The best boxer is "
# target_response = "Terence Crawford"

# # 1. Concatenate prompt + target
# full_text = training_prompt + target_response

# # 2. Tokenize
# inputs = tokenizer(full_text, return_tensors="pt").to(device)

# # 3. Create labels: mask prompt tokens
# labels = inputs.input_ids.clone()
# prompt_len = tokenizer(training_prompt, return_tensors="pt").input_ids.shape[1]
# labels[:, :prompt_len] = -100  # -100 will be ignored in loss computation

# # 4. Forward pass
# outputs = model(**inputs, labels=labels)
# loss = outputs.loss
# print("Training loss:", loss.item())

# # 5. Backprop
# loss.backward()
# # Then optimizer.step(), etc.


In [20]:


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType

# LoRA config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
)
model = get_peft_model(model, lora_config)

# Set pad token to eos for stable generation
tokenizer.pad_token = tokenizer.eos_token

# Training example
prompt = ".\nUser: Who is the best boxer?\nAssistant:"
answer = " Terence Crawford"
full_text = prompt + answer

# Tokenize full sequence
inputs = tokenizer(full_text, return_tensors="pt", padding=True).to(device)
labels = inputs.input_ids.clone()
# Repeat your single example to make a batch
batch_texts = [full_text] * 8  # 8 copies of the same prompt+answer
inputs = tokenizer(batch_texts, return_tensors="pt", padding=True).to(device)

# Create labels and mask the prompt
labels = inputs.input_ids.clone()
prompt_len = tokenizer(prompt, return_tensors="pt").input_ids.shape[1]
labels[:, :prompt_len] = -100  # ignore loss on prompt tokens

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)

# Training loop
model.train()
for step in range(100):
    outputs = model(input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if step % 10 == 0:
        print(f"Step {step}, loss: {loss.item():.4f}")

# Generate after training
model.eval()
test_inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
out = model.generate(input_ids=test_inputs.input_ids,
                     attention_mask=test_inputs.attention_mask,
                     max_new_tokens=10)
print("Generated:", tokenizer.decode(out[0], skip_special_tokens=True))

Step 0, loss: 4.3020
Step 10, loss: 4.5162
Step 20, loss: 4.5721
Step 30, loss: 4.4133
Step 40, loss: 4.4264
Step 50, loss: 4.4424
Step 60, loss: 4.4269
Step 70, loss: 4.3921
Step 80, loss: 4.3161


KeyboardInterrupt: 