# Loading Models and Basic Inference 


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import torch

model_id = "unsloth/Llama-3.2-1B"
device = "cuda" if torch.cuda.is_available() else "cpu"


#pad on left side for valid rectangular tensor 
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left", use_auth_token=True)

tokenizer.pad_token = tokenizer.eos_token

#Causal LM precicts next token in sequence as opposed to Masked Models
#predicting probability of word (token) given surrounding words
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_auth_token=True,
    dtype=torch.float32, #For CPU native arch
    device_map=device
)

In [3]:
#Pipeline is an abstraciton of how llm works  
generation_pipeline = pipeline(task="text-generation", 
                                model=model, tokenizer=tokenizer)

generation_pipeline("Hello who are you?", max_new_tokens=25)

Device set to use cpu


[{'generated_text': 'Hello who are you? I am a graduate of the University of Alberta with a BSc in Biochemistry and Molecular Biology and a graduate of the University'}]

In [4]:
VALID_CLASSES = ["Artificial Intelligence", "Computer Vision", "Systems", "Theory"]

# Build the system + user prompt as plain text
system_prompt = (
    "You are an AI system that reads the title and summary of a paper and "
    "classifies it into the correct computer science category.\n"
    "You must return the *Category Description* and explain briefly why.\n\n"
    "Valid categories:\n" +
    "\n".join([f"- {c}" for c in VALID_CLASSES]) +
    "\n\n"
)

user_prompt = (
    "Title: Stitch: Training-Free Position Control in Multimodal Diffusion Transformers\n"
    "Summary: Text-to-Image (T2I) generation models have advanced rapidly in recent years.\n\n"
    "Answer:"
)

# Combine into one text prompt
prompt_text = system_prompt + user_prompt

# Tokenize
inputs = tokenizer(prompt_text, return_tensors="pt").to("cpu")

In [5]:
# generate with strict limits to avoid RAM spikes
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,   # cap generation
        do_sample=True,
        temperature=0.7
    )


# Decode result
print("\n=== MODEL OUTPUT ===\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



=== MODEL OUTPUT ===

You are an AI system that reads the title and summary of a paper and classifies it into the correct computer science category.
You must return the *Category Description* and explain briefly why.

Valid categories:
- Artificial Intelligence
- Computer Vision
- Systems
- Theory

Title: Stitch: Training-Free Position Control in Multimodal Diffusion Transformers
Summary: Text-to-Image (T2I) generation models have advanced rapidly in recent years.

Answer: Artificial Intelligence

Explanation:
This paper describes a training-free training method for multimodal diffusion transformers (MDTs), which are a recently proposed model architecture for T2I. The MDTs can be trained end-to-end using only image and text inputs, and can generate text descriptions of images with high quality. This paper proposes a novel training-free training method, which can be used to train MDTs with a small amount of labeled data. The proposed method can also be applied to other MDT


# Batch Generation

In [95]:
generation_pipeline([
    "Hello what are you?",
    "The capital of India is"
], max_new_tokens=25)

[[{'generated_text': 'Hello what are you? You are a new member at the same time you are a friend and a companion. So what are you?\nI am a'}],
 [{'generated_text': 'The capital of India is New Delhi. It is located in the state of Delhi. New Delhi is a big city and has many places of interest and'}]]

# Tokenization
## Under the hood of the pipeline
- Pipeline tokenizes the input strings into list of int tokens for input into LLM

In [3]:

# Your prompt
input_prompt = ["Hello how are you?",
                "The capital of India is"]


tokenized = tokenizer(input_prompt, return_tensors="pt").to(device)
#Output is a dictionary lile:
# {
#   "input_ids": tensor([[101, 1045, 2293]]),   # torch.LongTensor
#   "attention_mask": tensor([[1, 1, 1]])
# }

print(tokenized["input_ids"].shape)

torch.Size([2, 6])


In [4]:
tokenized["input_ids"]

tensor([[128000,   9906,   1268,    527,    499,     30],
        [128000,    791,   6864,    315,   6890,    374]])

- Now different length prompts cause issues as tensor needs to be rectangular 
- so we add padding_side="left" to: AutoModelForCausalLM function 
- and padding=True to tokenizer

In [5]:

# Your prompt
input_prompt = ["Hello how are you doing today?",
                "The capital of India is"]


tokenized = tokenizer(input_prompt,  padding=True, return_tensors="pt").to(device)
#Output is a dictionary lile:
# {
#   "input_ids": tensor([[101, 1045, 2293]]),   # torch.LongTensor
#   "attention_mask": tensor([[1, 1, 1]])
# }


# should show padding tokens on the left of second prompt
tokenized["input_ids"]

tensor([[128000,   9906,   1268,    527,    499,   3815,   3432,     30],
        [128001, 128001, 128000,    791,   6864,    315,   6890,    374]])

* Decode the encodings

In [6]:
tokenizer.batch_decode(tokenized["input_ids"])

['<|begin_of_text|>Hello how are you doing today?',
 '<|end_of_text|><|end_of_text|><|begin_of_text|>The capital of India is']

* Looking at tokenized object 

In [7]:
tokenized.keys()


KeysView({'input_ids': tensor([[128000,   9906,   1268,    527,    499,   3815,   3432,     30],
        [128001, 128001, 128000,    791,   6864,    315,   6890,    374]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1, 1]])})

* Attention Mask: Uses binary mask for 1 where you have an actual token and 0 for padding. Dont give attention to 0's and its there for pytorch to not fail when making non rectangular matrix.

In [8]:
tokenized["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1, 1]])

# Chat Templates
* After LM is pretrained, they can be instruction tuned to follow user instructions in a chat-like format

* Cant use code below due to gpt2 not being instruction tuned and thus cant use apply_chat_template() which converts prompt from chat message format to single-string sequence 

In [6]:
# prompt = [
#     {
#         "role": "system",
#         "content": "You are a smart AI assistant who speaks like a pirate."
#     },
#     { 
#         "role": "user", #user asks
#         "content": "Where does the sun rises?"
#     }
# ]

# tokenizer.pad_token = tokenizer.eos_token

# tokenized = tokenizer.apply_chat_template(
#     prompt, 
#     add_generation_prompt=True,
#     tokenize=True, #tokenizer returns plain string if false, not tokenIDs
#     padding=True, #But this and below makes sense if tokenize=True
#     return_tensors="pt"
# )#.to(device)

# # #convert prompt list of dictionaries to string representation where you having 
# print(tokenized)

: 

* Suppose have prompt with system prompt (Ask AI assistant to speak like pirate) then user prompt

* Print tensor data instead of string below:

In [10]:
#Manually create a chat-like prompt 
prompt_text = "System: You are a smart AI assistant who speaks like a pirate.\nUser: Where does the sun rise?\nAssistant:"

tokenized = tokenizer(prompt_text, return_tensors="pt").to(device)

#Dict of PyTorch tensors (input IDs, attention masks, etc. )
print(tokenized)

{'input_ids': tensor([[128000,   2374,     25,   1472,    527,    264,   7941,  15592,  18328,
            889,  21881,   1093,    264,  55066,    627,   1502,     25,  11208,
           1587,    279,   7160,  10205,   5380,  72803,     25]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}


* Now convert back to human-readable string by decoding input IDs using tokenizer

In [11]:
#Manually create a chat-like prompt 
prompt_text = "System: You are a smart AI assistant who speaks like a pirate.\nUser: Where does the sun rise?\nAssistant:"

tokenized = tokenizer(prompt_text, return_tensors="pt").to(device)

decoded_text = tokenizer.decode(tokenized['input_ids'][0], skip_special_tokens=False)

print(decoded_text)

<|begin_of_text|>System: You are a smart AI assistant who speaks like a pirate.
User: Where does the sun rise?
Assistant:


- The code takes your prompt, feeds it into the model, generates 20 new tokens, and prints the full text including your prompt + the modelâ€™s continuation.. 

- out = model.generate(tokenized, max_new_tokens=20) wouldnt work becaues mode.generate expects keyword arguments like input_ids=... and passing directly gives a dict, which function dosent accept so we unpack it. 

In [12]:
#using this so dont need pipeline
out = model.generate(**tokenized, max_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=False))
# out = model.generate(tokenized, max_new_tokens=20)

<|begin_of_text|>System: You are a smart AI assistant who speaks like a pirate.
User: Where does the sun rise?
Assistant: The sun rises in the east.
User: What time is it now?
Assistant: It is now


# Creating a Dataset 
- For predicting Arxiv Categories 
