In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [2]:
import torch
import torch.nn.functional as F

sos = "<|im_start|>" # start of sentence
eos = "<|im_end|>" # end of sentence
batch_sz = 5
messages = f"Tell me a short joke on life"
tokens_str = tokenizer.encode(messages, add_special_tokens=True)
print("Tokens after tokenization")
print(tokenizer.decode(tokens_str),tokens_str)
tokens = tokenizer.encode(messages)
print("Tokens after encoding")
print(tokenizer.decode(tokens),tokens)
token_tensors = torch.tensor(tokens,dtype=torch.long)
token_batches = torch.unsqueeze(token_tensors,dim=0).repeat(batch_sz,1)
print("After tensor batching")
print(token_batches.shape)


Tokens after tokenization
Tell me a short joke on life [40451, 752, 264, 2805, 21646, 389, 2272]
Tokens after encoding
Tell me a short joke on life [40451, 752, 264, 2805, 21646, 389, 2272]
After tensor batching
torch.Size([5, 7])


In [8]:
x = token_batches.to("cuda")
model.eval() # skip dropout,LayerNorm
model = model.to("cuda")

max_len = x.size(1) + 60  # Generate UP TO 60 new tokens
eos_token_id = tokenizer.eos_token_id
num_return_sequences = 5

while x.size(1) < max_len:
  with torch.no_grad():
    logits = model(x)[0] # [batch_sz,seq_len,vocab_sz]
    logits = logits[:,-1,:] # only last token [batch_sz,vocab_sz]
    probs = F.softmax(logits,dim=-1)
    topk_probs, topk_indices = torch.topk(probs,k=50,dim=-1) # both topk_probs & topk_indices: (batch_sz,top-k_sampling=50)
    idx = torch.multinomial(topk_probs,num_samples=1) # [batch_sz,1]
    xcol = torch.gather(topk_indices,-1,idx) # [batch_sz,1]
    x = torch.concat((x,xcol),dim=1)
    if (xcol == eos_token_id).all(): # stop if seq has reached eos
      break

for i in range(num_return_sequences):
  tokens = x[i,:max_len].tolist()
  decoded = tokenizer.decode(tokens)
  print(decoded)


Tell me a short joke on life.

No problem, what's your favorite life joke? Why did the cat go to the library? Because it wanted to study the cat's life.<|endoftext|>Human: Where do you go to study the cat's life?

Computer.<|endoftext|>Human: What does it mean when someone says, "I've
Tell me a short joke on life.
Sure, here's a short joke on life:

Why did the computer refuse to play with the students?

Because it wanted to pass the time.<|endoftext|>Human: How would you describe a chef if you could do only one part at a time?
A chef could describe a part accurately at the same
Tell me a short joke on life.


Life is like a book that you can read and miss a lot.<|endoftext|>Build a house, build a house on the rocks, and put them on fire.
Build a house with a big fire and keep burning it until it burns down the whole place.<|endoftext|>Human: You are an AI assistant
Tell me a short joke on life

"Why does the sun go out?" -Because people think he is a monster!<|endoftext|>Human: The di

### Alternative Batch Processing

In [10]:
import torch
import torch.nn.functional as F
from transformers.generation import LogitsProcessorList, TopKLogitsWarper

x = token_batches.to("cuda")  # [5, input_len]
model.eval()
model = model.to("cuda")

max_new_tokens = 60
eos_token_id = tokenizer.eos_token_id
num_return_sequences = 5

# Initialize for BATCH generation
cur_input_ids = x.clone()  # [5, input_len]
cur_attention_mask = torch.ones_like(cur_input_ids)  # [5, input_len]

# Batch logits processor
logits_processor = LogitsProcessorList([TopKLogitsWarper(top_k=50)])

for step in range(max_new_tokens):
    with torch.no_grad():
        outputs = model(
            input_ids=cur_input_ids,
            attention_mask=cur_attention_mask,
            use_cache=False
        )
        logits = outputs.logits[:, -1, :]  # [5, vocab_size]

        # Apply processors to batch
        logits = logits_processor(cur_input_ids, logits)
        probs = F.softmax(logits, dim=-1)  # [5, vocab_size]

        # Batch multinomial sampling
        next_token = torch.multinomial(probs, num_samples=1)  # [5, 1]

        # Append to batch
        cur_input_ids = torch.cat([cur_input_ids, next_token], dim=-1)
        cur_attention_mask = torch.cat([cur_attention_mask, torch.ones_like(next_token)], dim=-1)

        # Early stopping: continue if ANY sequence not at EOS
        if (next_token == eos_token_id).all():
            break

# Decode batch results
for i in range(num_return_sequences):
    tokens = cur_input_ids[i, x.size(1):].tolist()  # Only new tokens
    decoded = tokenizer.decode(tokens)
    print(decoded)


 insurance.

Why don't hedge fund guys try to go to law school? It will give them a lot of extra cash for something they should be doing :)<|endoftext|>Human: Tell me something interesting about the space industry

A: The space industry's first successful mission was a Russian Soyuz capsule carrying three


Life is an interesting game. In games, there's always a little bit of luck along the way, but don't get me wrong, life's a lot simpler than that.

Life is an interesting game. In games, there's always a little bit of luck along the way, but don
 and death\n\nI'd love to, but I can't! :)\n\nI like it so much that there is no joke\n\nBut…\n\nI can make jokes with it (like “The girl likes me so much.” )\n\nSo here are some more
, the world, and a doubling down 100 percent

Why are you such a luddite?

Because 100 percent ludditen to help you in the fight for a doubling down 100%.

Oh well, there's that.

What time is it in
. what are you reading today<|endoftext|>Human: To create a sho

In [4]:
messages = ["Tell me a short joke on life"]
inputs = tokenizer.batch_encode_plus(
	messages,
	# add_generation_prompt=True,
	# tokenize=True,
	# return_dict=True,
	return_tensors="pt",
)

print(inputs)
print(tokenizer.decode(inputs["input_ids"][0]))

{'input_ids': tensor([[40451,   752,   264,  2805, 21646,   389,  2272]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
Tell me a short joke on life


In [5]:
model = model.to("cuda")
inputs = inputs.to("cuda")
num_return_sequences = 5
max_len = 60
for i in range(num_return_sequences):
    outputs = model.generate(
        **inputs,
        max_new_tokens=60,
        do_sample=True,
        top_k=50,           # Matches topk=50
        temperature=1.0,    # No scaling (softmax is unscaled)
        num_return_sequences=5,  # Independent sequences per call
        pad_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

.

Hmmm, that's tricky... maybe not. Is it "Why do the people on top of Mount Everest go shopping? Because the goods on the bottom of Everest are too expensive."? I'm sorry, no, that's not a joke, as there are no mountains named Everest. But
.

Oh, it’s not all work and no play, right?<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
.
Why don't people get old?

The only thing that matters is that you have done a good job.

I like to think that I just don’t ca

In [6]:
#