# 🔍 Step-by-Step GPT Inference — Tokenization to Logits


In [None]:
# 📦 Install and import necessary modules
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# ⚙️ Set model ID (Gemma or another causal LM)
model_id = 'google/gemma-1.1-2b-it'

# 🧠 Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to('cuda')


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1000.00 MiB. GPU 0 has a total capacity of 5.67 GiB of which 847.94 MiB is free. Including non-PyTorch memory, this process has 4.82 GiB memory in use. Of the allocated memory 4.67 GiB is allocated by PyTorch, and 73.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

# What the model looks like

In [None]:
model.eval()

# Adding Inputs

In [None]:
# ✏️ Input prompt
prompt = "A dog running through the snow"

# Tokenize the inputs using Gemma's tokenizer

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
print("input_ids:", inputs['input_ids'])
print("input shape:", inputs['input_ids'].shape)


# 🔁 Forward pass


In [None]:
with torch.no_grad():
    outputs = model(**inputs)



# 📊 Logits shape and inspection


In [None]:
logits = outputs.logits


In [None]:
print("logits shape:", logits.shape)
print("last token logits shape:", logits[:, -1, :].shape)



# 🔍 View top 5 predicted tokens from last position


In [None]:
last_logits = logits[:, -1, :]
topk = torch.topk(last_logits, k=5, dim=-1)
for i in range(5):
    token_id = topk.indices[0, i].item()
    score = topk.values[0, i].item()
    print(f"Rank {i+1}: Token '{tokenizer.decode([token_id])}' (ID: {token_id}) — Logit: {score:.2f}")


# ✨ Manual autoregressive decoding with shape tracking


In [None]:
generated = inputs['input_ids']
max_new_tokens = 20


In [None]:

for step in range(max_new_tokens):
    with torch.no_grad():
        output = model(input_ids=generated)
        next_token_logits = output.logits[:, -1, :]
        next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)

    print(f"Step {step+1} — Next token: {tokenizer.decode(next_token_id[0])} (ID: {next_token_id.item()})")
    generated = torch.cat([generated, next_token_id], dim=-1)

    if next_token_id.item() == tokenizer.eos_token_id:
        print("<EOS> token reached. Stopping generation.")
        break

In [None]:


decoded_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print("\n📝 Final Decoded Text:\n", decoded_text)

