# 🔍 Step-by-Step GPT Inference — Tokenization to Logits


In [1]:
# 📦 Install and import necessary modules
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# ⚙️ Set model ID (Gemma or another causal LM)
model_id = 'google/gemma-1.1-2b-it'

# 🧠 Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to('cuda')


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.61s/it]


# What the model looks like

In [2]:
model.eval()

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((2048,),

# Adding Inputs

In [3]:
# ✏️ Input prompt
prompt = "A dog running through the snow"

# Tokenize the inputs using Gemma's tokenizer

In [4]:
inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
print("input_ids:", inputs['input_ids'])
print("input shape:", inputs['input_ids'].shape)


input_ids: tensor([[     2, 235280,   5929,   5327,   1593,    573,   8529]],
       device='cuda:0')
input shape: torch.Size([1, 7])


# 🔁 Forward pass


## Model here is all MHA+MLP layers, effectively after all causal attention masking

torch.no_grad()
Turns off gradient tracking (no .grad, no .backward())
Reduces memory usage — no need to store intermediate tensors for backprop
Speeds up inference

**input unpacks the input dictionary as keyword arguments to the model.

In [5]:
with torch.no_grad():
    outputs = model(**inputs)



# 📊 Logits shape and inspection


In [6]:
logits = outputs.logits


## 🔢 What are logits?
### 🧠 Definition:
Logits are the raw, unnormalized scores output by a neural network before applying softmax.

In NLP tasks, logits represent how likely the model thinks each word (token) in the vocabulary should come next — before converting those scores into probabilities.

In [None]:
print("logits shape:", logits.shape)
print("last token logits shape:", logits[:, -1, :].shape)


logits shape: torch.Size([1, 7, 256000])
last token logits shape: torch.Size([1, 256000])


In [13]:
# Get the logits from the previous token
prev_logits = logits[:,-2,:]
topk = torch.topk(prev_logits, k=5, dim=-1)
for _ in topk.indices[0]:
    print(tokenizer.decode(_))



 forest
 park
 woods
 fields
 field



# 🔍 View top 5 predicted tokens from last position


In [None]:
last_logits = logits[:, -1, :]
topk = torch.topk(last_logits, k=5, dim=-1) #topk.indices returns the indices of the top i logits
for i in range(5):
    token_id = topk.indices[0, i].item() 
    score = topk.values[0, i].item()
    print(f"Rank {i+1}: Token '{tokenizer.decode([token_id])}' (ID: {token_id}) — Logit: {score:.2f}")


Rank 1: Token ',' (ID: 235269) — Logit: 5.83
Rank 2: Token ' is' (ID: 603) — Logit: 5.22
Rank 3: Token '.' (ID: 235265) — Logit: 4.92
Rank 4: Token ' with' (ID: 675) — Logit: 4.72
Rank 5: Token ' creates' (ID: 18460) — Logit: 4.67


# ✨ Manual autoregressive decoding with shape tracking


In [9]:
generated = inputs['input_ids']
max_new_tokens = 2000


In [10]:

for step in range(max_new_tokens):
    with torch.no_grad():
        output = model(input_ids=generated)
        next_token_logits = output.logits[:, -1, :]
        next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True) #dim=-1 ensures the output is a 2D tensor by keeping the last dimension

    print(f"Step {step+1} — Next token: {tokenizer.decode(next_token_id[0])} (ID: {next_token_id.item()})")
    generated = torch.cat([generated, next_token_id], dim=-1) # Concatenate (add) the new token to the generated sequence

    if next_token_id.item() == tokenizer.eos_token_id:
        print("<EOS> token reached. Stopping generation.")
        break

    

Step 1 — Next token: , (ID: 235269)
Step 2 — Next token:  its (ID: 1277)
Step 3 — Next token:  paws (ID: 92381)
Step 4 — Next token:  leaving (ID: 10317)
Step 5 — Next token:  tracks (ID: 18631)
Step 6 — Next token:  in (ID: 575)
Step 7 — Next token:  the (ID: 573)
Step 8 — Next token:  pristine (ID: 97459)
Step 9 — Next token:  white (ID: 2674)
Step 10 — Next token:  landscape (ID: 15487)
Step 11 — Next token: . (ID: 235265)
Step 12 — Next token: 

 (ID: 109)
Step 13 — Next token: ** (ID: 688)
Step 14 — Next token: Describe (ID: 50721)
Step 15 — Next token:  the (ID: 573)
Step 16 — Next token:  image (ID: 2416)
Step 17 — Next token:  in (ID: 575)
Step 18 — Next token:  more (ID: 978)
Step 19 — Next token:  detail (ID: 8637)
Step 20 — Next token: .** (ID: 116742)
Step 21 — Next token: 

 (ID: 109)
Step 22 — Next token: ** (ID: 688)
Step 23 — Next token: Visual (ID: 19268)
Step 24 — Next token:  Elements (ID: 34762)
Step 25 — Next token: :** (ID: 66058)
Step 26 — Next token: 

 (ID: 109

In [11]:


decoded_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print("\n📝 Final Decoded Text:\n", decoded_text)




📝 Final Decoded Text:
 A dog running through the snow, its paws leaving tracks in the pristine white landscape.

**Describe the image in more detail.**

**Visual Elements:**

* **Snow-covered landscape:** The image captures a winter scene with pristine white snow covering the ground. The snow is clean and pure, creating a stark and beautiful backdrop.
* **Dog running through the snow:** The dog is depicted running through the snow, its paws leaving tracks in the pristine white landscape. The movement of the dog is energetic and purposeful, suggesting a sense of adventure and freedom.
* **Paw prints:** The dog's paws leaving tracks in the snow create a visual connection between the animal and the environment. They are small and distinct, yet they are enough to convey the movement and energy of the dog.

**Emotional Elements:**

* **Joy and freedom:** The image conveys a sense of joy and freedom in the winter landscape. The dog's running through the snow is a joyous activity, and the pr