# 4. Pretraining On Unlabeled Data

## 4.1 Evaluating Generative Text Models

### 4.1.1 GPT-124M Configuration Setup and Model Import

In [2]:
from gpt_modules import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

In [3]:
import torch

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();

### 4.1.2 Text-to-Tokens, Generation, and Decoding Pipeline

In [4]:
import tiktoken
from gpt_modules import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'}) # Encode text into token IDs (allow GPT special tokens if present)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension → shape becomes (1, seq_len)
    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # Remove batch dimension
    return tokenizer.decode(flat.tolist()) # Convert token IDs back to readable text


start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

# Generate new tokens autoregressively
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

# Decode generated token IDs back into text
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


### 4.1.3 Calculating The Text Generation Loss: Cross-Entropy And Perplexity

Example Input and Target Token Batches for Next-Token Prediction

In [None]:
inputs = torch.tensor([
    [16833, 3626, 6100],   # "every effort moves"
    [40,    1107, 588]     # "I really like"
])

targets = torch.tensor([
    [3626, 6100, 345],     # "effort moves you"
    [1107, 588, 11311]     # "really like chocolate"
])

Computing Token Probabilities from Model Logits

In [None]:
with torch.no_grad():                # Disable gradients for inference
    logits = model(inputs)           # Forward pass → raw logits (batch, seq_len, vocab_size)

probas = torch.softmax(logits, dim=-1)  # Convert logits to probabilities over vocabulary
print(probas.shape)                      # Expected: (batch_size, seq_len, vocab_size)

torch.Size([2, 3, 50257])


Selecting Most Probable Token IDs (Greedy Prediction)

In [None]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)  
# Select highest-probability token at each position (greedy decoding)

print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


Comparing Target Tokens with Model Predictions (Decoded Text)

In [None]:
# Decode target tokens for batch 1
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")

# Decode predicted token IDs for batch 1
# Flatten removes extra dimension from argmax output
print(f"Outputs batch 1: "
      f"{token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


Extracting Model Probabilities for Target Tokens

In [11]:
text_idx = 0
# Get probabilities assigned to the correct target tokens for batch 1
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
# Same extraction for batch 2
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([7.4541e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


Computing Log Probabilities of Target Tokens

In [12]:
# Combine probabilities from both batches and convert to log-probabilities
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))

print(log_probas)  # Log probabilities used in cross-entropy / likelihood calculations

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])


Computing Average Log Probability of Targets

In [13]:
# Compute average log probability across all target tokens
avg_log_probas = torch.mean(log_probas)

print(avg_log_probas)  # Higher (less negative) means better predictions

tensor(-10.7940)


Computing Negative Average Log Probability (Loss Value)

In [14]:
# Convert average log probability into negative log-likelihood (loss)
neg_avg_log_probas = avg_log_probas * -1

print(neg_avg_log_probas)  # Equivalent to cross-entropy style loss

tensor(10.7940)


Inspecting Logits and Target Tensor Shapes

In [15]:
print("Logits shape:", logits.shape)    # Expected: (batch_size, seq_len, vocab_size)
print("Targets shape:", targets.shape)  # Expected: (batch_size, seq_len)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


Flattening Logits and Targets for Loss Computation

In [16]:
# Merge batch and sequence dimensions to match loss function expectations
logits_flat = logits.flatten(0, 1)   # Shape: (batch_size * seq_len, vocab_size)
targets_flat = targets.flatten()     # Shape: (batch_size * seq_len)

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


Computing Cross-Entropy Loss for Next-Token Prediction

In [17]:
# Compute cross-entropy loss between predicted logits and true token IDs
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)

print(loss)  # Standard language modeling loss (negative log-likelihood)

tensor(10.7940)
