In [3]:
# Step 1: Import necessary libraries
import numpy as np
import torch
import torch.nn.functional as F
from IPython.display import display, Markdown

# Helper function to format and display each step clearly
def print_step(title, description, value):
    display(Markdown(f"## {title}"))
    display(Markdown(f"**Description:** {description}"))
    print(value)
    print("\n" + "-"*60 + "\n")

# Step 2: Define sample logits and target labels
logits = np.array([[2.0, 1.0, 0.1],  # Class scores for sample 1
                   [0.5, 2.5, 0.3]])  # Class scores for sample 2

targets = np.array([0, 1])  # True class labels (sample 1 -> class 0, sample 2 -> class 1)

print_step(
    "Step 1: Define Logits and Targets",
    "Logits are raw scores (before softmax) produced by the model for each class (or for LLMs token score)."
    " Targets are the ground-truth class labels (as indices of the token in the lookup table).",
    {"Logits": logits, "Targets": targets}
)

# Step 3: Convert logits to probabilities using Softmax
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Subtract max for numerical stability
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

probabilities = softmax(logits)

print_step(
    "Step 2: Apply Softmax to Convert Logits into Probabilities",
    "Softmax normalizes logits into probabilities for each class, ensuring they sum to 1.",
    probabilities
)

# Step 4: Extract the probabilities of the correct classes
correct_class_probs = probabilities[np.arange(len(targets)), targets]

print_step(
    "Step 3: Extract Correct Class Probabilities",
    "We select the probability assigned to the true class for each sample.",
    correct_class_probs
)

# Step 5: Compute the Negative Log Likelihood (NLL)
negative_log_likelihoods = -np.log(correct_class_probs)

print_step(
    "Step 4: Compute Negative Log-Likelihoods",
    "The loss function penalizes incorrect predictions by taking the negative log "
    "of the correct class probability. Lower probabilities result in higher penalties. What's critical to note is this effect is spread amongst incorrect probabilities",
    negative_log_likelihoods
)

# Step 6: Compute Final Cross-Entropy Loss
cross_entropy_loss = np.mean(negative_log_likelihoods)

print_step(
    "Step 5: Compute Final Cross-Entropy Loss",
    "The final loss is computed as the mean of all negative log-likelihood values "
    "across all samples. A lower value indicates better model performance.",
    cross_entropy_loss
)

# ---------------------------------------
# Step 7: Compute Cross-Entropy using PyTorch for Comparison
logits_torch = torch.tensor(logits, dtype=torch.float32)
targets_torch = torch.tensor(targets, dtype=torch.long)

torch_ce_loss = F.cross_entropy(logits_torch, targets_torch).item()

print_step(
    "Step 6: Compute Cross-Entropy Loss using PyTorch",
    "We verify our manual calculation by comparing it to PyTorch’s built-in cross_entropy function.",
    torch_ce_loss
)


## Step 1: Define Logits and Targets

**Description:** Logits are raw scores (before softmax) produced by the model for each class (or for LLMs token score). Targets are the ground-truth class labels (as indices of the token in the lookup table).

{'Logits': array([[2. , 1. , 0.1],
       [0.5, 2.5, 0.3]]), 'Targets': array([0, 1])}

------------------------------------------------------------



## Step 2: Apply Softmax to Convert Logits into Probabilities

**Description:** Softmax normalizes logits into probabilities for each class, ensuring they sum to 1.

[[0.65900114 0.24243297 0.09856589]
 [0.10860373 0.80247906 0.08891721]]

------------------------------------------------------------



## Step 3: Extract Correct Class Probabilities

**Description:** We select the probability assigned to the true class for each sample.

[0.65900114 0.80247906]

------------------------------------------------------------



## Step 4: Compute Negative Log-Likelihoods

**Description:** The loss function penalizes incorrect predictions by taking the negative log of the correct class probability. Lower probabilities result in higher penalties. What's critical to note is this effect is spread amongst incorrect probabilities

[0.41703002 0.22004952]

------------------------------------------------------------



## Step 5: Compute Final Cross-Entropy Loss

**Description:** The final loss is computed as the mean of all negative log-likelihood values across all samples. A lower value indicates better model performance.

0.3185397696491857

------------------------------------------------------------



## Step 6: Compute Cross-Entropy Loss using PyTorch

**Description:** We verify our manual calculation by comparing it to PyTorch’s built-in cross_entropy function.

0.31853973865509033

------------------------------------------------------------

