In [None]:
import numpy as np

In [99]:
# Step 1: Define the Dataset
data = [([1, 2], 0), ([3, 4], 1)]  # Data points with their respective classes
num_features = 2  # Number of features in each data point
num_classes = 2   # Number of classes

In [100]:
# Initialize theta with small random values
np.random.seed(0)
theta = np.random.randn(num_classes, num_features) * 0.01
print("Initial theta:\n", theta)

Initial theta:
 [[0.01764052 0.00400157]
 [0.00978738 0.02240893]]


In [101]:
# Define the log_sum_exp function
def log_sum_exp(X, theta):
    """
    Compute log(sum(exp(X.dot(theta))))
    X is a single data point (1D array), theta is the weight matrix.
    """
    exp_term = np.exp(X.dot(theta))  # Compute exp(X.dot(theta))
    sum_exp = np.sum(exp_term)  # Compute the sum of exponentials
    log_sum = np.log(sum_exp)  # Compute the log of the sum
    print(f"X={X}, theta=\n{theta}")
    print(f"exp_term:\n{exp_term}")
    print(f"sum_exp: {sum_exp}")
    print(f"log_sum: {log_sum}")
    return log_sum

In [102]:
# Loss function using the expression from the figure
def loss_function(data, theta):
    """
    Calculate the loss using the given dataset and the theta values.
    """
    loss = 0
    for X, c in data:
        X = np.array(X)  # Convert to numpy array
        theta_c = theta[c]  # Get the weights for the correct class
        loss_contrib = X.dot(theta_c) - log_sum_exp(X, theta)
        print(f"X={X}, class={c}")
        print(f"theta_c:\n{theta_c}")
        print(f"loss_contrib: {loss_contrib}")
        loss += loss_contrib
    print(f"Total loss: {loss}")
    return loss  # Return the negative log-likelihood as a scalar

In [103]:
# Gradient calculation using the expression from the figure
def gradient(data, theta):
    """
    Calculate the gradient of the loss function with respect to theta.
    """
    grad = np.zeros_like(theta)  # Initialize gradient with zeros
    for X, c in data:
        X = np.array(X)  # Convert to numpy array
        exp_theta = np.exp(X.dot(theta))  # Compute exp(X.dot(theta)) for all classes
        sum_exp_theta = np.sum(exp_theta)  # Compute the sum of exponentials for normalization
        prob = exp_theta / sum_exp_theta  # p(y | X, theta)
        print(f"X={X}")
        print(f"exp_theta:\n{exp_theta}")
        print(f"sum_exp_theta: {sum_exp_theta}")
        print(f"prob:\n{prob}")
        
        for a in range(num_classes):
            indicator = int(a == c)  # Indicator function I(a = c)
            grad_contrib = X * (indicator - prob[a])  # Gradient update for class 'a'
            print(f"Class {a}")
            print(f"indicator: {indicator}")
            print(f"grad_contrib:\n{grad_contrib}")
            grad[a] += grad_contrib
            print(f"grad[{a}] after update:\n{grad[a]}")
    print(f"Gradient:\n{grad}")
    return grad  # Return negative gradient

In [104]:
# Implement Gradient Descent
def gradient_descent(data, theta, learning_rate=0.1, iterations=100):
    """
    Perform gradient descent to optimize theta.
    """
    for i in range(iterations):
        grad = gradient(data, theta)  # Compute gradient
        theta -= learning_rate * grad  # Update theta
        loss = loss_function(data, theta)  # Compute loss
        print(f"Iteration {i}")
        print(f"Loss: {loss:.4f}")
        print(f"Gradient:\n{grad}")
        print(f"Updated theta:\n{theta}")
    return theta

In [105]:
# Run Gradient Descent
theta_final = gradient_descent(data, theta, learning_rate=0.1, iterations=100)
print("Final Parameters:\n", theta_final)

X=[1 2]
exp_theta:
[1.03791644 1.05003074]
sum_exp_theta: 2.087947178643546
prob:
[0.49709899 0.50290101]
Class 0
indicator: 1
grad_contrib:
[0.50290101 1.00580201]
grad[0] after update:
[0.50290101 1.00580201]
Class 1
indicator: 0
grad_contrib:
[-0.50290101 -1.00580201]
grad[1] after update:
[-0.50290101 -1.00580201]
X=[3 4]
exp_theta:
[1.09644277 1.10698538]
sum_exp_theta: 2.203428142316842
prob:
[0.49760768 0.50239232]
Class 0
indicator: 0
grad_contrib:
[-1.49282304 -1.99043072]
grad[0] after update:
[-0.98992203 -0.98462871]
Class 1
indicator: 1
grad_contrib:
[1.49282304 1.99043072]
grad[1] after update:
[0.98992203 0.98462871]
Gradient:
[[-0.98992203 -0.98462871]
 [ 0.98992203  0.98462871]]
X=[1 2], theta=
[[ 0.11663273  0.10246444]
 [-0.08920482 -0.07605394]]
exp_term:
[0.94009258 0.95156866]
sum_exp: 1.8916612395114658
log_sum: 0.6374554057145182
X=[1 2], class=0
theta_c:
[0.11663273 0.10246444]
loss_contrib: -0.31589379332603645
X=[3 4], theta=
[[ 0.11663273  0.10246444]
 [-0.0