In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TwoLayerMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerMLP, self).__init__()
        
        # Define the layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)       # First linear layer
        self.bn1 = nn.BatchNorm1d(hidden_dim)             # First batch normalization
        self.fc2 = nn.Linear(hidden_dim, output_dim)      # Second linear layer
        self.bn2 = nn.BatchNorm1d(output_dim)             # Second batch normalization

    def forward(self, x):
        # Forward pass through the first layer
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.tanh(x)
        
        # Forward pass through the second layer
        x = self.fc2(x)
        x = self.bn2(x)
        return x  # Output is logits, for use with Cross-Entropy Loss

# Example usage:
input_dim = 10   # Dimension of input features
hidden_dim = 20  # Number of neurons in the hidden layer
output_dim = 5   # Number of output classes

model = TwoLayerMLP(input_dim, hidden_dim, output_dim)

# Generate a random input tensor with batch size of 3 and input dimension of 10
inputs = torch.randn(3, input_dim)

# Forward pass
logits = model(inputs)

# Define target labels (for demonstration, using random labels for a 5-class problem)
targets = torch.randint(0, output_dim, (3,))

# Compute the cross-entropy loss
criterion = nn.CrossEntropyLoss()
loss = criterion(logits, targets)

print("Logits:", logits)
print("Loss:", loss.item())



Logits: tensor([[-1.4070,  0.2286, -1.0106, -1.3578,  1.3568],
        [ 0.5806, -1.3229,  1.3620,  1.0202, -1.0237],
        [ 0.8265,  1.0943, -0.3514,  0.3377, -0.3332]],
       grad_fn=<NativeBatchNormBackward0>)
Loss: 2.8278591632843018


In [5]:
import torch
import numpy as np

class ManualTwoLayerMLP:
    def __init__(self, input_dim, hidden_dim, output_dim):
        # Initialize weights and biases
        self.fc1_weight = np.random.randn(hidden_dim, input_dim) * 0.01
        self.fc1_bias = np.zeros(hidden_dim)
        
        self.fc2_weight = np.random.randn(output_dim, hidden_dim) * 0.01
        self.fc2_bias = np.zeros(output_dim)
        
        # Batch normalization parameters (initialized to ones and zeros for gamma and beta)
        self.bn1_gamma = np.ones(hidden_dim)
        self.bn1_beta = np.zeros(hidden_dim)
        
        self.bn2_gamma = np.ones(output_dim)
        self.bn2_beta = np.zeros(output_dim)
        
    def forward(self, x):
        self.x = x
        
        # Forward pass: Layer 1 (Linear, BatchNorm, Tanh)
        self.fc1_out = np.dot(x, self.fc1_weight.T) + self.fc1_bias
        self.bn1_out, self.bn1_cache = self.batch_norm_forward(self.fc1_out, self.bn1_gamma, self.bn1_beta)
        self.tanh_out = np.tanh(self.bn1_out)
        
        # Forward pass: Layer 2 (Linear, BatchNorm)
        self.fc2_out = np.dot(self.tanh_out, self.fc2_weight.T) + self.fc2_bias
        self.bn2_out, self.bn2_cache = self.batch_norm_forward(self.fc2_out, self.bn2_gamma, self.bn2_beta)
        
        # Logits (no softmax as Cross-Entropy expects logits)
        self.logits = self.bn2_out
        return self.logits
    
    def batch_norm_forward(self, x, gamma, beta, eps=1e-5):
        mu = np.mean(x, axis=0)
        var = np.var(x, axis=0)
        x_hat = (x - mu) / np.sqrt(var + eps)
        out = gamma * x_hat + beta
        cache = (x, x_hat, mu, var, gamma, beta, eps)
        return out, cache
    
    def tanh_backward(self, dout, x):
        return dout * (1 - np.tanh(x) ** 2)
    
    def batch_norm_backward(self, dout, cache):
        x, x_hat, mu, var, gamma, beta, eps = cache
        N, D = dout.shape
        
        # Gradients w.r.t. parameters
        dbeta = np.sum(dout, axis=0)
        dgamma = np.sum(dout * x_hat, axis=0)
        
        # Gradients w.r.t. input x
        dx_hat = dout * gamma
        dvar = np.sum(dx_hat * (x - mu) * -0.5 * (var + eps) ** -1.5, axis=0)
        dmu = np.sum(dx_hat * -1 / np.sqrt(var + eps), axis=0) + dvar * np.mean(-2 * (x - mu), axis=0)
        
        dx = dx_hat / np.sqrt(var + eps) + dvar * 2 * (x - mu) / N + dmu / N
        return dx, dgamma, dbeta
    
    def cross_entropy_backward(self, logits, targets):
        m = targets.shape[0]
        grad = logits
        grad[np.arange(m), targets] -= 1
        grad = grad / m
        return grad
    
    def backward(self, logits, targets):
        # 1. Cross-entropy loss gradient w.r.t. logits
        dloss = self.cross_entropy_backward(logits, targets)
        
        # 2. Backprop through batch norm 2 (output layer)
        dbn2_out, self.d_bn2_gamma, self.d_bn2_beta = self.batch_norm_backward(dloss, self.bn2_cache)
        
        # 3. Backprop through second linear layer
        self.d_fc2_weight = np.dot(dbn2_out.T, self.tanh_out)
        self.d_fc2_bias = np.sum(dbn2_out, axis=0)
        d_tanh_out = np.dot(dbn2_out, self.fc2_weight)
        
        # 4. Backprop through tanh
        d_bn1_out = self.tanh_backward(d_tanh_out, self.bn1_out)
        
        # 5. Backprop through batch norm 1
        dbn1_out, self.d_bn1_gamma, self.d_bn1_beta = self.batch_norm_backward(d_bn1_out, self.bn1_cache)
        
        # 6. Backprop through first linear layer
        self.d_fc1_weight = np.dot(dbn1_out.T, self.x)
        self.d_fc1_bias = np.sum(dbn1_out, axis=0)
    
    def update_params(self, lr=0.01):
        # Update all parameters with gradient descent
        self.fc1_weight -= lr * self.d_fc1_weight
        self.fc1_bias -= lr * self.d_fc1_bias
        self.fc2_weight -= lr * self.d_fc2_weight
        self.fc2_bias -= lr * self.d_fc2_bias
        
        self.bn1_gamma -= lr * self.d_bn1_gamma
        self.bn1_beta -= lr * self.d_bn1_beta
        self.bn2_gamma -= lr * self.d_bn2_gamma
        self.bn2_beta -= lr * self.d_bn2_beta

# Example usage
input_dim = 10
hidden_dim = 20
output_dim = 5
x = np.random.randn(3, input_dim)
targets = np.array([0, 1, 2])

# Initialize the model
model = ManualTwoLayerMLP(input_dim, hidden_dim, output_dim)

# Forward pass
logits = model.forward(x)

# Backward pass
model.backward(logits, targets)

# Update parameters
model.update_params()


In [6]:
# Hyperparameters
learning_rate = 0.01
num_epochs = 10

# Sample input and target
input_dim = 10
hidden_dim = 20
output_dim = 5
x = np.random.randn(3, input_dim)  # Batch of 3 samples with input_dim features
targets = np.array([0, 1, 2])      # Example target classes for 3 samples

# Initialize the model
model = ManualTwoLayerMLP(input_dim, hidden_dim, output_dim)

# Training loop
for epoch in range(num_epochs):
    # Forward pass
    logits = model.forward(x)
    
    # Calculate loss (for monitoring purposes)
    # Compute softmax to get probabilities
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
    correct_log_probs = -np.log(probs[np.arange(len(targets)), targets])
    loss = np.sum(correct_log_probs) / len(targets)
    
    # Backward pass
    model.backward(logits, targets)
    
    # Update parameters
    model.update_params(lr=learning_rate)
    
    # Print the loss for the current epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}")


Epoch 1/10, Loss: 2.3813
Epoch 2/10, Loss: 1.8282
Epoch 3/10, Loss: 1.0752
Epoch 4/10, Loss: 0.9595
Epoch 5/10, Loss: 0.9201
Epoch 6/10, Loss: 0.9100
Epoch 7/10, Loss: 0.8615
Epoch 8/10, Loss: 0.8080
Epoch 9/10, Loss: 0.7360
Epoch 10/10, Loss: 0.7141
