In [100]:
import numpy as np
from torchvision import datasets, transforms

In [101]:

# Load and preprocess the data
transform = transforms.Compose([transforms.ToTensor()])
mnist_train = datasets.MNIST(root='/mnt/d/CUDA/cuda-learn/mnist-cuda/data', train=True, download=True, transform=transform)
mnist_test = datasets.MNIST(root='/mnt/d/CUDA/cuda-learn/mnist-cuda/data', train=False, download=True, transform=transform)

X_train = mnist_train.data.numpy().reshape(-1, 1, 28, 28) / 255.0
y_train = mnist_train.targets.numpy()
X_test = mnist_test.data.numpy().reshape(-1, 1, 28, 28) / 255.0
y_test = mnist_test.targets.numpy()

In [107]:

# Activation functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

# Linear layer
def initialize_weights(input_size, output_size):
    return np.random.randn(output_size, input_size) * np.sqrt(2.0 / input_size)

def linear_forward(weights, x):
    return weights @ x # (output_size, input_size) @ (input_size, batch_size) = (output_size, batch_size

def linear_backward(grad_output, x, weights):
    # print shapes
    print('grad_output:', grad_output.shape)
    print('x:', x.shape)
    print('weights:', weights.shape)
    # grad_output: (10, 8)
    # x: (256, 8)
    # weights: (10, 256)
    grad_weights = grad_output @ x.T # (10, 8) @ (8, 256) = (10, 256)
    grad_input = weights.T @ grad_output # (256, 10) @ (10, 8) = (256, 8)
    return grad_input, grad_weights

# Softmax and Cross-Entropy Loss
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=0, keepdims=True))
    return exp_x / np.sum(exp_x, axis=0, keepdims=True)

def cross_entropy_loss(y_pred, y_true):
    batch_size = y_pred.shape[1]
    probabilities = softmax(y_pred)
    loss = -np.sum(np.log(probabilities[y_true, range(batch_size)])) / batch_size
    return loss


In [108]:

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.weights1 = initialize_weights(input_size, hidden_size)
        self.weights2 = initialize_weights(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.shape[0]
        fc1_input = x.reshape(batch_size, -1).T # * 784 x batch_size
        fc1_output = linear_forward(self.weights1, fc1_input) # * 256 x 784 @ 784 x batch_size = 256 x batch_size 
        relu_output = relu(fc1_output) # * 256 x batch_size
        fc2_output = linear_forward(self.weights2, relu_output) # * 10 x 256 @ 256 x batch_size = 10 x batch_size
        return fc2_output, (fc1_input, fc1_output, relu_output) 

    def backward(self, grad_output, cache):
        fc1_input, fc1_output, relu_output = cache

        grad_fc2, grad_weights2 = linear_backward(grad_output, relu_output, self.weights2)
        grad_relu = grad_fc2 * relu_derivative(fc1_output)
        grad_fc1, grad_weights1 = linear_backward(grad_relu, fc1_input, self.weights1)
        return grad_weights1, grad_weights2

    def update_weights(self, grad_weights1, grad_weights2, learning_rate):
        self.weights1 -= learning_rate * grad_weights1
        self.weights2 -= learning_rate * grad_weights2

In [109]:

def train(model, X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate):
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        for i in range(0, len(X_train), batch_size):
            batch_X = X_train[i:i+batch_size]
            batch_y = y_train[i:i+batch_size]

            # Forward pass
            y_pred, cache = model.forward(batch_X)

            # Compute loss and gradients
            loss = cross_entropy_loss(y_pred, batch_y)

            # Compute gradient
            # btw we aren't double softmaxing here, because cross-entropy loss has a built-in softmax
            # and doesn't affect the code below
            softmax_probs = softmax(y_pred)
            y_true = np.zeros_like(y_pred)
            y_true[batch_y, np.arange(len(batch_y))] = 1
            grad_output = (softmax_probs - y_true) / len(batch_y)

            # Backward pass
            grad_weights1, grad_weights2 = model.backward(grad_output, cache)

            # Update weights
            model.update_weights(grad_weights1, grad_weights2, learning_rate)

            if i % (24 * batch_size) == 0:
                print(f"Iteration: {i//batch_size} Loss: {loss:.4f}")

        # Evaluate on test set
        y_pred, _ = model.forward(X_test)
        test_loss = cross_entropy_loss(y_pred, y_test)
        accuracy = np.mean(np.argmax(y_pred, axis=0) == y_test)
        print(f"Epoch {epoch+1} - Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}")

    print("Training completed!")

# Main execution
if __name__ == "__main__":
    # Assume X_train, y_train, X_test, y_test are loaded from somewhere
    
    input_size = 784  # 28x28 pixels
    hidden_size = 256
    output_size = 10  # 10 digits
    
    model = NeuralNetwork(input_size, hidden_size, output_size)
    
    batch_size = 8
    epochs = 5
    learning_rate = 1e-3
    
    train(model, X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate)


Epoch 1/5
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
Iteration: 0 Loss: 2.2154
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
grad_output: (10, 8)
x: (256, 8)
weights: (10, 256)
grad_output: (256, 8)
x: (784, 8)
weights: (256, 784)
grad_outpu

KeyboardInterrupt: 