In [45]:
import numpy as np
import time
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch

In [122]:
batch_size = 64
learning_rate = 1e-3
num_epochs = 10

# Define necessary naive operations
def conv2d_naive(x, weights, stride=1, padding=1):
    # Apply padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (padding, padding), (padding, padding)), mode='constant', constant_values=0)
    N, C, H, W = x.shape
    F, _, HH, WW = weights.shape
    H_out = int(1 + (H + 2 * padding - HH) / stride)
    W_out = int(1 + (W + 2 * padding - WW) / stride)
    out = np.zeros((N, F, H_out, W_out))

    for i in range(H_out):
        for j in range(W_out):
            h_start = i * stride
            h_end = h_start + HH
            w_start = j * stride
            w_end = w_start + WW
            x_patched = x_padded[:, :, h_start:h_end, w_start:w_end]
            for k in range(F):
                out[:, k, i, j] = np.sum(x_patched * weights[k, :, :, :], axis=(1, 2, 3))

    return out

def conv2d_backward_naive(dout, x, w, stride=1, padding=1):
    N, C, H, W = x.shape
    F, _, HH, WW = w.shape
    H_out = int(1 + (H + 2 * padding - HH) / stride)
    W_out = int(1 + (W + 2 * padding - WW) / stride)
    
    # Using padded version of input
    x_padded = np.pad(x, ((0, 0), (0, 0), (padding, padding), (padding, padding)), mode='constant', constant_values=0)
    dx_padded = np.zeros_like(x_padded)
    dw = np.zeros_like(w)
    
    for i in range(H_out):
        for j in range(W_out):
            h_start = i * stride
            h_end = h_start + HH
            w_start = j * stride
            w_end = w_start + WW

            for n in range(N):
                idout = dout[n, :, i, j][:, None, None, None]
                dx_padded[n, :, h_start:h_end, w_start:w_end] += np.sum(w * idout, axis=0)
                dw += x_padded[n, :, h_start:h_end, w_start:w_end] * idout
            
    dx = dx_padded[:, :, padding:-padding, padding:-padding] if padding > 0 else dx_padded    
    
    return dx, dw

# def conv2d_backward_naive(X, dL_dY, W_shape=(batch_size, 32), stride=1, padding=1):
#     # Extract dimensions
#     n_filters, d_filter, h_filter, w_filter = W_shape
#     n_x, d_x, h_x, w_x = X.shape
#     n_y, d_y, h_y, w_y = dL_dY.shape
    
#     # Initialize the gradient for the filter weights
#     dL_dW = np.zeros(W_shape)
    
#     # Consider padding
#     X_p = np.pad(X, [(0, 0), (0, 0), (padding, padding), (padding, padding)], mode='constant')
    
#     # Compute the gradient
#     for i in range(h_y):
#         for j in range(w_y):
#             for f in range(n_filters):
#                 h_start = i * stride
#                 h_end = h_start + h_filter
#                 w_start = j * stride
#                 w_end = w_start + w_filter
#                 dL_dW[f, :, :, :] += dL_dY[:, f, i, j][:, None, None, None] * X_p[:, :, h_start:h_end, w_start:w_end]
    
#     return dL_dW

# Update Naive BatchNorm to return cache
def batch_norm_naive(x, gamma, beta, eps=1e-5):
    N, C, H, W = x.shape
    mean = np.mean(x, axis=(0, 2, 3), keepdims=True)
    var = np.var(x, axis=(0, 2, 3), keepdims=True)
    x_norm = (x - mean) / np.sqrt(var + eps)
    out = gamma * x_norm + beta
    cache = (x, x_norm, mean, var, gamma, beta, eps) # this one
    return out, cache

def d_batch_norm_naive(x, gamma, beta, dout, mean, var, eps=1e-5):
    N, C, H, W = x.shape
    x_norm = (x - mean) / np.sqrt(var + eps)
    
    dgamma = np.sum(dout * x_norm, axis=(0, 2, 3), keepdims=True)
    dbeta = np.sum(dout, axis=(0, 2, 3), keepdims=True)
    
    dx_norm = dout * gamma
    dvar = np.sum(dx_norm * (x - mean) * -0.5 * (var + eps)**(-1.5), axis=(0, 2, 3), keepdims=True)
    dmean = np.sum(dx_norm * -1.0 / np.sqrt(var + eps), axis=(0, 2, 3), keepdims=True) + dvar * np.sum(-2.0 * (x - mean), axis=(0, 2, 3), keepdims=True) / N
    dx = dx_norm * 1.0 / np.sqrt(var + eps) + dvar * 2.0 * (x - mean) / N + dmean / N
    
    return dx, dgamma, dbeta

def relu_naive(x):
    return np.maximum(0, x)

def d_relu_naive(x):
    return (x > 0).astype(x.dtype)

def max_pool2d_naive(x, pool_size=2, stride=2):
    N, C, H, W = x.shape
    H_out = int((H - pool_size) / stride + 1)
    W_out = int((W - pool_size) / stride + 1)
    out = np.zeros((N, C, H_out, W_out))

    for i in range(H_out):
        for j in range(W_out):
            h_start = i * stride
            h_end = h_start + pool_size
            w_start = j * stride
            w_end = w_start + pool_size
            x_patched = x[:, :, h_start:h_end, w_start:w_end]
            out[:, :, i, j] = np.max(x_patched, axis=(2, 3))

    return out


def max_pool2d_backward_naive(prev_layer_grad, x, pool_size=2, stride=2):
    """
    Backward pass for a 2D max-pooling layer.
    
    Args:
    - prev_layer_grad: Gradient of the loss with respect to the outputs of the max pooling layer (shape: (N, C, H_out, W_out)).
    - x: Input to the max pooling layer during the forward pass (shape: (N, C, H_in, W_in)).
    - pool_size: Size of the pooling window (default 2).
    - stride: Stride of the pooling window (default 2).
    
    Returns:
    - dx: Gradient of the loss with respect to the inputs of the max pooling layer (shape: (N, C, H_in, W_in)).
    """
    N, C, H_in, W_in = x.shape
    H_out = (H_in - pool_size) // stride + 1
    W_out = (W_in - pool_size) // stride + 1
    
    # Initialize the gradient with respect to input with zeros
    dx = np.zeros_like(x)
    
    for n in range(N):
        for c in range(C):
            for h in range(H_out):
                for w in range(W_out):
                    h_start = h * stride
                    h_end = h_start + pool_size
                    w_start = w * stride
                    w_end = w_start + pool_size

                    # Extract the current pooling region from the input
                    x_pool = x[n, c, h_start:h_end, w_start:w_end]
                    
                    # Determine the maximum value in the pooling region
                    max_val = np.max(x_pool)
                    
                    # Create a mask that is 1 at the position of the max value and 0 elsewhere
                    mask = (x_pool == max_val)
                    
                    # Propagate the gradient through the max-pooling operation
                    dx[n, c, h_start:h_end, w_start:w_end] += prev_layer_grad[n, c, h, w] * mask
                    
    return dx

def softmax_naive(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy_loss_naive(pred, target):
    N = pred.shape[0]
    clipped_pred = np.clip(pred, 1e-12, 1. - 1e-12)
    return -np.sum(target * np.log(clipped_pred)) / N

def d_cross_entropy_loss_naive(pred, target):
    N = pred.shape[0]
    return (pred - target) / N

In [47]:
# MNIST Dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # Mean and std of MNIST
])

train_dataset = datasets.MNIST(root='../../../data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='../../../data', train=False, transform=transform, download=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Load all training data into system DRAM
train_data = torch.Tensor()
train_labels = torch.LongTensor()  # Convert to LongTensor for labels
for batchidxev, (data, label) in enumerate(train_loader):
    train_data = torch.cat((train_data, data), dim=0)
    train_labels = torch.cat((train_labels, label), dim=0)
print('Train Data Shape:', train_data.shape)
print('Train Data Type:', train_data.dtype)

# Load all test data into system DRAM
test_data = torch.Tensor()
test_labels = torch.LongTensor()  # Convert to LongTensor for labels
for batchidx, (data, label) in enumerate(test_loader):
    test_data = torch.cat((test_data, data), dim=0)
    test_labels = torch.cat((test_labels, label), dim=0)
print('Test Data Shape:', test_data.shape)
print('Test Data Type:', test_data.dtype)

iters_per_epoch = 60_000 // batch_size
print('Iters per epoch:', iters_per_epoch)

Train Data Shape: torch.Size([60000, 1, 28, 28])
Train Data Type: torch.float32
Test Data Shape: torch.Size([10000, 1, 28, 28])
Test Data Type: torch.float32
Iters per epoch: 937


In [140]:
"""
Forward pass:
conv2d: kernel size 3x3, stride 1, padding 1, in 1 channel, out 32 channels
batchnorm2d: out 32 channels
relu
maxpool2d: kernel size 2x2, stride 2
fc1: in 32*14*14, out 128
relu
fc2: in 128, out 10
log softmax (dim=1)
"""

batch_size = 8
learning_rate = 0.01
num_epochs = 5

class Model():
    def __init__(self):
        self.conv1_w = np.random.randn(32, 1, 3, 3)
        self.bn1_gamma = np.ones((1, 32, 1, 1), dtype=np.float32)
        self.bn1_beta = np.zeros((1, 32, 1, 1), dtype=np.float32)
        self.fc1_w = np.random.randn(32*14*14, 128)
        self.fc2_w = np.random.randn(128, 10)
        print("fc2", self.fc2_w)

        # init grads
        self.conv1_w_grad = np.zeros_like(self.conv1_w)
        self.bn1_gamma_grad = np.zeros_like(self.bn1_gamma)
        self.bn1_beta_grad = np.zeros_like(self.bn1_beta)
        self.fc1_w_grad = np.zeros_like(self.fc1_w)
        self.fc2_w_grad = np.zeros_like(self.fc2_w)
        print('Model initialized')

    def forward(self, x):
        self.x_conv1 = x
        x = conv2d_naive(x, self.conv1_w, stride=1, padding=1)
        # print('Conv1:', x.shape)

        self.x_bn1 = x
        x, self.bn1_cache = batch_norm_naive(self.x_bn1, self.bn1_gamma, self.bn1_beta)
        # print('BN1:', x.shape)
        
        self.x_relu1 = x
        x = relu_naive(x)
        # print('ReLU1:', x.shape)
        
        self.x_pool1 = x
        x = max_pool2d_naive(x, pool_size=2, stride=2)
        # print('MaxPool1:', x.shape)
        
        self.x_flatten = x
        x = x.reshape(x.shape[0], -1)
        # print('Flatten:', x.shape)
        
        self.x_fc1 = x
        x = np.dot(x, self.fc1_w)
        # print('FC1:', x.shape)
        
        self.x_relu2 = x
        x = relu_naive(x)
        # print('ReLU2:', x.shape)
        
        self.x_fc2 = x
        x = np.dot(x, self.fc2_w)
        # print('FC2:', x.shape)
        # check if this is filled with zeros
        
        # if np.all(x == 0):
        #     print('Zero FC2')
        # else: 
        #     print('Non-zero FC2')
        
        # x = softmax_naive(x)
        # print('Softmax:', x.shape)
        return x

    def backward(self, y_pred, y_true):
        # print('y_pred:', y_pred.shape, y_pred)
        # print('y_true:', y_true.shape, y_true)
        loss = cross_entropy_loss_naive(y_pred, y_true)
        print('Loss:', loss)


        # dx -> previous layer gradient
        dx = d_cross_entropy_loss_naive(y_pred, y_true)
        # print('dLoss:', dx.shape) # -> (B, 10)
        # print('dLoss:', dx)

        # do this properly
        # print('shapes', dx.T.shape, self.x_fc2.shape)
        self.fc2_w_grad += self.x_fc2.T @ dx
        print('fc2_w_grad:', self.fc2_w_grad)
        # print('dFC2:', self.fc2_w_grad.shape)
        # check if all any of the numbers are greater than 100 or less than -100
        # if np.any(self.fc2_w_grad > 100) or np.any(self.fc2_w_grad < -100):
        #     print('FC2 Gradient Exploded')
        # print('dFC2:', dx)
        # print(self.fc2_w_grad.shape, self.x_relu2.shape)  
        relu_grad = d_relu_naive(self.x_fc2)
        # print('dReLU2:', dx.shape)
        # print('dReLU2:', dx)
        # print(np.any(relu_grad > 100) or np.any(relu_grad < -100))

        # print(self.x_flatten.T.shape, dx.shape)
        # print(self.x_fc1.shape, relu_grad.shape)
        self.fc1_w_grad = self.x_fc1.T @ relu_grad # (6272, 8) @ (8, 128) -> (6272, 128)

        # print('dFC1:', dx.shape)
        # print('dFC1:', dx)
        # print(np.any(self.fc1_w_grad > 100) or np.any(self.fc1_w_grad < -100))
        
        # print(self.fc1_w_grad.shape)
        # print(self.x_flatten.shape)
        flatten_grad = self.x_fc1.reshape(self.x_flatten.shape) # (8, 6272) -> (8, 32, 14, 14)
        # print('dReshape:', dx.shape)
        # print('dReshape:', dx)
        # print(np.any(flatten_grad > 100) or np.any(flatten_grad < -100))

        # print(flatten_grad.shape, self.x_pool1.shape)
        dx = max_pool2d_backward_naive(flatten_grad, self.x_pool1, pool_size=2, stride=2)
        # print('dMaxPool1:', dx.shape)
        # print('dMaxPool1:', dx)
        # print(np.any(dx > 100) or np.any(dx < -100))

        dx = d_relu_naive(dx)
        # print('dReLU1:', dx.shape)
        # print('dReLU1:', dx)
        # print(np.any(dx > 100) or np.any(dx < -100))

        # print('cache:', self.bn1_cache)
        x, norm, mean, var, gamma, beta, eps = self.bn1_cache
        dx, dgamma, dbeta = d_batch_norm_naive(norm, gamma, beta, dx, mean, var, eps)
        # print('dBN1:', dx.shape)
        # print('dBN1:', dx)
        # print(np.any(dx > 100) or np.any(dx < -100))

        dx, dw_conv1 = conv2d_backward_naive(dx, self.x_conv1, self.conv1_w, stride=1, padding=1)
        # print('dConv1:', dx.shape)

        # print('dConv1:', dx)
        # print(np.any(dx > 100) or np.any(dx < -100))

        return loss, dgamma, dbeta, dw_conv1

model = Model()

for i in range(20):
    start_time = time.time()
 
    data = train_data[i*batch_size:(i+1)*batch_size]
    label = train_labels[i*batch_size:(i+1)*batch_size]
    data = data.numpy()
    label = label.numpy()
    # print('labels', label) if i == 0 else None
    y_pred = model.forward(data)
    # argmax for a (B, 1) output
    # y_pred = np.argmax(y_pred, axis=1)
    # print(y_pred) if i == 0 else None
    print("\n"*2)
    y_true = np.eye(10)[label]
    loss, dgamma, dbeta, dw_conv1 = model.backward(y_pred, y_true)
    print('Epoch:', i, 'Loss:', loss)
    # Update batchnorm parameters
    # model.bn1_gamma -= learning_rate * dgamma
    # model.bn1_beta -= learning_rate * dbeta
    # gradient descent
    model.conv1_w -= learning_rate * model.conv1_w_grad
    model.fc1_w -= learning_rate * model.fc1_w_grad
    model.fc2_w -= learning_rate * model.fc2_w_grad

    # zero grads
    model.conv1_w_grad = np.zeros_like(model.conv1_w)
    model.fc1_w_grad = np.zeros_like(model.fc1_w)
    model.fc2_w_grad = np.zeros_like(model.fc2_w)

 
    end_time = time.time()
    # print('Epoch:', i, 'Time:', end_time - start_time)

fc2 [[-4.04632453e-01  1.55585324e-03 -7.22037405e-01 ... -7.72472995e-01
   3.11575001e-01 -4.48599169e-01]
 [ 6.06264051e-02 -3.03086078e-01  3.65922641e-01 ... -1.03070662e+00
  -1.02773165e+00  2.02246203e+00]
 [ 1.69209793e+00 -2.40898511e-01 -8.51896461e-01 ...  4.14495726e-01
   1.12930323e+00  7.93053221e-01]
 ...
 [ 7.76044912e-01  1.37814775e+00 -1.01397074e-01 ... -7.13224821e-01
   1.21873493e+00 -7.55031189e-01]
 [ 7.42708701e-01  3.96874734e-01  2.76460947e-01 ... -7.41453432e-01
  -4.84037574e-01  2.45427244e-01]
 [-8.97876573e-01 -7.27971402e-01 -8.69469823e-01 ... -2.02834981e-02
   9.00807460e-01  7.20277453e-01]]
Model initialized



Loss: 13.815510557964773
fc2_w_grad: [[   3942.22866724    -159.35069001   16808.38610212 ...  -36131.67906102
    32030.46091518   17255.73436866]
 [  -2106.54362975   -2482.23437908    8264.84361961 ...   -7392.39910211
     3073.84592867  -10313.28094257]
 [  10895.28998235    5657.68982069    5133.06031656 ...  -28773.4986957
    323

KeyboardInterrupt: 