In [1]:
import numpy as np

In [2]:
class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = np.zeros_like(data)
        
class Module:
    # ADD
    def __init__(self):
        self.params = {}

    def forward(self, *args, **kwargs):
        raise NotImplementedError

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

    # ADD
    def __setattr__(self, name, value):
        if isinstance(value, Parameter):
            self.params[name] = value
        super().__setattr__(name, value)

    def parameters(self):
        params = list(self.params.values())
        for attr in self.__dict__.values():
            if isinstance(attr, Module):
                params.extend(attr.parameters())
        return params

In [None]:
class Linear(Module):
    def __init__(self, *args):
        super().__init__()
        # Normal mode: user specifies input and output size
        if len(args) == 2:  # example: Linear(128, 32)
            in_features, out_features = args
            self.deferred_init = False
            self.initialize_params(in_features, out_features)

        # Deferred initialization: Linear(32)
        elif len(args) == 1:
            (out_features,) = args
            self.deferred_init = True
            self.out_features = out_features
            self.W = None
            self.b = None
        else:
            raise ValueError("Linear expects 1 or 2 arguments")


    def initialize_params(self, in_features, out_features):
        # simple 
        self.W = Parameter(np.random.randn(in_features, out_features) * 0.01)   # (in_features, out_features, )

        # Kaiming He normal initialization (best for ReLU networks)
        # std = np.sqrt(2.0 / in_features)                    
        # self.W = Parameter(np.random.randn(in_features, out_features) * std)  # (in_features, out_features, )
        
        self.b = Parameter(np.zeros(out_features)) # (output_features,)


    def forward(self, x):
        # Deferred initialization
        if self.deferred_init and self.W is None:
            in_features = x.shape[-1]
            self.initialize_params(in_features, self.out_features)
            self.deferred_init = False

        self.x = x
        # x: (batch, in_features) 
        return x @ self.W.data + self.b.data    # (batch, out_features)
    
    def backward(self, grad_output):
        self.W.grad += self.x.T @ grad_output      # (in, batch) @ (batch, out) → (in, out)
        self.b.grad += grad_output.sum(axis=0)
        return grad_output @ self.W.data.T          # (batch, out) @ (out, in) → (batch, in)

In [4]:
class ReLU(Module):
    def forward(self, x):
        self.mask = x > 0
        return x * self.mask

    def backward(self, grad_input):
        return grad_input * self.mask

In [5]:
class MSE:
    def forward(self, y_pred, y_true):
        self.y_pred = y_pred  # Store predictions for backward pass

        # Convert 1D class labels to one-hot if needed
        if y_true.ndim == 1:
            num_classes = y_pred.shape[1]
            self.y_true = np.eye(num_classes)[y_true]
        else:
            self.y_true = y_true  # Already in proper shape

        # Match dtype with predictions
        self.y_true = self.y_true.astype(y_pred.dtype)

        # Average of squared differences
        loss = np.mean((y_pred - self.y_true) ** 2)
        return loss
    
    def backward(self): # dL/dY_pred
        return 2 * (self.y_pred - self.y_true) / self.y_true.shape[0]

In [None]:
class CrossEntropy:
    def forward(self, y_pred, y_true):
        self.y_pred = y_pred # Store predictions for backward pass

        # If labels are 1D (class indices), convert them to one-hot encoding
        if y_true.ndim == 1:
            num_classes = y_pred.shape[1]  # Number of output classes
            self.y_true = np.eye(num_classes)[y_true]  # One-hot encode
        else:
            self.y_true = y_true  # Already one-hot encoded

        # Match dtype with predictions
        self.y_true = self.y_true.astype(y_pred.dtype)

        # Clip predictions to avoid log(0) which can cause numerical issues
        y_pred_clipped = np.clip(y_pred, 1e-12, 1.0)

        # Compute cross-entropy loss:
        #   - sum over classes for each sample
        #   - then average over all samples
        loss = -np.mean(np.sum(self.y_true * np.log(y_pred_clipped), axis=1))
        return loss
    
    def backward(self):
        N = self.y_pred.shape[0]  # batch size
        # Gradient: (y_pred - y_true) / N
        grad = (self.y_pred - self.y_true) / N
        return grad

In [6]:
class SGD:
    def __init__(self, _module, lr=0.01):
        # Check if the input _module is an instance of the Module class
        # (e.g., Linear, Activation layers)
        if isinstance(_module, Module):  # If it's a Module, get all the parameters (weights and biases)
            self._module = _module.parameters()
        else:
            # If it's already a list of parameters, assign it directly
            self._module = _module
        self.lr = lr  # Set learning rate (default 0.01)

    def step(self):
        # Loop through all weights and biases of the model
        for param in self._module:
            # Update each weights and biases using SGD formula
            param.data -= self.lr * param.grad

    def zero_grad(self):
        for param in self._module:
            param.grad[...] = 0  # reset their gradients to zero to prevent accumulation from previous steps

In [7]:
def accuracy(logits, targets):
    preds = np.argmax(logits, axis=1)
    return np.mean(preds == targets)

In [None]:
class MyModel(Module):
    def __init__(self):
        super().__init__()
        self.fc1 = Linear(784, 128)
        self.relu1 = ReLU()
        self.fc2 = Linear(128, 32)
        self.relu2 = ReLU()
        self.fc3 = Linear(32, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

    def backward(self, grad_output):
        grad_output = self.fc3.backward(grad_output)
        grad_output = self.relu2.backward(grad_output)
        grad_output = self.fc2.backward(grad_output)
        grad_output = self.relu1.backward(grad_output)
        grad_output = self.fc1.backward(grad_output)
        return grad_output

model = MyModel()

epochs = 10         # Number of epochs
batch_size = 10     # Number of samples
num_classes = 10    # Number of class

# loss_fn = MSE()
loss_fn = CrossEntropy()

initial_lr = 0.01
optimizer = SGD(model, lr=initial_lr)

# input (random images)
X_random = np.random.rand(batch_size, 784)   # shape: (10,784)

# labels (random class indices 0–9)
y_indices = np.random.randint(0, num_classes, size=batch_size)

# y_batch = np.eye(num_classes)[y_indices]
# Not needed because we defined the loss to handle both label encoding and one-hot encoding.

model = MyModel()   # Instantiate the model

# Training loop
for epoch in range(epochs):
    for i in range(0, X_random.shape[0], batch_size):
        # Get the current batch
        x_batch = X_random[i:i+batch_size]
        y_batch = y_indices[i:i+batch_size]

        # Forward pass
        logits = model.forward(x_batch)
        loss = loss_fn.forward(logits, y_batch)

        # Backward pass
        grad_output = loss_fn.backward()  # Get gradient of the loss
        model.backward(grad_output)  # Propagate gradients back through the model

        # Update parameters
        optimizer.step()
        optimizer.zero_grad()  # Reset gradients

        iter_num = i // batch_size + 1

    # Evaluate on the entire training set
    logits_train = model.forward(X_random)
    train_loss = loss_fn.forward(logits_train, y_indices)
    train_acc = accuracy(logits_train, y_indices)

    # Print progress for the current epoch
    print(f"Epoch {epoch+1} Summary: "
          f"Train Acc={train_acc:.4f}, Train Loss={train_loss:.4f}")

Epoch 1 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 2 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 3 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 4 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 5 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 6 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 7 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 8 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 9 Summary: Train Acc=0.2000, Train Loss=0.8943
Epoch 10 Summary: Train Acc=0.2000, Train Loss=0.8943
