In [1]:
import numpy as np

In [2]:
class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = np.zeros_like(data)
        
class Module:
    def forward(self, *args, **kwargs):
        raise NotImplementedError

    def __call__(self, x):
        return self.forward(x)

In [3]:
class Linear(Module):
    def __init__(self, *args):
        super().__init__()
        # Normal mode: user specifies input and output size
        if len(args) == 2:  # example: Linear(128, 32)
            in_features, out_features = args
            self.deferred_init = False
            self.initialize_params(in_features, out_features)

        # Deferred initialization: Linear(32)
        elif len(args) == 1:
            (out_features,) = args
            self.deferred_init = True
            self.out_features = out_features
            self.W = None
            self.b = None
        else:
            raise ValueError("Linear expects 1 or 2 arguments")


    def initialize_params(self, in_features, out_features):
        # simple 
        # self.W = Parameter(np.random.randn(in_features, out_features) * 0.01)   # (in_features, out_features, )

        # Kaiming He normal initialization (best for ReLU networks)
        std = np.sqrt(2.0 / in_features)                    
        self.W = Parameter(np.random.randn(in_features, out_features) * std)  # (in_features, out_features, )
        
        self.b = Parameter(np.zeros(out_features)) # (output_features,)


    def forward(self, x):
        # Deferred initialization
        if self.deferred_init and self.W is None:
            in_features = x.shape[-1]
            self.initialize_params(in_features, self.out_features)
            self.deferred_init = False

        self.x = x
        # x: (batch, in_features) 
        return x @ self.W.data + self.b.data    # (batch, out_features)

In [4]:
class ReLU(Module):
    def forward(self, x):
        # Mask for positive elements (positive -> keep, else -> 0)
        self.mask = x > 0
        return x * self.mask

In [5]:
class MSE:
    def forward(self, y_pred, y_true):
        self.y_pred = y_pred  # Store predictions for backward pass

        # If labels are 1D (class indices), convert them to one-hot encoding
        if y_true.ndim == 1:
            num_classes = y_pred.shape[1]   # Number of output classes
            self.y_true = np.eye(num_classes)[y_true]   # One-hot encode
        else:
            self.y_true = y_true  # Already in proper shape

        # Match dtype with predictions
        self.y_true = self.y_true.astype(y_pred.dtype)

        # Average of squared differences
        loss = np.mean((y_pred - self.y_true) ** 2)
        return loss

In [6]:
class CrossEntropy:
    def forward(self, y_pred, y_true):
        self.y_pred = y_pred # Store predictions for backward pass

        # If labels are 1D (class indices), convert them to one-hot encoding
        if y_true.ndim == 1:
            num_classes = y_pred.shape[1]  # Number of output classes
            self.y_true = np.eye(num_classes)[y_true]  # One-hot encode
        else:
            self.y_true = y_true  # Already one-hot encoded

        # Match dtype with predictions
        self.y_true = self.y_true.astype(y_pred.dtype)

        # Clip predictions to avoid log(0) which can cause numerical issues
        y_pred_clipped = np.clip(y_pred, 1e-12, 1.0)

        # Compute cross-entropy loss:
        #   - sum over classes for each sample
        #   - then average over all samples
        loss = -np.mean(np.sum(self.y_true * np.log(y_pred_clipped), axis=1))
        return loss


In [7]:
class MyModel:
    def __init__(self):
        self.fc1 = Linear(784, 128)
        self.relu1 = ReLU()
        self.fc2 = Linear(128, 32)
        self.relu2 = ReLU()
        self.fc3 = Linear(32, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

batch_size = 10     # Number of samples
num_classes = 10    # Number of class

loss_fn = MSE()
# loss_fn = CrossEntropy()

# input (random images)
X_random = np.random.rand(batch_size, 784)   # shape: (10,784)

# labels (random class indices 0â€“9)
y_indices = np.random.randint(0, num_classes, size=batch_size)

# y_batch = np.eye(num_classes)[y_indices]
# Not needed because we defined the loss to handle both label encoding and one-hot encoding.

model = MyModel()   # Instantiate the model

# Run forward pass
logits = model.forward(X_random)
loss = loss_fn.forward(logits, y_indices)

print("Output shape:", logits.shape)
print("Random labels:", y_indices)
print("logits:", logits[0])
print("Predicted labels:", np.argmax(logits, axis=1)) # Logits give class scores -> use np.argmax to convert to labels
print("Loss:", loss)

Output shape: (10, 10)
Random labels: [4 4 7 8 8 2 1 3 8 1]
logits: [-0.8690983   0.62165563 -0.07143504 -0.60895046 -0.24058133  0.60720407
 -0.1415443   0.67932762  0.57505472 -0.21747152]
Predicted labels: [7 1 7 7 7 7 1 7 1 1]
Loss: 0.5348666917144509
