In [None]:
import numpy as np
from sklearn.datasets import fetch_openml  

In [2]:
def dataset(loader_fn, train_num, test_num):
    data_x, data_y = loader_fn
    
    classes = np.unique(data_y)
    
    train_x_list = []
    train_y_list = []
    test_x_list = []
    test_y_list = []
    
    for cls in classes:
        cls_indices = np.where(data_y == cls)[0]
        cls_indices = np.random.permutation(cls_indices)
        
        X_cls = data_x[cls_indices]
        Y_cls = data_y[cls_indices]
        
        train_x_list.append(X_cls[:train_num])
        train_y_list.append(Y_cls[:train_num])
        
        test_x_list.append(X_cls[train_num:train_num + test_num])
        test_y_list.append(Y_cls[train_num:train_num + test_num])
        
    X_train = np.concatenate(train_x_list)
    y_train = np.concatenate(train_y_list)
    X_test = np.concatenate(test_x_list)
    y_test = np.concatenate(test_y_list)
    
    train_perm = np.random.permutation(len(X_train))
    X_train = X_train[train_perm]
    y_train = y_train[train_perm]
    
    test_perm = np.random.permutation(len(X_test))
    X_test = X_test[test_perm]
    y_test = y_test[test_perm]
    
    return X_train, y_train, X_test, y_test

In [3]:
data = fetch_openml("mnist_784")

In [4]:
# Load and preprocess MNIST dataset
# Normalize pixel values to range 0-1 by dividing by 255.0 (grayscale images originally 0-255)
# Convert labels to integer type using astype('int16') to ensure numeric operations work correctly 

X_train, y_train, X_test, y_test = dataset(
    (
        np.asarray(data["data"].values) / 255.0,                 # Normalize input images
        np.asarray(data["target"].values.astype('int16'))        # Convert labels to integers
    ),
    train_num=3000,
    test_num=100
)


In [5]:
class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = np.zeros_like(data)
        
class Module:
    def __init__(self):
        self.params = {}
        self.layer_dict = {}     # Add

    def forward(self, *args, **kwargs):
        raise NotImplementedError

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

    def __setattr__(self, name, value):
        if isinstance(value, Parameter):
            self.params[name] = value
        super().__setattr__(name, value)

        if isinstance(value, Module):
            self.layer_dict[name] = value
        super().__setattr__(name, value)

    def parameters(self):
        params = list(self.params.values())
        for attr in self.__dict__.values():
            if isinstance(attr, Module):
                params.extend(attr.parameters())
        return params

    # Add
    def backward(self, grad_output):
        for layer in reversed(list(self.layer_dict.values())):
            grad_output = layer.backward(grad_output)
        return grad_output

In [6]:
class Linear(Module):
    def __init__(self, *args):
        super().__init__()
        # Normal mode: user specifies input and output size
        if len(args) == 2:  # example: Linear(128, 32)
            in_features, out_features = args
            self.deferred_init = False
            self.initialize_params(in_features, out_features)

        # Deferred initialization: Linear(32)
        elif len(args) == 1:
            (out_features,) = args
            self.deferred_init = True
            self.out_features = out_features
            self.W = None
            self.b = None
        else:
            raise ValueError("Linear expects 1 or 2 arguments")


    def initialize_params(self, in_features, out_features):
        # simple 
        self.W = Parameter(np.random.randn(in_features, out_features) * 0.01)   # (in_features, out_features, )

        # Kaiming He normal initialization (best for ReLU networks)
        # std = np.sqrt(2.0 / in_features)                    
        # self.W = Parameter(np.random.randn(in_features, out_features) * std)  # (in_features, out_features, )
        
        self.b = Parameter(np.zeros(out_features)) # (output_features,)


    def forward(self, x):
        # Deferred initialization
        if self.deferred_init and self.W is None:
            in_features = x.shape[-1]
            self.initialize_params(in_features, self.out_features)
            self.deferred_init = False

        self.x = x
        # x: (batch, in_features) 
        return x @ self.W.data + self.b.data    # (batch, out_features)
    
    def backward(self, grad_output):
        self.W.grad += self.x.T @ grad_output      # (in, batch) @ (batch, out) → (in, out)
        self.b.grad += grad_output.sum(axis=0)
        return grad_output @ self.W.data.T          # (batch, out) @ (out, in) → (batch, in)

In [7]:
class ReLU(Module):
    def forward(self, x):
        self.mask = x > 0
        return x * self.mask

    def backward(self, grad_input):
        return grad_input * self.mask

In [8]:
class MSE:
    def forward(self, y_pred, y_true):
        self.y_pred = y_pred  # Store predictions for backward pass

        # Convert 1D class labels to one-hot if needed
        if y_true.ndim == 1:
            num_classes = y_pred.shape[1]
            self.y_true = np.eye(num_classes)[y_true]
        else:
            self.y_true = y_true  # Already in proper shape

        # Match dtype with predictions
        self.y_true = self.y_true.astype(y_pred.dtype)

        # Average of squared differences
        loss = np.mean((y_pred - self.y_true) ** 2)
        return loss
    
    def backward(self): # dL/dY_pred
        return 2 * (self.y_pred - self.y_true) / self.y_true.shape[0]

In [9]:
class SGD:
    def __init__(self, _module, lr=0.01):
        # Check if the input _module is an instance of the Module class
        # (e.g., Linear, Activation layers)
        if isinstance(_module, Module):  # If it's a Module, get all the parameters (weights and biases)
            self._module = _module.parameters()
        else:
            # If it's already a list of parameters, assign it directly
            self._module = _module
        self.lr = lr  # Set learning rate (default 0.01)

    def step(self):
        # Loop through all weights and biases of the model
        for param in self._module:
            # Update each weights and biases using SGD formula
            param.data -= self.lr * param.grad

    def zero_grad(self):
        for param in self._module:
            param.grad[...] = 0  # reset their gradients to zero to prevent accumulation from previous steps

In [10]:
def accuracy(logits, targets):
    preds = np.argmax(logits, axis=1)
    return np.mean(preds == targets)

In [11]:
class Sequential(Module):
    def __init__(self, layers):
        super().__init__()
        self.layer_dict = {}          # Store the layers
        for i, layer in enumerate(layers):
            self.layer_dict['layer' + str(i)] = layer

    def forward(self, x):
        # Apply each layer in order
        for i in sorted(self.layer_dict.keys()):
            x = self.layer_dict[i](x)
        return x

    def backward(self, grad_output):
        # Backpropagate through each layer in reverse order
        for i in reversed(sorted(self.layer_dict.keys())):
            grad_output = self.layer_dict[i].backward(grad_output)
        return grad_output

    def parameters(self):
        # Collect parameters from all layers
        params = []
        for layer in self.layer_dict.values():
            if hasattr(layer, 'parameters'):
                params.extend(layer.parameters())
        return params

In [12]:
model_seq = Sequential([
    Linear(784, 128),
    ReLU(),
    Linear(32),
    ReLU(),
    Linear(10),
])

In [13]:
class MyModel(Module):
    def __init__(self):
        super().__init__()
        self.fc1 = Linear(784, 128)
        self.relu1 = ReLU()
        self.fc2 = Linear(128, 32)
        self.relu2 = ReLU()
        self.fc3 = Linear(32, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x
    
model_custom = MyModel()

In [None]:
loss_fn = MSE()

epochs = 15
batch_size = 64
initial_lr = 0.01

# model = model_seq
model = model_custom 

optimizer = SGD(model.parameters(), lr=initial_lr)

for epoch in range(epochs):
    for i in range(0, X_train.shape[0], batch_size):
        x_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        logits = model.forward(x_batch)
        loss = loss_fn.forward(logits, y_batch)
        grad_output = loss_fn.backward()
        model.backward(grad_output)
        optimizer.step()
        optimizer.zero_grad()

    logits_train = model.forward(X_train)
    train_loss = loss_fn.forward(logits_train, y_train)
    train_acc = accuracy(logits_train, y_train)

    logits_test = model.forward(X_test)
    test_loss = loss_fn.forward(logits_test, y_test)
    test_acc = accuracy(logits_test, y_test)

    print(f"Epoch {epoch+1} Summary: "
          f"Train Acc={train_acc:.4f}, Train Loss={train_loss:.4f}, "
          f"Test Acc={test_acc:.4f}, Test Loss={test_loss:.4f}")


Epoch 1 Summary: Train Acc=0.1661, Train Loss=0.0900, Test Acc=0.1670, Test Loss=0.0900
Epoch 2 Summary: Train Acc=0.1873, Train Loss=0.0898, Test Acc=0.1920, Test Loss=0.0898
Epoch 3 Summary: Train Acc=0.2871, Train Loss=0.0888, Test Acc=0.2960, Test Loss=0.0888
Epoch 4 Summary: Train Acc=0.3243, Train Loss=0.0809, Test Acc=0.3170, Test Loss=0.0809
Epoch 5 Summary: Train Acc=0.5413, Train Loss=0.0698, Test Acc=0.5430, Test Loss=0.0699
Epoch 6 Summary: Train Acc=0.6260, Train Loss=0.0646, Test Acc=0.6500, Test Loss=0.0647


In [None]:
def save_model(model, path):
    state = {}
    for module_name, module in model.layer_dict.items():
        if isinstance(module, Module):
            for pname, p in module.params.items():
                state[f"{module_name}.{pname}"] = p.data

    for k, v in state.items():
        print(f"{k}: shape={v.shape}")
    np.save(path, state)

def load_model(model, path):
    # Load the saved file
    state = np.load(path, allow_pickle=True).item()

    # Update the parameters of the current model
    for module_name, module in model.layer_dict.items():
        if isinstance(module, Module):
            for pname, p in module.params.items():
                key = f"{module_name}.{pname}"
                if key in state:
                    p.data = state[key]  # Update its value
                else:
                    print("Missing:", key)

    return model

In [None]:
# save
save_path = 'saved_1.npy'
save_model(model, save_path)

# load
model_load = load_model(model, save_path)

fc1.W: shape=(784, 128)
fc1.b: shape=(128,)
fc2.W: shape=(128, 32)
fc2.b: shape=(32,)
fc3.W: shape=(32, 10)
fc3.b: shape=(10,)


In [None]:
loss_fn = MSE()

epochs = 5
batch_size = 64
initial_lr = 0.01

# Reinitialize optimizer after loading the model
optimizer = SGD(model_load.parameters(), lr=initial_lr)

# Continue with the training loop
for epoch in range(epochs):
    for i in range(0, X_train.shape[0], batch_size):
        x_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        logits = model.forward(x_batch)
        loss = loss_fn.forward(logits, y_batch)
        grad_output = loss_fn.backward()
        model.backward(grad_output)
        optimizer.step()
        optimizer.zero_grad()

    logits_train = model.forward(X_train)
    train_loss = loss_fn.forward(logits_train, y_train)
    train_acc = accuracy(logits_train, y_train)

    logits_test = model.forward(X_test)
    test_loss = loss_fn.forward(logits_test, y_test)
    test_acc = accuracy(logits_test, y_test)

    print(f"Epoch {epoch+1} Summary: "
          f"Train Acc={train_acc:.4f}, Train Loss={train_loss:.4f}, "
          f"Test Acc={test_acc:.4f}, Test Loss={test_loss:.4f}")


Epoch 1 Summary: Train Acc=0.9104, Train Loss=0.0218, Test Acc=0.9030, Test Loss=0.0228
Epoch 2 Summary: Train Acc=0.9158, Train Loss=0.0205, Test Acc=0.9100, Test Loss=0.0215
Epoch 3 Summary: Train Acc=0.9207, Train Loss=0.0193, Test Acc=0.9140, Test Loss=0.0203
Epoch 4 Summary: Train Acc=0.9247, Train Loss=0.0183, Test Acc=0.9210, Test Loss=0.0192
Epoch 5 Summary: Train Acc=0.9297, Train Loss=0.0174, Test Acc=0.9240, Test Loss=0.0183
