In [33]:
import numpy as xp

In [None]:
def dataset(loader_fn, train_num, test_num):
    data_x, data_y = loader_fn
    
    classes = xp.unique(data_y)
    
    train_x_list = []
    train_y_list = []
    test_x_list = []
    test_y_list = []
    
    for cls in classes:
        cls_indices = xp.where(data_y == cls)[0]
        cls_indices = xp.random.permutation(cls_indices)
        
        X_cls = data_x[cls_indices]
        Y_cls = data_y[cls_indices]
        
        train_x_list.append(X_cls[:train_num])
        train_y_list.append(Y_cls[:train_num])
        
        test_x_list.append(X_cls[train_num:train_num + test_num])
        test_y_list.append(Y_cls[train_num:train_num + test_num])
        
    X_train = xp.concatenate(train_x_list)
    y_train = xp.concatenate(train_y_list)
    X_test = xp.concatenate(test_x_list)
    y_test = xp.concatenate(test_y_list)
    
    train_perm = xp.random.permutation(len(X_train))
    X_train = X_train[train_perm]
    y_train = y_train[train_perm]
    
    test_perm = xp.random.permutation(len(X_test))
    X_test = X_test[test_perm]
    y_test = y_test[test_perm]
    
    return X_train, y_train, X_test, y_test

In [35]:
from sklearn.datasets import fetch_openml  

data = fetch_openml("mnist_784")

In [36]:
X_train, y_train, X_test, y_test = dataset((xp.asarray(data["data"].values) / 255.0, xp.asarray(data["target"].values.astype('int16'))), 3000,10)

In [37]:
class Parameter:
    def __init__(self, data):
        self.data = data
        self.grad = xp.zeros_like(data)
        
class Module:
    def __init__(self):
        self.params = {}
        
    def forward(self, *args, **kwargs):
        raise NotImplementedError
    
    def __call__(self, x):
        return self.forward(x)

    def __setattr__(self, name, value):
        if isinstance(value, Parameter):
            self.params[name] = value
        super().__setattr__(name, value)
        
    def parameters(self):
        params = list(self.params.values())
        # print('params', params)
        # print('dict', self.__dict__.values())
        for attr in self.__dict__.values():
            if isinstance(attr, Module):
                params.extend(attr.parameters())
        return params

In [38]:
class Linear(Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.W = Parameter(xp.random.randn(in_features, out_features) * 0.01)
        self.b = Parameter(xp.zeros(out_features))
        self.x = None

    def forward(self, x):
        self.x = x
        return x @ self.W.data + self.b.data

    def backward(self, grad_output):
        self.W.grad += self.x.T @ grad_output
        self.b.grad += grad_output.sum(axis=0)
        return grad_output @ self.W.data.T

class ReLU(Module):
    def forward(self, x):
        self.mask = x > 0
        return x * self.mask

    def backward(self, grad_output):
        return grad_output * self.mask

class SGD:
    def __init__(self, parameters, lr=0.01):
        self.parameters = list(parameters)
        self.lr = lr

    def step(self):
        for param in self.parameters:
            param.data -= self.lr * param.grad
            

    def zero_grad(self):
        for param in self.parameters:
            param.grad.fill(0)


def accuracy(logits, targets):
    preds = xp.argmax(logits, axis=1)
    return xp.mean(preds == targets)

In [39]:
class MSE:
    def forward(self, y_pred, y_true):
        self.y_pred = y_pred

        if y_true.ndim == 1:
            # Convert 1D class indices to one-hot vectors
            num_classes = y_pred.shape[1]
            self.y_true = xp.eye(num_classes)[y_true]
        else:
            self.y_true = y_true

        self.y_true = self.y_true.astype(y_pred.dtype)
        loss = xp.mean((y_pred - self.y_true) ** 2)
        return loss

    def backward(self):
        return 2 * (self.y_pred - self.y_true) / self.y_true.shape[0]


In [40]:
class MyModel(Module):
    def __init__(self):
        super().__init__()
        self.fc1 = Linear(784, 128)
        self.relu1 = ReLU()
        self.fc2 = Linear(128, 32)
        self.relu2 = ReLU()
        self.fc3 = Linear(32, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

    def backward(self, grad_output):
        grad_output = self.fc3.backward(grad_output)
        grad_output = self.relu2.backward(grad_output)
        grad_output = self.fc2.backward(grad_output)
        grad_output = self.relu1.backward(grad_output)
        grad_output = self.fc1.backward(grad_output)
        return grad_output


In [41]:
loss_fn = MSE()

epochs = 5
batch_size = 64
initial_lr = 0.01

model = MyModel()
optimizer = SGD(model.parameters(), lr=initial_lr)

for epoch in range(epochs):
    for i in range(0, X_train.shape[0], batch_size):
        x_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        logits = model.forward(x_batch)
        loss = loss_fn.forward(logits, y_batch)
        grad_output = loss_fn.backward()
        model.backward(grad_output)
        optimizer.step()
        optimizer.zero_grad()

    logits_train = model.forward(X_train)
    train_loss = loss_fn.forward(logits_train, y_train)
    train_acc = accuracy(logits_train, y_train)

    logits_test = model.forward(X_test)
    test_loss = loss_fn.forward(logits_test, y_test)
    test_acc = accuracy(logits_test, y_test)

    print(f"Epoch {epoch+1} Summary: "
          f"Train Acc={train_acc:.4f}, Train Loss={train_loss:.4f}, "
          f"Test Acc={test_acc:.4f}, Test Loss={test_loss:.4f}")


Epoch 1 Summary: Train Acc=0.1000, Train Loss=0.0899, Test Acc=0.1000, Test Loss=0.0899
Epoch 2 Summary: Train Acc=0.1068, Train Loss=0.0896, Test Acc=0.1100, Test Loss=0.0897
Epoch 3 Summary: Train Acc=0.2831, Train Loss=0.0873, Test Acc=0.2800, Test Loss=0.0876
Epoch 4 Summary: Train Acc=0.3596, Train Loss=0.0771, Test Acc=0.3300, Test Loss=0.0775
Epoch 5 Summary: Train Acc=0.5823, Train Loss=0.0665, Test Acc=0.5000, Test Loss=0.0664


In [42]:
def save_model(model, path):
    state = {}
    for name, module in model.__dict__.items():
        if isinstance(module, Module):
            for pname, p in module.params.items():
                state[f"{name}.{pname}"] = p.data
    xp.save(path, state)

def load_model(model, path):
    state = xp.load(path, allow_pickle=True).item()

    for name, module in model.__dict__.items():
        if isinstance(module, Module):
            for pname, p in module.params.items():
                key = f"{name}.{pname}"
                if key in state:
                    p.data = state[key]
                else:
                    print("Missing:", key)

    return model

save_path = 'saved_1.npy'
save_model(model, save_path)

In [43]:
class MyModel(Module):
    def __init__(self):
        super().__init__()
        self.fc1 = Linear(784, 128)
        self.relu1 = ReLU()
        self.fc2 = Linear(128, 32)
        self.relu2 = ReLU()
        self.fc3 = Linear(32, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

    def backward(self, grad_output):
        grad_output = self.fc3.backward(grad_output)
        grad_output = self.relu2.backward(grad_output)
        grad_output = self.fc2.backward(grad_output)
        grad_output = self.relu1.backward(grad_output)
        grad_output = self.fc1.backward(grad_output)
        return grad_output

model_ = MyModel()

In [44]:
model.__dict__

{'params': {},
 'fc1': <__main__.Linear at 0x1a6c5993580>,
 'relu1': <__main__.ReLU at 0x1a6c5992590>,
 'fc2': <__main__.Linear at 0x1a6c59922c0>,
 'relu2': <__main__.ReLU at 0x1a6c59920e0>,
 'fc3': <__main__.Linear at 0x1a6c5991f60>}

In [50]:
model.parameters()

[<__main__.Parameter at 0x1a6c5990af0>,
 <__main__.Parameter at 0x1a6c5992980>,
 <__main__.Parameter at 0x1a6c5991a50>,
 <__main__.Parameter at 0x1a6c59924a0>,
 <__main__.Parameter at 0x1a6c5991c60>,
 <__main__.Parameter at 0x1a6c5991180>]

In [46]:

model1 = load_model(model_, save_path)

# Reinitialize optimizer after loading the model
optimizer = SGD(model1.parameters(), lr=initial_lr)

# Continue with the training loop
for epoch in range(4):
    for i in range(0, X_train.shape[0], batch_size):
        x_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        logits = model1.forward(x_batch)
        loss = loss_fn.forward(logits, y_batch)
        grad_output = loss_fn.backward()
        model1.backward(grad_output)
        optimizer.step()
        optimizer.zero_grad()

    logits_train = model1.forward(X_train)
    train_loss = loss_fn.forward(logits_train, y_train)
    train_acc = accuracy(logits_train, y_train)

    logits_test = model1.forward(X_test)
    test_loss = loss_fn.forward(logits_test, y_test)
    test_acc = accuracy(logits_test, y_test)

    print(f"Epoch {epoch+1} Summary: "
          f"Train Acc={train_acc:.4f}, Train Loss={train_loss:.4f}, "
          f"Test Acc={test_acc:.4f}, Test Loss={test_loss:.4f}")

Epoch 1 Summary: Train Acc=0.6683, Train Loss=0.0591, Test Acc=0.6400, Test Loss=0.0587
Epoch 2 Summary: Train Acc=0.7657, Train Loss=0.0519, Test Acc=0.7500, Test Loss=0.0512
Epoch 3 Summary: Train Acc=0.7998, Train Loss=0.0449, Test Acc=0.7800, Test Loss=0.0438
Epoch 4 Summary: Train Acc=0.8164, Train Loss=0.0384, Test Acc=0.8000, Test Loss=0.0374
