Multi Layer perceptron 

In [1]:
import math 
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class Value:
    def __init__(self, data, _children=(), _op='', label='') -> None:
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._prev = _children
        self._op = _op
        self.label = label
    
    def __repr__(self) -> str:
        return f'Value(data = {self.data})'
    
    

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self,other), '+')
        
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward

        return out
    
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self,other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out
    

    def __pow__(self, other):
        assert isinstance(other, (int,float)) # only support int & float powers for now
        out = Value(self.data**other, (self,), f'^{other}')

        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad
        out._backward = _backward

        return out
    
    
    
    def __neg__(self):
        return -1 * self
    
    def __sub__(self, other):
        return self + (-other)
    
    def __rsub__(self, other):
        return self + (-other)
    
    def __radd__(self, other):
        return self + other
    
    def __rmul__(self, other):
        return self * other
    
    def __truediv__(self, other):
        return self * (other ** -1)
    
    def __rtruediv__(self, other):
        return self * (other ** -1)

    
    def sigmoid(self):
        x = self.data
        s = 1/(1+ math.exp(-x))
        out = Value(s, (self, ), 'sigmoid')

        def _backward():
            self.grad += s * (1-s)

        out._backward = _backward
        
        return out
    
    def relu(self):
        x = self.data
        r = max(x,0)
        out = Value(r, (self, ), 'ReLU')

        def _backward():
            self.grad += 1 if x > 0 else 0

        out._backward = _backward

        return out
    
    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')
        
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        
        return out
    
    def linear(self):
        x = self.data
        out = Value(x, (self, ), 'linear')

        def _backward():
            self.grad += 1.0 * out.grad
        out._backward = _backward
        
        return out
    
    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')

        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        return out
    
    def ln(self):
        x = self.data
        out = Value(math.log(x), (self, ), 'ln')

        def _backward():
            self.grad += (1/x) * out.grad
        out._backward = _backward
        return out
    

    def backward(self):
        topological = []
        visited = set()

        def build_topological(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topological(child)
                topological.append(v)

        build_topological(self)
        
        self.grad = 1.0
        for node in reversed(topological):
            node._backward()

In [3]:
class Neuron:
    
    def __init__(self, numIn, activation) -> None:
        self.w = [Value(random.uniform(-1,1)) for _ in range(numIn)]
        self.b = Value(random.uniform(-1,1))
        self.act_func = activation
    
    def __call__(self, x):
        # w . x + b
        act = sum(w1*x1 for w1, x1 in zip(self.w, x)) + self.b
        out = getattr(act, self.act_func)()
        return out
    
    def parameters(self):
        return self.w + [self.b]

class Layer:
    def __init__(self, numIn, numOut, activation) -> None:
        self.neurons = [Neuron(numIn, activation) for _ in range(numOut)]
    
    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs
    
    def parameters(self):
        params = []
        for neuron in self.neurons:
            params.extend(neuron.parameters())
        return params
    
        
class MLP:
    def __init__(self, numIn, numOuts, activations) -> None:
        sizes = [numIn] + numOuts 
        self.layers = [Layer(sizes[i], sizes[i+1], activations[i]) for i in range(len(numOuts))]
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params
        

MNIST DATASET


In [22]:
import numpy as np

In [25]:
class MNISTClassifier:
    def __init__(self, layer_sizes=[784, 128, 64, 10]):
        """Initialize the MNIST classifier with specified layer sizes."""
        self.model = MLP(4, layer_sizes, ['relu','relu','relu','linear'])
        
    def forward(self, x):
        """Forward pass through the network."""
        # Convert input to list of Values
        x = [Value(xi) for xi in x]
        return self.model(x)
    
    def loss(self, outputs, target):
        """Calculate cross entropy loss."""
        # Simple negative log likelihood loss
        exp_scores = [o.exp() for o in outputs]
        sum_exp_scores = sum(exp_scores)
        probs = [e / sum_exp_scores for e in exp_scores]
        return -probs[target].ln()
    
    def train_step(self, x, y, learning_rate=0.01):
        """Perform one training step."""
        # Forward pass
        outputs = self.forward(x)
        
        # Calculate loss
        loss = self.loss(outputs, y)
        
        # Backward pass
        self.model.zero_grad()
        loss.backward()
        
        # Update weights
        for p in self.model.parameters():
            p.data -= learning_rate * p.grad
        
        return loss.data
    
    def predict(self, x):
        """Make prediction for input x."""
        outputs = self.forward(x)
        scores = [o.data for o in outputs]
        return np.argmax(scores)

def train_mnist(model, X_train, y_train, batch_size=32, epochs=5, learning_rate=0.01):
    """Train the model on MNIST data."""
    n_samples = len(X_train)
    losses = []
    
    for epoch in range(epochs):
        epoch_losses = []
        # Shuffle data
        # indices = np.random.permutation(n_samples)
        # X_train = X_train[indices]
        # y_train = y_train[indices]
        
        for i in range(0, n_samples, batch_size):
            batch_X = X_train[i:i + batch_size]
            batch_y = y_train[i:i + batch_size]
            
            batch_losses = []
            for x, y in zip(batch_X, batch_y):
                loss = model.train_step(x, y, learning_rate)
                batch_losses.append(loss)
            
            avg_loss = np.mean(batch_losses)
            epoch_losses.append(avg_loss)
            
            if i % 1000 == 0:
                print(f'Epoch {epoch+1}, Step {i}, Loss: {avg_loss:.4f}')
        
        avg_epoch_loss = np.mean(epoch_losses)
        losses.append(avg_epoch_loss)
        print(f'Epoch {epoch+1} completed, Average Loss: {avg_epoch_loss:.4f}')
    
    return losses

def evaluate(model, X_test, y_test, num_samples=1000):
    """Evaluate model accuracy on test set."""
    correct = 0
    for x, y in zip(X_test[:num_samples], y_test[:num_samples]):
        pred = model.predict(x)
        if pred == y:
            correct += 1
    
    accuracy = correct / num_samples
    return accuracy

def main():
    # # Load MNIST data
    # print("Loading MNIST dataset...")
    # (X_train, y_train), (X_test, y_test) = fetch_mnist()
    
    # Create and train model
    print("Initializing model...")
    model = MNISTClassifier([784, 128, 64, 10])
    
    print("Starting training...")
    losses = train_mnist(model, X_train, y_train, 
                        batch_size=32, 
                        epochs=5, 
                        learning_rate=0.01)
    
    # Evaluate model
    # print("Evaluating model...")
    # accuracy = evaluate(model, X_test, y_test)
    # print(f"Test accuracy: {accuracy:.4f}")
    
    # Plot training loss
    plt.plot(losses)
    plt.title('Training Loss Over Time')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()



if __name__ == "__main__":
    main()

Initializing model...
Starting training...


AttributeError: 'MLP' object has no attribute 'zero_grad'

In [16]:
import pandas as pd
df = pd.read_csv('mnist_test.csv', header = 0)

y_train = df.iloc[:, 0].tolist()  # First column
X_train = df.iloc[:, 1:].values.tolist()  # Remaining columns

# xs -> 784-ary arrays 
# ys -> labels  


In [5]:
n = MLP(3, [784, 15, 10], ['relu','relu','linear'])

In [None]:
from typing import List
import numpy as np

 

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def my_CE(y_true, y_pred):
    log_y_pred = [Value]
    for y in y_pred:
        log_y_pred.append(y.ln())
    element_wise = - (log_y_pred * y_true)

    sum = 0.0
    for i in element_wise:
        sum += i

    return sum / len(element_wise)

def one_hot_encode(labels, num_classes=10):
    encoded = []
    for label in labels:
        # Create a zero vector of length num_classes
        vector = [0] * num_classes
        # Set the corresponding index to 1
        vector[label] = 1
        encoded.append(vector)
    return encoded


In [14]:
for k in range(20):

    # Forward pass
    ypred = [n(x) for x in xs]

    loss = my_CE(one_hot_encode(ys), ypred)
    loss._back

    # Backward pass
    for p in n.parameters():
        p.grad = 0
    loss.backward()

    # Update
    for p in n.parameters():
        p.data += -0.1 * p.grad

    print (k, loss.data)

AttributeError: 'list' object has no attribute 'ln'

In [9]:

def stable_softmax(logits):
    """
    Numerically stable softmax implementation
    """
    # Shift values by max for numerical stability
    max_val = max(x.data for x in logits)
    expvals = [Value(0.0) for _ in logits]
    sum_exp = Value(0.0)

    # Compute exp(x - max(x)) for stability
    for i, x in enumerate(logits):
        exp_vals[i] = (x + (-max_val)).exp()
        sum_exp = sum_exp + exp_vals[i]

    # Normalize
    probs = [exp_i * (1.0/sum_exp) for exp_i in exp_vals]
    return probs

def cross_entropy_loss(probs, target_idx):
    """
    Compute cross entropy loss with numerical stability
    """
    eps = 1e-7  # Small constant to prevent log(0)
    prob = probs[target_idx]
    # Clip probability to prevent numerical issues
    if prob.data < eps:
        prob.data = eps
    return -prob.ln()

# Training loop
learning_rate = 0.1  # Reduced learning rate
for k in range(50):
    # Forward pass
    ypred = [n(x) for x in xs]
    y_pd = stable_softmax(ypred)

    # Compute loss with stabilized cross entropy
    batch_loss = sum(cross_entropy_loss(y_pd[i], ys[i]) for i in range(len(xs))) / len(xs)

    # Backward pass
    for p in n.parameters():
        p.grad = 0.0
    batch_loss.backward()

    # Update with proper gradient descent
    for p in n.parameters():
        p.data -= learning_rate * p.grad  # Note the -= instead of +=

    # Optional: Learning rate decay
    if k > 30:
        learning_rate *= 0.95

    print(f"Epoch {k}, Loss: {batch_loss.data:.4f}")

KeyboardInterrupt: 

In [29]:
# After computing gradients
if k % 10 == 0:  # Print every 10 epochs
    print("\nGradient magnitudes:")
    for p in n.parameters():
        print(f"Parameter grad: {p.grad:.6f}")

In [None]:
# visualizing 

import matplotlib.pyplot as plt

def visualize_digit(pixel_list):

    # Ensure we have 784 elements
    if len(pixel_list) != 784:
        raise ValueError(f"Expected 784 elements, but got {len(pixel_list)}")
    
    # Convert the 1D list of 784 elements into a 2D list of 28x28
    image = []
    for i in range(0, 784, 28):
        row = pixel_list[i:i + 28]
        image.append(row)
    
    # Create figure and display
    plt.figure(figsize=(6, 6))
    plt.imshow(image, cmap='gray')
    plt.axis('off')
    plt.tight_layout()
    plt.show()


In [51]:
# Using Categorical cross-entropy loss [with softmax function]



for k in range(50):

    # Forward pass

    ypred = [n(x) for x in xs]

    # y_pd -> probabilities predicted using softmax
    y_pd = softmax(ypred) #softmax probability distribution 

    loss = - sum((y_pd[i][ys[i]]).ln() for i in range(len(xs))) / len(xs)
    
    # Backward pass
    for p in n.parameters():
        p.grad = 0
    loss.backward()

    # Update
    for p in n.parameters():
        p.data += -0.5 * p.grad

    print (k, loss.data)

0 6.977437148509617
1 5.860054490391806
2 5.724070890497646
3 5.70427963142299
4 5.685907667817884
5 5.668864201863611
6 5.653063650078032
7 5.638425526728609
8 5.624874240640926
9 5.612338830201947
10 5.600752660265695
11 5.590053102207423
12 5.580181215065307
13 5.571081441968523
14 5.5627013321993495
15 5.554991295526268
16 5.547904392063652
17 5.541396158001981
18 5.535424465196715
19 5.5299494108481
20 5.524933232346475
21 5.52034024175975
22 5.516136774333545
23 5.512291145669659
24 5.508773612844519
25 5.505556335523056
26 5.502613334018773
27 5.49992044216394
28 5.497455253718023
29 5.495197061809329
30 5.493126791543945
31 5.4912269264137255
32 5.489481429490523
33 5.487875660617059
34 5.486396290912353
35 5.4850312159213805
36 5.483769468676757
37 5.482601133825213
38 5.48151726382256
39 5.480509798033805
40 5.479571485403104
41 5.478695811191156
42 5.4778769281224235
43 5.477109592146053
44 5.476389102894197
45 5.475711248821358
46 5.475072256927379
47 5.4744687469035345
48 

In [55]:
def one_hot_encode(labels, num_classes=10):
    encoded = []
    for label in labels:
        # Create a zero vector of length num_classes
        vector = [0] * num_classes
        # Set the corresponding index to 1
        vector[label] = 1
        encoded.append(vector)
    return encoded

In [93]:
stable_softmax(ypred)

[[Value(data = 4.37629807291863),
  Value(data = 0.14847948055566637),
  Value(data = 0.5558538437162941),
  Value(data = 0.34068164284984703),
  Value(data = 1.6349652042369576),
  Value(data = 2.684598794863206),
  Value(data = 1.6197815242345228),
  Value(data = 2.5755697057384794),
  Value(data = 1.6450463081748452),
  Value(data = 3.570710245742863)],
 [Value(data = 4.37629807291863),
  Value(data = 0.14847948055566637),
  Value(data = 0.5558538437162941),
  Value(data = 0.34068164284984703),
  Value(data = 1.6349652042369576),
  Value(data = 2.684598794863206),
  Value(data = 1.6197815242345228),
  Value(data = 2.5755697057384794),
  Value(data = 1.6450463081748452),
  Value(data = 3.570710245742863)],
 [Value(data = 4.37629807291863),
  Value(data = 0.14847948055566637),
  Value(data = 0.5558538437162941),
  Value(data = 0.34068164284984703),
  Value(data = 1.6349652042369576),
  Value(data = 2.684598794863206),
  Value(data = 1.6197815242345228),
  Value(data = 2.57556970573847