# MNIST

In [141]:

import numpy as np

from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.sgd import SGD
from torchvision.datasets import MNIST
from tqdm import tqdm


dataset = MNIST(root="/home/paradox/Desktop/ai/pygrad",download=True)

In [142]:
DEVICE = "cuda:0"
EPOCHS = 2
BATCH_SIZE=512
HEIGHT=28
WIDTH=28
N_CLASSES = 10

# PyGrad

In [178]:

class Tensor():
    def __init__(self, value, label=None, _childrens=set()):


        if isinstance(value, np.ndarray):
            self.value = value
        else:
            self.value = np.array(value)
        self._childrens = _childrens
        self.label = label
        self._backwards = lambda: None
        
        if np.isscalar(value):
            self.grad = np.zeros(1)
        else:
            self.grad = np.zeros(value.shape)

        self.shape = self.value.shape

    def __repr__(self):
        if self.label == None:
            return f"Tensor(value={self.value}, grad={self.grad})"
        else:
            return f"Tensor(value={self.value}, label={self.label}, grad={self.grad})"
            
    def __add__(self, other):
       
        t = Tensor(self.value + other.value, _childrens=set([self, other]))

        def backwards():

            if self.grad.shape == t.grad.shape:
                self.grad += t.grad
                
            # Row Vector
            elif self.grad.shape[-1] == t.grad.shape[-1]:
                self.grad += t.grad.sum(0)
                
            # Column Vector
            elif len(self.grad.shape) == 2 and self.grad.shape[-1] == 1:
                self.grad += t.grad.sum(1)

            # Scalar
            else:
                self.grad += t.grad.sum()
                
            if other.grad.shape == t.grad.shape:
                other.grad += t.grad
                
            # Row Vector
            elif self.grad.shape[-1] == t.grad.shape[-1]:
                other.grad += t.grad.sum(0)
                
            # Column Vector
            elif len(self.grad.shape) == 2 and self.grad.shape[-1] == 1:
                other.grad += t.grad.sum(1)

            # Scalar
            else:
                other.grad += t.grad.sum()


        t._backwards = backwards
        
        return t
        
    def backward(self):

        assert ((self.value.shape == () or self.value.shape == (1,)) and "Grads can be created for scalar inputs only")

        if self.value.shape != ():
            self.value = self.value[0]
        
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._childrens:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backwards()


    def dot(self, other):

        assert (self.shape[-1] == other.shape[0])
        assert (len(self.shape) == 2 and "currently only supports dot product for 2D tensors")            
        
        t = Tensor(np.dot(self.value, other.value), _childrens=set([self, other]))

        def backwards():
            
            self.grad += (other.value @ t.grad.T).T        
            other.grad += (self.value.T @ t.grad)
    
        t._backwards = backwards
        return t


    # Scalar / Elementwise multiplication, we assume that both of the multiplicands have
    # same dimensions.
    def __mul__(self, other):

        t = Tensor(self.value * other.value, _childrens=set([self, other]))

        def backwards():
            print(t.grad)

            if self.grad.shape == t.grad.shape:
                self.grad += other.value * t.grad

            # Row Vector
            elif self.grad.shape[-1] == t.grad.shape[-1]:
                self.grad += (other.value * t.grad).sum(0)
                
            # Column Vector
            elif len(self.grad.shape) == 2 and self.grad.shape[1] == 1:
                self.grad += (other.value * t.grad).sum(1)

            # Scalar
            else:
                self.grad += (other.value * t.grad).sum()
                
            if other.grad.shape == t.grad.shape:
                other.grad += self.value * t.grad

            # Row Vector
            elif other.grad.shape[-1] == t.grad.shape[-1]:
                other.grad += (self.value * t.grad).sum(0)
                
            # Column Vector
            elif len(other.grad.shape) == 2 and other.grad.shape[1] == 1:
                other.grad += (self.value * t.grad).sum(1)

            # Scalar
            else:
                other.grad += (self.value * t.grad).sum()
                
            # other.grad += self.value * t.grad
            
        t._backwards = backwards
        return t

    def __pow__(self, n):

        assert (isinstance(n, Tensor) == False)

        t = Tensor(self.value ** n, _childrens=set([self]))

        def backwards():
            self.grad += n * (self.value ** (n - 1)) * t.grad

        t._backwards = backwards
        return t
        

    def relu(self):

        n = np.copy(self.value)
        n[n <= 0] = 0
        t = Tensor(n, _childrens=set([self]))

        def backwards():
            self.grad[0 <= self.value] += t.grad[0 <= self.value]
            
        t._backwards = backwards
        return t

    def exp(self):

        t = Tensor(np.exp(self.value), _childrens=set([self]))
        
        def backwards():
            self.grad += t.value * t.grad

        t._backwards = backwards
        return t

    def __truediv__(self,other):
        return self * (other ** -1)

    def sum(self):
        t = Tensor(self.value.sum(), _childrens=set([self]))

        def backwards():
            self.grad += t.grad
            
        t._backwards = backwards
        
        return t

    def max(self, dim):

        assert (dim == 0 or dim == 1) and "Nanograd only supports 2D Tensors"

        t = Tensor(self.value.max(dim), _childrens=set([self]))

        def backwards():

            # self.value is 1D Tensor
            if t.value.shape == ():
                self.grad[self.value == t.value] += t.grad
                return

            row,cols = self.value.shape
            if dim == 0:

                if len(t.grad.shape) == 1:
                    self.grad[self.value == self.value.max(0)] += t.grad
                else:
                    self.grad[self.value == self.value.max(0)] += t.grad[self.value == self.value.max(0)]
            else:
                

                if len(t.grad.shape) == 1:
                    self.grad[self.value == self.value.max(1).reshape(-1,1)] += t.grad
                else:
                    self.grad[self.value == self.value.max(1).reshape(-1,1)] += t.grad[self.value == self.value.max(1).reshape(-1,1)]


        t._backwards = backwards
        return t
        
        

In [179]:

xTr = np.random.randn(16, HEIGHT * WIDTH) / ((HEIGHT * WIDTH) ** 0.5)
w1 = np.random.randn(HEIGHT * WIDTH, HEIGHT * WIDTH) / ((HEIGHT * WIDTH) ** 0.5)
b1 = np.random.randn(HEIGHT * WIDTH) / ((HEIGHT * WIDTH) ** 0.5)
w2 = np.random.randn(HEIGHT * WIDTH, N_CLASSES) / ((HEIGHT * WIDTH) ** 0.5)
b2 = np.random.randn(N_CLASSES) / (N_CLASSES ** 0.5)

In [184]:
n_xTr = Tensor(xTr)

n_w1 = Tensor(w1)
n_b1 = Tensor(b1)

n_w2 = Tensor(w2)
n_b2 = Tensor(b2)

batch_size = Tensor(np.array(16.0))

# Forward Pass

n_h1 = (n_xTr.dot(n_w1) + n_b1).relu()
n_logits = n_h1.dot(n_w2) + n_b2

# n_probs = (n_logits.exp() / n_logits.exp().sum()).max(dim=1).sum() / batch_size

n_probs = (n_b2 / batch_size).sum()

n_probs.backward()



[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


ValueError: non-broadcastable output operand with shape () doesn't match the broadcast shape (1,)

In [161]:

p_xTr = torch.tensor(xTr)

p_w1 = torch.tensor(w1); p_w1.requires_grad = True
p_b1 = torch.tensor(b1); p_b1.requires_grad = True

p_w2 = torch.tensor(w2); p_w2.requires_grad = True
p_b2 = torch.tensor(b2); p_b2.requires_grad = True


# Forward pass
p_h1 = (p_xTr.matmul(p_w1) + p_b1).relu()
p_logits = p_h1.matmul(p_w2) + p_b

p_probs = (p_logits.exp() / p_logits.exp().sum() ).max(1).values.sum()

p_probs.backward()



In [163]:
parameters = [(n_w1, p_w1), (n_b1, p_b1), (n_w2, p_w2), (n_b2, p_b2)]


for n,p in parameters:
    print(np.all(np.abs(n.grad - p.grad.numpy()) < 1e-5))

True
True
True
True


In [140]:
n_probs, p_probs

(Tensor(value=0.10875924192675124, grad=1),
 tensor(0.1088, dtype=torch.float64, grad_fn=<SumBackward0>))

In [135]:
def pygrad():

    n_xTr = Tensor(xTr)
    n_w1 = Tensor(w1)
    n_b1 = Tensor(b1)
    n_w2 = Tensor(w2)
    
    h1_preact = n_xTr.dot(n_w1) + n_b1
    h1 = h1_preact.relu()
    o = h1.dot(n_w2).sum()

    o.backwards()
    return n_w1,n_w2

n_w1,n_w2 = pygrad()

In [136]:
p_xTr = torch.tensor(xTr)
p_w1 = torch.tensor(w1); p_w1.requires_grad = True
p_w2 = torch.tensor(w2); p_w2.requires_grad = True

o = p_xTr.matmul(p_w1).relu().matmul(p_w2).sum()

o.backward()


In [137]:

if np.all(np.abs(p_w1.grad.numpy() - n_w1.grad) < 1e-8):
    print(f"Grads of matrix n_w1 is same")
else:
    print("Grads of matrix n_w1 is not same")

if np.all(np.abs(p_w2.grad.numpy() - n_w2.grad) < 1e-8):
    print(f"Grads of matrix n_w2 is same")
else:
    print("Grads of matrix n_w2 is not same")




Grads of matrix n_w1 is same
Grads of matrix n_w2 is same
