In [45]:
import numpy as np

debug = 1
def mean(mylist): return sum(mylist) / len(mylist) if len(mylist) > 0 else 0
def prod(a): return np.array(a).prod() 

# Tensor operations return lazybuffers, if you realize the lazybuffers, they run forward()

### SHAPETRACKER ###

class ShapeTracker:
    def __init__(self, shape) -> None:
        self.shape = shape 
        self.permutes = None
        self.stride = [1]
        self.pad = [0]


    def __eq__(self, other):  # Incomplete
        return mean([a == b for a, b in zip(self.shape, other.shape)]) == 1 

    def __str__(self) -> str:
        return "t" + str(self.shape)

    def __getitem__(self, index): 
        if isinstance(index, int): return self.shape[index]
        if isinstance(index, slice): return self.shape[index]
        # if isinstance(index, tuple): return ShapeTracker([self.shape[i] for i in index])
        # Also handle tensor

    def pop(self, axis):
        if axis is not None: self.shape = tuple([i for n, i in enumerate(self.shape) if n != axis%len(self.shape)]) 
        else: self.shape = (1,) 

    def copy(self): return ShapeTracker(self.shape)
    def flat(self): return prod(self.shape)

### LAZYBUFFER ###

class LazyBuffer:
    def __init__(self) -> None:
        self.buffer = []
        self.grad = 0
        # print(type(self).__name__)
    
    def numpy(self):
        return self.forward()
    
    def debug(self):
        print(self.buffer)
    
    def __add__(self, other): return Add(self, other)
    def __radd__(self, other): return Add(other, self)
    def __mul__(self, other): return Mul(self, other)
    def __truediv__(self, other): return Div(self, other)
    def __rtruediv__(self, other): return Div(other, self) # Really untested
    def __sub__(self, other): return Add(self, -other)
    def __matmul__(self, other): return MatMul(self, other)
    def __neg__(self): return Mul(self, -1)
    def __pow__(self, exp): return Pow(self, exp)
    def __exp__(self): return Exp(self)

    def sum(self, axis=None): return Sum(self, axis)
    def mean(self, axis=None): return Sum(self, axis) * (1 / self.shapeTrack.shape[axis])  # Untested
    def onehot(self, dict_size=None): return OneHot(self, dict_size) # NOT LAZY
    def softmax(self): return softmax(self)  # Untested
    def max(self, axis=None): return Max(self, axis)  # Untested

    def __str__(self): return self.numpy().__str__()

    def shapeTrack(self, shape):
        self.shapeTrack = ShapeTracker(shape)

### TENSOR ###

class Tensor(LazyBuffer):  # Tensor is just lazybuffer that contains data 
    def __init__(self, value, shape=None) -> None:
        self.data = np.array(value)
        self.shape = ShapeTracker(self.data.shape) if shape is None else shape # shape is a shapeTracker
        self.requires_grad = False
        self.grad = 0

    def forward(self):
        return self.data
    def backward(self, grad):  # Leaf tensors don't need to do anything
        self.grad += grad

    def numpy(self):
        return self.data

    def __getitem__(self, index):   # THIS IS NOT LAZY!!
        if isinstance(index, slice):
            start, stop, step = index.indices(len(self.data))
            return Tensor(self.data[start:stop:step])
        else:
            return Tensor(self.data[index])


### OPERATION TYPES ###

class Unary(LazyBuffer):
    def __init__(self, a) -> None:
        super().__init__()
        self.a = Tensor(a) if not isinstance(a, LazyBuffer) else a
        self.shape = self.a.shape.copy()

class Binary(LazyBuffer):
    def __init__(self, a, b) -> None:
        super().__init__()
        self.a = Tensor(a) if not isinstance(a, LazyBuffer) else a
        self.b = Tensor(b) if not isinstance(b, LazyBuffer) else b
        self.shape = self.a.shape.copy()  
        if self.a.shape.flat() != self.b.shape.flat() and self.a.shape.shape == (self.b.shape.shape + (self.a.shape.shape[-1],)): self.b = Adapt(self.b, self.a)  # Untested
        # assert self.a.shape == self.b.shape, f"Shapes {self.a.shape} and {self.b.shape} are not compatible" 
        # This ^ doesnt take b scalars into account

class Reduce(LazyBuffer):
    def __init__(self, a, axis=None) -> None:
        super().__init__()
        self.grad = 0
        self.a = Tensor(a) if not isinstance(a, LazyBuffer) else a  # Deal with shapetrack
        self.shape = self.a.shape.copy()
        self.axis = axis
        self.shape.pop(axis)
        # self.shape = 1 if axis is None else (elm for i, elm in enumerate(self.a.shape) if i != axis)

class Broadcast(LazyBuffer):
    def __init__(self, a, b) -> None:
        super().__init__()
        self.a = Tensor(a) if not isinstance(a, LazyBuffer) else a
        self.b = Tensor(b) if not isinstance(b, LazyBuffer) else b
        self.shape = ShapeTracker(self.a.shape.shape + (self.b.forward(),))

### OPERATIONS ###

def softmax(tensor): # safe softmax
    # exp = Exp(tensor - tensor.max())
    return Exp(tensor - tensor.max()) / (Exp(tensor - tensor.max()).sum(axis=-1) )

def OneHot(tensor, dict_size):
    if dict_size is None: dict_size = int(tensor.data.max()) + 1
    result = np.zeros((tensor.shape[0], dict_size))
    result[np.arange(tensor.shape[0]), np.int32(tensor.data)] = 1
    return Tensor(result)    

class Exp(Unary):
    def forward(self):  # Exp
        self.data = np.exp(self.a.forward())
        return self.data

    def backward(self, grad=1):
        self.grad += grad
        self.a.backward(grad * self.data)


class Div(Binary):
    def forward(self):  # a / b
        self.data = self.a.forward() / self.b.forward()
        return self.data

    def backward(self, grad=1):
        self.grad += grad
        self.a.backward(grad / self.b.data)
        self.b.backward(grad * -self.a.data / (self.b.data ** 2)) 

class Pow(Binary):
    def forward(self):  # a ** b
        self.data = self.a.forward() ** self.b.forward() 
        return self.data

    def backward(self, grad=1):
        self.grad += grad
        self.a.backward(grad * self.b.data * self.a.data ** (self.b.data - 1))
        self.b.backward(grad * self.data * np.log(self.a.data))

class Add(Binary):
    def forward(self):  # a + b
        self.data = self.a.forward() + self.b.forward()
        return self.data

    def backward(self, grad=1):
        self.grad += grad
        self.a.backward(grad)
        self.b.backward(grad)

class Mul(Binary):
    def forward(self):
        self.data = self.a.forward() * self.b.forward()
        return self.data

    def backward(self, grad=1):
        self.grad += grad
        self.a.backward(grad * self.b.data)
        self.b.backward(grad * self.a.data)

class MatMul(Binary):
    def __init__(self, a, b) -> None:
        super().__init__(a, b)
        self.shape = ShapeTracker(self.a.shape.shape[:-1] + self.b.shape.shape[1:])

    def forward(self):
        self.data = self.a.forward() @ self.b.forward()
        return self.data

    def backward(self, grad=1):
        self.grad += grad
        self.a.backward(grad @ self.b.data.T)
        self.b.backward(self.a.data.T @ grad)


class Sum(Reduce): 
    def forward(self):
        self.data = np.sum(self.a.forward(), axis=self.axis)
        return self.data

    def backward(self, grad=Tensor((1,))):
        self.grad += grad
        # self.a.backward(np.repeat(grad, (self.a.shape.flat()//self.grad.shape.flat()), axis=self.axis).reshape(*self.a.shape)) # Untested
        self.a.backward(Adapt(self.grad, self.a).numpy())

class Mean(Reduce):  # Untested
    def forward(self):
        self.data = np.mean(self.a.forward(), axis=self.axis)
        return self.data

    def backward(self, grad=1):
        self.grad += grad
        self.a.backward(grad * np.ones(self.a.shape) / self.a.shapeTrack.shape[self.axis])

class Max(Reduce): 
    def forward(self):
        self.data = np.max(self.a.forward(), axis=self.axis)
        return self.data
    
    def backward(self, grad=1):  # WTF is the gradient of max? Definetly untested
        self.grad += grad
        self.a.backward(grad * (self.a.data == self.data))

# class Repeat(Broadcast):
#     def forward(self):  # todo - this is far more complex than expected 
#         self.data = np.repeat(self.a.forward(), self.b.forward(), axis=-1).reshape(*self.a.forward().shape, self.b.forward())
#         return self.data

#     def backward(self, grad=1):
#         self.grad += grad
#         self.a.backward(np.sum(grad, axis=-1))
#         self.b.backward(np.sum(grad * self.a.data, axis=-1))

class Adapt(Broadcast):
    def forward(self):
        # given a tensor and b shape, adapt a to b
        self.original_shape = self.a.shape
        times_smaller = self.b.shape.flat() // self.a.shape.flat()
        self.data = np.repeat(self.a.forward(), times_smaller).reshape(*self.b.shape)
        return self.data
    
    def backward(self, grad=1):
        self.grad += grad
        self.a.backward(np.sum(grad, axis=-1).reshape(self.original_shape))
        

class Rand(Tensor):
    def __init__(self, shape) -> None:
        self.data = np.random.rand(*shape)
        super().__init__(self.data)

class Randn(Tensor):
    def __init__(self, shape) -> None:
        self.data = np.random.randn(*shape)
        super().__init__(self.data)

class ReadCSV(Tensor):
    def __init__(self, path) -> None:
        # assume first line is the headerRepe
        self.data = np.genfromtxt(path, delimiter=',', skip_header=1)
        super().__init__(self.data)

### MODULES ###

class Module():
    pass

class Linear(Module):
    def __init__(self, in_features, out_features) -> None:
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Randn((in_features, out_features)) * (1 / np.sqrt(in_features))
        self.bias = Randn((1, out_features)) * (1 / np.sqrt(in_features)) # double check the 1 in (1, out_features)

    def __call__(self, x):
        self.x = x
        return x @ self.weight + self.bias

    def backward(self, grad):
        self.weight.grad += self.x.T @ grad
        self.bias.grad += grad.sum(axis=0)
        return grad @ self.weight.T




In [46]:
r1 = Rand((4, 3))
r2 = Rand((4, 3))
diff = (r1 - r2)
diffs = diff ** 2
r = (diffs).sum() 
r.numpy()

r.backward()
r1.grad

  self.b.backward(grad * self.data * np.log(self.a.data))


array([[ 0.19509937, -0.19372956,  1.02515072],
       [ 1.61044581,  0.56728383, -0.9234133 ],
       [-1.56467467, -0.39880177,  1.02836979],
       [-0.3214824 , -0.19538698,  1.1027053 ]])

## issues and notes

Repeat is not needed, numpy automatically broadcasts things, but fine for now <br>
--> actually repeat is needed because of backwards, and it's far more complex than expected <br>
Not sure about the laziness of Linear <br>

In [48]:
# would be cool to run on MNIST
import pickle

# dataset = ReadCSV('mnist_train.csv')
# with open('mnist.pickle', 'wb') as f:
#     pickle.dump(dataset, f)

In [51]:
with open('../mnist.pickle', 'rb') as f:
    dataset = pickle.load(f)
    

In [56]:
ad = Adapt(Rand((4,)), Rand((4, 3, 4)))
ad.numpy()
ad.data

array([[[0.06248974, 0.06248974, 0.06248974, 0.06248974],
        [0.06248974, 0.06248974, 0.06248974, 0.06248974],
        [0.06248974, 0.06248974, 0.06248974, 0.06248974]],

       [[0.69896573, 0.69896573, 0.69896573, 0.69896573],
        [0.69896573, 0.69896573, 0.69896573, 0.69896573],
        [0.69896573, 0.69896573, 0.69896573, 0.69896573]],

       [[0.01120667, 0.01120667, 0.01120667, 0.01120667],
        [0.01120667, 0.01120667, 0.01120667, 0.01120667],
        [0.01120667, 0.01120667, 0.01120667, 0.01120667]],

       [[0.19189536, 0.19189536, 0.19189536, 0.19189536],
        [0.19189536, 0.19189536, 0.19189536, 0.19189536],
        [0.19189536, 0.19189536, 0.19189536, 0.19189536]]])

In [72]:
images = dataset[:7, 1:]
labels = dataset[:4, 0].onehot()

fc1 = Linear(784, 10)

n1 = fc1(images)

pred = n1.softmax()
print((pred - labels).numpy())
mse = ((pred - labels) ** 2).sum() 
print(mse.numpy())

mse.backward()



[[ 5.79660926e-025  2.44611700e-013  1.48041146e-072  1.26760433e-053
   1.19747559e-068 -1.00000000e+000  1.00000000e+000  3.90823555e-082
   1.32449637e-045  9.24014785e-038]
 [-1.00000000e+000  6.66782411e-023  8.30246357e-082  4.92833148e-047
   9.10025195e-075  9.99695468e-001  3.04532099e-004  1.06862005e-033
   3.41019883e-098  1.88657005e-065]
 [ 1.04087090e-029  4.09201877e-082  8.57062900e-081  4.75369811e-128
   0.00000000e+000  2.24303290e-054  1.53814807e-058  1.73346678e-074
   1.08105222e-029  1.84968335e-026]
 [ 3.00596856e-006 -1.00000000e+000  2.43248158e-043  1.02403419e-032
   1.75373437e-034  4.39851457e-054  4.26351904e-003  1.90323934e-047
   2.21230295e-061  9.95733475e-001]
 [ 1.21363702e-054  1.17498122e-054  9.99999997e-001  1.98077170e-056
   1.29452212e-077  8.05572218e-058  1.75418277e-026  2.87370913e-009
   1.13652908e-045 -1.00000000e+000]
 [ 9.51209441e-014  1.92018057e-067 -1.00000000e+000  3.25302851e-067
   5.24218417e-068  1.00000000e+000  4.515868

  self.b.backward(grad * self.data * np.log(self.a.data))
  self.b.backward(grad * self.data * np.log(self.a.data))
  self.b.backward(grad * self.data * np.log(self.a.data))


In [None]:
import torch
torchdata = torch.tensor(dataset.data)
timages = torchdata[:, 1:].float()
tlabels = torchdata[:, 0].long()

w1 = torch.randn(784, 10, requires_grad=True).float()
b1 = torch.randn(1, 10, requires_grad=True).float()

torchmse = torch.nn.functional.mse_loss(torch.nn.functional.softmax(timages @ w1 + b1, dim=-1), torch.nn.functional.one_hot(tlabels, 10))
print(mse)

109813.15396756928


In [None]:
# Softmax testing
n = Rand((1, 10))
print(Exp(n))
print(Exp(n).sum(1))
print(n.softmax())

[[2.32357242 1.29060336 1.17901193 1.04738572 2.06071516 2.01357728
  1.07164179 2.26901126 2.6726729  1.94057902]]
[17.86877084]
[[0.13003538 0.07222676 0.0659817  0.05861543 0.11532495 0.11268695
  0.05997289 0.12698194 0.14957229 0.10860171]]


In [None]:
output.numpy()

array([[ 117.61452441, -111.65694779,  107.36385164, ...,   53.58838391,
         106.69311709,  -83.32522846],
       [  89.56216287,  -13.8187122 ,  107.26697021, ...,   73.73735931,
         117.12209774,  -11.48190668],
       [  44.85184525,   25.15065779,  -37.71456661, ...,   40.05536546,
         -36.57610678, -104.04146739],
       ...,
       [ 141.54043248,    9.65783841,   -1.56395171, ...,  163.73343039,
         -74.13833273,  -36.1691029 ],
       [  25.13424445,   18.97431544,  107.04201011, ...,   23.80922902,
          28.84876724, -122.98191804],
       [  76.78514152,  -18.86484713,   68.92965517, ...,  -11.26740982,
          64.50984431,  -91.08108858]])

In [None]:
Linear(3, 2).forward(Rand((2, 3))).numpy()

array([[1.4965278 , 1.49941769],
       [1.75843499, 1.47767739]])

In [None]:
conv = Conv2D(3, 2, 3)
conv.forward(Rand((2, 3, 4, 4)))

In [None]:
a = Tensor(np.array([[1, 2], [3, 4]]))
b = Tensor(np.array([[5, 6], [7, 8]]))

c = a + b
d = c * b
e = d + 3
f = e.sum()

print(c)
print(d)
print(e)
print(f)

print(f.numpy())
f.backward()

print(a.grad)
print(b.grad)
print(c.grad)
print(d.grad)
print(e.grad)
print(f.grad)


<__main__.Add object at 0x0000021235AE07F0>
<__main__.Mul object at 0x0000021235AE00D0>
<__main__.Add object at 0x0000021235AE0280>
<__main__.Sum object at 0x0000021235AE0160>
256
[[5. 6.]
 [7. 8.]]
[[11. 14.]
 [17. 20.]]
[[5. 6.]
 [7. 8.]]
[[1. 1.]
 [1. 1.]]
[[1. 1.]
 [1. 1.]]
1


In [None]:
from torch import *

In [None]:
a = Tensor(np.array([[1, 2], [3, 4]])).requires_grad_()
b = Tensor(np.array([[5, 6], [7, 8]])).requires_grad_()

c = a + b
d = c * b
e = d + 3
f = e.sum()

print(c)
print(d)
print(e)
print(f)

f.backward()

print("-------------------")
print(a.grad)
print(b.grad)
# print(c.grad)
# print(d.grad)
# print(e.grad)
# print(f.grad)


tensor([[ 6.,  8.],
        [10., 12.]], grad_fn=<AddBackward0>)
tensor([[30., 48.],
        [70., 96.]], grad_fn=<MulBackward0>)
tensor([[33., 51.],
        [73., 99.]], grad_fn=<AddBackward0>)
tensor(256., grad_fn=<SumBackward0>)
-------------------
tensor([[5., 6.],
        [7., 8.]])
tensor([[11., 14.],
        [17., 20.]])
