## References


### Forward mode & dual numbers

- https://en.wikipedia.org/wiki/Dual_number
- https://blog.demofox.org/2014/12/30/dual-numbers-automatic-differentiation/
- https://blog.demofox.org/2017/02/20/multivariable-dual-numbers-automatic-differentiation/
- https://marksaroufim.medium.com/automatic-differentiation-step-by-step-24240f97a6e6

### Backward mode & backpropagation

- https://www.jmlr.org/papers/volume18/17-468/17-468.pdf
- https://sidsite.com/posts/autodiff/
- https://github.com/karpathy/micrograd

In [128]:
import math

from collections import deque

In [132]:
# TODO: do this for numpy arrays

In [129]:
class Value:
    def __init__(self, value):
        self.value = value
        
        self.grad = None
        self.local_grad = None
        
        self.children = []
        
    @property
    def safe_grad(self):
        return 0.0 if self.grad is None else self.grad
    
    @staticmethod
    def topsort(root):
        sort, visited = deque(), set()
    
        def dfs(node):
            for child in node.children:
                if child not in visited:
                    visited.add(child)
                    dfs(child)
            sort.appendleft(node)
            
        dfs(root)
            
        return sort
    
    @staticmethod
    def check_input(value):
        return value if isinstance(value, Value) else Value(value)
    
    def backward(self):
        topsort = self.topsort(self)
        topsort[0].grad = 1.0
        
        for root in topsort:
            for i, child in enumerate(root.children):
                child.grad = child.safe_grad + (root.local_grad[i] * root.grad)        

    def relu(self):
        node = Value(max(0, self.value))
                
        node.children = [self]
        node.local_grad = [0 if self.value < 0 else 1]
        
        return node
                
    def __add__(self, other):
        other = self.check_input(other)
        
        node = Value(self.value + other.value)
        
        node.children = [self, other]
        node.local_grad = [1, 1]
         
        return node
    
    def __radd__(self, other):
        return self + other
    
    def __mul__(self, other):
        other = self.check_input(other)
        
        node = Value(self.value * other.value)
        
        node.children = [self, other]
        node.local_grad = [other.value, self.value]
    
        return node
    
    def __rmul__(self, other):
        return self * other
    
    def __pow__(self, other):
        assert isinstance(other, (float, int))
        
        node = Value(self.value ** other)

        node.children = [self]
        node.local_grad = [other * self.value ** (other - 1)]
        
        return node

    def __neg__(self):
        return self * -1
    
    def __sub__(self, other):
        return self + (-other)
    
    def __rsub__(self, other):
        return self + (-other)
    
    def __truediv__(self, other):
        return self * other**(-1)
    
    def __rtruediv__(self, other):
        return other * self**(-1)

    def __repr__(self):
        return f"Value({self.value})"

In [130]:
def test_grad():
    a, b = Value(-4.0), Value(2.0)

    c = a + b
    d = a * b + b**3
    c += c + 1
    c += 1 + c + (-a)
    d += d * 2 + (b + a).relu()
    d += 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g += 10.0 / f
    
    assert round(g.value, 4) == 24.7041
    
    g.backward()
    
    assert round(a.grad, 4) == 138.8338
    assert round(b.grad, 4) == 645.5773

In [131]:
test_grad()

# Torch test

In [162]:
import torch
import torch.nn as nn

from sklearn.datasets import make_blobs

In [286]:
class SimpleNet(nn.Module):
    def __init__(self, n_inputs):
        super().__init__()
        self.n_inputs = n_inputs
        
        self.net = nn.Sequential(
            nn.Linear(self.n_inputs, 25),
            nn.Linear(25, 25),
            nn.Linear(25, 2)
        )
        
    def forward(self, x):
        return self.net(x)
    
    def predict(self, X):
        return (self.net(X)[:, 1] > 0.5).float()
    
    def train(self, X, y, max_iter=100):
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(self.parameters(), lr=0.01)

        for i in range(max_iter):
            optimizer.zero_grad()
            y_pred = self(X)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            print(f"Iteration {i}: {loss}")


In [291]:
X, y = make_blobs(10000, 5, 2)

X, y = torch.Tensor(X), torch.Tensor(y).type(torch.LongTensor)

In [292]:
dir(X)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__complex__',
 '__contains__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__idiv__',
 '__ifloordiv__',
 '__ilshift__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pow__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rfloordiv__',
 '__rmul__',
 '__rpow__',
 '__rshift__',
 '__rsub__',
 '__r

In [288]:
net = SimpleNet(5)
net.train(X, y)

Iteration 0: 0.6382302045822144
Iteration 1: 0.12754975259304047
Iteration 2: 0.08094043284654617
Iteration 3: 0.05983508378267288
Iteration 4: 0.04759441316127777
Iteration 5: 0.0395541787147522
Iteration 6: 0.03385269269347191
Iteration 7: 0.029592623934149742
Iteration 8: 0.02628576010465622
Iteration 9: 0.023643014952540398
Iteration 10: 0.021481644362211227
Iteration 11: 0.019680745899677277
Iteration 12: 0.01815696991980076
Iteration 13: 0.016850721091032028
Iteration 14: 0.015718506649136543
Iteration 15: 0.014727565459907055
Iteration 16: 0.013853192329406738
Iteration 17: 0.013075744733214378
Iteration 18: 0.012380117550492287
Iteration 19: 0.011753936298191547
Iteration 20: 0.011187351308763027
Iteration 21: 0.010672244243323803
Iteration 22: 0.010201912373304367
Iteration 23: 0.009770774282515049
Iteration 24: 0.009374110028147697
Iteration 25: 0.009007971733808517
Iteration 26: 0.008668997325003147
Iteration 27: 0.008354232646524906
Iteration 28: 0.008061190135776997
Iterat

In [289]:
net.predict(X)

tensor([1., 0., 1.,  ..., 0., 0., 0.])

In [290]:
y

tensor([1, 0, 1,  ..., 0, 0, 0])