In [22]:
import torch
%pip install graphviz
import math
import numpy as np
import matplotlib.pyplot as plt
from graphviz import Digraph
import random
%matplotlib inline

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
from graphviz import Digraph

def trace(root):
    # builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
    
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s |  data %4f}" % (n.data,n.grad), shape='record')
        # if this value is a result of some operation, create an op node for it
        if n._op:
            dot.node(name = uid + n._op, label = n._op)
            dot.edge(uid + n._op, uid)
    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    
    return dot
    

In [24]:
x1 = torch.Tensor([1.0]).double()
x1.requires_grad = True

x2 = torch.Tensor([0.0]).double()
x2.requires_grad = True

w1 = torch.Tensor([2.0]).double()
w1.requires_grad = True

w2 = torch.Tensor([-1.0]).double()
w2.requires_grad = True

b = torch.Tensor([2.2]).double()
b.requires_grad = True

n = x1 * w1 + x2 * w2 + b
o = torch.tanh(n)

print(o.data.item())
o.backward()

print(x2.grad.item())
print(w2.grad.item())
print(w1.grad.item())
print(x1.grad.item())

0.9995503665024041
-0.0008990648249095345
0.0
0.0008990648249095345
0.001798129649819069


In [25]:

class Value:
    """ stores a single scalar value and its gradient """

    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"

In [26]:
class Neuron():
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(1.0)

    def __call__(self, x):
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        return act
    
    def parameters(self):
        return self.w + [self.b]


class Layer():
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs
    
    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]


class MLP():
    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i + 1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]
                


In [136]:
x = [1,-2,4]
n = MLP(3, [4, 4, 1])
n(x)

Value(data=-0.5777597241023255, grad=0)

In [137]:
xs = [
[2.0, 2.0, -1.0],
[1.0, 1.0, 0.5],
[1.0, 1.0, -1.0],
[0.5,2.0,-1.5]
]
ys = [1.0, -1.0, -1.0, 1.0]


In [138]:
ypred = [n(x) for x in xs]
loss  =  sum([(yout-ygt)**2 for ygt,yout in zip(ys,ypred)])
loss

Value(data=4.415740893797561, grad=0)

In [139]:
ypred = [n(x) for x in xs]
loss  =  sum([(yout-ygt)**2 for ygt,yout in zip(ys,ypred)])
loss

Value(data=4.415740893797561, grad=0)

In [140]:
loss.backward()

In [141]:
for p in n.parameters():
    p.data +=-0.00001 * p.grad

In [163]:
for i in range(100):
    ypred = [n(x) for x in xs]
    loss  =  sum([(yout-ygt)**2 for ygt,yout in zip(ys,ypred)])
    print(loss)
    loss.backward()
    for p in n.parameters():
        p.data +=-0.000001 * p.grad

Value(data=0.11034065228999192, grad=0)
Value(data=0.11188528989099612, grad=0)
Value(data=0.1135500786293937, grad=0)
Value(data=0.11533456494622277, grad=0)
Value(data=0.117238261308903, grad=0)
Value(data=0.11926064663243935, grad=0)
Value(data=0.12140116670819623, grad=0)
Value(data=0.12365923463997441, grad=0)
Value(data=0.12603423128710975, grad=0)
Value(data=0.12852550571432778, grad=0)
Value(data=0.13113237564807895, grad=0)
Value(data=0.13385412793909648, grad=0)
Value(data=0.13669001903090755, grad=0)
Value(data=0.13963927543404225, grad=0)
Value(data=0.1427010942056845, grad=0)
Value(data=0.14587464343450776, grad=0)
Value(data=0.14915906273045204, grad=0)
Value(data=0.15255346371919032, grad=0)
Value(data=0.156056930541045, grad=0)
Value(data=0.1596685203541115, grad=0)
Value(data=0.1633872638413557, grad=0)
Value(data=0.1672121657214483, grad=0)
Value(data=0.17114220526311233, grad=0)
Value(data=0.17517633680274922, grad=0)
Value(data=0.1793134902651274, grad=0)
Value(data

In [167]:
for i in range(100):
    #forward pass
    ypred = [n(x) for x in xs]
    loss  =  sum([(yout-ygt)**2 for ygt,yout in zip(ys,ypred)])
    print(loss)
    #backward pass
    loss.backward()
    
    #update
    for p in n.parameters():
        p.data +=-0.000001 * p.grad
ypred

Value(data=1.2530236891641915, grad=0)
Value(data=1.2519537866780364, grad=0)
Value(data=1.250884329695333, grad=0)
Value(data=1.249815914031227, grad=0)
Value(data=1.2487491285520997, grad=0)
Value(data=1.2476845550574447, grad=0)
Value(data=1.246622768163938, grad=0)
Value(data=1.2455643351917138, grad=0)
Value(data=1.2445098160528263, grad=0)
Value(data=1.2434597631418947, grad=0)
Value(data=1.2424147212289258, grad=0)
Value(data=1.2413752273543124, grad=0)
Value(data=1.2403418107259865, grad=0)
Value(data=1.2393149926187306, grad=0)
Value(data=1.2382952862756378, grad=0)
Value(data=1.2372831968117093, grad=0)
Value(data=1.236279221119578, grad=0)
Value(data=1.235283847777359, grad=0)
Value(data=1.2342975569586045, grad=0)
Value(data=1.2333208203443695, grad=0)
Value(data=1.2323541010373618, grad=0)
Value(data=1.2313978534781767, grad=0)
Value(data=1.230452523363609, grad=0)
Value(data=1.2295185475670225, grad=0)
Value(data=1.2285963540607734, grad=0)
Value(data=1.2276863618406755, 

[Value(data=0.506361582235045, grad=-0.98727683552991),
 Value(data=-0.354698036586124, grad=1.290603926827752),
 Value(data=-0.5028602504466197, grad=0.9942794991067605),
 Value(data=0.4493521067349753, grad=-1.1012957865300494)]