In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
''' %matplotlib inline sets the backend of matplotlib to
the 'inline' backend. When using the 'inline' backend,
your matplotlib graphs will be included in your notebook,
next to the code.'''

" %matplotlib inline sets the backend of matplotlib to\nthe 'inline' backend. When using the 'inline' backend,\nyour matplotlib graphs will be included in your notebook,\nnext to the code."

In [2]:
from graphviz import Digraph

def trace(root):
    '''
    Build a set of all nodes and edges in a graph,
    the order does not matter
    '''
    nodes, edges = set(), set()
    
    def build(v): # a heuristic to get the nodes without order
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v)) # define edge like this
                build(child)

    build(root)
    return nodes, edges

# first have an scheme of your desired graph based on the nodes and operations
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR : left to right

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular (record') node for it
        # a visual node
        dot.node(name=uid, label=f'{n.label} | data: {n.data} | grad: {n.grad}', shape='record')
        if n._op:
            # if this value is result of some operation, create an operation node for it
            dot.node(name=uid+n._op , label=n._op) # n._op is a string
            # and connect the op node to the value node
            dot.edge(uid+n._op , uid)
            for child in n._prev:
                dot.edge( str(id(child)) , uid+n._op)
    return dot

In [3]:
import torch

# Torch
notice: Pyhton by default uses double (64 bits) precision for its floating point operations.

but default tensor's type in PyTorch is float32.

so to make everything identical, cast tensors to double.

In [4]:
# Below we have a Tensor similar to our Value objects:
# Tensors are multi-dimensional Values.
x1 = torch.Tensor([2.0]).double()
x1

tensor([2.], dtype=torch.float64)

In [5]:
# If you want PyTorch to compute gradient w.r.t to a Tensor,
# you should explicitly set it to True.
x1 = torch.Tensor([2.0]).double();  x1.requires_grad = True
x1

tensor([2.], dtype=torch.float64, requires_grad=True)

#### torch.Tensor VS torch.tensor
In PyTorch torch.Tensor is the main tensor class. So `all tensors are just instances of torch.Tensor`.

When you call torch.Tensor() you will get an `empty tensor without any data` just like the Value objects.

it is no problem creating an empty tensor instance of torch.Tensor without data by calling:

`tensor_without_data = torch.Tensor()`

if you also give it data, it must also `explicitly give the dtype as well`.

**But on the other side:**

`torch.tensor` is a `function` which returns a tensor, and `data must be given as the input`

`tensor_with_data = torch.tensor(data=['must be filled'])

it also `automatically infer the dtype`of the data

In [6]:
# all tensors are just instances of torch.Tensor
# with the same attributes
x1 = torch.tensor([2.0]).double();  x1.requires_grad = True
x1

tensor([2.], dtype=torch.float64, requires_grad=True)

In [7]:
# torch.tensor(data=[]) is a function
x1 = torch.tensor([2.0], requires_grad = True).double()
x1

tensor([2.], dtype=torch.float64, grad_fn=<ToCopyBackward0>)

### the same NN Architecture in micrograd with Torch

In [8]:
x1 = torch.Tensor([2.0]).double();  x1.requires_grad = True
x2 = torch.Tensor([0.0]).double();  x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double();  w1.requires_grad = True
w2 = torch.Tensor([1.0]).double();  w2.requires_grad = True
b = torch.Tensor([6.881373587]).double();  x1.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)
print(o)
print(o.data)
# if you want the item of data
print(o.data.item())

tensor([0.7071], dtype=torch.float64, grad_fn=<TanhBackward0>)
tensor([0.7071], dtype=torch.float64)
0.7071066904050358


In [9]:
o.backward()

In [10]:
print(o) # data is tensor
print(o.item())
print(x1.grad) # grad is a tensor

tensor([0.7071], dtype=torch.float64, grad_fn=<TanhBackward0>)
0.7071066904050358
tensor([-1.5000], dtype=torch.float64)


In [11]:
print(x1.grad.item())
print(x2.grad.item())
print(w1.grad.item())
print(w2.grad.item())

-1.5000003851533106
0.5000001283844369
1.0000002567688737
0.0


# A NN that subscribes to the PyTorch api

In [12]:
# we have our class Value
class Value:
    
    def __init__(self, data, _prev=(), _op='', label=''):
        self.data = data
        self._prev = set(_prev) # the order does not matter, use set instead of list
        self._op = _op
        self.label = label

        self.grad = 0.0

        # After the forwrd path (when we have the data vlaue of all nodes)
        # we start the backprop (to get the gradients for each node)
        # the output node of each operation knows the operation and children
        # since we dont have a backprop for leaf nodes, and 
        # each operation has different local gradient
        # we can't define a general method in the class.
        # so at the time of doing the operation,
        # we can both define the local gradiant function
        # and store the whole chain rule function in an attribute to call later.
        # then call the local this function attribute from end node to the begining.
        self._backprop = lambda: None
    
    def __repr__(self) -> str:
        return f'Value({self.label} | data:{self.data} | grad:{self.grad})'
    
    def __add__(self, other):
        # check if 'other' is an instance of 'Value'
        other = other if isinstance(other, Value) else Value(other)

        out = Value(self.data + other.data, (self, other), _op='+')
        
        def _backprop():
            # partial derivatives for each input:
            self.grad += out.grad
            other.grad += out.grad

        out._backprop = _backprop
        return out
    
    def __radd__(self, other):
        return self + other
    
    def __sub__(self, other):
        return self + -1*other
    
    def __rsub__(self, other):
        return -1*(self + -1*other)
    
    def __neg__(self):
        return self * -1

    def __mul__(self, other):
        # check if 'other' is an instance of 'Value'
        other = other if isinstance(other, Value) else Value(other)

        out = Value(self.data * other.data, (self, other), _op='*')
        
        def _backprop():
            # partial derivatives for each input:
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        
        out._backprop = _backprop
        return out
    
    def __rmul__(self, other):
        return self * other

    def __truediv__(self, other):
        return self * other**-1
    
    def __rtruediv__(self, other):
        return other * self**-1
    
    def __pow__(self, other):
        # only support int or float powers for now!!
        # in case of other of type 'Value', we would need another method
        assert isinstance(other, (int, float))
        
        out = Value(self.data**other, (self,), _op=f'**{other}')

        def _backprop():
            self.grad += (other * self.data**(other - 1)) * out.grad
        
        out._backprop = _backprop
        return out

    def relu(self):
        out = Value( self.data if self.data > 0.0 else 0.0 , (self,), _op='ReLU' )
        
        def _backprop():
            self.grad += out.grad * (out.data > 0.0)
        
        out._backprop = _backprop
        return out
    
    def exp(self):
        out = Value(np.exp(self.data), (self,), _op='exp')
        
        def _backprop():
            self.grad += out.data * out.grad
        
        out._backprop = _backprop
        return out
    
    def tanh(self):
        tanh = (np.exp(2*self.data) - 1)/(np.exp(2*self.data) + 1)
        out = Value( tanh, (self,), _op='tanh')
        
        def _backprop():
            self.grad += (1 - tanh**2) * out.grad
        
        out._backprop = _backprop
        return out
    
    def backprop(self):
        topo_sort_list = []
        visited = set() # the order does not matter, use set instead of list
        
        def build_topo(root):
            if root not in visited:
                visited.add(root)
                # Appending to topo_sort before its children are processed
                # will give us out-to-left sort,
                # but not out to leaf sort in case of b (bias leaf), try it & see it
                for child in root._prev:
                    build_topo(child)
                # Appending after its children are processed
                # will give us leaf-to-out sort
                topo_sort_list.append(root)
        
        build_topo(root=self)
        
        self.grad = 1.0
        for node in reversed(topo_sort_list):
            node._backprop()


In [13]:
import random

we can call an object of a class to give us an output:
for that we need to use `__call__` method:

n = Neuron(x) # object

a = n(x) # calling object

here we use it to give us the value of the forward pass

In [14]:
class Neuron:
    # nin: number of inputs
    def __init__(self, nin: int):
        self.w = [ Value( random.uniform(-1,1), label=f'w{i}' ) for i in range(nin)]
        self.b = Value(random.uniform(-1,1), label='b')
    
    def __call__(self, x: list) -> float:
        # w * x + b -> a scalar value
        activation = sum( ( wi*xi for wi,xi in zip(self.w, x) ) , start=self.b) # pair up w & x point wise
        return activation.tanh()
    
    def parameters(self):
        return self.w + [self.b]


class OneMLPLayer:

    def __init__(self, nin: int, nout: int):
        '''
        so we need a bunch of Neurons:
        Data structure? -> order matters -> list
        '''
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def __call__(self, x: list) -> list:
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs
    
    def parameters(self) -> list:
        # list comprehension with double for:
        # write the for loops in a way you write it as usual !! 
        return [parameter for neuron in self.neurons for parameter in neuron.parameters()]
        

class MLP:
    '''
    we want a bunch of layers -> order matters -> list
    input layer: nin - hidden layer: hs - output layer: nout
    '''
    def __init__(self, nin: int, hs: list, nout: int) -> None:
        layer_width = [nin] + hs + [nout]
        self.layers = [OneMLPLayer(nin= layer_width[idx], nout=layer_width[idx+1]) for idx in range(len(layer_width)-1) ]

    def __call__(self, x) -> list:
        out = x
        for layer in self.layers:
            out = layer(out)
        return out
    
    def parameters(self):
        return [parameter for layer in self.layers for parameter in layer.parameters()]




In [15]:
x = [2.0, 3.0 -1.0]
n = Neuron(3)
l = OneMLPLayer(3, 3)
model = MLP(nin=3, hs=[4, 4], nout=1)
print(n(x))
print(l(x))
print(model(x))

Value( | data:0.8018681265098966 | grad:0.0)
[Value( | data:-0.8805757909710763 | grad:0.0), Value( | data:0.8555983793423952 | grad:0.0), Value( | data:0.8832098486745505 | grad:0.0)]
Value( | data:-0.14642457747197496 | grad:0.0)


In [16]:
# input data points
x_batch = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]

# desires targets
y_batch = [
    1.0,
    -1.0,
    -1.0,
    1.0
]

In [17]:
y_preds = [model(x) for x in x_batch]
y_preds

[Value( | data:0.07831801032455092 | grad:0.0),
 Value( | data:-0.8984201737040973 | grad:0.0),
 Value( | data:-0.6123353581808755 | grad:0.0),
 Value( | data:-0.21870671677024864 | grad:0.0)]

In [18]:
losses = [(ygt-y_pred)**2 for ygt, y_pred in zip(y_batch, y_preds)]
loss = sum(losses)
loss.label = 'loss'
loss

Value(loss | data:2.4953460872200695 | grad:0.0)

MSE loss is always +

and to have a better prediction , we need to reduce it.

In [19]:
loss.backprop()

In [20]:
param = model.parameters()
print(len(param))
param

41


[Value(w0 | data:0.915783763752259 | grad:0.36176150770990945),
 Value(w1 | data:-0.1905455982987767 | grad:0.3856647427725668),
 Value(w2 | data:-0.4812601281172937 | grad:-0.4661916955473663),
 Value(b | data:0.006421811569115254 | grad:0.21113592376717188),
 Value(w0 | data:-0.39839420483993737 | grad:-0.045486377588545),
 Value(w1 | data:-0.6410730140883056 | grad:-0.07673443261662213),
 Value(w2 | data:0.8690608121468422 | grad:-0.11964108063112913),
 Value(b | data:-0.9669803000002632 | grad:-0.08677935140010419),
 Value(w0 | data:0.8823970304213575 | grad:0.21388048271275503),
 Value(w1 | data:0.7836271370251264 | grad:0.2007740466804331),
 Value(w2 | data:0.22612836684268367 | grad:-0.27577500365992424),
 Value(b | data:-0.18171296341253096 | grad:0.19501937778275297),
 Value(w0 | data:-0.013915501429131227 | grad:-1.5341603789801292),
 Value(w1 | data:0.7833508632156321 | grad:-1.5169831948252233),
 Value(w2 | data:-0.49347382234372605 | grad:1.786051907179295),
 Value(b | dat

`Negative gradient` means that if we `increase this weight`, the `loss will go down`

`Posetive gradient` means that if we `decrease this weight`, the `loss will go down`

And we have this information for all weights

In [None]:
# graph of 4 forward pass of our MLP network !!!!
draw_dot(loss)

## Optimization
update the parameters (weights and biases) by `modifing them in the oposite direction of the gradient`, by a rate:

`p_new = p_old - lr*grad`

In [21]:
model.layers[0].neurons[0].w[0]

Value(w0 | data:0.915783763752259 | grad:0.36176150770990945)

In [22]:
lr = 0.1
for p in model.parameters():
    p.data -= lr*p.grad

In [23]:
mlp.layers[0].neurons[0].w[0]

Value(w0 | data:0.8796076129812681 | grad:0.36176150770990945)

In [218]:
y_preds = [model(x) for x in x_batch]
print(y_preds)
loss = sum((ygt-y_pred)**2 for ygt, y_pred in zip(y_batch, y_preds))
loss

[Value( | data:0.9695570006106607 | grad:0.0), Value( | data:0.3908102914553565 | grad:0.0), Value( | data:0.12469364441718465 | grad:0.0), Value( | data:0.8794516540068404 | grad:0.0)]


Value( | data:3.214747740544048 | grad:0.0)

In [214]:
loss.backprop()

In [None]:
param = model.parameters()
print(len(param))
param

41


[Value(w0 | data:-0.7800143933499435 | grad:-0.058292146708676336),
 Value(w1 | data:0.16289228661893906 | grad:-0.059958376679883206),
 Value(w2 | data:-0.20967497944127667 | grad:-0.16440308701852463),
 Value(b | data:-0.0422469166235695 | grad:-0.09451187988759993),
 Value(w0 | data:-0.9054239788137961 | grad:0.026104385222427542),
 Value(w1 | data:-0.3260986947394393 | grad:0.03193490177448111),
 Value(w2 | data:-0.4879378645782313 | grad:0.06066615724202481),
 Value(b | data:-0.7991330160394869 | grad:0.041045106917326775),
 Value(w0 | data:-0.4969658375301522 | grad:-2.3236541276505727),
 Value(w1 | data:-0.6746652129527699 | grad:-0.6154446437948274),
 Value(w2 | data:0.5534059556319595 | grad:-1.5059625586659788),
 Value(b | data:-0.03229857135030967 | grad:-1.7708956190618947),
 Value(w0 | data:-0.18837168223493062 | grad:0.9342769425612104),
 Value(w1 | data:0.6794315371587631 | grad:1.3795131167272128),
 Value(w2 | data:-0.04520585545773326 | grad:1.7576365226168393),
 Value