Note:   
 This project is to show what Neural Network training looks like under the hood.
 We will be building micrograd from andrej karpathy's micrograd project.    
 Micrograd is a tiny autograd(automatic gradient) engine and neural network library written in Python.  
 what it does is it implments backpropagation - backpropagation is the algorithm that allows you to efficiently compute the gradients of a loss function with respect to the weight of a neural network.    
 This allows us to iteratively tune the weight of the neural network to minimize the loss function and improve the accuracy of the network

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def f(x):
    return 3*x**2 - 4*x + 5

In [None]:
f(3.0)

In [None]:
# to see the shape of the fuction f(x)   - f of x\n
xs = np.arange(-5, 5, 0.25) #values from -5 to 5 in steps of 0.25
ys = f(xs) #compute f(x) for each value in xs
plt.plot(xs, ys) #plot the points
plt.title("Plot of f(x) = 3x^2 - 4x + 5")

Derivative is basically if you add or reduce one number, does the other respond positively or negatively to it and how deep is the effect (the slope of the function).

In [None]:
# the derivative mathemathically
h = 0.000001
x = 2/3
(f(x + h) - f(x)) / h #mathematical derivative formula

In [None]:
# lets get more complex
a = 2.0
b = -3.0
c = 10.0
d = a*b + c
print(d)

In [None]:
# to track the impact of the changes
h = 0.0001

# inputs 
a = 2.0
b = -3.0
c = 10.0

d1 = a *b +c
a +=h  #can either be a, b or c to see the changes effect
d2 = a*b +c

print('d1', d1)
print('d2', d2)
print('slope', (d1 - d2)/h)

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data 
        self.grad = 0.0 #represent the derivative of the output with respect to that value
        self._backward = lambda: None #a function that will be used to propagate the gradient backward through the computational graph
        self._prev =set(_children) #enable us track what two values were added/mul to get another value
        self._op = _op #enable us track what operation created a value
        self.label = label #enable us know what variable has what value during visualization

    def __repr__(self): #better readability for output result
        return f"Value(data={self.data})"
    
    def __add__(self, other): #how to manually add two object
        out = Value(self.data + other.data, (self, other), "+" ) #a.__add__(b) - what the function does, (self, other) - is the "children" of the value but it should have been named parent since the two values make up the output value

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad

        out._backward = _backward

        return out
    
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), "*") #a.__mul__(b)

        def _backward():
            self.grad += other.data * out.grad 
            other.grad += self.data * out.grad
        out._backward = _backward

        return out
    

    #TODO implment exponentiation and division later
    def __exp__(self, other):
        pass

    def __div__(self, other):
        pass
    
    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def _backward():
            self.grad += (1- t**2) * out.grad
        out._backward = _backward

        return out
    
    #function to automatically perform backpropagation by going through the graph in reverse topological order (end to start) and calling the _backward method of each node to compute gradients/ each operation carried out
    def backward(self): 
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        self.grad = 1.0
        for node in reversed(topo):
            node._backward()
        
        
    
a = Value(2.0, label="a")
b = Value(-3.0, label="b")
c = Value(10.0, label="c")
e = a*b; e.label="e" #a.__mul__(b)
d = e +c; d.label="d" #e.__add__(c)
f = Value(-2.0, label='f')
L = d * f; L.label = "L" #we are using this to represent the Loss function in this context
L


if you are runing the jupter notebook on your vs code like i am, you need both the pip installed graphviz and the brew install graphviz for it to work;
self.grad = 0.0 #represent the derivative of the output (e,d,L) with respect to that value - i.e grad in variable d, is the derivative of L with respect to d, grad in variable b, is the derivative of e with respect to b.


In [None]:
# function to help us visualize the expressions
from graphviz import Digraph

def trace(root):
    # builds a set of all nodes and edges in graphs
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child,v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir' : 'LR'}) #LR = left to right

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular {'record'} node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f}" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label = n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)

    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)

    return dot


In [None]:
draw_dot(L) 

var a;
var b;
var c;
e = a * b;
d = e + c;
var f;
L = d * f;  
L represent the loss function, which represents the result of all the operations carried out on the inputs;
The process of the operations carried out on the inputs to get L is known as the forward pass;
Back-propagation is the reverse of forward pass and is AKA as backward pass;  
Back-prop involves calculating the gradient along all the intermidate values that was added to get L;
For every single value, we will compute the derivative of that value with respect to L;
i.e, derivative of L with respect to L (it's 1), derivative of L with respect to f, derivative of L with respect to e - same for a,b,c and d;
In neural netowrk (NN) settings, we are concerned with the derivative of L (loss function) with respect to some of its leaf nodes (c,d,e,f) which will become the weights of the NN, while the rest nodes will be the data itself (a,b);
Usually we will not want/use the derivative of the loss fuction (L) with respect to the data, because the data is fixed but the weights will be iterated on;

In [None]:
# finding the gradients manually using chain rule
L.grad = 1.0 #( dL/dL = 1) always true 
d.grad = L.grad * f.data #1.0 * -2.0 = -2.0 #because d is multiplied by f to give L
f.grad = L.grad * d.data  #1.0 * 4.0 = 4.0

c.grad = d.grad * 1.0 # -2.0 * 1.0 = -2.0  # because e is added to c to give d
e.grad = d.grad * 1.0 # -2.0 * 1.0 = -2.0 

a.grad = e.grad * b.data # -2.0 * -3.0 = 6.0
b.grad = e.grad * a.data # -2.0 * 2.0 = -4.0

# in multiplication, the gradient of the result is multiplied by the data of the other operand to get the gradient of one operand.
# in addition, the gradient of the result is simply passed to each operand unchanged.

In [None]:
# a function to numerically estimate the gradient of L with respect to a - can be used to get the gradients of other variables too by changing the variable 'h' is added to
def lol():
    h = 0.001
    a = Value(2.0, label="a")
    b = Value(-3.0, label="b")
    c = Value(10.0, label="c")
    e = a*b; e.label="e" 
    d = e +c; d.label="d" 
    f = Value(-2.0, label='f')
    L = d * f; L.label = "L"
    L1 = L.data

    a = Value(2.0 + h, label="a")
    b = Value(-3.0, label="b")
    c = Value(10.0, label="c")
    e = a*b; e.label="e" 
    d = e +c; d.label="d" 
    f = Value(-2.0, label='f')
    L = d * f; L.label = "L"
    L2 = L.data

    print((L2 - L1)/h) #this will give us the derivative(grad) of L with respect to 'a'
lol()

how to use the above code to manually solve for the derivative(gradient) of each variable using h;
L2 = L.data + h #this will give the derivative(grad) of L with respect to 'L';
a = Value(2.0 + h, label="a") #this will give the derivative(grad) of L with respect to 'a';
b = Value(-3.0 + h, label="b") #this will give the derivative(grad) of L with respect to 'b';
f = Value(-2.0, label='f') #this will give the derivative(grad) of L with respect to 'f';
d = e + c; d.label="d" 
d.data += h  #this will give the derivative(grad) of L with respect to 'd';
e = a*b; e.label="e" 
e.data += h #this will give the derivative(grad) of L with respect to 'e';

#we are basically adding h to the L2 section of the variable that we want the grad
#before adding the h to the new variable, remove it from the old variable.


In [None]:
# now we want to backpropagate through a neuron (using tanh activation function for squashing/capping)
#this is the visualization of the tanh function squashing/capping effect
plt.plot(np.arange(-5,5,0.2), np.tanh(np.arange(-5,5,0.2))); plt.grid()

In [None]:
# inputs x1, x2 - has 2 inputs so its a 2D neuron
x1 = Value(2.0, label="x1")
x2 = Value(0.0, label="x2")

# weights w1,w2
w1 = Value(-3.0, label="w1")
w2 = Value(1.0, label="w2")

# bias of the neuron
b = Value(6.8813735870195432, label="b")

# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label="x1w1"
x2w2 = x2*w2; x2w2.label="x2w2"
x1w1x2w2 = x1w1+x2w2; x1w1x2w2.label="x1w1 + x2w2"
n = x1w1x2w2 + b; n.label="n"
o = n.tanh(); o.label="o" #output of the neuron


In [None]:
draw_dot(o)

In a typical NN settings, what we usually care about is the gradients of the weights (W1 and W2)

In [None]:
# manual backpropagation through the neuron
o.grad = 1.0
n.grad = 1.0 - (o.data**2) # because derivative of tanh is (1 - tanh^2)

x1w1x2w2.grad = n.grad * 1.0 # 0.5 * 1.0 = 0.5
b.grad = n.grad * 1.0 # 0.5 * 1.0 = 0.5

x2w2.grad = x1w1x2w2.grad * 1.0  #0.5 * 1.0 = 0.5
x1w1.grad = x1w1x2w2.grad * 1.0 # 0.5 * 1.0 = 0.5

x2.grad = w2.data * x2w2.grad  # 1.0 * 0.5 = 0.5
w2.grad = x2.data * x2w2.grad  # 0.0 * 0.5 = 0.0

w1.grad = x1.data * x1w1.grad  # 2.0 * 0.5 = 1.0 (this is the weight that we can adjust to have the most effect on the loss function - perfromance of the network)
x1.grad = w1.data * x1w1.grad  # -3.0 * 0.5 = -1.5




In [None]:
# automatic backpropagation through the neuron using the implemented backward function
o.backward()