In [1]:
import math
import random

In [2]:
# Value class is essentially scalars with additional features enabling back propigation
class Value:
    def __init__(self, data, _children=(), _op="", label=""):
        self.data = data
        self._op = _op
        self.label = label
        self._backprop = lambda: None
        self._prev = _children
        self.grad = 0

    def __repr__(self):
        return f"Value object (Data: {self.data})"

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), "+")
        def _backprop():
            local_det = 1 # in respect to both terms, d(out)/d(term) = 1
            back_prop_det = out.grad * local_det # Apply the chain rule
            self.grad += back_prop_det # Pass gradient "backwards" to the inputs
            other.grad += back_prop_det # Pass gradient "backwards" to the inputs
            
        out._backprop = _backprop
        return out


    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out =  Value(self.data * other.data, (self, other), "*")
        def _backprop():
            self_local_det = other.data
            other_local_det = self.data
            self.grad += self_local_det * out.grad
            other.grad += other_local_det * out.grad
        out._backprop = _backprop
        return out

    def __rmul__(self, other):
        return self * other

    def __trudiv__(self, other):
        return self * other**-1

    def __sub__(self, other):
        return self + (-other)
    
    def tanh(self):
        tanh_value = (math.exp(2*self.data)-1)/(math.exp(2*self.data) + 1)
        out = Value(tanh_value, (self, ), "tanh")
        def _backprop():
            self.grad += (1 - (tanh_value**2)) * out.grad
        out._backprop = _backprop
        return out

    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self,  ), 'exp')

        def _backprop(self):
            self.grad += out.data * out.grad
        
        out._backprop = _backprop
        return out

    def __pow__(self, power):
        assert isinstance(power, (int, float)), "currently pow only supports int/float"
        x = self.data
        out = Value(x**power, (self, ), "**")

        def _backprop():
            self.grad += (power*(x**(power-1))) * out.grad
        out._backprop = _backprop
        return out
    

    def backprop(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        self.grad = 1
        for node in reversed(topo):
            node._backprop()
        
    
    

In [3]:
# Setting Value objects to be used 
x1 = Value(2)
x2 = Value(0)
w1 = Value(-3)
w2 = Value(1)
b = Value(6.8813735870195432)


In [4]:
# Example scalar equation. Using backprop to see effect of x1 on o (gradient do/dx1)
n = (x1*w1 + x2*w2) + b
o = n.tanh()
o.backprop()
print("do/dx1 : %s" % (x1.grad))
print("Equation output value : %s" % o.data)

do/dx1 : -1.4999999999999996
Equation output value : 0.7071067811865476


### PyTorch Version of above

In [5]:
import torch

In [6]:
# Same as above, just using PyTorch api rather than custom Value class
x1 = torch.Tensor([2]).double(); x1.requires_grad = True
x2 = torch.Tensor([0]).double(); x2.requires_grad = True
w1 = torch.Tensor([-3]).double(); w1.requires_grad = True
w2 = torch.Tensor([1]).double(); w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double(); b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

o.backward()
print("do/dx1 : %s" % (x1.grad.item()))
print("Equation output value : %s" % o.data.item())

do/dx1 : -1.5000003851533106
Equation output value : 0.7071066904050358


### Value class based Neuron

In [7]:
# Neuron Class builds on Value class to create a neuron with a set of (randomly initiated) weights and a bias
class Neuron:
    def __init__(self, nin):
        '''
        nin - number of inputs to the neuron
        '''
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)] # Generate a random weight for each input
        self.b = Value(random.uniform(-1,1)) # Generate a random bias for the neuron

    def __call__(self, x):
        # [weights DOT inputs] plus bias
        paired_weights = zip(self.w, x) # Merge into [(w1, x1), (w2, x2)... ]
        dot_prod = 0
        for pair in paired_weights:
            dot_prod = pair[0]*pair[1] + dot_prod
        activation = dot_prod + self.b
        # Oneliner: activation = sum(w*x for w,x in zip(self.w, x), self.b)  # sum(<array to sum>, <starting value to add on top of>)

        # Non liniarity, using tanh here
        out = activation.tanh()
        return out

    def parameters(self):
        # Get the 'knobs and dials' for this neuron
        return self.w + [self.b] # Full list of modifiable params


        

In [8]:
## Zip Demo - used to "pair up" values in arrays
zip_x = [1,2,3]
zip_y = [4,5,6]
list(zip(zip_x,zip_y))

[(1, 4), (2, 5), (3, 6)]

In [9]:
x = [2.0,3.0]
n = Neuron(2)
n(x)

Value object (Data: -0.903335923815253)

### From Neurons to Networks

In [10]:
# Layer class brings together a set of neurons into a network layer
class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, input):
        # To run an input on a layer, run the input through each neuron
        return [n(input) for n in self.neurons]

    def parameters(self):
        # Get the 'knobs and dials' for this layer
        params = []
        for n in self.neurons:
            params.extend(n.parameters())

        # shorthand: return [p for neuron in self.neurons for p in neuron.parameters()]
        return params

In [11]:
## Create inputs and a layer of neurons
inputs=[1.0, -2.0, -1.0, 4]
l1 = Layer(4, 3)
l1(inputs)

[Value object (Data: -0.9514688550081641),
 Value object (Data: 0.8824649501675387),
 Value object (Data: -0.948712358937843)]

#### Full MLP

In [12]:
# MLP class implements a multilayer perceptron model as a collection of connected layers
class MLP:
    def __init__(self, nin, nouts):
        '''
        nouts -> list of output sizes desired
        '''
        layer_sizes = [nin] + nouts # Add inputs to the start of the array of sizes
        self.layers = [Layer(layer_sizes[n], layer_sizes[n+1]) for n in range(len(nouts))] # Create layers

    def __call__(self, input):
        '''
        Run an input through the whole network.
        '''
        x = input
        for  l in self.layers:
            x = l(x) # Feed output from previous layer into next layer
        return x[0] if len(x) == 1 else x # Clean output for last node

    def parameters(self):
        # Get the 'knobs and dials' for the whole network
        return [p for layer in self.layers for p in layer.parameters()]
        

In [44]:
inputs=[1.0, -2.0, -1.0]
mlp = MLP(3, [4,4,1]) # Create 3 layer network that takes 3 inputs
mlp_result = mlp(inputs) 
mlp_result

Value object (Data: 0.4048014958084447)

### Back Prop in the MLP

In [45]:
# Results with no training, just passing through a new MLP
input_data = [
    [1.0, 3.0, -1],
    [1.0, 3.0, 0.5],
    [0.5, 1.5, 1.0],
    [-2.0, 1.0, -1.0]
]
targets = [1.0, -1.0, -1.0, 1.0]  # example 1 -> 1.0; example 2 -> -1

untrained_results = [mlp(i) for i in input_data]
untrained_results


[Value object (Data: 0.9069534342036834),
 Value object (Data: 0.9163948866459092),
 Value object (Data: 0.9332324724623532),
 Value object (Data: 0.775155971557185)]

In [46]:
# Calculating loss for no training
loss = [(result - target)**2 for target, result in zip(targets, untrained_results)]
loss = sum(loss, Value(0))
loss

Value object (Data: 7.469169654678371)

In [47]:
loss.backprop()

In [48]:
mlp.layers[0].neurons[0].w[0].grad # See the gradient of loss func in respect to one weight 

-0.036800686038540545

In [49]:
len(mlp.parameters()) # Number of params in this network

41

In [56]:
# Training loop on MLP,  
i=0
input_data = [
    [1.0, 3.0, -1],
    [1.0, 3.0, 0.5],
    [0.5, 1.5, 1.0],
    [-2.0, 1.0, -1.0]
]
loss = Value(0)
for _ in range(1000): # 1000 training itters
    net_outs = [mlp(i) for i in input_data]
    loss = sum(((result - target)**2 for target, result in zip(targets, net_outs)), Value(0))
    loss.backprop()
    for p in mlp.parameters(): # Move value in direction of gradient descent
        step_size = 0.5 # Small step size to avoid over stepping
        p.data += -p.grad * step_size # Step in direction of descent
        p.grad = 0 # Reset each parameters gradient to 0 to stop backprop from adding to last run's gradient

print(loss) # New loss 

Value object (Data: 0.00027476512146774347)


In [57]:
# Visually compare targets to outputs after training
print([mlp(v).data for v in input_data])
print(targets)

[0.9906714455268741, -0.990225840923678, -0.9923404647776453, 0.9942195374714783]
[1.0, -1.0, -1.0, 1.0]
