In [12]:
import numpy as np

import math
import matplotlib.pyplot as plt

In [13]:
class Parameter:
    def __init__(self, value: float, name: str, _children=()) -> None:
        self._value = value
        self._name = name
        self._backward = lambda: None
        self._prev = set(_children)
        self._grad = 0.0

    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"

    def __mul__(self, other):
        result = Parameter(
            self._value * other._value,
            f'{self._name} * {other._name}',
            _children=(self, other)
        )

        def _backward():
            self._grad += other._value * result._grad
            other._grad += self._value * result._grad

        result._backward = _backward

        return result

    def __add__(self, other):
        result = Parameter(
            self._value + other._value,
            f'[{self._name} + {other._name}]',
            _children=(self, other)
        )

        def _backward():
            self._grad += 1.0 * result._grad
            other._grad += 1.0 * result._grad

        result._backward = _backward

        return result

    def sigmoid(self):
        val = 1.0 / (1.0 + math.exp(-self._value))

        result = Parameter(
            val,
            f"σ({self._name})"
        )

        def _backward():
            self._grad += result._grad * val * (1 - val)

        result._backward = _backward

        return result
    
    def backward(self):
        topo_sort = []
        visited_nodes = set()
        def sort_topo(node):
            if node not in visited_nodes:
                visited_nodes.add(node)
                for child in node._prev:
                    sort_topo(child)
                topo_sort.append(node)
        sort_topo(self)
        
        self._grad = 1.0
        for node in reversed(topo_sort):
            node._backward()
        
    def softplus(self):
        val = np.log(1 + np.exp(self._value))
        result = Parameter(
            val,
            f"softplus({self._name})"
        )
        
        def _backward():
            self._grad += result._grad * (1.0 / (1.0 + np.exp(-self._value)))

        result._backward = _backward

        return result


    def mish(self):
        softplus_result = self.softplus()
        softplus_val = softplus_result._value
        tanh_val = np.tanh(softplus_val)
        sech_sqr = 1 / np.cosh(softplus_val)**2

        val = self._value * tanh_val

        result = Parameter(
            val,
            f"mish({self._name})"
        )

        def _backward():
            sigmoid_val = softplus_result.sigmoid()._value
            mish_grad = self._value * (sech_sqr * sigmoid_val + tanh_val * (1 - sigmoid_val))
            self._grad += result._grad * mish_grad

        result._backward = _backward

        return result


def gd(learning_rate: float, *parameters: Parameter) -> None:
    for parameter in parameters:
        parameter._value -= learning_rate * parameter._grad
    

In [14]:
a = Parameter(3.0, 'a')
b = Parameter(2.0, 'b')
c = Parameter(5.0, 'c')
d = Parameter(5.0, 'd')
u = a * b
v = u + c
L = v * d
L.backward()
print(L)
print(v)
print(u)
print(d)
print(c)
print(b)
print(a)

Parameter [a * b + c] * d = 55.0; dL/d[[a * b + c] * d] = 1.0
Parameter [a * b + c] = 11.0; dL/d[[a * b + c]] = 5.0
Parameter a * b = 6.0; dL/d[a * b] = 5.0
Parameter d = 5.0; dL/d[d] = 11.0
Parameter c = 5.0; dL/d[c] = 5.0
Parameter b = 2.0; dL/d[b] = 15.0
Parameter a = 3.0; dL/d[a] = 10.0


In [15]:
x1 = Parameter(3.0, 'x1')
x2 = Parameter(4.0, 'x2')

w1 = Parameter(1.0, 'w1')
w2 = Parameter(2.0, 'w2')

x1w1 = x1 * w1
x2w2 = x2 * w2
xw = x1w1 + x2w2
out = xw.mish()
out.backward()
print(out._grad)
print(xw._grad)
print(x2w2._grad)
print(x1w1._grad)
print(x1._grad)
print(w1._grad)
print(x2._grad)
print(w2._grad)

1.0
0.0001837248450104483
0.0
0.0
0.0
0.0
0.0
0.0
