In [1]:
import numpy as np

import math
import matplotlib.pyplot as plt

In [2]:
class Parameter:
    def __init__(self, value: float, name: str, _children=()) -> None:
        self._value = value
        self._name = name
        self._backward = lambda: None
        self._prev = set(_children)
        self._grad = 0.0

    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"

    def __mul__(self, other):
        other = other if isinstance(other, Parameter) else Parameter(other._value, "const", ())
        result = Parameter(
            self._value * other._value,
            f'{self._name} * {other._name}',
            _children=(self, other)
        )

        def _backward():
            self._grad += other._value * result._grad
            other._grad += self._value * result._grad

        result._backward = _backward

        return result

    def __pow__(self, power):
        result = Parameter(
            self._value**power, 
            f"{self._name}**{power}",
            _children=(self,)
        )

        def _backward():
            self._grad += power * (self._value ** (power - 1)) * result._grad

        result._backward = _backward

        return result

    def __add__(self, other):
        other = other if isinstance(other, Parameter) else Parameter(other._value, "const", ())
        result = Parameter(
            self._value + other._value,
            f'[{self._name} + {other._name}]',
            _children=(self, other)
        )

        def _backward():
            self._grad += 1.0 * result._grad
            other._grad += 1.0 * result._grad

        result._backward = _backward

        return result

    def __neg__(self): 
        return self * -1

    def __sub__(self, other): 
        return self + (-other)

    def sigmoid(self):
        val = 1.0 / (1.0 + math.exp(-self._value))

        result = Parameter(
            val,
            f"σ({self._name})",
            _children=(self,)
        )

        def _backward():
            self._grad += result._grad * val * (1 - val)

        result._backward = _backward

        return result

    def backward(self):
        topo_sort = []
        visited_nodes = set()
        def sort_topo(node):
            if node not in visited_nodes:
                visited_nodes.add(node)
                for child in node._prev:
                    sort_topo(child)
                topo_sort.append(node)
        sort_topo(self)
        
        self._grad = 1.0
        for node in reversed(topo_sort):
            node._backward()

    def softplus(self):
        val = np.log(1 + np.exp(self._value))
        result = Parameter(
            val,
            f"softplus({self._name})",
            _children=(self,)
        )

        def _backward():
            self._grad += result._grad * (1.0 / (1.0 + np.exp(-self._value)))

        result._backward = _backward

        return result

    def mish(self):
        softplus_result = self.softplus()
        tanh_val = np.tanh(softplus_result._value)
        val = self._value * tanh_val

        result = Parameter(
            val,
            f"mish({self._name})",
            _children=(self, softplus_result)
        )

        def _backward():
            sigmoid_val = 1.0 / (1.0 + np.exp(-softplus_result._value))
            sech_sqr = 1 / np.cosh(softplus_result._value)**2
            mish_grad = tanh_val + self._value * sech_sqr * sigmoid_val * (1 - tanh_val**2)
            self._grad += result._grad * mish_grad

        result._backward = _backward

        return result


def gd(learning_rate: float, *parameters: Parameter) -> None:
    for parameter in parameters:
        parameter._value -= learning_rate * parameter._grad
        parameter._grad = 0
    

## Backward method test

In [3]:
a = Parameter(3.0, 'a')
b = Parameter(2.0, 'b')
c = Parameter(5.0, 'c')
d = Parameter(5.0, 'd')
u = a * b
v = u + c
L = v * d
L.backward()
print(L)
print(v)
print(u)
print(d)
print(c)
print(b)
print(a)

Parameter [a * b + c] * d = 55.0; dL/d[[a * b + c] * d] = 1.0
Parameter [a * b + c] = 11.0; dL/d[[a * b + c]] = 5.0
Parameter a * b = 6.0; dL/d[a * b] = 5.0
Parameter d = 5.0; dL/d[d] = 11.0
Parameter c = 5.0; dL/d[c] = 5.0
Parameter b = 2.0; dL/d[b] = 15.0
Parameter a = 3.0; dL/d[a] = 10.0


In [4]:
x1 = Parameter(3.0, 'x1')
x2 = Parameter(4.0, 'x2')

w1 = Parameter(1.0, 'w1')
w2 = Parameter(2.0, 'w2')

x1w1 = x1 * w1
x2w2 = x2 * w2
xw = x1w1 + x2w2
out = xw.mish()
out.backward()
print(out._grad)
print(xw._grad)
print(x2w2._grad)
print(x1w1._grad)
print(x1._grad)
print(w1._grad)
print(x2._grad)
print(w2._grad)

1.0
0.0001837248450104483
0.0
0.0
0.0
0.0
0.0
0.0


## Gradient descent test

In [5]:
W = Parameter(0.5, 'W') 
b = Parameter(0.1, 'b') 
x = Parameter(0.8, 'x') 
learning_rate = 0.001
n_epochs = 25
target = 0.4
for n in range(n_epochs):
    y = (W * x).sigmoid() + b  
    gd(learning_rate, W, b)
    loss = (y._value - target) ** 2
    y.backward()
    print(f"loss after {n} epochs :{loss}")


loss after 0 epochs :0.08921431830345163
loss after 1 epochs :0.08921431830345163
loss after 2 epochs :0.08861794298322671
loss after 3 epochs :0.08743119234277691
loss after 4 epochs :0.0856660663821022
loss after 5 epochs :0.08334056510120258
loss after 6 epochs :0.08047868850007805
loss after 7 epochs :0.07711043657872863
loss after 8 epochs :0.0732718093371543
loss after 9 epochs :0.06900480677535506
loss after 10 epochs :0.06435742889333093
loss after 11 epochs :0.05938367569108188
loss after 12 epochs :0.05414354716860799
loss after 13 epochs :0.048703043325909134
loss after 14 epochs :0.04313416416298538
loss after 15 epochs :0.03751490967983672
loss after 16 epochs :0.03192927987646316
loss after 17 epochs :0.02646727475286469
loss after 18 epochs :0.021224894309041318
loss after 19 epochs :0.016304138544993043
loss after 20 epochs :0.011813007460719867
loss after 21 epochs :0.007865501056221798
loss after 22 epochs :0.004581619331498822
loss after 23 epochs :0.0020873622865509

In [6]:
print((W * x).sigmoid() + b)

Parameter [σ(W * x) + b] = 0.398687660112452; dL/d[[σ(W * x) + b]] = 0.0
