In [68]:
import numpy as np
import math

In [69]:
class Parameter:
    def __init__(self, value: float, name: str) -> None:
        self._value = value
        self._name = name

        self._grad = 0.0
        self._backward = lambda: None
        self._prev = []

    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"

    def __mul__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(
            self._value * other._value,
            f'{self._name} * {other._name}'
        )

        def _backward():
            self._grad += other._value * result._grad
            other._grad += self._value * result._grad

        result._backward = _backward
        result._prev = [self, other]

        return result

    def __add__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(
            self._value + other._value,
            f'[{self._name} + {other._name}]'
        )

        def _backward():
            self._grad += 1.0 * result._grad
            other._grad += 1.0 * result._grad

        result._backward = _backward
        result._prev = [self, other]

        return result

    def sigmoid(self) -> 'Parameter':
        val = 1.0 / (1.0 + math.exp(-self._value))

        result = Parameter(
            val,
            f"σ({self._name})"
        )

        def _backward():
            self._grad += result._grad * val * (1 - val)

        result._backward = _backward
        result._prev = [self]

        return result

    def relu(self) -> 'Parameter':
        val = max(0, self._value)

        result = Parameter(
            val,
            f"ReLU({self._name})"
        )

        def _backward():
            self._grad += result._grad * (1.0 if self._value > 0 else 0.0)

        result._backward = _backward
        result._prev = [self]

        return result

    def silu(self) -> 'Parameter':
        sigmoid = 1.0 / (1.0 + math.exp(-self._value))
        val = self._value * sigmoid

        result = Parameter(
            val,
            f"SiLU({self._name})"
        )

        def _backward():
            self._grad += result._grad * (sigmoid * (1 + self._value * (1 - sigmoid)))

        result._backward = _backward
        result._prev = [self]

        return result

    def backward(self):
        visited = set()
        order = []

        def topo_sort(node):
            if node not in visited:
                visited.add(node)
                for prev_node in node._prev:
                    topo_sort(prev_node)
                order.append(node)

        topo_sort(self)
        self._grad = 1.0
        for node in reversed(order):
            node._backward()

def sgd(parameters, learning_rate):
    for param in parameters:
        param._value -= learning_rate * param._grad
        param._grad = 0.0


# Імплементувати автоматичний backpropagation (hint: topological sort, dfs)

In [70]:
a = Parameter(2.0, "a")
b = Parameter(3.0, "b")
c = a * b
d = c + a

d.backward()

print(a)
print(b)
print(c)
print(d)

Parameter a = 2.0; dL/d[a] = 4.0
Parameter b = 3.0; dL/d[b] = 2.0
Parameter a * b = 6.0; dL/d[a * b] = 1.0
Parameter [a * b + a] = 8.0; dL/d[[a * b + a]] = 1.0


In [71]:
a = Parameter(0.0, "a")
b = a.sigmoid()

b.backward()

print(a)
print(b)

Parameter a = 0.0; dL/d[a] = 0.25
Parameter σ(a) = 0.5; dL/d[σ(a)] = 1.0


In [72]:
a = Parameter(1.0, "a")
b = Parameter(2.0, "b")
c = a * b
d = c.sigmoid()
e = d + a

e.backward()

print(a)
print(b)
print(c)
print(d)
print(e)

Parameter a = 1.0; dL/d[a] = 1.2099871708070133
Parameter b = 2.0; dL/d[b] = 0.10499358540350662
Parameter a * b = 2.0; dL/d[a * b] = 0.10499358540350662
Parameter σ(a * b) = 0.8807970779778823; dL/d[σ(a * b)] = 1.0
Parameter [σ(a * b) + a] = 1.8807970779778822; dL/d[[σ(a * b) + a]] = 1.0


# Імплементувати ті активації (2 з них), котрі ви обрали у lab 1(ReLU, SiLU)

In [73]:
a = Parameter(-1.0, "a")
b = Parameter(2.0, "b")
c = a.relu()
d = b.relu()

print(c)
print(d)

c.backward()
d.backward()

print(a)
print(b)

Parameter ReLU(a) = 0; dL/d[ReLU(a)] = 0.0
Parameter ReLU(b) = 2.0; dL/d[ReLU(b)] = 0.0
Parameter a = -1.0; dL/d[a] = 0.0
Parameter b = 2.0; dL/d[b] = 1.0


In [74]:
a = Parameter(0.0, "a")
b = a.sigmoid()

print(b)

b.backward()

print(a)

Parameter σ(a) = 0.5; dL/d[σ(a)] = 0.0
Parameter a = 0.0; dL/d[a] = 0.25


In [75]:
a = Parameter(1.0, "a")
b = a.silu()

print(b)

b.backward()

print(a)

Parameter SiLU(a) = 0.7310585786300049; dL/d[SiLU(a)] = 0.0
Parameter a = 1.0; dL/d[a] = 0.9276705118714869


# Імплементувати gradient  descent як окрему функцію

In [76]:
a = Parameter(1.0, "a")
a._grad = 2.0

sgd([a], learning_rate=0.1)

print(a)

Parameter a = 0.8; dL/d[a] = 0.0


In [77]:
a = Parameter(1.0, "a")
b = Parameter(2.0, "b")
a._grad = 2.0
b._grad = 1.0

sgd([a, b], learning_rate=0.1)

print(a)
print(b)

Parameter a = 0.8; dL/d[a] = 0.0
Parameter b = 1.9; dL/d[b] = 0.0


In [78]:
a = Parameter(1.0, "a")
b = Parameter(2.0, "b")
c = a * b
d = c.sigmoid()
e = d + a

e.backward()

parameters = [a, b, c, d, e]
sgd(parameters, learning_rate=0.1)

for param in parameters:
    print(param)

Parameter a = 0.8790012829192987; dL/d[a] = 0.0
Parameter b = 1.9895006414596494; dL/d[b] = 0.0
Parameter a * b = 1.9895006414596494; dL/d[a * b] = 0.0
Parameter σ(a * b) = 0.7807970779778823; dL/d[σ(a * b)] = 0.0
Parameter [σ(a * b) + a] = 1.7807970779778821; dL/d[[σ(a * b) + a]] = 0.0
