In [None]:
# задача 1

# Найти корни квадратного уравнения методом градиентного спуска
# x ** 2 - 5 * x + 4 = 0
# надо начать движение от начальной точки в направлении антиградиента с заданным шагом
# x = x - learning_rate * grad(x)
# всегда ли сойдемся за приемлемое количество шагов?
# важна ли начальная точка?
# как найти второй корень?
# как влияет ЛР?

In [2]:
import numpy as np

In [1]:
def f(x, b, c):
    #Функция квадратного уравнения
    return x**2 - b*x + c

def df(x, b, c):
    #Производная от квадрата квадратного уравнения
    return 2*(x**2 - b*x + c)*(2*x - b)

def gradient_descent(b, c, initial_x, learning_rate, epochs):
   #Градиентный спуск
    x = initial_x
    for i in range(epochs):
        grad = df(x, b, c)
        x = x - learning_rate * grad
        if abs(f(x, b, c)) < 1e-6:  #Проверка насколько близко ошибка подошла к 0
            break
    return x

b, c = 5, 4

# Параметры градиента
initial_x = 0  # начальная точка
learning_rate = 0.01  # Learning rate
epochs = 1000  # Количество итераций

# первый корень
root = gradient_descent(b, c, initial_x, learning_rate, epochs)
print(root)

# второй корень с другой начальной точкой
initial_x = 3
root2 = gradient_descent(b, c, initial_x, learning_rate, epochs)
print(root2)


# Второй корень можно найти, используя другую начальную точку
# Начальная точка влияет сильно на результат
# ЛР влияет на скорость обучения, если будет слишком большой, то можно проскочить минимум ошибки, если будет слишком низкий, то можно застрять в локальном минимуме.


0.9999996969264429
3.999999672449243


In [None]:
# Задача 2

# Реализовать адаптивный оптимизатор с подстраивающимся LR

In [23]:
class AdagradOptimizer:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.eps = 1e-8  #
        self.cache = None

    def step(self, params, grads):
        if self.cache is None:
            self.cache = [np.zeros_like(param) for param in params]

        for i, param in enumerate(params):
            self.cache[i] += grads[i] ** 2
            adjusted_grad = grads[i] / (np.sqrt(self.cache[i]) + self.eps)
            params[i] -= self.learning_rate * adjusted_grad  #


In [27]:
optimizer = AdagradOptimizer(learning_rate=0.1)
params = [np.random.randn(10, 10), np.random.randn(10)]

grads = [np.random.randn(*param.shape) for param in params]

optimizer.step(params, grads)


In [31]:
import numpy as np

In [32]:
# Task 2
# Realize forward and backward pass for linear layer with sigmoid activation

In [33]:
def sigmoid(x):
    return 1. / (1 + np.exp(-x))

def sigmoid_backward(da, x):
    sig = sigmoid(x)

    return da * sig * (1 - sig)

def relu(x):
    return np.maximum(0., x)

def relu_backward(da, x):
    da = np.array(da, copy = True)
    da[x <= 0] = 0
    return da

In [34]:
def mse_loss(t, y):
    return (t - y) ** 2

def d_mse_loss(t, y):
    return 2 * (y - t)


In [35]:
class LinearLayer:
    def __init__(self, n_inp, n_out, activation='sigmoid'):
        self.w = np.random.randn(n_out, n_inp) * 0.1
        self.b = np.random.randn(n_out, 1) * 0.1
        if activation == 'sigmoid':
            self.activ = sigmoid
        if activation == 'relu':
            self.activ = relu
        elif activation == 'None':
            self.activ = None
        else:
            raise Exception(f'Unknown activation "{activation}"')
        self._clear_state()

    def _clear_state(self):
        self.lin = None
        self.inp = None
        self.d_w = None
        self.d_b = None

    def forward(self, x):
        self.inp = x
        self.lin = np.dot(self.w, x) + self.b
        activ = self.activ(self.lin) if self.activ is not None else self.lin

        return activ

    def backward(self, grad): # grad = d L / d z    Dout
        # grad * dz / d lin
        if self.activ == sigmoid:
            grad_lin = sigmoid_backward(grad, self.lin)
        if self.activ == relu:
            grad_lin = relu_backward(grad, self.lin)
        else:
            grad_lin = grad
        # grad_lin * d lin / d w
        m = self.inp.shape[1]
        self.d_w = grad_lin @ self.inp.T / m
        # grad_lin * d lin / d b
        self.d_b = np.sum(grad_lin, axis=1, keepdims=True) / m

        grad = np.dot(self.w.T, grad_lin)

        return grad

In [36]:
from typing import Tuple

class Model:
    def __init__(self, arch: Tuple[Tuple[int, int]], activation):
        self.layers = []
        for i, p in enumerate(arch):
            self.layers.append(
                LinearLayer(p[0], p[1],
                            activation=activation if i < len(arch)-1 else 'None')
                )
        self._clear_state()

    def _clear_state(self):
        for l in self.layers:
            l._clear_state()

    def forward(self, x):
        for l in self.layers:
            x = l.forward(x)

        return x

    def backward(self, grad):
        for l in reversed(self.layers):
            grad = l.backward(grad)

        return grad

In [None]:
# Task 3
# Realize SGD Momentum optimizer
# velocity = momentum * velocity - lr * gradient
# w = w + velocity

In [37]:
#для всей модели
class SGDMomentum:
    def __init__(self, model: Model, lr= 0.0001, momentum=0.9):
        self.model = model
        self.lr = lr
        self.m = momentum
        self.vel = [[np.zeros_like(layer.w),
                     np.zeros_like(layer.b)] for layer in model.layers]

    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.vel[i][0] = self.vel[i][0] * self.m - self.lr * layer.d_w
            self.vel[i][1] = self.vel[i][1] * self.m - self.lr * layer.d_b
            layer.w += self.vel[i][0]
            layer.b += self.vel[i][1]

    def zero_grad(self):
        self.model._clear_state()

In [38]:
x = np.random.uniform(-3, 3, 20000)
y = x**2 + np.random.randn()*0.01


In [39]:
model = Model(((1, 100), (100, 1)), activation='relu')
optim = SGDMomentum(model, lr=0.00001)
for e in range(20):
    print(e, model.forward([[1]]), model.forward([[2]]), model.forward([[-1]]), model.forward([[-2]]))
    for i, (val, t) in enumerate(zip(x, y)):
        optim.zero_grad()
        pred = model.forward(np.array([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        model.backward(grad)
        optim.step()



0 [[-0.24843854]] [[-0.27199549]] [[-0.26851293]] [[-0.31483425]]
1 [[1.64026474]] [[4.52778064]] [[1.65116296]] [[4.46503956]]
2 [[1.24338431]] [[4.52817876]] [[1.24530243]] [[4.47272861]]
3 [[1.05992496]] [[4.49491844]] [[1.04920875]] [[4.44726259]]
4 [[1.0092588]] [[4.45030793]] [[0.99818621]] [[4.41002083]]
5 [[0.97891121]] [[4.40243613]] [[0.97327746]] [[4.3695646]]
6 [[0.99211186]] [[4.35589742]] [[0.97500115]] [[4.3294755]]
7 [[1.00285023]] [[4.31307741]] [[0.98833596]] [[4.29140304]]
8 [[1.00830709]] [[4.2739728]] [[0.99470663]] [[4.25596491]]
9 [[1.01063024]] [[4.23803633]] [[0.99672082]] [[4.22354877]]
10 [[1.01076731]] [[4.20514596]] [[0.99760031]] [[4.19387706]]
11 [[1.00933609]] [[4.17509889]] [[0.99721281]] [[4.16658558]]
12 [[1.00680766]] [[4.14756277]] [[0.99538112]] [[4.14126407]]
13 [[1.00597036]] [[4.12245766]] [[0.99251821]] [[4.11794587]]
14 [[1.00841661]] [[4.09944817]] [[0.98900346]] [[4.09603175]]
15 [[1.01048633]] [[4.07835959]] [[0.9851544]] [[4.07549491]]
16 

In [40]:
print(e, model.forward([[1]]), model.forward([[2]]), model.forward([[-1]]), model.forward([[103]]))

19 [[1.01600192]] [[4.01714374]] [[0.96643764]] [[483.9757612]]
