1. Написать на PyTorch forward и backward полносвязного слоя без использования autograd
1. Написать 1-2 адаптивных оптимизатора
1. Решить задачу нахождения корней квадратного уравнения методом градиентного спуска

In [1]:
import torch
import numpy as np

In [2]:
def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

def sigmoid_backward(da, x):
    sig = sigmoid(x)
    return da * sig * (1 - sig)

def relu(x):
    return torch.maximum(torch.zeros_like(x), x)

def relu_backward(da, x):
    da = torch.tensor(da)
    da[x <= 0] = 0
    return da

In [3]:
def mse_loss(t, y):
    return torch.pow(t - y, 2)

def d_mse_loss(t, y):
    return 2 * (y - t)

PyTorch forward и backward полносвязного слоя без использования autograd

In [4]:
class Layer:
    def __init__(self, n_inp=10, n_out=1, activation=None):
        self.w = torch.rand(n_out, n_inp) * 0.1
        self.b = torch.rand(n_out, 1) * 0.1
        self.n_inp = n_inp
        self.n_out = n_out
        self.activ = activation
        self._clear_state()

    def _clear_state(self):
        self.lin = None
        self.inp = None
        self.d_w = None
        self.d_b = None
    
    def forward(self, input_):
        self.inp = input_
        if type(self.w) != type(self.inp):
            print(type(self.w), type(self.inp))
        self.lin = self.w @ self.inp + self.b
        out = self.activ(self.lin) if self.activ else self.lin

        return out
    
    def backward(self, grad):
        if self.activ == sigmoid:
            grad_lin = sigmoid_backward(grad, self.lin) 
        elif self.activ == relu:
            grad_lin = relu_backward(grad, self.lin)
        else:
            grad_lin = grad
        
        m = self.inp.size()[1]
        self.d_w = grad_lin @ self.inp.t() / m
        self.d_b = torch.sum(grad_lin, 1, keepdims=True) / m

        grad = self.w.t() @ grad_lin

        return grad

In [5]:
layer = Layer(1, 10, sigmoid)
input_ = torch.tensor([[1.]])

In [6]:
from typing import Callable

class Network:
    def __init__(self, arch: tuple[tuple[int, int]], activation: Callable = None):
        self.layers = []
        for i, neurons in enumerate(arch):
            self.layers.append(Layer(neurons[0], neurons[1], 
                                     activation=activation if i < len(arch) - 1 else None))
        self._clear_state()
        
    def _clear_state(self):
        for layer in self.layers:
            layer._clear_state()
    
    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x
    
    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
        return grad

Адаптивный оптимизатор RMSprop

In [7]:
class RMSprop:
    def __init__(self, model: Network, rho=0.9, lr=0.01):
        self.model = model
        self.rho = rho
        self.lr = lr
        self.accum = [[torch.zeros_like(layer.w),
                       torch.zeros_like(layer.b)] for layer in model.layers]
    
    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.accum[i][0] = self.rho * self.accum[i][0] + (1 - self.rho) * layer.d_w**2
            self.accum[i][1] = self.rho * self.accum[i][1] + (1 - self.rho) * layer.d_b**2
            adapt_lr_w = self.lr / torch.sqrt(self.accum[i][0])
            adapt_lr_b = self.lr / torch.sqrt(self.accum[i][1])
            layer.w -= adapt_lr_w * layer.d_w
            layer.b -= adapt_lr_b * layer.d_b
    
    def zero_grad(self):
        self.model._clear_state()

Адаптивный оптимизатор Adam

In [8]:
class Adam:
    def __init__(self, model: Network, beta1=0.9, beta2=0.9, lr=0.01):
        self.model = model
        self.beta1 = beta1
        self.beta2 = beta2
        self.lr = lr
        self.accum = [[torch.zeros_like(layer.w),
                       torch.zeros_like(layer.b)] for layer in model.layers]
        self.vel = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]
    
    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.vel[i][0] = self.beta1 * self.vel[i][0] + (1 - self.beta1) * layer.d_w
            self.vel[i][1] = self.beta1 * self.vel[i][1] + (1 - self.beta1) * layer.d_b
            self.accum[i][0] = self.beta2 * self.accum[i][0] + (1 - self.beta2) * layer.d_w**2
            self.accum[i][1] = self.beta2 * self.accum[i][1] + (1 - self.beta2) * layer.d_b**2
            adapt_lr_w = self.lr / torch.sqrt(self.accum[i][0])
            adapt_lr_b = self.lr / torch.sqrt(self.accum[i][1])
            layer.w -= adapt_lr_w * self.vel[i][0]
            layer.b -= adapt_lr_b * self.vel[i][1]
    
    def zero_grad(self):
        self.model._clear_state()

In [9]:
x = 4 * torch.rand(2000) - 2
y = x**3 + np.random.randn() * 0.1

In [10]:
epochs = 20
model1 = Network(((1, 100), (100, 1)), activation=sigmoid)
model2 = Network(((1, 100), (100, 1)), activation=sigmoid)
rmsprop = RMSprop(model1)
adam = Adam(model2)
for e in range(epochs):
    for i, (val, t) in enumerate(zip(x, y)):
        rmsprop.zero_grad()
        adam.zero_grad()
        pred1 = model1.forward(torch.tensor([[val]]))
        pred2 = model2.forward(torch.tensor([[val]]))
        loss1 = mse_loss(t, pred1)
        loss2 = mse_loss(t, pred2)
        grad1 = d_mse_loss(t, pred1)
        grad2 = d_mse_loss(t, pred2)
        model1.backward(grad1)
        model2.backward(grad2)
        rmsprop.step()
        adam.step()
        
    print(f'Epoch: {e}')
    print('RMSprop:', model1.forward(torch.tensor([[1.]])), model1.forward(torch.tensor([[2.]])),
          model1.forward(torch.tensor([[-1.]])), model1.forward(torch.tensor([[-2.]])))
    print('Adam:   ', model2.forward(torch.tensor([[1.]])), model2.forward(torch.tensor([[2.]])),
          model2.forward(torch.tensor([[-1.]])), model2.forward(torch.tensor([[-2.]])))

Epoch: 0
RMSprop: tensor([[1.8781]]) tensor([[3.6419]]) tensor([[-1.6652]]) tensor([[-3.3061]])
Adam:    tensor([[2.0852]]) tensor([[3.9322]]) tensor([[-1.6329]]) tensor([[-3.3339]])
Epoch: 1
RMSprop: tensor([[1.8387]]) tensor([[3.6209]]) tensor([[-1.6674]]) tensor([[-3.3144]])
Adam:    tensor([[2.0851]]) tensor([[3.9861]]) tensor([[-1.6563]]) tensor([[-3.4038]])
Epoch: 2
RMSprop: tensor([[1.8114]]) tensor([[3.6093]]) tensor([[-1.6783]]) tensor([[-3.3183]])
Adam:    tensor([[2.0495]]) tensor([[3.9272]]) tensor([[-1.7048]]) tensor([[-3.7805]])
Epoch: 3
RMSprop: tensor([[1.7918]]) tensor([[3.6078]]) tensor([[-1.6917]]) tensor([[-3.3236]])
Adam:    tensor([[1.9666]]) tensor([[3.9643]]) tensor([[-1.7352]]) tensor([[-4.5526]])
Epoch: 4
RMSprop: tensor([[1.7772]]) tensor([[3.6136]]) tensor([[-1.7057]]) tensor([[-3.3303]])
Adam:    tensor([[1.8475]]) tensor([[4.4827]]) tensor([[-1.6221]]) tensor([[-5.5462]])
Epoch: 5
RMSprop: tensor([[1.7660]]) tensor([[3.6249]]) tensor([[-1.7195]]) tensor([[

Решить задачу нахождения корней квадратного уравнения методом градиентного спуска:  
$ax^2 + bx + c = 0$

In [11]:
a, b, c = 1, 2, -3

In [12]:
from random import random

In [13]:
def forward(x):
    return a * x**2 + b * x + c

def backward(x, grad):
    return (2 * a * x + b) * grad

def loss(pred, t):
    return (pred - t)**2

def grad_loss(pred, t):
    return 2 * (pred - t)

lr = 0.001
for _ in range(10):
    x = 10 * random() - 5
    for i in range(1000):
        pred = forward(x)
        grad = backward(x, grad_loss(pred, 0))
        x -= grad * lr
    print(f'Root is {x:>7.4f}, loss is {loss(forward(x), 0):>7.4f}')

Root is -3.0000, loss is  0.0000
Root is  1.0000, loss is  0.0000
Root is -3.0000, loss is  0.0000
Root is -3.0000, loss is  0.0000
Root is  1.0000, loss is  0.0000
Root is  1.0000, loss is  0.0000
Root is  1.0000, loss is  0.0000
Root is  1.0000, loss is  0.0000
Root is  1.0000, loss is  0.0000
Root is -3.0000, loss is  0.0000
