In [1]:
import torch
import torch.nn as nn

## Написать на PyTorch forward и backward полносвязного слоя без использования autograd

In [2]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.layer_1 = nn.Linear(input_dim, output_dim, bias = False)
        self.activation = torch.nn.Sigmoid()
       
    def forward(self, x):
        self.saved_tensors = x        
        x = self.activation(self.layer_1(x))
        
        return x
    
    def backward(self, grad_output):    
        x, = self.saved_tensors
        return grad_output * [lambda x: (x * (1 - x))]

## Написать 1-2 адаптивных оптимизатора

In [3]:
class RMSprop:
    def __init__(self, model_params, lr=1e-2, alpha=0.99, eps=1e-8):
        self.model_params = list(model_params)
        self.lr = lr
        self.alpha = alpha
        self.eps = eps
        self.avg_sqr_grads = [torch.zeros_like(p) for p in self.model_params]

    def zero_grad(self):
        for param in self.model_params:
            param.grad = None

    @torch.no_grad()
    def step(self):
        for param, avg_sqr_grad in zip(self.model_params, self.avg_sqr_grads):
            avg_sqr_grad.mul_(self.alpha).add_(param.grad * param.grad * (1 - self.alpha))
            std = avg_sqr_grad.sqrt().add(self.eps)
            param.sub_((self.lr / std) * param.grad)

In [4]:
class Adadelta:
    def __init__(self, model_params, lr=1.0, rho=0.9, eps=1e-6):
        self.model_params = list(model_params)
        self.lr = lr
        self.rho = rho
        self.eps = eps
        self.avg_sqr_grads = [torch.zeros_like(p) for p in self.model_params]
        self.avg_sqr_deltas = [torch.zeros_like(p) for p in self.model_params] 

    def zero_grad(self):
        for param in self.model_params:
            param.grad = None

    @torch.no_grad()
    def step(self):
        for param, avg_sqr_grad, avg_sqr_delta in zip(self.model_params, \
                                                      self.avg_sqr_grads, \
                                                      self.avg_sqr_deltas):
            avg_sqr_grad.mul_(self.rho).add_(param.grad * param.grad * (1 - self.rho))
            std = avg_sqr_grad.add(self.eps).sqrt()
            
            delta = avg_sqr_delta.add(self.eps).sqrt().div(std).mul(param.grad)
            param.sub_(self.lr * delta)
            avg_sqr_delta.mul_(self.rho).add_(delta * delta * (1 - self.rho))

## Решить задачу нахождения корней квадратного уравнения методом градиентного спуска

In [5]:
cur_x = 3 
rate = 0.01 
precision = 0.000001 
previous_step_size = 1 
max_iters = 10000 
iters = 0 

In [6]:
# Equation:
# 4x^2 - 10x = 0

df = lambda x: (8*x - 10)

In [7]:
while previous_step_size > precision and iters < max_iters:
    prev_x = cur_x 
    cur_x = cur_x - rate * df(prev_x) 
    previous_step_size = abs(cur_x - prev_x) 
    iters = iters + 1 
    
print("Solution:", cur_x)

Solution: 1.2500106778837021
