# Part 1

In [2]:
import torch

In [3]:
x_0 = torch.Tensor([
    [1, 2],
    [4, 5]
])

x_0

tensor([[1., 2.],
        [4., 5.]])

In [22]:
alpha = 10
x_1 = x_0 - alpha * (1 / (x_0 + 1))

x_1

tensor([[-4.0000, -1.3333],
        [ 2.0000,  3.3333]])

In [26]:
x_1.tolist()

[[-4.0, -1.3333334922790527], [2.0, 3.3333332538604736]]

# Part 2

In [1]:
import torch

TODO: в чем разница между `torch.tensor` и `torch.Tensor`?

In [31]:
x = torch.Tensor([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]
])
x

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]])

In [32]:
x = x.clone().detach().requires_grad_(True)
x

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]], requires_grad=True)

In [33]:
x.grad

In [34]:
device = torch.device(
    'cuda:0'
    if torch.cuda.is_available()
    else 'cpu'
)

device

device(type='cpu')

In [35]:
function = 10 * (x ** 2).sum()

In [36]:
function.backward()

In [37]:
x.grad

tensor([[ 20.,  40.,  60.,  80.],
        [100., 120., 140., 160.],
        [180., 200., 220., 240.]])

In [38]:
type(function)

torch.Tensor

In [49]:
(
    function
    .grad_fn
)

<MulBackward0 at 0x7f58341c44c0>

In [50]:
(
    function
    .grad_fn
    .next_functions[0][0]
)

<SumBackward0 at 0x7f58341bf460>

In [55]:
(
    function
    .grad_fn
    .next_functions[0][0]
    .next_functions[0][0]
)

<PowBackward0 at 0x7f58341bfa60>

In [60]:
(
    function
    .grad_fn
    .next_functions[0][0]
    .next_functions[0][0]
    .next_functions
)

((<AccumulateGrad at 0x7f58341bf6a0>, 0),)

Задача:  

Реализуйте расчет градиента для функции  
$$ f(w) = \prod\limits_{i,j}{log_{e}(log_{e}({w_{i,j} + 7}})) $$
​
в точке $ w = [[5, 10], [1, 2]] $  

> Подсказка: перемножить все значения функции можно с помощью метода `.prod()`

In [63]:
w = torch.tensor(
    [[5., 10.],
    [1., 2.]],
    requires_grad=True
)

In [69]:
with torch.no_grad():
    display(torch.prod(torch.log(torch.log(w + 7))))

tensor(0.5463)

In [70]:
function = torch.prod(torch.log(torch.log(w + 7)))

In [71]:
function.backward()

In [72]:
w.grad

tensor([[0.0201, 0.0109],
        [0.0449, 0.0351]])

Как обновить значения тензора в градиентном спуске:

In [75]:
x = torch.Tensor([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]
])
x

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]])

In [76]:
x = x.clone().detach().requires_grad_(True)
x

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]], requires_grad=True)

In [77]:
x.grad

In [79]:
function = 10 * (x ** 2).sum()

In [80]:
function.backward()

In [81]:
x.grad

tensor([[ 20.,  40.,  60.,  80.],
        [100., 120., 140., 160.],
        [180., 200., 220., 240.]])

In [83]:
x.data -= 1e-3 * x.grad

In [87]:
x.grad.zero_()

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [88]:
x.grad

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

Реализуйте градиентный спуск для той же функции

$$ f(w) = \prod\limits_{i,j}{log_{e}(log_{e}({w_{i,j} + 7}})) $$
​
Пусть начальным приближением будет $ w^{t=0} = [[5, 10], [1, 2]] $, шаг градиентного спуска $\alpha=0.001$. 

Чему будет равен $ w^{t=500} $ ?

In [94]:
N_ITER = 500
alpha = 1e-3

w = torch.tensor(
    [[5., 10.],
    [1., 2.]],
    requires_grad=True
)

for _ in range(N_ITER):
    function = torch.prod(torch.log(torch.log(w + 7)))
    function.backward()

    w.data -= alpha * w.grad

    w.grad.zero_()

In [95]:
w

tensor([[4.9900, 9.9948],
        [0.9775, 1.9825]], requires_grad=True)

# Оформим градиентный шаг в виде функций:

In [112]:
import torch

In [113]:
x = torch.tensor(
    [8., 8.],
    requires_grad=True
)

In [114]:
def function_parabola(var):
    return 10 * (var ** 2).sum()

In [115]:
with torch.no_grad():
    print(function_parabola(x))

tensor(1280.)


In [116]:
def make_gradient_step(function, var, alpha = 1e-3):
    function_res = function(var)
    function_res.backward()
    var.data -= alpha * var.grad
    var.grad.zero_()

In [117]:
for _ in range(500):
    make_gradient_step(function_parabola, x)

In [118]:
x

tensor([0.0003, 0.0003], requires_grad=True)

## Добавим оптимизатор:

In [17]:
import torch

In [18]:
x = torch.tensor(
    [8., 8.],
    requires_grad=True
)

In [19]:
import torch.optim

In [20]:
optimizer = torch.optim.SGD(
    [x],
    lr=1e-3,
    # nesterov=True,
    # momentum=0.1
)
# optimizer = torch.optim.Adam([x], lr=1e-1)

In [21]:
def function_parabola(var):
    return 10 * (var ** 2).sum()

In [22]:
def make_gradient_step(function, var, alpha = 1e-3):
    function_res = function(var)
    function_res.backward()
    # var.data -= alpha * var.grad
    # var.grad.zero_()
    optimizer.step()
    optimizer.zero_grad()

In [23]:
for _ in range(500):
    make_gradient_step(function_parabola, x)

In [24]:
x

tensor([0.0003, 0.0003], requires_grad=True)

Задача: переписать задачу с функцией $ f(w) = \prod\limits_{i,j}{log_{e}(log_{e}({w_{i,j} + 7}})) $ с SGD:

In [25]:
N_ITER = 500
alpha = 1e-3

w = torch.tensor(
    [[5., 10.],
    [1., 2.]],
    requires_grad=True
)
optimizer = torch.optim.SGD([w], lr=1e-3)

for _ in range(N_ITER):
    function = (w + 7).log().log().prod()
    function.backward()
    optimizer.step()
    optimizer.zero_grad()

print(w)

tensor([[4.9900, 9.9948],
        [0.9775, 1.9825]], requires_grad=True)
