# Part 1

In [2]:
import torch

In [3]:
x_0 = torch.Tensor([
    [1, 2],
    [4, 5]
])

x_0

tensor([[1., 2.],
        [4., 5.]])

In [22]:
alpha = 10
x_1 = x_0 - alpha * (1 / (x_0 + 1))

x_1

tensor([[-4.0000, -1.3333],
        [ 2.0000,  3.3333]])

In [26]:
x_1.tolist()

[[-4.0, -1.3333334922790527], [2.0, 3.3333332538604736]]

# Part 2

In [1]:
import torch

TODO: в чем разница между `torch.tensor` и `torch.Tensor`?  

> => torch.tensor - конструктор, torch.Tensor - тип данных 

In [13]:
x = torch.Tensor([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]],
)
x

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]])

In [20]:
x = torch.tensor([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]],
    dtype=float,
    requires_grad=True
    # requires_grad=False
)
x

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]], dtype=torch.float64, requires_grad=True)

In [25]:
type(x)

torch.Tensor

In [21]:
x.grad

In [22]:
device = torch.device(
    'cuda:0'
    if torch.cuda.is_available()
    else 'cpu'
)

device

device(type='cpu')

In [23]:
function = 10 * (x ** 2).sum()

In [24]:
type(function)

torch.Tensor

In [26]:
function.backward()

In [27]:
x.grad

tensor([[ 20.,  40.,  60.,  80.],
        [100., 120., 140., 160.],
        [180., 200., 220., 240.]], dtype=torch.float64)

In [29]:
(
    function
    .grad_fn
)

<MulBackward0 at 0x7f2acbd45420>

In [30]:
(
    function
    .grad_fn
    .next_functions[0][0]
)

<SumBackward0 at 0x7f2a612db4f0>

In [31]:
(
    function
    .grad_fn
    .next_functions[0][0]
    .next_functions[0][0]
)

<PowBackward0 at 0x7f2a6149fd30>

In [32]:
(
    function
    .grad_fn
    .next_functions[0][0]
    .next_functions[0][0]
    .next_functions
)

((<AccumulateGrad at 0x7f2a6149e980>, 0),)

Задача:  

Реализуйте расчет градиента для функции  
$$ f(w) = \prod\limits_{i,j}{log_{e}(log_{e}({w_{i,j} + 7}})) $$
​
в точке $ w = [[5, 10], [1, 2]] $  

> Подсказка: перемножить все значения функции можно с помощью метода `.prod()`

In [39]:
w = torch.tensor([
    [5, 10],
    [1, 2]],
    dtype=torch.float,
    requires_grad=True
)

In [40]:
with torch.no_grad():
    display(torch.prod(torch.log(torch.log(w + 7))))

tensor(0.5463)

In [41]:
function = torch.prod(torch.log(torch.log(w + 7)))

In [42]:
function.backward()

In [43]:
w.grad

tensor([[0.0201, 0.0109],
        [0.0449, 0.0351]])

Как обновить значения тензора в градиентном спуске:

In [53]:
x = torch.tensor([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]],
    dtype=torch.float,
    requires_grad=True
)
x

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]], requires_grad=True)

In [54]:
x.grad

In [55]:
function = 10 * (x ** 2).sum()

In [56]:
function.backward()

In [57]:
x.grad

tensor([[ 20.,  40.,  60.,  80.],
        [100., 120., 140., 160.],
        [180., 200., 220., 240.]])

In [58]:
x

tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]], requires_grad=True)

In [59]:
x.data -= 1e-3 * x.grad

In [60]:
x

tensor([[ 0.9800,  1.9600,  2.9400,  3.9200],
        [ 4.9000,  5.8800,  6.8600,  7.8400],
        [ 8.8200,  9.8000, 10.7800, 11.7600]], requires_grad=True)

Методы с нижним подчеркиванием в конце делают inplace-операции:

In [61]:
x.grad.zero_()

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [62]:
x.grad

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

Реализуйте градиентный спуск для той же функции

$$ f(w) = \prod\limits_{i,j}{log_{e}(log_{e}({w_{i,j} + 7}})) $$
​
Пусть начальным приближением будет $ w^{t=0} = [[5, 10], [1, 2]] $, шаг градиентного спуска $\alpha=0.001$. 

Чему будет равен $ w^{t=500} $ ?

In [64]:
N_ITER = 500
alpha = 1e-3

w = torch.tensor([
    [5, 10],
    [1, 2]],
    dtype=torch.float,
    requires_grad=True
)

for _ in range(N_ITER):

    function = torch.prod(torch.log(torch.log(w + 7)))
    function.backward()

    w.data -= alpha * w.grad
    w.grad.zero_()


In [65]:
w

tensor([[4.9900, 9.9948],
        [0.9775, 1.9825]], requires_grad=True)

# Оформим градиентный шаг в виде функций:

In [66]:
import torch

In [67]:
x = torch.tensor(
    [8, 8],
    dtype=torch.float,
    requires_grad=True
)

In [68]:
def function_parabola(var):
    return 10 * (var ** 2).sum()

In [69]:
with torch.no_grad():
    print(function_parabola(x))

tensor(1280.)


In [70]:
def make_gradient_step(function, variable, alpha=1e-3):
    function_res = function(variable)
    function_res.backward()

    variable.data -= alpha * variable.grad
    variable.grad.zero_()

In [71]:
N_ITER = 500

for i in range(N_ITER):
    make_gradient_step(function_parabola, x)

In [72]:
x

tensor([0.0003, 0.0003], requires_grad=True)

## Добавим оптимизатор:

In [73]:
import torch

In [83]:
x = torch.tensor(
    [8, 8],
    dtype=torch.float,
    requires_grad=True
)

In [84]:
import torch.optim

In [98]:
# optimizer = torch.optim.SGD(
#     [x],
#     lr=1e-3,
#     nesterov=True,
#     momentum=0.1
# )

optimizer = torch.optim.Adam([x], lr=1e-1)

In [99]:
def function_parabola(var):
    return 10 * (var ** 2).sum()

In [100]:
def make_gradient_step(function, variable, alpha=1e-3):
    function_res = function(variable)
    function_res.backward()

    # variable.data -= alpha * variable.grad
    # variable.grad.zero_()
    optimizer.step()
    optimizer.zero_grad()

In [101]:
N_ITER = 500

for i in range(N_ITER):
    make_gradient_step(function_parabola, x)

In [102]:
x

tensor([2.2964e-13, 2.2964e-13], requires_grad=True)

Задача: переписать задачу с функцией $ f(w) = \prod\limits_{i,j}{log_{e}(log_{e}({w_{i,j} + 7}})) $ с SGD:

In [105]:
N_ITER = 500
alpha = 1e-3

w = torch.tensor([
    [5, 10],
    [1, 2]],
    dtype=torch.float,
    requires_grad=True
)

optimizer = torch.optim.SGD(
    [w],
    lr=alpha
)

for _ in range(N_ITER):

    function = torch.prod(torch.log(torch.log(w + 7)))
    function.backward()

    optimizer.step()
    optimizer.zero_grad()

