# 全く同じネットワークを4つの書き方で動かします
https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

日本語:https://www.aiprogrammers.net/entry/2020/04/24/092500

## warmup:Numpy

In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 38520443.548501045
1 40106890.92393669
2 46893996.87852523
3 47746784.01847948
4 36180681.05554907
5 19098472.562031403
6 8047113.291170775
7 3575785.7939848537
8 2058388.3799723121
9 1475382.4084175427
10 1176879.5750739207
11 979065.1062340576
12 829443.3354382712
13 709724.5548966997
14 611402.3335875792
15 529567.3011789686
16 460888.8999882756
17 402883.3631774767
18 353526.20843780256
19 311317.75013024977
20 275091.791918296
21 243878.65363627818
22 216855.47681941916
23 193389.87806939502
24 172949.28498883572
25 155044.60639267668
26 139373.54823776308
27 125546.38455738673
28 113316.25164179596
29 102466.45856108962
30 92819.01325240743
31 84217.38329398693
32 76535.2176056859
33 69658.33989214539
34 63486.7426855221
35 57942.93237944448
36 52948.43307965083
37 48445.87693746871
38 44376.91893667788
39 40693.78646329875
40 37354.19900520337
41 34322.569454176424
42 31566.01053127266
43 29058.269215715
44 26772.385622017355
45 24684.33600940508
46 22777.092089598293
47 21032

499 4.174361967308919e-06


## warmup:PyTorch

In [3]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 301.7300720214844
199 0.5452671647071838
299 0.0017044495325535536
399 6.971536640776321e-05
499 1.8546084902482107e-05


## AUTOGRAD

In [6]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 609.012939453125
199 3.393815040588379
299 0.028269469738006592
399 0.0005146132316440344
499 7.23170378478244e-05


## 関数化

In [None]:
import torch


class MyReLU(torch.autograd.Function):
    """
    torch.autograd.Functionを継承することで自分であつらえたautograd関数を実装することができます。
    ここでは順伝播と逆伝播を実装しました。
    """

    @staticmethod
    def forward(ctx, input):
        """
        順伝播では、入力を含むテンソルを受け取り、出力を含むテンソルを返しあmす。
        ctxはcontextオブジェクトで、逆伝播のための情報をこっそりとっておくことができます。
        ctx.save_for_backwardメソッドを使うことで、任意のオブジェクトのキャッシュを逆伝播のためにとっておくことができます。
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        逆伝播では、出力に対するloss関数の勾配を含むテンソルを受け取り、
        入力に対するloss関数の勾配を計算する必要がある。
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()