# Core of PyTorch
*  Tensor (similar to Numpy array) computation on GPUs
*  Automatic differentation for neural networks

以下从低级到高级的API让人熟悉PyTorchAPI   
1.1 Tensor（运用NumpyArray与手动求导写一个NN）   
1.2 PyTorch Tensor（PyTorchTensor与手动求导写一个NN）  
1.3 PyTorch Tensor + Autograd (PyTorchTensor与自动求导写一个NN)    
1.4 PyTorch Tensor + customized Autograd写一个NN  
1.5 PyTorch NN Sequential + Autograd写一个NN  
1.6 PyTorch NN + optimizer写一个NN   
1.7 PyTorch NN custom + optimizer 写一个NN  
1.8 PyTorch NN custom + control flow + optimizer 写一个NN


## Tensors

In [2]:
# 用Numpy 来写一个两层的ANN
import numpy as np
# Batch_size, input_dimension, hidden_dimension, output_dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# random input and target from uniform distribution
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# random weights from uniform distribution
# 这里hidden vector 不是初始化出来的，而是通过input 和 w1 相乘计算出来的
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

for t in range(500):
#     Dot product of two arrays. 
# If both a and b are 2-D arrays, it is matrix multiplication, but using matmul or a @ b is preferred.
    h = x.dot(w1)
#     这里会返回elementwise maximum array，但是注意这里与torch.max不一样，torch.max是求出一个array按行或列返回最大值indices
#     numpy中对应的函数是np.amax()
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
#     计算square loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
#     直接手动写出loss function对y_pred以及前面weight matrix的gradient，而PyTorch中基本能对任意可导函数微分
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30128081.5812336
1 26528397.349934135
2 31294229.266981103
3 39529371.1059206
4 44486568.21986099
5 37907919.25611724
6 23016883.13894947
7 10155988.736892443
8 4078198.269220598
9 1900117.4871771801
10 1155374.1677559717
11 851578.6325813667
12 688985.101596944
13 579323.8730722324
14 495455.69504850265
15 427704.1255260593
16 371669.7160113986
17 324668.26757468685
18 284922.450027877
19 251075.02210418606
20 222077.7279341534
21 197154.58733523643
22 175584.4503152246
23 156828.944837175
24 140470.82822714548
25 126157.11472063104
26 113587.6651076428
27 102509.53187232028
28 92716.15103075396
29 84024.85172504946
30 76294.12671197455
31 69404.76015181348
32 63250.60786566926
33 57739.00936646465
34 52790.565391530836
35 48337.976540217824
36 44322.75460966771
37 40695.76498237411
38 37413.34290808212
39 34438.865830515395
40 31738.965327707665
41 29283.67046718638
42 27050.437183494618
43 25017.701727563002
44 23161.20708470529
45 21463.49767308954
46 19909.10919404295
47 18484.4

446 0.0001413673501174279
447 0.00013549607606946212
448 0.00012986781105016456
449 0.00012447331010598473
450 0.00011930318045401221
451 0.00011435031613416877
452 0.00010960341688455941
453 0.00010505223428833738
454 0.00010069033398567732
455 9.650978917103353e-05
456 9.250411363368732e-05
457 8.866542286947446e-05
458 8.498491701555878e-05
459 8.145719125880266e-05
460 7.807695759714833e-05
461 7.483724849898668e-05
462 7.17324287126611e-05
463 6.875566234084958e-05
464 6.590251200148065e-05
465 6.316842233960662e-05
466 6.054758487804845e-05
467 5.8036167488538004e-05
468 5.5628758511958806e-05
469 5.33219413822327e-05
470 5.11106960015646e-05
471 4.899059491287657e-05
472 4.695909268694033e-05
473 4.501231119337365e-05
474 4.3145803162290264e-05
475 4.135689069814485e-05
476 3.96418809152735e-05
477 3.7998421061070184e-05
478 3.642363256061759e-05
479 3.4913779291463726e-05
480 3.346643879315315e-05
481 3.207904278974669e-05
482 3.074931246598656e-05
483 2.9474923786801746e-05
48

<img src="./fig/chain_rule_differentiation.png" alt="图片替换文本" width="600" height="300" align="left" />

## PyTorch Tensors

In [3]:
import torch

dtype = torch.float
device = torch.device('cuda:0')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for t in range(500):
#     forward pass
#   torch.mm(matrix1, matrix2) matrix mulitpliction
    h = x.mm(w1)
#   torch.clamp(input, min, max) 能将tensor限制在[min, max]范围内，假如是NN的话，请用F.relu
    h_relu = h.clamp(min=0)
    y_pred = h.mm(w2)
    
#   这里没有像np或TF1一样使用square function
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
        
#     backward pass
    grad_y_pred = 2 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
#   Ndarray.copy() 被Tensor.clone() 所代替
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
    
    

99 943.206787109375
199 3.445002555847168
299 0.019996928051114082
399 0.0002746207173913717
499 4.034971789224073e-05


## Tensors and autograd

In [8]:
import torch

dtype = torch.float
device = torch.device('cuda:0')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Tensors are nodes in the computation graph, edges are functions generating the values in the tensors.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
#     因为有autograd，我们不需要手动把intermediate values保存下来，可以一步直接计算到output
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
#    这里不能用Tensor.item()，因为这里的loss 一开始是一个Tensor，Tensor才能用来backward
#    如果用了.item()，直接返回一个Python scalar
#    这里与前面的例子不同的地方在于前面的例子loss只是一个representation，没有实际的意义
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
#        在backward以后，所有Tensor.grad will hold the gradient of the loss with respect to w1 and w2
    loss.backward()
    
    with torch.no_grad():
#       这一部是手动更新weight，也可以用optim.SGD，计算这部分的时候和inference一样都是No gradient的
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
#         update weights以后手动清空gradients
        w1.grad.zero_()
        w2.grad.zero_()

99 686.5114135742188
199 3.257018566131592
299 0.019082289189100266
399 0.0002951398491859436
499 4.0182305383495986e-05


## Defining new autograd functions

In [5]:
import torch

# 其实 autograd 就是一个forward，一个backward，所以我们可以实现forward和backward的方法，
# 然后subclass torch.autograd.Function，最后像个function一样召唤它

class MyReLU(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, input):
#         这里的ctx是一个context object，只是用来stash information for backward computation
#         可以用ctx.save_for_backward这种方法
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
#        元组解包
        input, = ctx.saved_tensors
#     这里我们已经接收到Tensor的grad，所以可以计算考虑input的gradient
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input

dtype = torch.float
device = torch.device('cuda:0')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Tensors are nodes in the computation graph, edges are functions generating the values in the tensors.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

# 给MyReLU 起一个别名，apply以后会自动调用MyReLU的forward和backward method 
relu = MyReLU.apply

for t in range(500):
#     除了这一步，别的步骤跟前面别无二致
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

99 645.6539916992188
199 4.855976581573486
299 0.0506538562476635
399 0.0008095258381217718
499 8.064862049650401e-05


## PyTorch NN module

In [8]:
# High level abstractions over raw computational graphs useful for building neural network
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out))

# 创建一个MSEloss的instance
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4

for t in range(500):
    y_pred = model(x)
#     
    loss = loss_fn(y_pred, y)
    if t% 100 == 99:
        print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    

99 1.5782206058502197
199 0.01805734634399414
299 0.000392022862797603
399 1.0980829756590538e-05
499 3.4718257779786654e-07


## PyTorch optim

In [9]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out))

# 创建一个MSEloss的instance
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
# 第一个参数告诉optim要优化哪些参数
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
#     
    loss = loss_fn(y_pred, y)
    if t% 100 == 99:
        print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    optimizer.step()

99 41.959007263183594
199 0.48242470622062683
299 0.0015302618267014623
399 4.968972461938392e-06
499 2.4410873322722182e-08


## PyTorch Custom NN module

In [3]:
import torch

class Net(torch.nn.Module):
    
    def __init__(self, D_in, H, D_out):
        super().__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

net = Net(D_in, H, D_out)

# 创建一个MSEloss的instance
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
# 第一个参数告诉optim要优化哪些参数
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = net(x)
#     
    loss = loss_fn(y_pred, y)
    if t% 100 == 99:
        print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    optimizer.step()


99 54.945655822753906
199 0.9118472337722778
299 0.004170420579612255
399 4.6487789404636715e-06
499 1.4031473760667268e-09


## PyTorch Control flow and Weight sharing

In [6]:
import torch
import random

class DynamicNet(torch.nn.Module):
    
    def __init__(self, D_in, H, D_out):
        super().__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, H)
        self.linear3 = torch.nn.Linear(H, D_out)
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.linear2(h_relu).clamp(min=0)
        y_pred = self.linear3(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

net = DynamicNet(D_in, H, D_out)

# 创建一个MSEloss的instance
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
# 第一个参数告诉optim要优化哪些参数
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = net(x)
#     
    loss = loss_fn(y_pred, y)
    if t% 100 == 99:
        print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    optimizer.step()

99 651.8897094726562
199 268.1856994628906
299 149.90826416015625
399 26.771236419677734
499 13.140218734741211
