In [7]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
torch.__version__

'1.0.1'

In [3]:
torch.cuda.is_available()

True

In [4]:
torch.cuda.device_count()

1

In [5]:
torch.cuda.current_device()

0

In [6]:
torch.cuda.get_device_name(0)

'GeForce GTX 1080 Ti'

# 什么是PyTorch

In [4]:
# PyTorch是一个基于Python的科学计算库，类似于NumPy，但是它可以使用GPU

# Tensor

In [5]:
# Tensor类似于NumPy的ndarray，唯一的区别是Tensor可以在GPU上进行加速运算

In [6]:
# 构建一个未初始化的5*3矩阵

In [7]:
x = torch.empty(5, 3); x

tensor([[-1.4197e-37,  7.5530e-43, -7.3176e-38],
        [ 7.5530e-43, -7.3308e-38,  7.5530e-43],
        [-7.3311e-38,  7.5530e-43, -7.3312e-38],
        [ 7.5530e-43, -7.3312e-38,  7.5530e-43],
        [-7.3313e-38,  7.5530e-43, -7.3313e-38]])

In [8]:
# 构建随机初始化矩阵

In [9]:
x = torch.rand(5, 3); x

tensor([[0.9758, 0.1649, 0.8757],
        [0.9409, 0.1528, 0.3944],
        [0.5435, 0.4614, 0.7119],
        [0.4058, 0.9775, 0.6182],
        [0.1247, 0.6225, 0.6064]])

In [10]:
# 构建一个全部为0，类型为long的矩阵

In [11]:
x = torch.zeros(5, 3); x

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

In [12]:
x.dtype

torch.float32

In [13]:
x = torch.zeros(5, 3, dtype=torch.long); x

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])

In [14]:
x.dtype

torch.int64

In [15]:
# 从数据直接构建Tensor

In [16]:
x = torch.tensor([5.5, 3]); x

tensor([5.5000, 3.0000])

In [20]:
x = torch.tensor(
    [
        [2, 3], 
        [5, 9]
    ]
); x

tensor([[2, 3],
        [5, 9]])

In [21]:
x.shape

torch.Size([2, 2])

In [24]:
# 从已有Tensor构建一个Tensor会重用原来Tensor的特征。例如，数据类型等。除非提供新的数据

In [25]:
x = x.new_ones(5, 3); x

tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])

In [27]:
x.dtype  # 可以看出新建的x重用了之前x的数据类型torch.int64

torch.int64

In [28]:
# 我们也可以指定数据类型

In [29]:
x = x.new_ones(5, 3, dtype=torch.double); x

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)

In [31]:
x.dtype

torch.float64

In [32]:
# 重用数据形状

In [34]:
y = torch.rand_like(x, dtype=torch.float64); y

tensor([[0.7226, 0.9590, 0.6864],
        [0.1988, 0.8378, 0.4334],
        [0.7738, 0.5135, 0.3211],
        [0.6820, 0.9713, 0.2132],
        [0.3054, 0.9054, 0.4199]], dtype=torch.float64)

In [35]:
# 得到Tensor形状，注意torch.Size返回的是一个tuple

In [38]:
x.shape == y.shape

True

In [40]:
type(x.shape)  # 是一个元组tuple

torch.Size

# Tensor操作

In [43]:
# 加法1

In [45]:
x + y

tensor([[1.7226, 1.9590, 1.6864],
        [1.1988, 1.8378, 1.4334],
        [1.7738, 1.5135, 1.3211],
        [1.6820, 1.9713, 1.2132],
        [1.3054, 1.9054, 1.4199]], dtype=torch.float64)

In [46]:
# 加法2

In [47]:
torch.add(x, y)

tensor([[1.7226, 1.9590, 1.6864],
        [1.1988, 1.8378, 1.4334],
        [1.7738, 1.5135, 1.3211],
        [1.6820, 1.9713, 1.2132],
        [1.3054, 1.9054, 1.4199]], dtype=torch.float64)

In [50]:
# 加法3：把输出作为一个变量

In [54]:
result = torch.empty(5, 3, dtype=torch.float64)
torch.add(x, y, out=result)
result  # 这样做可以减少内存

tensor([[1.7226, 1.9590, 1.6864],
        [1.1988, 1.8378, 1.4334],
        [1.7738, 1.5135, 1.3211],
        [1.6820, 1.9713, 1.2132],
        [1.3054, 1.9054, 1.4199]], dtype=torch.float64)

In [66]:
# 加法4：in-place操作。任何in-place操作的运算都会以_结尾。举例来说x.copy_(y)，会改变x

In [56]:
y.add_(x)  # 所有的下划线操作(in-place)是作用在y上的，就是会改变源y的值

tensor([[1.7226, 1.9590, 1.6864],
        [1.1988, 1.8378, 1.4334],
        [1.7738, 1.5135, 1.3211],
        [1.6820, 1.9713, 1.2132],
        [1.3054, 1.9054, 1.4199]], dtype=torch.float64)

In [57]:
# 各种类似于NumPy的indexing操作也可以在PyTorch的Tensor上使用

In [63]:
x = torch.rand_like(y, dtype=torch.float64); x

tensor([[0.1039, 0.9525, 0.8452],
        [0.6057, 0.1973, 0.1643],
        [0.2708, 0.0807, 0.4430],
        [0.8523, 0.4176, 0.2398],
        [0.7091, 0.6370, 0.4677]], dtype=torch.float64)

In [64]:
x[:, 1:]

tensor([[0.9525, 0.8452],
        [0.1973, 0.1643],
        [0.0807, 0.4430],
        [0.4176, 0.2398],
        [0.6370, 0.4677]], dtype=torch.float64)

In [80]:
# resizing: 如果希望resize/reshape一个Tensor，可以使用torch.view，在NumPy里是reshape

In [68]:
x = torch.randn(4, 4); x

tensor([[-0.3207,  0.6449, -0.4905,  1.3452],
        [-1.3559,  0.3588,  1.8640,  0.9712],
        [-0.3755,  1.3294, -0.1590, -0.1769],
        [ 0.4891, -0.6439,  1.0613,  1.1723]])

In [70]:
x.view(16)

tensor([-0.3207,  0.6449, -0.4905,  1.3452, -1.3559,  0.3588,  1.8640,  0.9712,
        -0.3755,  1.3294, -0.1590, -0.1769,  0.4891, -0.6439,  1.0613,  1.1723])

In [71]:
x

tensor([[-0.3207,  0.6449, -0.4905,  1.3452],
        [-1.3559,  0.3588,  1.8640,  0.9712],
        [-0.3755,  1.3294, -0.1590, -0.1769],
        [ 0.4891, -0.6439,  1.0613,  1.1723]])

In [72]:
# 可以看到view操作并不会改变源x的值，只有in-place才会改变源值

In [74]:
x.view(2, 8)

tensor([[-0.3207,  0.6449, -0.4905,  1.3452, -1.3559,  0.3588,  1.8640,  0.9712],
        [-0.3755,  1.3294, -0.1590, -0.1769,  0.4891, -0.6439,  1.0613,  1.1723]])

In [75]:
# 当只知道或只需要某一维的维度时，可以将另一维度写作-1，PyTorch会自动reshape这个Tensor

In [76]:
x

tensor([[-0.3207,  0.6449, -0.4905,  1.3452],
        [-1.3559,  0.3588,  1.8640,  0.9712],
        [-0.3755,  1.3294, -0.1590, -0.1769],
        [ 0.4891, -0.6439,  1.0613,  1.1723]])

In [77]:
x.view(-1, 8)

tensor([[-0.3207,  0.6449, -0.4905,  1.3452, -1.3559,  0.3588,  1.8640,  0.9712],
        [-0.3755,  1.3294, -0.1590, -0.1769,  0.4891, -0.6439,  1.0613,  1.1723]])

In [79]:
# 如果是只有一个元素的Tensor，使用.item()方法可以把里面的value变成Python的数值

In [81]:
x = torch.randn(1); x

tensor([0.1237])

In [85]:
x.item()

0.12369722872972488

In [87]:
x.data

tensor([0.1237])

In [93]:
# torch的操作，dir(x)

# NumPy和Tensor之间的转换

In [100]:
# Tensor转换为NumPy

In [94]:
a = torch.ones(5); a

tensor([1., 1., 1., 1., 1.])

In [96]:
b = a.numpy(); b  # 这里的a和b共享内存空间

array([1., 1., 1., 1., 1.], dtype=float32)

In [97]:
b[1] = 2; b

array([1., 2., 1., 1., 1.], dtype=float32)

In [101]:
a  # 可以看到a也改变了

tensor([1., 2., 1., 1., 1.])

In [102]:
# NumPy转换为Tensor

In [103]:
import numpy as np

In [104]:
a = np.ones(5); a

array([1., 1., 1., 1., 1.])

In [105]:
b = torch.from_numpy(a); b

tensor([1., 1., 1., 1., 1.], dtype=torch.float64)

In [106]:
np.add(a, 1, out=a); a

array([2., 2., 2., 2., 2.])

# CUDA Tensor

In [107]:
# 使用.to()方法，可以把Tensor移到GPU device上进行加速计算

In [110]:
if torch.cuda.is_available():
    device = torch.device('cuda')  # 程序自动寻找Nvidia的GPU
    y = torch.ones_like(x, device=device)
    x = x.to(device)  # 把Tensor搬到GPU上
    z = x + y
    print(x, '\n', y, '\n', z)
    # print(z.to('cpu', torch.double))  # 把z搬回到CPU上

tensor([0.1237], device='cuda:0') 
 tensor([1.], device='cuda:0') 
 tensor([1.1237], device='cuda:0')


In [111]:
if torch.cuda.is_available():
    device = torch.device('cuda')  # 程序自动寻找Nvidia的GPU
    y = torch.ones_like(x, device=device)
    x = x.to(device)  # 把Tensor搬到GPU上
    z = x + y
    # print(x, '\n', y, '\n', z)
    print(z.to('cpu', torch.double))  # 把z搬回到CPU上

tensor([1.1237], dtype=torch.float64)


In [112]:
# 如果变量在GPU上，不能直接将其转换为CPU。需要先将其转到CPU上

In [113]:
y.cpu().data.numpy()

array([1.], dtype=float32)

In [115]:
# 把整个模型搬到GPU, model = model.cuda()

# 热身：用NumPy实现两层神经网络

In [116]:
# Model = Architecture + Parameters

In [7]:
N, D_in, H, D_out = 64, 1000, 100, 10  # 输入数据个数，输入维度，隐层维度，输出维度

In [118]:
# 随机创建训练数据

In [124]:
X = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(10):
    # 1.forward pass
    h = X.dot(w1)  # N * H
    h_relu = np.maximum(h, 0)  # N * H
    y_pred = h_relu.dot(w2)  # N * D_out
    # 2.compute loss
    loss = np.square(y_pred - y).sum()
    print(f'iter {t}, loss {loss}')
    # Backward pass
    ## compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = X.T.dot(grad_h)
    ## update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

iter 0, loss 31466996.89913607
iter 1, loss 32718021.204082876
iter 2, loss 44043031.543425515
iter 3, loss 57686727.01357093
iter 4, loss 59058323.04549369
iter 5, loss 38507663.45951154
iter 6, loss 15450405.589939145
iter 7, loss 4814654.10642281
iter 8, loss 1920254.1931992732
iter 9, loss 1159827.4299147371


In [122]:
# 可以看到loss在下降

# PyTorch: Tensor

In [139]:
X = torch.randn(N, D_in).cuda()  # 放在GPU上计算，device='cuda:0'
# X = torch.randn(N, D_in).to('cuda:0')  # 指定device='cuda:0'
y = torch.randn(N, D_out).cuda()

w1 = torch.randn(D_in, H).cuda()
w2 = torch.randn(H, D_out).cuda()

# print(f'X: {X}, y: {y}, w1: {w1}, w2: {w2}')
learning_rate = 1e-6
for t in range(10):
    # 1.forward pass
    h = X.mm(w1)  # N * H
    h_relu = h.clamp(min=0)  # N * H
    y_pred = h_relu.mm(w2)  # N * D_out
    # 2.compute loss
    loss = (y_pred - y).pow(2).sum().item()
    print(f'iter {t}, loss {loss}')
    # Backward pass
    ## compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = X.t().mm(grad_h)
    ## update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

iter 0, loss 34952572.0
iter 1, loss 35036756.0
iter 2, loss 41241688.0
iter 3, loss 44826480.0
iter 4, loss 38308968.0
iter 5, loss 23406632.0
iter 6, loss 10711394.0
iter 7, loss 4460092.0
iter 8, loss 2149122.5
iter 9, loss 1313488.625


In [130]:
# 简单的autograd()

In [149]:
x = torch.tensor(1., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)

In [150]:
y = w * x + b; y  # y = 2 * 1 + 3

tensor(5., grad_fn=<AddBackward0>)

In [151]:
# 如果要求y关于x的梯度，可以直接使用y.backward()自动求梯度

In [152]:
y.backward()

In [153]:
print(f'w.grad, {w.grad}')
print(f'x.grad, {w.grad}')
print(f'b.grad, {w.grad}')

w.grad, 1.0
x.grad, 1.0
b.grad, 1.0


# PyTorch: Tensor和autograd

### 需要把数据、损失函数和模型等都放入GPU中

In [154]:
# PyTorch的一个重要功能是autograd，也就是说只要定义了forward pass（前向神经网络），计算了loss后，PyTorch可以自动求导计算模型所有参数的梯度。

In [155]:
# 一个PyTorch的Tensor表示计算图中的一个节点。如果x是一个Tensor并且x.requires_grad=True，那么x.grad是一个另一个储存着x当前梯度（相对于一个scalar，常常是loss）的向量。

In [13]:
X = torch.randn(N, D_in).cuda()  # 放在GPU上计算，device='cuda:0'
# X = torch.randn(N, D_in).to('cuda:0')  # 指定device='cuda:0'
y = torch.randn(N, D_out).cuda()

device = torch.device('cuda:0')
w1 = torch.randn(D_in, H, requires_grad=True, device=device)
w2 = torch.randn(H, D_out, requires_grad=True, device=device)
print(y.is_leaf)
# print(f'X: {X}, y: {y}, w1: {w1}, w2: {w2}')
learning_rate = 1e-6
for t in range(10):
    # 1.forward pass
    # h = X.mm(w1)  # N * H
    # h_relu = h.clamp(min=0)  # N * H
    # y_pred = h_relu.mm(w2)  # N * D_out
    y_pred = X.mm(w1).clamp(min=0).mm(w2).cuda()
    # 2.compute loss
    loss = (y_pred - y).pow(2).sum().cuda()
    print(f'iter {t}, loss {loss}')
    # Backward pass
    ## compute the gradient
    # grad_y_pred = 2.0 * (y_pred - y)
    # grad_w2 = h_relu.t().mm(grad_y_pred)
    # grad_h_relu = grad_y_pred.mm(w2.t())
    # grad_h = grad_h_relu.clone()
    # grad_h[h < 0] = 0
    # grad_w1 = X.t().mm(grad_h)
    loss.backward()
    ## update weights of w1 and w2
    # w1 -= learning_rate * w1.grad
    # w2 -= learning_rate * w2.grad
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

True
iter 0, loss 31492254.0
iter 1, loss 28703134.0
iter 2, loss 31292642.0
iter 3, loss 34162792.0
iter 4, loss 32454456.0
iter 5, loss 24952252.0
iter 6, loss 15218750.0
iter 7, loss 7961044.5
iter 8, loss 4048104.5
iter 9, loss 2262352.0


In [106]:
X = torch.randn(N, D_in).cuda()  # 放在GPU上计算，device='cuda:0'
y = torch.randn(N, D_out).cuda()

w1 = torch.randn(D_in, H, requires_grad=True, device=device)
w2 = torch.randn(H, D_out, requires_grad=True, device=device)

y_pred = X.mm(w1).clamp(min=0).mm(w2).cuda()
loss = (y_pred - y).pow(2).sum().cuda()
# print(f'iter {t}, loss {loss}')
# for i in range(5):
loss.backward()
w1 = w1 - learning_rate * w1.grad
w2 = w2 - learning_rate * w2.grad
print(f'w1, {w1.shape}, w2, {w2.shape}')

w1, torch.Size([1000, 100]), w2, torch.Size([100, 10])


In [108]:
# 放到GPU上运行

In [14]:
X = torch.randn(N, D_in).cuda()
y = torch.randn(N, D_out).cuda()

# device = torch.cuda.device('cuda:0')，这里的错误写法
device = torch.device('cuda:0')
w1 = torch.randn(D_in, H, requires_grad=True, device=device)
w2 = torch.randn(H, D_out, requires_grad=True, device=device)

learning_rate = 1e-6
for it in range(10):
    # 1.forward pass
    y_pred = X.mm(w1).clamp(min=0).mm(w2).cuda()
    # 2.compute loss
    loss = (y_pred - y).pow(2).sum().cuda()
    print(f'iter {it}, loss {loss}')
    # Backward pass
    loss.backward()
    # update weights of w1 and w2
    # 为了不让计算图占内存，使用torch.no_grad()，就不会记住w1和w2的梯度。
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

iter 0, loss 40641304.0
iter 1, loss 44717200.0
iter 2, loss 52143172.0
iter 3, loss 50191388.0
iter 4, loss 33907368.0
iter 5, loss 15680492.0
iter 6, loss 6101679.0
iter 7, loss 2793259.0
iter 8, loss 1728043.875
iter 9, loss 1294621.0


In [None]:
# 在CPU上运行

In [8]:
X = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)

learning_rate = 1e-6
for it in range(10):
    # Forward pass
    y_pred = X.mm(w1).clamp(min=0).mm(w2)

    # compute loss
    loss = (y_pred - y).pow(2).sum()  # computation graph
    print(f'it: {it}, loss: {loss}')
    
    # Backward pass
    loss.backward()

    # update weights of w1 and w2
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

it: 0, loss: 28325130.0
it: 1, loss: 24314682.0
it: 2, loss: 23358280.0
it: 3, loss: 22278968.0
it: 4, loss: 19547522.0
it: 5, loss: 15201494.0
it: 6, loss: 10530118.0
it: 7, loss: 6716563.5
it: 8, loss: 4143936.0
it: 9, loss: 2592636.25


# PyTorch: nn

In [7]:
import torch.nn as nn

In [27]:
N, D_in, H, D_out = 64, 1000, 100, 10  # 输入数据个数，输入维度，隐层维度，输出维度

X = torch.randn(N, D_in).cuda()
y = torch.randn(N, D_out).cuda()

device = torch.device('cuda:0')

# w1 = torch.randn(D_in, H, requires_grad=True, device=device)
# w2 = torch.randn(H, D_out, requires_grad=True, device=device)

# model里面只是控制数据输入输出维度的，在定义阶段并不需要传入数据
model = nn.Sequential(
    nn.Linear(in_features=D_in, out_features=H),
    nn.ReLU(),
    nn.Linear(in_features=H, out_features=D_out)
)
model = model.cuda(device=device)

# 初始化线性层参数
nn.init.normal_(model[0].weight)
nn.init.normal_(model[2].weight)

# 损失函数
loss_fn = nn.MSELoss(reduction='sum')
loss_fn = loss_fn.cuda(device=device)

learning_rate = 1e-6
for it in range(10):
    # 1.forward pass
    # y_pred = X.mm(w1).clamp(min=0).mm(w2).cuda()
    y_pred = model(X)  # model.forward()
    # 2.compute loss
    # loss = (y_pred - y).pow(2).sum().cuda()
    loss = loss_fn(y_pred, y)
    print(f'iter {it}, loss {loss}')

    model.zero_grad()  # 清零所有参数梯度
    # Backward pass
    loss.backward()
    # update weights of w1 and w2
    # 为了不让计算图占内存，使用torch.no_grad()，就不会记住w1和w2的梯度。
    # with torch.no_grad():
    #     w1 -= learning_rate * w1.grad
    #     w2 -= learning_rate * w2.grad
    #     w1.grad.zero_()
    #     w2.grad.zero_()
    with torch.no_grad():
        # model.parameters()里有很多参数的梯度，如w1,w2, b1, b2等，
        for param in model.parameters():  # 所有模型的参数都在param里
            param -= learning_rate * param.grad
    # model.zero_grad()  # 清零所有参数梯度

iter 0, loss 46458704.0
iter 1, loss 46189048.0
iter 2, loss 41337848.0
iter 3, loss 28393146.0
iter 4, loss 14855011.0
iter 5, loss 6812826.0
iter 6, loss 3406305.5
iter 7, loss 2061754.75
iter 8, loss 1460919.375
iter 9, loss 1128984.5


In [15]:
model

Sequential(
  (0): Linear(in_features=1000, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
)

In [16]:
model[0]

Linear(in_features=1000, out_features=100, bias=True)

In [20]:
model[0].weight

Parameter containing:
tensor([[ 0.0088,  0.0025,  0.0292,  ...,  0.0216, -0.0028,  0.0183],
        [ 0.0043,  0.0249,  0.0010,  ...,  0.0078,  0.0153,  0.0194],
        [-0.0021, -0.0241,  0.0038,  ...,  0.0154, -0.0248,  0.0095],
        ...,
        [ 0.0286,  0.0169, -0.0184,  ...,  0.0045,  0.0137, -0.0122],
        [-0.0193,  0.0260,  0.0232,  ...,  0.0239, -0.0077,  0.0001],
        [-0.0168,  0.0044, -0.0077,  ...,  0.0120, -0.0268, -0.0010]],
       device='cuda:0', requires_grad=True)

# PyTorch: optim

In [28]:
# 这一次我们不再手动更新模型的weights，而是使用optim这个包来帮助我们更新参数。optim这个package提供了各种不同的模型优化方法，包括SGD+momentum，RMSProp，Adam等。

In [33]:
N, D_in, H, D_out = 64, 1000, 100, 10  # 输入数据个数，输入维度，隐层维度，输出维度

X = torch.randn(N, D_in).cuda()
y = torch.randn(N, D_out).cuda()

device = torch.device('cuda:0')

# model里面只是控制数据输入输出维度的，在定义阶段并不需要传入数据
model = nn.Sequential(
    nn.Linear(in_features=D_in, out_features=H),
    nn.ReLU(),
    nn.Linear(in_features=H, out_features=D_out)
)
model = model.cuda(device=device)

# 初始化线性层参数
nn.init.normal_(model[0].weight)
nn.init.normal_(model[2].weight)

# 损失函数
loss_fn = nn.MSELoss(reduction='sum')
loss_fn = loss_fn.cuda(device=device)

# learning_rate = 1e-4
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

learning_rate = 1e-6
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for it in range(10):
    # 1.forward pass
    y_pred = model(X)  # model.forward()
    
    # 2.compute loss
    loss = loss_fn(y_pred, y)
    print(f'iter {it}, loss {loss}')

    # model.zero_grad()  # 清零所有参数梯度
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # update model parameters
    optimizer.step()  # 一步更新所有的参数

# 所以我们要做的就是：定义一个optimizer，求导之前将梯度清空，求导滞后将梯度更新。

iter 0, loss 30414648.0
iter 1, loss 29018474.0
iter 2, loss 32293482.0
iter 3, loss 34286220.0
iter 4, loss 30563664.0
iter 5, loss 21128370.0
iter 6, loss 11583691.0
iter 7, loss 5615385.0
iter 8, loss 2824090.75
iter 9, loss 1642769.625


# PyTorch：自定义nn.Module

In [34]:
# 我们可以定义一个模型，这个模型继承自nn.Module类。如果需要定义一个比Sequential模型更复杂的模型，就需要自定义nn.Module模型

In [13]:
N, D_in, H, D_out = 64, 1000, 100, 10  # 输入数据个数，输入维度，隐层维度，输出维度

X = torch.randn(N, D_in).cuda()
y = torch.randn(N, D_out).cuda()

device = torch.device('cuda:0')

# model里面只是控制数据输入输出维度的，在定义阶段并不需要传入数据
# model = nn.Sequential(
#     nn.Linear(in_features=D_in, out_features=H),
#     nn.ReLU(),
#     nn.Linear(in_features=H, out_features=D_out)
# )

# 要想写出更复杂的模型，我们需要继承nn.Module
class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        # define the model architecture
        self.linear1 = nn.Linear(in_features=D_in, out_features=H)
        self.linear2 = nn.Linear(in_features=H, out_features=D_out)

    def forward(self, x):
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

model = TwoLayerNet(D_in, H, D_out)
model = model.cuda(device=device)

# # 初始化线性层参数
# nn.init.normal_(model[0].weight)
# nn.init.normal_(model[2].weight)

# 损失函数
loss_fn = nn.MSELoss(reduction='sum')
loss_fn = loss_fn.cuda(device=device)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for it in range(10):
    y_pred = model(X)
    loss = loss_fn(y_pred, y)
    print(f'iter {it}, loss {loss}')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

iter 0, loss 619.3310546875
iter 1, loss 602.7033081054688
iter 2, loss 586.5711669921875
iter 3, loss 570.8697509765625
iter 4, loss 555.6207885742188
iter 5, loss 540.8893432617188
iter 6, loss 526.600341796875
iter 7, loss 512.7101440429688
iter 8, loss 499.1954650878906
iter 9, loss 486.0283508300781
