# 第一章节
## 1.线性回归

In [2]:
import torch

In [4]:
import numpy as np

为变量附上梯度
假设我们想对函数 f = 2 * (x ** 2) 求解关于x的导数。先创建变量x, 并赋予初值

In [29]:
x = torch.tensor([[1., 2.,], [3., 4.]], requires_grad=True)

In [30]:
y = x * 2
z = y * x # z = 2 * x ** 2 

注意，backward只能用于标量上, 这里的z数出来是一个向量, 需要转成标量再反向传播, 即 z.sum().backward()

In [31]:
z.sum().backward()

In [32]:
x.grad # 获取x的导数

tensor([[ 4.,  8.],
        [12., 16.]])

### 线性回归

给定一个数据点集合X和对应的目标值y, 线性模型的目标是找一根线, 其由向量w和位移b组成, 用数学符号yhat表示预测值
yhat = Xw + b 
最小化所有数据点上的平方误差
sigma(pow(yhat(i) - y(i)) + ... pow(yhat(n) - y(n)))

创建数据集
y[i] = 2 * x0 + 3.4 * x1 + 4.2 + noise
noise 服从均值为0和方差为0.1的正太分布

In [299]:
num_inputs = 2 
num_examples = 1000
true_w = [2, 3.4]
true_b = 4.2

In [300]:
X = torch.randn(1000, 2)

In [301]:
Y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b

In [302]:
Y += 0.1 * torch.randn(Y.shape)

### 数据读取
当我们开始训练神经网络的时候, 我们需要不断的读取数据块。
通过python的yield来构造一个迭代器。

In [303]:
import random
batch_size = 10
def data_iter():
    idx = list(range(num_examples))
    random.shuffle(idx)
    for i in range (0, num_examples, batch_size):
        j = idx[i : min(i+batch_size, num_examples)]
        yield X[j], Y[j]

In [304]:
x, y = next(data_iter())

In [305]:
x

tensor([[ 0.2308,  1.4141],
        [ 0.6318,  0.5003],
        [ 0.5387,  0.3124],
        [-1.9866,  0.1278],
        [-0.1211, -0.6213],
        [ 0.6104,  0.9932],
        [ 0.8174,  0.5257],
        [ 0.0466,  1.6091],
        [ 0.5747, -0.9410],
        [ 0.3665, -1.2202]])

In [306]:
y

tensor([9.4122, 7.1353, 6.4539, 0.8080, 1.9595, 8.7703, 7.5319, 9.7548, 2.1617,
        0.8375])

### 初始化模型参数

In [307]:
w = torch.randn((2, 1), requires_grad=True)

In [308]:
b = torch.zeros((1,), requires_grad=True)

In [309]:
params = [w, b]

### 定义模型

In [310]:
def net(x):
    return x @ w + b

### 损失函数

In [311]:
def square_loss(yhat, y):
    return (yhat - y.view(yhat.size())) ** 2

### 优化
通过随机梯度下降求解，每一步，我们将模型参数沿着梯度的反方向走特定的距离，这个距离叫做学习率。



In [312]:
def SGD(params, lr, batch_size):
    for param in params:
        param.data -= lr * param.grad / batch_size

### 训练

In [318]:
epochs = 5 
lr = .01
for epoch in range(epochs):
    total_loss = 0.0 
    for data, label in data_iter():
        output = net(data)
        loss = square_loss(output, label)
        loss = loss.sum()
        total_loss+=loss
        loss.backward()
        SGD(params, lr, batch_size)
        w.grad.data.zero_()
        b.grad.data.zero_()
    train_l = square_loss(net(X), Y)
    print (f"epoch: {epoch}, average_loss: {train_l.mean()}")    




epoch: 0, average_loss: 0.010231736116111279
epoch: 1, average_loss: 0.010228591039776802
epoch: 2, average_loss: 0.010221009142696857
epoch: 3, average_loss: 0.01022170577198267
epoch: 4, average_loss: 0.010225989855825901


In [322]:
w

tensor([[1.9993],
        [3.4067]], requires_grad=True)

In [323]:
b

tensor([4.2003], requires_grad=True)