# Pytorch Introduction

In [24]:
import torch
import numpy as np
import torch.nn as nn

## Tensors

In [9]:
x_numpy = np.array([0.1, 0.2, 0.3])
x_torch = torch.tensor([0.1, 0.2, 0.3])
print('x_numpy, x_torch')
print(x_numpy, x_torch)

print('to and form numpy and pytorch')
print(torch.from_numpy(x_numpy),x_torch.numpy())

y_numpy = np.array([3, 4, 5])
y_torch = torch.tensor([3, 4, 5])
print("x+y")
print(x_numpy + y_numpy, x_torch + y_torch)

print("norm")
print(np.linalg.norm(x_numpy), torch.norm(x_torch))
print()

print("mean along the 0th dimension")
x_numpy = np.array([[1, 2], [3, 4.]])
x_torch = torch.tensor([[1, 2], [3, 4.]])
print(np.mean(x_numpy, axis=0), torch.mean(x_torch, dim = 0))

x_numpy, x_torch
[0.1 0.2 0.3] tensor([0.1000, 0.2000, 0.3000])
to and form numpy and pytorch
tensor([0.1000, 0.2000, 0.3000], dtype=torch.float64) [0.1 0.2 0.3]
x+y
[3.1 4.2 5.3] tensor([3.1000, 4.2000, 5.3000])
norm
0.37416573867739417 tensor(0.3742)

mean along the 0th dimension
[2. 3.] tensor([2., 3.])


## Tensor.view

We can use the Tensor.view() function to reshape tensors similarly to numpy.reshape()

It can also automatically calculate the correct dimension if a -1 is passed in. This is useful if we are working with batches, but the batch size is unknown.

In [10]:
N,C,W,H = 1000,3,28,28
X = torch.randn((N, C, W, H))
print(X.shape)
print(X.view(N, C, 784).shape)
print(X.view(-1, C, 784).shape)

torch.Size([1000, 3, 28, 28])
torch.Size([1000, 3, 784])
torch.Size([1000, 3, 784])


## Computation graphs

What's special about PyTorch's tensor object is that it implicitly creates a computation graph in the background. A computation graph is a a way of writing a mathematical expression as a graph. There is an algorithm to compute the gradients of all the variables of a computation graph in time on the same order it is to compute the function itself.

计算图计算的顺序和函数表达式求解是一致的。

其中的重点是使用计算图可以很方便地进行梯度求解，在这些框架中，使用的都是类似原理，在pytorch中，基本思路就是：

````python 
y = f(x)
y.backward()
x.grad
````

In [12]:
a = torch.tensor(2.0, requires_grad = True)
b =  torch.tensor(21.0, requires_grad = True)
c = a + b
d = b + 1
e = c * d
print('c', c)
print('d', d)
print('e', e)

c tensor(23., grad_fn=<AddBackward0>)
d tensor(22., grad_fn=<AddBackward0>)
e tensor(506., grad_fn=<MulBackward0>)


In [13]:
def f(x):
    return (x-2)**2

def fp(x):
    return 2*(x-2)

x = torch.tensor([1.0], requires_grad=True)

y = f(x)
y.backward()

print('Analytical f\'(x):', fp(x))
print('PyTorch\'s f\'(x):', x.grad)

Analytical f'(x): tensor([-2.], grad_fn=<MulBackward0>)
PyTorch's f'(x): tensor([-2.])


In [14]:
def g(w):
    return 2*w[0]*w[1] + w[1]*torch.cos(w[0])

def grad_g(w):
    return torch.tensor([2*w[1] - w[1]*torch.sin(w[0]), 2*w[0] + torch.cos(w[0])])

w = torch.tensor([np.pi, 1], requires_grad=True)

z = g(w)
z.backward()

print('Analytical grad g(w)', grad_g(w))
print('PyTorch\'s grad g(w)', w.grad)

Analytical grad g(w) tensor([2.0000, 5.2832])
PyTorch's grad g(w) tensor([2.0000, 5.2832])


## Using the gradients

In [16]:
'''
f(x) = (x-2) ^2 
find the minimum
'''
x = torch.tensor([5.0], requires_grad=True)
step_size = 0.25

print('iter,\tx,\tf(x),\tf\'(x),\tf\'(x) pytorch')
for i in range(15):
    y = f(x)
    y.backward() # compute the gradient
    
    print('{},\t{:.3f},\t{:.3f},\t{:.3f},\t{:.3f}'.format(i, x.item(), f(x).item(), fp(x).item(), x.grad.item()))
    
    x.data = x.data - step_size * x.grad # perform a GD update step
    
    # We need to zero the grad variable since the backward()
    # call accumulates the gradients in .grad instead of overwriting.
    # The detach_() is for efficiency. You do not need to worry too much about it.
    x.grad.detach_()
    x.grad.zero_()

iter,	x,	f(x),	f'(x),	f'(x) pytorch
0,	5.000,	9.000,	6.000,	6.000
1,	3.500,	2.250,	3.000,	3.000
2,	2.750,	0.562,	1.500,	1.500
3,	2.375,	0.141,	0.750,	0.750
4,	2.188,	0.035,	0.375,	0.375
5,	2.094,	0.009,	0.188,	0.188
6,	2.047,	0.002,	0.094,	0.094
7,	2.023,	0.001,	0.047,	0.047
8,	2.012,	0.000,	0.023,	0.023
9,	2.006,	0.000,	0.012,	0.012
10,	2.003,	0.000,	0.006,	0.006
11,	2.001,	0.000,	0.003,	0.003
12,	2.001,	0.000,	0.001,	0.001
13,	2.000,	0.000,	0.001,	0.001
14,	2.000,	0.000,	0.000,	0.000


## Linear Regression
Now, instead of minimizing a made-up function, lets minimize a loss function on some made-up data.

We will implement Gradient Descent in order to solve the task of linear regression.

notes：
@符号设计，其实就是想简化表达式的书写，很好理解，.符号无法使用的情况下。
### before pytorch
以下是一个线性回归的例子，y，即label，用的是原公式加噪声模拟数据误差。首先是求了解析梯度验证，然后进行梯度下降，用梯度下降找到拟合的w，对比原来的w。

In [19]:
# make a simple linear dataset with some noise

d = 2
n = 50
X = torch.randn(n,d)
true_w = torch.tensor([[-1.0], [2.0]])
y = X @ true_w + torch.randn(n,1) * 0.1
print('X shape', X.shape)
print('y shape', y.shape)
print('w shape', true_w.shape)

X shape torch.Size([50, 2])
y shape torch.Size([50, 1])
w shape torch.Size([2, 1])


In [20]:
def model(X, w):
    return X @ w

# the residual sum of squares loss function
def rss(y, y_hat):
    return torch.norm(y - y_hat)**2 / n

# analytical expression for the gradient
def grad_rss(X, y, w):
    return -2*X.t() @ (y - X @ w) / n

w = torch.tensor([[1.], [0]], requires_grad=True)
y_hat = model(X, w)

loss = rss(y, y_hat)
loss.backward()

# notes : detach Returns a new Tensor, detached from the current graph.
print('Analytical gradient', grad_rss(X, y, w).detach().view(2).numpy())
print('PyTorch\'s gradient', w.grad.view(2).numpy())

Analytical gradient [ 4.4079127 -4.184778 ]
PyTorch's gradient [ 4.4079123 -4.184779 ]


In [22]:
'''
This example is an illustration to connect ideas we have seen before to PyTorch's way of doing things
'''
step_size = 0.1

print('iter,\tloss,\tw')
for i in range(20):
    y_hat = model(X, w)
    loss = rss(y, y_hat)
    
    loss.backward() # compute the gradient of the loss
    
    w.data = w.data - step_size * w.grad # do a gradient descent step
    
    print('{},\t{:.2f},\t{}'.format(i, loss.item(), w.view(2).detach().numpy()))
    
    # We need to zero the grad variable since the backward()
    # call accumulates the gradients in .grad instead of overwriting.
    # The detach_() is for efficiency. You do not need to worry too much about it.
    w.grad.detach()
    w.grad.zero_()

print('\ntrue w\t\t', true_w.view(2).numpy())
print('estimated w\t', w.view(2).detach().numpy())

iter,	loss,	w
0,	8.68,	[0.11841756 0.83695585]
1,	2.87,	[-0.12975451  1.0818826 ]
2,	1.78,	[-0.32358998  1.2761402 ]
3,	1.11,	[-0.47494987  1.4302471 ]
4,	0.69,	[-0.59311265  1.5525312 ]
5,	0.43,	[-0.6853354  1.6495874]
6,	0.27,	[-0.7572931  1.7266393]
7,	0.17,	[-0.81342274  1.7878251 ]
8,	0.11,	[-0.8571928  1.8364242]
9,	0.07,	[-0.89131415  1.8750356 ]
10,	0.05,	[-0.9179049  1.9057199]
11,	0.03,	[-0.9386199  1.9301109]
12,	0.02,	[-0.9547516  1.9495045]
13,	0.02,	[-0.9673093  1.9649286]
14,	0.01,	[-0.9770808  1.9771991]
15,	0.01,	[-0.98468107  1.9869633 ]
16,	0.01,	[-0.99058986  1.9947354 ]
17,	0.01,	[-0.99518144  2.0009234 ]
18,	0.01,	[-0.99874765  2.0058515 ]
19,	0.01,	[-1.001516   2.0097775]

true w		 [-1.  2.]
estimated w	 [-1.001516   2.0097775]


### 认识Pytorch 

#### nn.Modules

Modules是pytorch中的重要概念，简单来说就是实现神经网络模块化，若你想自定义模块，必须继承它，并定义forward。

它并不是层次的概念，而是更类似于network的部分，比如下面举一个重要而基础的例子Linear模块，从网络结构来看其实就是全连接，从公式来看就是最基础的线性公式。

其次，模块套模块，对比tensorflow中，这种功能是由function来实现的。

In [25]:
d_in = 3
d_out = 4
linear_module = nn.Linear(d_in, d_out)

example_tensor = torch.tensor([[1.,2,3], [4,5,6]])
# applys a linear transformation to the data
transformed = linear_module(example_tensor)
print('example_tensor', example_tensor.shape)
print('transormed', transformed.shape)
print()
print('We can see that the weights exist in the background\n')
print('W:', linear_module.weight)
print('b:', linear_module.bias)

example_tensor torch.Size([2, 3])
transormed torch.Size([2, 4])

We can see that the weights exist in the background

W: Parameter containing:
tensor([[ 0.0282,  0.4551,  0.0534],
        [-0.0860,  0.3267,  0.0478],
        [-0.3098, -0.2641,  0.0881],
        [ 0.4435,  0.0684,  0.2904]], requires_grad=True)
b: Parameter containing:
tensor([ 0.4604,  0.5394,  0.2120, -0.0237], requires_grad=True)


In [26]:
activation_fn = nn.ReLU() # we instantiate an instance of the ReLU module
example_tensor = torch.tensor([-1.0, 1.0, 0.0])
activated = activation_fn(example_tensor)
print('example_tensor', example_tensor)
print('activated', activated)

example_tensor tensor([-1.,  1.,  0.])
activated tensor([0., 1., 0.])


#### Activation function

In [27]:
activation_fn = nn.ReLU() # we instantiate an instance of the ReLU module
example_tensor = torch.tensor([-1.0, 1.0, 0.0])
activated = activation_fn(example_tensor)
print('example_tensor', example_tensor)
print('activated', activated)

example_tensor tensor([-1.,  1.,  0.])
activated tensor([0., 1., 0.])


#### Sequential

Many times, we want to compose Modules together. torch.nn.Sequential provides a good interface for composing simple modules.

这个东西要结合Modules的概念来理解，就很简单了。

In [28]:
d_in = 3
d_hidden = 4
d_out = 1
model = torch.nn.Sequential(
                            nn.Linear(d_in, d_hidden),
                            nn.Tanh(),
                            nn.Linear(d_hidden, d_out),
                            nn.Sigmoid()
                           )

example_tensor = torch.tensor([[1.,2,3],[4,5,6]])
transformed = model(example_tensor)
print('transformed', transformed.shape)

transformed torch.Size([2, 1])


#### Model Parameters

In [29]:
params = model.parameters()

for param in params:
    print(param)

Parameter containing:
tensor([[-0.5052,  0.4948,  0.0198],
        [ 0.1981, -0.0647,  0.0489],
        [ 0.3133,  0.5347,  0.4618],
        [-0.5004,  0.0650,  0.2572]], requires_grad=True)
Parameter containing:
tensor([-0.0421,  0.1236,  0.3876, -0.0447], requires_grad=True)
Parameter containing:
tensor([[0.4422, 0.2688, 0.0525, 0.0794]], requires_grad=True)
Parameter containing:
tensor([0.4016], requires_grad=True)


#### loss function

In [30]:
mse_loss_fn = nn.MSELoss()

input = torch.tensor([[0., 0, 0]])
target = torch.tensor([[1., 0, -1]])

loss = mse_loss_fn(input, target)

print(loss)

tensor(0.6667)


#### torch.optim
PyTorch implements a number of gradient-based optimization methods in torch.optim, including Gradient Descent. At the minimum, it takes in the model parameters and a learning rate.

这个就是抽象出优化器的模块。使用的特点就是，我们从此把w交给optim，不再直接操作w，让optim给我们操作，即对optim对象执行梯度下降。

In [31]:
# create a simple model
model = nn.Linear(1, 1)

# create a simple dataset
X_simple = torch.tensor([[1.]])
y_simple = torch.tensor([[2.]])

# create our optimizer
optim = torch.optim.SGD(model.parameters(), lr=1e-2)
mse_loss_fn = nn.MSELoss()

y_hat = model(X_simple)
print('model params before:', model.weight)
loss = mse_loss_fn(y_hat, y_simple)
optim.zero_grad()
loss.backward()
optim.step()
print('model params after:', model.weight)


model params before: Parameter containing:
tensor([[-0.0216]], requires_grad=True)
model params after: Parameter containing:
tensor([[0.0332]], requires_grad=True)


### Linear Regression in pytorchic way

In [33]:
step_size = 0.1

linear_module = nn.Linear(d, 1, bias=False)

loss_func = nn.MSELoss()

optim = torch.optim.SGD(linear_module.parameters(), lr=step_size)

print('iter,\tloss,\tw')

for i in range(20):
    y_hat = linear_module(X) # model由函数指定，现在用modules指定
    loss = loss_func(y_hat, y)
    optim.zero_grad()
    loss.backward()
    optim.step() # 不对w进行操作，交予optim
    
    print('{},\t{:.2f},\t{}'.format(i, loss.item(), linear_module.weight.view(2).detach().numpy()))

print('\ntrue w\t\t', true_w.view(2).numpy())
print('estimated w\t', linear_module.weight.view(2).detach().numpy())
'''
step_size = 0.1

print('iter,\tloss,\tw')
for i in range(20):
    y_hat = model(X, w) # model由函数指定，现在用modules指定
    loss = rss(y, y_hat)
    
    loss.backward() # compute the gradient of the loss
    
    w.data = w.data - step_size * w.grad # do a gradient descent step
    
    print('{},\t{:.2f},\t{}'.format(i, loss.item(), w.view(2).detach().numpy()))
    
    # We need to zero the grad variable since the backward()
    # call accumulates the gradients in .grad instead of overwriting.
    # The detach_() is for efficiency. You do not need to worry too much about it.
    w.grad.detach()
    w.grad.zero_()

print('\ntrue w\t\t', true_w.view(2).numpy())
print('estimated w\t', w.view(2).detach().numpy())
'''

iter,	loss,	w
0,	3.28,	[-0.663055   0.6294377]
1,	2.09,	[-0.7519799  0.9052835]
2,	1.33,	[-0.819157   1.1263719]
3,	0.85,	[-0.86975116  1.3036222 ]
4,	0.55,	[-0.9077259  1.4457664]
5,	0.35,	[-0.9361185  1.559789 ]
6,	0.23,	[-0.95725304  1.6512785 ]
7,	0.15,	[-0.9729048  1.724708 ]
8,	0.10,	[-0.98442745  1.7836583 ]
9,	0.07,	[-0.992851   1.8309971]
10,	0.05,	[-0.99895763  1.8690217 ]
11,	0.03,	[-1.0033396  1.8995726]
12,	0.02,	[-1.0064446  1.9241252]
13,	0.02,	[-1.0086095  1.9438621]
14,	0.02,	[-1.0100873  1.9597319]
15,	0.01,	[-1.0110669  1.9724956]
16,	0.01,	[-1.0116892  1.9827635]
17,	0.01,	[-1.0120583  1.9910258]
18,	0.01,	[-1.0122509  1.9976757]
19,	0.01,	[-1.0123231  2.003029 ]

true w		 [-1.  2.]
estimated w	 [-1.0123231  2.003029 ]


## Linear regression using Stochastic Gradient Descent

In [40]:
'''
每次随机取一个x，只对选取x做梯度下降，iter次数需%save适当增加
'''

step_size = 0.01

linear_module = nn.Linear(d, 1)
loss_func = nn.MSELoss()
optim = torch.optim.SGD(linear_module.parameters(), lr=step_size)
print('iter,\tloss,\tw')
for i in range(200):
    rand_idx = np.random.choice(n) # take a random point from the dataset
    x = X[rand_idx] 
    y_hat = linear_module(x)
    loss = loss_func(y_hat, y[rand_idx]) # only compute the loss on the single point
    optim.zero_grad()
    loss.backward()
    optim.step()
    
    if i % 20 == 0:
        print('{},\t{:.2f},\t{}'.format(i, loss.item(), linear_module.weight.view(2).detach().numpy()))

print('\ntrue w\t\t', true_w.view(2).numpy())
print('estimated w\t', linear_module.weight.view(2).detach().numpy())

iter,	loss,	w
0,	4.15,	[ 0.02381387 -0.11205033]
20,	0.05,	[-0.3758768   0.45335925]
40,	1.15,	[-0.7061731  0.7819934]
60,	1.01,	[-0.8895813  1.0832611]
80,	0.02,	[-1.0228333  1.4879164]
100,	0.01,	[-1.0442263  1.7136205]
120,	0.00,	[-0.9997265  1.830707 ]
140,	0.04,	[-0.99125946  1.8812301 ]
160,	0.01,	[-0.98673844  1.9160957 ]
180,	0.01,	[-1.009729   1.9443203]

true w		 [-1.  2.]
estimated w	 [-1.0163108  1.9696457]


## CrossEntropyLoss

输入为(N, C), N个样本，每个样本C个输出，CrossEntropyLoss做了softmax，然后求CrossEntropy，输出最终的loss。

In [45]:
loss = nn.CrossEntropyLoss()
# 可以从输入预测loss，比如-1，1明显softmax之后就是1
input = torch.tensor([[-1., 1],[-1, 1],[1, -1]]) # raw scores correspond to the correct class
input = torch.tensor([[-3., 3],[-3, 3],[3, -3]]) # raw scores correspond to the correct class with higher confidence
input = torch.tensor([[1., -1],[1, -1],[-1, 1]]) # raw scores correspond to the incorrect class
# input = torch.tensor([[3., -3],[3, -3],[-3, 3]]) # raw scores correspond to the incorrect class with incorrectly placed confidence

target = torch.tensor([1, 1, 0])
output = loss(input, target)
print(output)

tensor(2.1269)


## Dataset class

torch.utils.data.Dataset is an abstract class representing a dataset. Your custom dataset should inherit Dataset and override the following methods:

- `__len__` so that len(dataset) returns the size of the dataset.
- `__getitem__` to support the indexing such that dataset[i] can be used to get i\ th sample

torch.utils.data.DataLoader is an iterator which provides all these features. 
- Batching the data
- Shuffling the data
- Load the data in parallel using multiprocessing workers.

Parameters used below should be clear. One parameter of interest is collate_fn. You can specify how exactly the samples need to be batched using collate_fn. However, default collate should work fine for most use cases.

In [52]:
class FaceLandmarksDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.landmarks_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        # 应该是实现懒加载
        img_name = os.path.join(self.root_dir,
                                self.landmarks_frame.iloc[idx, 0])
        image = io.imread(img_name)
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        landmarks = np.array([landmarks])
        landmarks = landmarks.astype('float').reshape(-1, 2)
        sample = {'image': image, 'landmarks': landmarks}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [53]:
dataloader = DataLoader(transformed_dataset, batch_size=4,
                        shuffle=True, num_workers=4)

for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched['image'].size(),
          sample_batched['landmarks'].size())

RuntimeError: DataLoader worker (pid(s) 71720, 53892, 69340, 69948) exited unexpectedly