# prerequisite for pytorch

In [1]:
import torch 

In [3]:
print(torch.__version__)

1.7.1


## Data Operation

### Create Tensor Variable

In [51]:
# create tensor variable

# torch.rand(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False)
x = torch.empty(3,4,dtype=torch.int32)
print(x)
print(x.dtype)

x1 = torch.empty([3,4,5])
print(x1)
print(x1.dtype)


# x=torch.zeros([3,4,5])
# x=torch.ones([3,4,5])
# x=torch.eye([3,3]) # diagonal=1
# x=torch.rand([3,4,5]) # uniform distribution

x3 = x.new_ones([3,4,5]) # return the same torch.dtype&torch.device as x
x4 = torch.randn_like(x1) # return a standard uniform distribution in the form of x1 (has to be float type or sth)
print(x3)
print(x4)

x5 = torch.Tensor(5,3) # Tensor(*size), tensor(data,)
print(x5)

x6 = torch.arange(1,10,1) # step
# x7 = torch.arange(1,10,2) # pieces_cut
x7 = torch.randperm(10) # pieces_cut
print(x6)
print(x7)

# x8 = torch.normal(mean, std)
# x9 = torch.uniform(from,to)


tensor([[7, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]], dtype=torch.int32)
torch.int32
tensor([[[ 2.8026e-44,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00, -2.2729e+11,  4.5828e-41, -2.2729e+11],
         [ 4.5828e-41, -2.2729e+11,  4.5828e-41,  0.0000e+00,  0.0000e+00]],

        [[ 6.4890e-07,  2.5038e-12,  6.3371e-10,  7.9348e+17,  1.3556e-19],
         [ 1.8567e-01,  2.8376e+20,  1.8180e+31, -2.2729e+11,  4.5828e-41],
         [-2.2729e+11,  4.5828e-41, -2.2730e+11,  4.5828e-41,  1.4013e-45],
         [ 0.0000e+00,  1.3556e-19,  1.8567e-01,  7.5553e+28,  5.2839e-11]],

        [[ 1.1429e+33,  1.7226e+22,  1.8040e+28,  3.4740e-12,  0.0000e+00],
         [ 0.0000e+00,  1.4013e-45,  0.0000e+00,  8.4118e-31,  1.4013e-45],
         [ 1.6144e-41,  4.0473e-11,  3.8519e-34,  0.0000e+00,  1.4013e-45],
         [ 2.3511e-38,  5.5144e-31,  2.5250e-29,  3.1799e-32, 

In [46]:
# directly create a tensor, just like numpy
import numpy as np
print(torch.tensor([3,3,3,3]))
print(torch.tensor(np.array([[2,2,2,2],[3,3,3,3]]))) # create a tensor through numpy.array

print(x.shape) # return a tuple
print(x.size())

# transformation with numpy 
print(x.numpy())
print(torch.from_numpy(x.numpy()))

tensor([3, 3, 3, 3])
tensor([[2, 2, 2, 2],
        [3, 3, 3, 3]])
torch.Size([3, 4])
torch.Size([3, 4])
[[7 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
tensor([[7, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]], dtype=torch.int32)


### Operations

- 在PyTorch中，所有operation的赋值都会导致新的变量地址开辟，比如a=a+1
- 所以要想keep地址不变，用+=或者operation_的操作
- .add()和.add_()都能把两个张量加起来，但.add_是in-place操作，比如x.add_(y)，x+y的结果会存储到原来的x中。Torch里面所有带"_"的操作，都是in-place的。



In [59]:
# operations

# add
x = torch.tensor([1,2])
y = torch.tensor([2,1])
print(x+y)
print(torch.add(x,y))
print('size:',x.size())

result = torch.empty(2) # according to the size()
torch.add(x,y,out=result)
print(result)

z = y.add(x)
print(y) # it doesn't change!
z = y.add_(x) # inplace should use add_, it changes y
print(y)
print(z)


tensor([3, 3])
tensor([3, 3])
size: torch.Size([2])
tensor([3., 3.])
tensor([2, 1])
tensor([3, 3])
tensor([3, 3])


### Index
- PyTorch中，index是不改变id的操作，view也是
- 对view如果我们想返回一个真正新的副本（即不共享data内存）该怎么办呢？Pytorch还提供了一个reshape()可以改变形状，但是此函数并不能保证返回的是其拷贝，所以不推荐使用。推荐先用clone创造一个副本然后再使用view

In [85]:
# index
from operator import *

# the index result shares the same memory with the original data

x = torch.rand([2,3])
print(x)
print(x[1:])
print(x[:,1])

y = x[1:]
y += 1
print(x)
print(y)

# advanced index functions
z = torch.index_select(x,1,torch.tensor([0,1]))# the second parameter stands for by row or column, the third parameter stands for the id of the row/column.
print(z)

z = torch.masked_select(x,x>1)
print(z)
mask = x.le(1) # less equal, ge: greater equal, gt: greater than, eq: equal
print(mask)
z = torch.masked_select(x,mask)
print(z)

print(torch.nonzero(x)) # return the index of nonzero elements

print(torch.gather(x,0,torch.tensor([[1,0,0],[0,1,0]])))  # 沿给定轴dim，将输入索引张量index指定位置的值进行聚合。
print(torch.gather(x,1,torch.tensor([[1,0,0],[0,1,0]])))  # 沿列，1代表这个位置的值变为这一列的第二个值，0代表这个位置的值变为这一列的第一个值



tensor([[0.0010, 0.7618, 0.2470],
        [0.5468, 0.5794, 0.8021]])
tensor([[0.5468, 0.5794, 0.8021]])
tensor([0.7618, 0.5794])
tensor([[9.8264e-04, 7.6176e-01, 2.4699e-01],
        [1.5468e+00, 1.5794e+00, 1.8021e+00]])
tensor([[1.5468, 1.5794, 1.8021]])
tensor([[9.8264e-04, 7.6176e-01],
        [1.5468e+00, 1.5794e+00]])
tensor([1.5468, 1.5794, 1.8021])
tensor([[ True,  True,  True],
        [False, False, False]])
tensor([0.0010, 0.7618, 0.2470])
tensor([[0, 0],
        [0, 1],
        [0, 2],
        [1, 0],
        [1, 1],
        [1, 2]])
tensor([[1.5468e+00, 7.6176e-01, 2.4699e-01],
        [9.8264e-04, 1.5794e+00, 2.4699e-01]])
tensor([[7.6176e-01, 9.8264e-04, 9.8264e-04],
        [1.5468e+00, 1.5794e+00, 1.5468e+00]])


In [89]:
# change the shape
# result of view shares data with the original vatiable, but not memory, i.e. id(x)!=id(y)
# to return a new one, use reshape(), or x.clone().view()

x = torch.tensor([[1,2,3],[4,5,6]])
y = x.view(6)
z = x.clone().view(-1, 2)  
print(x.size(), y.size(), z.size())
x += 1
print(x)
print(y)
print(z)

y = x.reshape(6)
z = x.reshape(-1, 2)  
print(x.size(), y.size(), z.size())


torch.Size([2, 3]) torch.Size([6]) torch.Size([3, 2])
tensor([[2, 3, 4],
        [5, 6, 7]])
tensor([2, 3, 4, 5, 6, 7])
tensor([[1, 2],
        [3, 4],
        [5, 6]])
torch.Size([2, 3]) torch.Size([6]) torch.Size([3, 2])


### Linear Algeba

In [114]:
# linear algeba

x = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
y = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])

print(x.trace())
print(x.diag())
print(torch.mm(x,y))
print(x.mm(y))

#print(x.dot(y)) # only support 1 dim
print(x.t())

print(torch.rand(4, 4).inverse()) # 求逆

print('SVD',torch.rand(4, 4).svd())

print(torch.Tensor(4, 4).inverse()) # 求逆

print('SVD',torch.Tensor(4, 4).svd())

tensor(15)
tensor([1, 5, 9])
tensor([[ 30,  36,  42],
        [ 66,  81,  96],
        [102, 126, 150]])
tensor([[ 30,  36,  42],
        [ 66,  81,  96],
        [102, 126, 150]])
tensor([[1, 4, 7],
        [2, 5, 8],
        [3, 6, 9]])
tensor([[ 0.5884,  1.5458,  1.3129, -2.4955],
        [-0.9899, -0.0333, -2.0643,  2.9506],
        [ 1.6425, -1.0018, -0.3710, -0.3777],
        [ 0.0298, -0.0513,  1.5906, -0.6297]])
SVD torch.return_types.svd(
U=tensor([[-0.5474, -0.3724, -0.7158,  0.2219],
        [-0.3164, -0.7250,  0.5186, -0.3244],
        [-0.6964,  0.4364,  0.4236,  0.3810],
        [-0.3396,  0.3810, -0.1980, -0.8369]]),
S=tensor([2.0563, 0.7839, 0.3961, 0.0579]),
V=tensor([[-0.5323,  0.4891, -0.1373, -0.6771],
        [-0.4830, -0.8271,  0.1465, -0.2474],
        [-0.3921,  0.2710,  0.8109,  0.3396],
        [-0.5741,  0.0572, -0.5497,  0.6041]]))
tensor([[-0.5474, -0.3724, -0.7158,  0.2219],
        [-0.3164, -0.7250,  0.5186, -0.3244],
        [-0.6964,  0.4364,  0.4236, 

### Broadcast Mechanism

- 若tensor运算的两个变量形状不同，先适当复制元素使这两个Tensor形状相同后再按元素运算
- 由于x和y分别是1行2列和3行1列的矩阵，如果要计算x + y，那么x中第一行的2个元素被广播（复制）到了第二行和第三行，而y中第一列的3个元素被广播（复制）到了第二列。如此，就可以对2个3行2列的矩阵按元素相加。

In [147]:
# Broadcast mechanism

# 
x = torch.arange(1, 3).view(1, 2)
print(x)
y = torch.arange(1, 4).view(3, 1)
print(y)
print(x + y)



tensor([[1, 2]])
tensor([[1],
        [2],
        [3]])
tensor([[2, 3],
        [3, 4],
        [4, 5]])


### Memory Overhead

In [116]:
# Memory overhead
# operations will create new memory while index doesn't ()

x = torch.tensor([1, 2])
y = torch.tensor([3, 4])
id_before = id(y)
y = y + x
print(id(y) == id_before) # False means the id of y is different, indicating that a new memory is used.

# solution: write to the original momory of y using index
x = torch.tensor([1, 2])
y = torch.tensor([3, 4])
id_before = id(y)
y[:] = y + x
print(id(y) == id_before) # True

# other solutions:
# torch.add(x, y, out=y)
# y += x
# y.add_(x) # add_ equals +=


False
True


In [121]:
# Exchange with numpy on CPU

# tensor-numpy
a = torch.ones(5)
b = a.numpy()
print(a, b)

a += 1
print(a, b)
b += 1
print(a, b)


# numpy-tensor 
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
print(a, b)

a += 1
print(a, b)
b += 1
print(a, b)

c = torch.tensor(a)
print(a, c)


# on GPU
print(torch.cuda.is_available())

if torch.cuda.is_available():
    device = torch.device("cuda")          # GPU
    y = torch.ones_like(x, device=device)  # Tensor on GPU
    x = x.to(device)                       # == .to("cuda")
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # to(), change the data type


tensor([1., 1., 1., 1., 1.]) [1. 1. 1. 1. 1.]
tensor([2., 2., 2., 2., 2.]) [2. 2. 2. 2. 2.]
tensor([3., 3., 3., 3., 3.]) [3. 3. 3. 3. 3.]
[1. 1. 1. 1. 1.] tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
[2. 2. 2. 2. 2.] tensor([2., 2., 2., 2., 2.], dtype=torch.float64)
[3. 3. 3. 3. 3.] tensor([3., 3., 3., 3., 3.], dtype=torch.float64)
[3. 3. 3. 3. 3.] tensor([3., 3., 3., 3., 3.], dtype=torch.float64)
False


## Autograd
- torch.autograd It is a set of automatic derivation engine specially developed for the convenience of users. It can automatically build the calculation diagram according to the input and forward propagation process, and execute the back propagation.

### Gradient

In [129]:
# AutoGrad

x = torch.ones(3,3,requires_grad=True)
print(x)
print(x.grad_fn)

y = x + 2
print(y)
print(y.grad_fn)

print(x.is_leaf, y.is_leaf)

z = pow(y,2)*3
print(z.mean(),'\n',z)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], requires_grad=True)
None
tensor([[3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.]], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x7fc0d91ed590>
True False
tensor(27., grad_fn=<MeanBackward0>) 
 tensor([[27., 27., 27.],
        [27., 27., 27.],
        [27., 27., 27.]], grad_fn=<MulBackward0>)


### Backward

In [143]:
# gradient & backward
# backward() auto computes the gradient for all 'requires_grad=True' variables.
# .grad to show the gradient of a certain variable in the computation map.

x = torch.ones(2,2,requires_grad=True)
y = x + 2
z = pow(y,2)*3
out = z.mean()  # mean(): 1/n*\sum(x_i)
print(out)

# back propagation
out.backward()
print(x.grad)

# back propagation again, grad will be accumulated 
out2 = x.sum()  # sum():\sum(x_i)
out2.backward()
print(x.grad)

# set grad to zero to avoid gradient accumulation
out3 = x.sum()
#out3.backward()
x.grad.data.zero_()
out3.backward()  # this step has to be after setting gradient to zero
print(x.grad)



tensor(27., grad_fn=<MeanBackward0>)
tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])
tensor([[5.5000, 5.5000],
        [5.5000, 5.5000]])
tensor([[1., 1.],
        [1., 1.]])


- 标量是0阶张量(一个数)，是1*1的；
- 向量是一阶张量，是1*n的；
- 张量可以给出所有坐标间的关系，是n*n的
  
  
- 避免向量（甚至更高维张量）对张量求导，而转换成标量对张量求导
- z是个张量，所以在调用backward时需要传入一个和z同形的权重向量进行加权求和得到一个标量

In [146]:
# can't compute the gradient of a tensor on a tensor
# need to change the tensor to scalar, and compute the gradient of a tensor on a scalar

x = torch.tensor([1.0, 2.0, 3.0, 4.0], requires_grad=True)
y = 2 * x
z = y.view(2, 2)
print(z)



v = torch.tensor([[1.0, 0.1], [0.01, 0.001]], dtype=torch.float)
z.backward(v)
print(x.grad)

tensor([[2., 4.],
        [6., 8.]], grad_fn=<ViewBackward>)
tensor([2.0000, 0.2000, 0.0200, 0.0020])


In [135]:
# add requires_grad

a = torch.randn(2,2)
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)


# if doesn't require_grad, it will not be backward()

x = torch.tensor(1.0, requires_grad=True)
y1 = x ** 2 
with torch.no_grad():
    y2 = x ** 3
y3 = y1 + y2

print(x.requires_grad)
print(y1, y1.requires_grad) # True
print(y2, y2.requires_grad) # False
print(y3, y3.requires_grad) # True
y3.backward()
print(x.grad) # suppose 5 but 2 because y2 doesn't backward
# y2.backward() 


False
True
True
tensor(1., grad_fn=<PowBackward0>) True
tensor(1.) False
tensor(2., grad_fn=<AddBackward0>) True
tensor(2.)


- x.data独立于计算图之外
- 更改data的值会影响tensor的print，但不会影响grad

In [136]:
# change data without influence the backward process

x = torch.ones(1,requires_grad=True)

print(x.data) # still a tensor
print(x.data.requires_grad) 

y = 2 * x
x.data *= 100 # only changes the value, no influence on the computation map and gradient propagation

y.backward()
print(x) 
print(x.grad)


tensor([1.])
False
tensor([100.], requires_grad=True)
tensor([2.])
