# Tensor和Numpy
Tensor和Numpy有很高相似性，共享内存，Tensor不支持的操作可以先转为Numpy数组

In [67]:
import torch
import numpy as np

In [68]:
a = np.ones([2,3])
a

array([[1., 1., 1.],
       [1., 1., 1.]])

In [69]:
b = torch.from_numpy(a)
b

tensor([[1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)

In [70]:
a[0,1]=100
b #共享内存

tensor([[  1., 100.,   1.],
        [  1.,   1.,   1.]], dtype=torch.float64)

In [71]:
c = b.numpy()
c

array([[  1., 100.,   1.],
       [  1.,   1.,   1.]])

## Numpy广播法则
1. 让所有输入数组向shape最长的看齐，shape不足部分在前面加1补齐
2. 两个数组要么在某一维长度一致，要么其中一个为1，否则不能计算
3. 当某维长度为1时，计算时沿此维度扩充成一样的形状

Pytorch支持自动广播法则，但仍建议手动实现：
1. unsqueeze和view实现法则1
2. expand和expand_as实现法则3

In [72]:
a = torch.ones(3, 2)
b = torch.zeros(2, 3, 1)

In [73]:
#自动广播
a+b

tensor([[[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]]])

In [74]:
#手动广播
a.unsqueeze(0).expand(2,3,2)+b.expand(2,3,2)

tensor([[[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]]])

# 内部结构
1. Tensor分为头信息区和存储区
2. 信息区保存shape, stride, type等
3. 存储区保存real data

In [75]:
a = torch.arange(0, 6.0)
a.storage()

 0.0
 1.0
 2.0
 3.0
 4.0
 5.0
[torch.FloatStorage of size 6]

In [76]:
b = a.view(2,3)
b.storage()

 0.0
 1.0
 2.0
 3.0
 4.0
 5.0
[torch.FloatStorage of size 6]

使用id可以查看在内存中的地址

In [77]:
id(a) == id(b)

False

In [78]:
id(a.storage) == id(b.storage)

True

In [79]:
a[1] = 100 #共享存储地址
b

tensor([[  0., 100.,   2.],
        [  3.,   4.,   5.]])

In [80]:
c = a[2:] #仍然共享存储
c.storage()

 0.0
 100.0
 2.0
 3.0
 4.0
 5.0
[torch.FloatStorage of size 6]

In [81]:
c.data_ptr(), a.data_ptr() #返回Tensor首个元素的内存地址，每个元素占8字节（Long）,2*8=16

(94917364682696, 94917364682688)

In [82]:
c[0] = -100 #c[0]对应着a[2]
a

tensor([   0.,  100., -100.,    3.,    4.,    5.])

In [83]:
c.storage()[0] = -200
a

tensor([-200.,  100., -100.,    3.,    4.,    5.])

In [84]:
d = torch.Tensor(c.storage())
d[-1] = 666
a

tensor([-200.,  100., -100.,    3.,    4.,  666.])

In [85]:
id(a.storage()) == id(b.storage()) == id(c.storage()) == id(d.storage())

True

In [86]:
a.storage_offset(), b.storage_offset(), c.storage_offset() #偏移量

(0, 0, 2)

In [87]:
e = b[::2,::2] #隔两行两列取一个元素
e

tensor([[-200., -100.]])

In [88]:
id(a.storage()) == id(a.storage())

True

In [89]:
b.stride() # 第⼀个维度会按 3 个元素来切分 (6 个元素可以切分成 2 组)，第⼆个维度会按 1 个元素来切分 (3 个元素)

(3, 1)

In [90]:
c.stride()

(1,)

In [91]:
e.is_contiguous()

False

In [92]:
e.contiguous().is_contiguous()

True

可见绝大数操作只修改头部信息，这种方法节省内存；但会导致Tensor不连续，调用tensor.contiguous()后不再共享内存

## 双冒号：：的用法

In [93]:
a = torch.arange(0, 25.0).view(5, 5)
a

tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.],
        [20., 21., 22., 23., 24.]])

In [94]:
a[::2, ::2]

tensor([[ 0.,  2.,  4.],
        [10., 12., 14.],
        [20., 22., 24.]])

In [95]:
a[::,::]

tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.],
        [20., 21., 22., 23., 24.]])

In [96]:
a[::3,::3]

tensor([[ 0.,  3.],
        [15., 18.]])

In [97]:
a[1::2, ::2] #index::stride

tensor([[ 5.,  7.,  9.],
        [15., 17., 19.]])

# 其他有关话题

## cuda

In [98]:
torch.cuda.is_available()

True

In [99]:
torch.cuda.get_device_name()

'GeForce MX150'

In [100]:
torch.cuda.device_count()

1

In [101]:
torch.cuda.current_device()

0

## 持久化

In [102]:
a

tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.],
        [20., 21., 22., 23., 24.]])

In [103]:
if torch.cuda.is_available():
    a = a.cuda(0)
    torch.save(a, "a.pth")
    
    b = torch.load("a.pth") #GPU
    c = torch.load("a.pth", map_location=lambda storage, loc: storage) #存储于CPU

In [104]:
c

tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.],
        [20., 21., 22., 23., 24.]])

In [105]:
b

tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.],
        [20., 21., 22., 23., 24.]], device='cuda:0')

## 向量化
可以提升运算效率。python中的for循环等操作很低效，因此应该尽量使用向量化

In [106]:
def for_loop_add(x, y):
    result = []
    for i, j in zip(x, y):
        result.append(i + j)
    return torch.Tensor(result)

In [107]:
x = torch.zeros(100)
y = torch.ones(100)

### 测试程序运行时间

In [108]:
%timeit -n 10 for_loop_add(x, y)

715 µs ± 19.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [109]:
%timeit -n 10 x+y

The slowest run took 18.60 times longer than the fastest. This could mean that an intermediate result is being cached.
9.15 µs ± 16 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### 线程数

In [110]:
torch.get_num_threads()

8

In [111]:
torch.set_num_threads(8)

In [112]:
torch.get_num_threads()

8

### 设置精度
torch.set_printoptions

In [113]:
a = torch.arange(0, 200000000.0) #溢出
a[-1]

tensor(1.7500000000e+08)

In [114]:
b = torch.arange(0, 200000000) #64bit
b[-1]

tensor(199999999)

In [115]:
a = torch.randn(2,3)
a

tensor([[-0.0723569915,  1.2988783121,  1.5531506538],
        [ 0.9965285063, -0.7701903582,  0.2025056332]])

In [116]:
torch.set_printoptions(10)

In [117]:
a

tensor([[-0.0723569915,  1.2988783121,  1.5531506538],
        [ 0.9965285063, -0.7701903582,  0.2025056332]])

## 获取中间节点的梯度

In [118]:
x = torch.ones((3,3), requires_grad = True)
w = torch.rand((3,3), requires_grad = True)
y = w*x
z = y.sum()

In [119]:
z.backward(retain_graph=True)
x.grad, w.grad, y.grad #非叶子节点计算后被清空

(tensor([[0.4673362970, 0.3898677826, 0.8805522323],
         [0.4095358253, 0.0447326899, 0.0937862992],
         [0.7721707225, 0.5200177431, 0.4711250663]]),
 tensor([[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]),
 None)

In [120]:
torch.autograd.grad(z, y) #计算dz/dy

(tensor([[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]),)

x的梯度是目标函数对x的梯度，形状与x一致。

y.backward(grad_variables)中的grad_variables相当于链式法则中的dz/dy，形状与y相同。z.backward()等价于y.backward(grad_y)。

z.backward()省略了grad_variables是因为z是一个标量，dz/dz=1

In [121]:
y.backward(torch.ones(y.shape)) #z.backward(retain_graph=True),否则buffer已被释放，没有y.shape

In [122]:
x.grad, w.grad, y.grad

(tensor([[0.9346725941, 0.7797355652, 1.7611044645],
         [0.8190716505, 0.0894653797, 0.1875725985],
         [1.5443414450, 1.0400354862, 0.9422501326]]),
 tensor([[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]]),
 None)

# 自定义求导

In [123]:
from torch.autograd import Function

In [124]:
class MultiplyAdd(Function):
    
    @staticmethod
    def forward(ctx, w, x, b): #ctx必须有，其余为参数
        print("type in forward", type(x))
        ctx.save_for_backward(w, x)
        output = w * x + b
        return output
    
    @staticmethod
    def backward(ctx, grad_output):
        w, x = ctx.saved_tensors
        print("type in backward", type(x))
        grad_w = grad_output * x
        grad_x = grad_output * w
        grad_b = grad_output * 1
        return grad_w, grad_x, grad_b

In [125]:
x = torch.ones(1)
w = torch.rand(1, requires_grad = True)
b = torch.rand(1, requires_grad = True)
print("开始前向传播")
z = MultiplyAdd.apply(w, x, b)
print("开始反向传播")
z.backward()

x.grad, w.grad, b.grad

开始前向传播
type in forward <class 'torch.Tensor'>
开始反向传播
type in backward <class 'torch.Tensor'>


(None, tensor([1.]), tensor([1.]))

In [126]:
z = MultiplyAdd.apply(w, x, b)
z.grad_fn.apply(torch.ones(1)) #输出grad_w, grad_x, grad_b

type in forward <class 'torch.Tensor'>
type in backward <class 'torch.Tensor'>


(tensor([1.]), tensor([0.4461910129], grad_fn=<MulBackward0>), tensor([1.]))

# 求高阶导数

In [127]:
x = torch.tensor([5.0], requires_grad = True)
y = x ** 2
grad_x = torch.autograd.grad(y, x, create_graph=True)
grad_x

(tensor([10.], grad_fn=<MulBackward0>),)

In [128]:
grad_grad_x = torch.autograd.grad(grad_x, x)
grad_grad_x

(tensor([2.]),)

# 自定义Sigmoid

In [129]:
class Sigmoid(Function):
    
    @staticmethod
    def forward(ctx, x):
        output = 1/(1 + torch.exp(-x))
        ctx.save_for_backward(output)
        return output
    
    @staticmethod
    def backward(ctx, grad_output):
        output, = ctx.saved_tensors #不加“，”时，output是tuple数据类型，不能进行计算
        grad_x = output * (1- output) * grad_output
        return grad_x

In [130]:
test_input = torch.randn((3,4), requires_grad = True)
torch.autograd.gradcheck(Sigmoid.apply, test_input, eps=1e-3) #采用数值逼近法检验梯度计算公式是否正确，若不对则报错

True

In [131]:
def f_sigmoid(x):
    y = Sigmoid.apply(x)
    y.backward(torch.ones(x.shape))

def f_naive(x):
    y = 1/(1+torch.exp(-x))
    y.backward(torch.ones(x.shape))
    
def f_th(x): #系统优化
    y = torch.sigmoid(x)
    y.backward(torch.ones(x.shape))
    
x = torch.randn((100,100), requires_grad = True)

In [132]:
%timeit -n 100 f_sigmoid(x)
%timeit -n 100 f_naive(x)
%timeit -n 100 f_th(x)

The slowest run took 4.05 times longer than the fastest. This could mean that an intermediate result is being cached.
299 µs ± 184 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
The slowest run took 13.58 times longer than the fastest. This could mean that an intermediate result is being cached.
522 µs ± 746 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
62.4 µs ± 11.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
