# 加载

In [2]:
%load_ext autoreload
%autoreload 2
try:
    import os
    from google.colab import drive
    drive.mount('/content/drive')
    
    GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'ConvNetFromScratch' 
    GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
    print(os.listdir(GOOGLE_DRIVE_PATH))
    import sys
    sys.path.append(GOOGLE_DRIVE_PATH)
except:
    pass

import time, os, torch, torchvision, random, time, math
from torch import Tensor
import torchvision
import matplotlib.pyplot as plt
from imageio import imread
from PIL import Image
from torchvision.transforms import ToTensor

%matplotlib inline
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['font.size'] = 10
from toolset.utils import *
from toolset.data import *
from toolset.helper import *
from toolset.solver import *
from convolutional_networks import *
from fully_connected_networks import *
from toolset import *
from typing import Dict, List, Optional
TensorDict = Dict[str, torch.Tensor]
if torch.cuda.is_available:
    print('Good to go!')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Good to go!


# 卷积测试

In [4]:
# 用torch.float64/double Rel errors 应该在1e-11附近或更小
# time.time时间太短精度不够会造成除0问题，用time.perf_counter()
from time import perf_counter as tpc
reset_seed(0)


N, C, H, W = 50, 3, 32, 32
F, HH, WW = 25, 3, 3
stride, pad = 2, 1
h_out, w_out = 1 + (H + 2 * pad - HH) // stride, 1 + (W + 2 * pad - WW) // stride

conv_param = {'stride': stride, 'pad': pad}
dtype = torch.float64
device = 'cpu'

x = torch.randn(N, C, H, W, dtype=dtype, device=device)
w = torch.randn(F, C, HH, WW, dtype=dtype, device=device)
b = torch.randn(F, dtype=dtype, device=device)
dout = torch.randn(N, F, h_out, w_out, dtype=dtype, device=device)
x_cuda, w_cuda, b_cuda, dout_cuda = x.to('cuda'), w.to('cuda'), b.to('cuda'), dout.to('cuda')

# gpu需要预热
_, _ = Conv.forward(x_cuda, w_cuda, b_cuda, conv_param)

t0 = tpc()
out_vanilla, cache_vanilla = ConvVanilla.forward(x, w, b, conv_param)
t1 = tpc()
out_fast, cache_fast = Conv.forward(x, w, b, conv_param)
t2 = tpc()
out_fast_cuda, cache_fast_cuda = Conv.forward(x_cuda, w_cuda, b_cuda, conv_param)
t3 = tpc()

print('Testing Conv.forward:')
print(f'Vanilla: {t1 - t0:f}s')
print(f'Fast:{t2 - t1:f}s')
print(f'Fast CUDA:{t3 - t2:f}s')
print(f'Speedup: {(t1 - t0) / (t2 - t1):.1f}x')
print(f'Speedup CUDA: {(t1 - t0) / (t3 - t2):.1f}x')
print('Difference: ', grad.rel_error(out_vanilla, out_fast))
print('Difference CUDA: ', grad.rel_error(out_vanilla, out_fast_cuda.to(device)))

t0 = tpc()
dx_vanilla, dw_vanilla, db_vanilla = ConvVanilla.backward(dout, cache_vanilla)
t1 = tpc()
dx_fast, dw_fast, db_fast = Conv.backward(dout, cache_fast)
t2 = tpc()
dx_fast_cuda, dw_fast_cuda, db_fast_cuda = Conv.backward(dout_cuda, cache_fast_cuda)
t3 = tpc()

print('\nTesting FastConv.backward:')
print(f'Vanilla: {t1 - t0:f}s')
print(f'Fast:{t2 - t1:f}s')
print(f'Fast CUDA:{t3 - t2:f}s')
print(f'Speedup: {(t1 - t0) / (t2 - t1):.1f}x')
print(f'Speedup CUDA: {(t1 - t0) / (t3 - t2):.1f}x')

print('dx difference: ', grad.rel_error(dx_vanilla, dx_fast))
print('dw difference: ', grad.rel_error(dw_vanilla, dw_fast))
print('db difference: ', grad.rel_error(db_vanilla, db_fast))

print('dx difference CUDA: ', grad.rel_error(dx_vanilla, dx_fast_cuda.to(dx_vanilla.device)))
print('dw difference CUDA: ', grad.rel_error(dw_vanilla, dw_fast_cuda.to(dw_vanilla.device)))
print('db difference CUDA: ', grad.rel_error(db_vanilla, db_fast_cuda.to(db_vanilla.device)))
# 至少1000倍以上的加速

Testing Conv.forward:
Vanilla: 4.527313s
Fast:0.003057s
Fast CUDA:0.000476s
Speedup: 1480.9x
Speedup CUDA: 9501.2x
Difference:  2.668446055776658e-16
Difference CUDA:  2.668446055776658e-16

Testing FastConv.backward:
Vanilla: 9.240463s
Fast:0.006705s
Fast CUDA:0.000537s
Speedup: 1378.1x
Speedup CUDA: 17217.2x
dx difference:  3.411315230434022e-16
dw difference:  2.524463004486185e-15
db difference:  0.0
dx difference CUDA:  3.411315230434022e-16
dw difference CUDA:  2.5645338458272362e-15
db difference CUDA:  1.8505571847128472e-16


# 其他

### 2. unfold()

`torch.Tensor.unfold()`是一个函数，用于创建一个视图，其中沿着指定维度展开的数据被组织成一个新的最后一维。换句话说，这个操作可以用于有效地提取张量的滑动窗口块，用于进一步的操作（如卷积）。

这个函数的语法是这样的：

`tensor.unfold(dimension, size, step)`

- `dimension` 是你想要展开的维度
- `size` 是滑动窗口的大小
- `step` 是滑动窗口移动的步长



In [18]:
x = torch.arange(1, 8)  # tensor([1, 2, 3, 4, 5, 6, 7])
y = x.unfold(0, 3, 1)  # 连续的滑动窗口
print(y)

tensor([[1, 2, 3],
        [2, 3, 4],
        [3, 4, 5],
        [4, 5, 6],
        [5, 6, 7]])


In [42]:
x = torch.arange(30).view(5, 6)
print(x)
H, W = x.shape
HH = WW = 2
pad = 0
stride = 2
H_out = 1 + (H + 2 * pad - HH) // stride
W_out = 1 + (W + 2 * pad - WW) // stride

print(H_out,W_out)

def get_conv_table(x, HH, WW, stride):
    #@ 针对按顺序的两维(这里是0，1)逐个unfold就可以得到要卷积的部分
    y = x.unfold(0, HH, stride).unfold(1, WW, stride)
    return y
y = get_conv_table(x, HH,WW,stride)
#print(y)
print(y.shape)
print(y)
# 输出：
# tensor([[[[1, 2],
#            [4, 5]],
#           [[2, 3],
#            [5, 6]]],
#          [[[4, 5],
#            [7, 8]],
#           [[5, 6],
#            [8, 9]]]])

tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11],
        [12, 13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22, 23],
        [24, 25, 26, 27, 28, 29]])
2 3
torch.Size([2, 3, 2, 2])
tensor([[[[ 0,  1],
          [ 6,  7]],

         [[ 2,  3],
          [ 8,  9]],

         [[ 4,  5],
          [10, 11]]],


        [[[12, 13],
          [18, 19]],

         [[14, 15],
          [20, 21]],

         [[16, 17],
          [22, 23]]]])


### 3. einsum()

In [44]:
# 转置
x = torch.arange(6).reshape((2,3))
print(x)
# tensor([[0, 1, 2],
#         [3, 4, 5]])

# 使用einsum进行转置操作
y = torch.einsum('ij->ji', x)
print(y)
# tensor([[0, 3],
#         [1, 4],
#         [2, 5]])

# 最后两维转置
a = torch.randn(2,3,5,7,9)
# i = 7, j = 9
b = torch.einsum('...ij->...ji', [a])


tensor([[0, 1, 2],
        [3, 4, 5]])
tensor([[0, 3],
        [1, 4],
        [2, 5]])


In [45]:
# 矩阵乘法
x = torch.arange(6).reshape((2,3))
y = torch.arange(9).reshape((3,3))

# 使用einsum进行矩阵乘法操作
z = torch.einsum('ij,jk->ik', x, y)
print(z)
# tensor([[15, 18, 21],
#         [42, 54, 66]])


tensor([[15, 18, 21],
        [42, 54, 66]])


In [56]:
# 张量点乘求和
x = torch.arange(3)
y = torch.arange(3, 6)
print(x, y)
# 使用einsum进行点乘操作
z = torch.einsum('i,i->i', x, y)
print(z)
# tensor([ 0,  4, 10])

# 使用einsum进行点乘+求和操作
z = torch.einsum('i,i->', x, y)
print(z)
# tensor(14)

tensor([0, 1, 2]) tensor([3, 4, 5])
tensor([ 0,  4, 10])
tensor(14)


### Conv

In [22]:
x = torch.arange(2*24).view(2, 4, 6)  # 理解成双通道图像
print(x)
C, H, W = x.shape
HH = WW = 2
pad = 0
stride = 2
H_out = 1 + (H + 2 * pad - HH) // stride
W_out = 1 + (W + 2 * pad - WW) // stride

print(H_out,W_out)

def get_conv_table(x, HH, WW, stride):
    #@ 针对按顺序的两维(这里是1，2)逐个unfold就可以得到要卷积的部分
    y = x.unfold(1, HH, stride).unfold(2, WW, stride)
    return y
y = get_conv_table(x, HH,WW,stride)
#print(y)
print(y.shape)  # (N, H_out, W_out, stride, stride)
print(y)

# 对角取元素
# 双通道图像需要双通道卷积核(单个)
w = torch.tensor([
    [[1, 0],
     [0, 1]],
    [[1, 0],
     [0, 1]]
    
])
print(w.shape)

tensor([[[ 0,  1,  2,  3,  4,  5],
         [ 6,  7,  8,  9, 10, 11],
         [12, 13, 14, 15, 16, 17],
         [18, 19, 20, 21, 22, 23]],

        [[24, 25, 26, 27, 28, 29],
         [30, 31, 32, 33, 34, 35],
         [36, 37, 38, 39, 40, 41],
         [42, 43, 44, 45, 46, 47]]])
2 3
torch.Size([2, 2, 3, 2, 2])
tensor([[[[[ 0,  1],
           [ 6,  7]],

          [[ 2,  3],
           [ 8,  9]],

          [[ 4,  5],
           [10, 11]]],


         [[[12, 13],
           [18, 19]],

          [[14, 15],
           [20, 21]],

          [[16, 17],
           [22, 23]]]],



        [[[[24, 25],
           [30, 31]],

          [[26, 27],
           [32, 33]],

          [[28, 29],
           [34, 35]]],


         [[[36, 37],
           [42, 43]],

          [[38, 39],
           [44, 45]],

          [[40, 41],
           [46, 47]]]]])
torch.Size([2, 2, 2])



假设输入张量 `x` 的形状为 `(C, H, W)`，即通道数为 `C`，高度为 `H`，宽度为 `W`。卷积核 `w` 的形状为 `(C, HH, WW)`，即通道数也为 `C`，高度为 `HH`，宽度为 `WW`。此处的 `y` 是 `x` 通过 `unfold` 操作后的结果，形状为 `(C, H_out, W_out, HH, WW)`，其中 `H_out` 和 `W_out` 分别是输出的高度和宽度。

下面的 `einsum` 表达式 `'ChwIJ,CIJ->hw'` 可以表示为以下数学公式：

$$out_{hw}=\sum_{C,\text{堆叠相加}}{\sum_{I=0}^{HH-1}{\sum_{J=0}^{WW-1}{y_{ChwIJ}\cdot}}}w_{CIJ}$$

其中，`h` 和 `w` 是输出张量的高度和宽度的索引，`C` 是通道的索引，`I` 和 `J` 是卷积核的高度和宽度的索引。这个表达式的含义是，对于输出张量的每一个位置 `(h, w)`，我们遍历输入的所有通道和卷积核的所有位置，将输入的对应部分和卷积核的元素相乘，然后将所有结果相加，得到输出张量的该位置的值。

这就是使用 `einsum` 进行卷积运算的数学公式表示。这个表达式就是卷积运算的定义：将输入的每一个局部窗口和卷积核进行对应元素的乘积和操作，得到输出的对应位置的值。

In [23]:
# h, w是输出维度，I，J是卷积核大小
out = torch.einsum('ChwIJ,CIJ->hw', y, w)
print(out)
# tensor([[ 0+7+24+31,2+9+26+33,  78], 
#         [110, 118, 126]])

tensor([[ 62,  70,  78],
        [110, 118, 126]])



关于 `'NChwIJ,FCIJ->NFhw'` 这个表达式的数学公式如下：
$$
\text{out}_{NFhw} = \sum_{C} \sum_{I=0}^{HH-1} \sum_{J=0}^{WW-1} x_{NChwIJ} \cdot w_{FCIJ}
$$


In [59]:
# N, C, h, w, I, J = 2, 3, 2, 4, 3, 3
# C, H, W = 3, 4, 6
# x = torch.ones(N, C, H, W)
# # print(x)
# col = ConvMy.im2col(x, I, J)  
# print(col.shape) # (N, h*w, C*I*J)
# out = ConvMy.col2im(col, x.shape,I, J)
# print(out.shape)
# print(x, out, sep='\n\n')

In [60]:
# N, C, h, w, I, J = 2, 3, 2, 4, 3, 3
# C, H, W = 3, 4, 6
# F = 2
# x1 = torch.zeros(N, h, w, C, I, J)
# # print(x.shape)
# weight = torch.randint(0,10,[F, C, I, J])
# w_expand1 = weight[:,None,None,:,:,:]  # (F11CIJ)
# print(w_expand1.shape)
# for i in range(w_expand1.shape[0]):
#     x1 += w_expand1[i] #  (N, h, w, C, I, J) + (1, 1, 1, C, I, J)
# # print(x)

# x2 = torch.zeros(N, C, h, w, I, J)
# w_expand2 = weight[:, :, None, None, :, :] # (FC11IJ)
# print(w_expand2.shape)

# for i in range(w_expand2.shape[0]):
#     x2 += w_expand2[i] #  (N, C, h, w, I, J) + (1, C, 1, 1, I, J)
# x1 = x1.permute(0,3,1,2,4,5)
# print(x1 == x2)