# PyTorch中进行卷积残差模块算子融合

来自b站up主deep_thoughts 合集【PyTorch源码教程与前沿人工智能算法复现讲解】

P_16_PyTorch中进行卷积残差模块算子融合：
    
https://www.bilibili.com/video/BV1sU4y1u7TM/?spm_id_from=333.788&vd_source=18e91d849da09d846f771c89a366ed40

Torch.nn 官方文档：https://pytorch.org/docs/stable/nn.html

***论文***

R-Drop: Regularized Dropout for Neural Networks：

https://arxiv.org/pdf/2106.14448.pdf

## R Dropout

In [None]:
import numpy as np
import torch

def train_r_dropout(rate, x, w1, b1, w2, b2):
    x = torch.cat([x, x], 0)
    layer1 = np.maximum(0, np.dot(w1, x) + b1)
    mask1 = np.random.binomial(1, 1-rate, layer1.shape)
    layer1 = layer1*mask1
    
    layer2 = np.maximum(0, np.dot(w2, layer1) + b2)
    mask2 = np.random.binomial(1, 1-rate, layer2.shape)
    layer2 = layer2*mask2
    
    logits = func(layer2)
    logits1, logits2 = logits[:bs, :], logits[bs:, :]
    nll1 = nll(logits1, label)
    nll2 = nll(logits2, label)
    kl_loss = kl(logits1, logits2)
    loss = nll1 + nll2 + kl_loss
    
    return loss

## Torch.nn.Conv2d
官方文档：https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d

In [1]:
import torch

conv_layer = torch.nn.Conv2d(2, 2, 3, padding="same")

for i in conv_layer.named_parameters():
    print(i)

('weight', Parameter containing:
tensor([[[[-0.0102,  0.1698,  0.2327],
          [ 0.0041, -0.0713, -0.0640],
          [ 0.1578,  0.2283,  0.2095]],

         [[-0.0545,  0.0185, -0.1818],
          [ 0.0845, -0.1467, -0.1645],
          [-0.1213, -0.2126,  0.0537]]],


        [[[ 0.1197,  0.2086, -0.0750],
          [ 0.0555,  0.0258,  0.1428],
          [ 0.0094,  0.1274, -0.1129]],

         [[-0.2097, -0.0310,  0.1732],
          [-0.0508, -0.0166,  0.1956],
          [-0.0259, -0.2299,  0.2276]]]], requires_grad=True))
('bias', Parameter containing:
tensor([ 0.0490, -0.0998], requires_grad=True))


In [3]:
print(conv_layer.weight)
print(conv_layer.weight.size())
print(conv_layer.bias.size())

Parameter containing:
tensor([[[[-0.0102,  0.1698,  0.2327],
          [ 0.0041, -0.0713, -0.0640],
          [ 0.1578,  0.2283,  0.2095]],

         [[-0.0545,  0.0185, -0.1818],
          [ 0.0845, -0.1467, -0.1645],
          [-0.1213, -0.2126,  0.0537]]],


        [[[ 0.1197,  0.2086, -0.0750],
          [ 0.0555,  0.0258,  0.1428],
          [ 0.0094,  0.1274, -0.1129]],

         [[-0.2097, -0.0310,  0.1732],
          [-0.0508, -0.0166,  0.1956],
          [-0.0259, -0.2299,  0.2276]]]], requires_grad=True)
torch.Size([2, 2, 3, 3])
torch.Size([2])


### point-wise convolution

* 1 X 1 卷积
* channel mix
* 相当于 MLP

### depth-wise convolution

* groups 设置大于1的数

In [4]:
conv_layer = torch.nn.Conv2d(2, 4, 3, padding="same", groups=2)
print(conv_layer.weight.size())
print(conv_layer.bias.size())

torch.Size([4, 1, 3, 3])
torch.Size([4])


In [6]:
conv_layer1 = torch.nn.Conv2d(1, 2, 3, padding="same")
conv_layer2 = torch.nn.Conv2d(1, 2, 3, padding="same")
print(conv_layer1.weight.size(), conv_layer1.bias.size())
print(conv_layer2.weight.size(), conv_layer2.bias.size())

torch.Size([2, 1, 3, 3]) torch.Size([2])
torch.Size([2, 1, 3, 3]) torch.Size([2])


## 卷积残差模块算子融合

res_block = 3 X 3 conv + 1 X 1 conv + input

![](./img/P16_1.png)

In [8]:
import torch
import torch.nn.functional as F
import torch.nn as nn

in_channels = 2
out_channels = 2
kernel_size = 3
w = 9
h = 9

x = torch.ones(1, in_channels, w, h)  # 输入图片

# 方法1：原生写法

conv_2d = nn.Conv2d(in_channels, out_channels, kernel_size, padding="same")
conv_2d_pointwise = nn.Conv2d(in_channels, out_channels, 1)
result1 = conv_2d(x) + conv_2d_pointwise(x) + x

In [12]:
# 方法2：算子融合
# 把 point-wise 卷积和 x 本身都写成 3*3 的卷积
# 最终把三个卷积写成一个卷积
# 1) 改造

pointwise_to_conv_weight = F.pad(conv_2d_pointwise.weight, [1,1,1,1,0,0,0,0])  # 2*2*1*1->2*2*3*3
conv_2d_for_pointwise = nn.Conv2d(in_channels, out_channels, kernel_size, padding="same")
conv_2d_for_pointwise.weight = nn.Parameter(pointwise_to_conv_weight)
conv_2d_for_pointwise.bias = conv_2d_pointwise.bias

# 2*2*3*3
zeros = torch.unsqueeze(torch.zeros(kernel_size, kernel_size), 0)
stars = torch.unsqueeze(F.pad(torch.ones(1, 1), [1,1,1,1]), 0)
stars_zeros = torch.unsqueeze(torch.cat([stars, zeros], 0), 0)  # 第一个通道的卷积核
zeros_stars = torch.unsqueeze(torch.cat([zeros, stars], 0), 0)  # 第二个通道的卷积核
identity_to_conv_weight = torch.cat([stars_zeros, zeros_stars], 0)
identity_to_conv_bias = torch.zeros([out_channels])
conv_2d_for_identity = nn.Conv2d(in_channels, out_channels, kernel_size, padding="same")
conv_2d_for_identity.weight = nn.Parameter(identity_to_conv_weight)
conv_2d_for_identity.bias = nn.Parameter(identity_to_conv_bias)

result2 = conv_2d(x) + conv_2d_for_pointwise(x) + conv_2d_for_identity(x)

print(torch.all(torch.isclose(result1, result2)))

tensor(True)


In [13]:
# 2) 融合

conv_2d_for_fusion = nn.Conv2d(in_channels, out_channels, kernel_size, padding="same")
conv_2d_for_fusion.weight = nn.Parameter(conv_2d.weight.data + conv_2d_for_pointwise.weight.data + conv_2d_for_identity.weight.data)
conv_2d_for_fusion.bias = nn.Parameter(conv_2d.bias.data + conv_2d_for_pointwise.bias.data + conv_2d_for_identity.bias.data)
result3 = conv_2d_for_fusion(x)

print(torch.all(torch.isclose(result2, result3)))

tensor(True)


## 时间比较

In [21]:
import time

# 原生写法

t1 = time.time()
for i in range(1000):
    result1 = conv_2d(x) + conv_2d_pointwise(x) + x
t2 = time.time()

# 融合写法
for i in range(1000):
    result3 = conv_2d_for_fusion(x)
t3 = time.time()

print(torch.all(torch.isclose(result1, result3)))
print("原生写法耗时：", t2-t1)
print("算子融合写法耗时：", t3-t2)

tensor(True)
原生写法耗时： 0.08690285682678223
算子融合写法耗时： 0.06800007820129395
