# 0.前言
### 这里是第二个卷积层:
#### 正向传播输入是[10 32 14 14]，输出是[10 16 14 14],w是[16 32 3 3],b是[16]
#### 反向传播输入是top_diff=[10 16 14 14]，输出是bottom_diff=[10 32 14 14],w_diff=[16 32 3 3], b_diff = [16]

# 1.定义卷积函数

In [1]:
# -*- coding: utf-8 -*-
# 环境：python 3.6；numpy
import numpy as np

class ConvLayer(object):
    def __init__(self, in_channel, out_channel, kernel_size, lr=0.01, stride = 1, pad = 1, momentum=0.9, reg = 0.75, name='Conv'):
        self.w = np.random.randn(out_channel,in_channel, kernel_size, kernel_size)    #w初值设置，随机产生标准正太分布范围内的数
        self.b = np.random.randn(out_channel)
        #w_shape = (out_channel,in_channel, kernel_size, kernel_size)
        self.layer_name = name
        self.lr = lr
        self.momentum = momentum
        self.stride = stride
        self.pad = pad
        self.reg = reg

        self.prev_gradient_w = np.zeros_like(self.w)
        self.prev_gradient_b = np.zeros_like(self.b)

    def forward(self, in_data):
        self.out = None
        N, C, H, W = in_data.shape
        F, _, HH, WW = self.w.shape                                            #F是out_channel,HH是kernel_size,ww是kernel_size;
        stride, pad = self.stride, self.pad
        H_out = int(1 + (H + 2 * pad - HH) / stride)                           #计算纵向卷积需要滑几步
        W_out = int(1 + (W + 2 * pad - WW) / stride)                           #计算横向卷积需要滑几步
        self.out = np.zeros((N , F , H_out, W_out))

        in_data_pad = np.pad(in_data, ((0,), (0,), (pad,), (pad,)), mode='constant', constant_values=0) #边缘填充，这里上下左右各填一个像素
        for i in range(H_out):
            for j in range(W_out):
                in_data_pad_masked = in_data_pad[:, :, i*stride:i*stride+HH, j*stride:j*stride+WW]
                for k in range(F):
                    self.out[:, k , i, j] = np.sum(in_data_pad_masked * self.w[k, :, :, :], axis=(1,2,3))+ self.b[k] #注意：这里比原版加了一个+ self.b[k]

        self.bottom_val = in_data
        return self.out ,self.w ,self.b
        # 注意：这里我们返回更多的变量，方便调试

    def backward(self, residual):
        N, C, H, W = self.bottom_val.shape
        F, _, HH, WW = self.w.shape
        stride, pad = self.stride, self.pad
        H_out = int(1 + (H + 2 * pad - HH) / stride)
        W_out = int(1 + (W + 2 * pad - WW) / stride)

        x_pad = np.pad(self.bottom_val, ((0,), (0,), (pad,), (pad,)), mode='constant', constant_values=0)
        dx = np.zeros_like(self.bottom_val)
        dx_pad = np.zeros_like(x_pad)
        dw = np.zeros_like(self.w)
        # db = np.zeros_like(self.b)

        db = np.sum(residual, axis=(0, 2, 3))

        x_pad = np.pad(self.bottom_val, ((0,), (0,), (pad,), (pad,)), mode='constant', constant_values=0)
        for i in range(H_out):
            for j in range(W_out):
                x_pad_masked = x_pad[:, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
                for k in range(F):  # compute dw
                    dw[k, :, :, :] += np.sum(x_pad_masked * (residual[:, k, i, j])[:, None, None, None], axis=0)
                for n in range(N):  # compute dx_pad
                    temp_w = np.rot90(self.w,2,(2,3))#这种写法不旋转
                    dx_pad[n, :, i * stride:i * stride + HH, j * stride:j * stride + WW] += np.sum((self.w[:, :, :, :] * (residual[n, :, i,j])[:, None, None, None]), axis=0)
        dx[:,:,:,:] = dx_pad[:, :, pad:-pad, pad:-pad]
        self.w -= self.lr * (dw + self.prev_gradient_w * self.reg)
        self.b -= self.lr * db
        self.prev_gradient_w = self.w
        return dx


# 2.前向传播

In [2]:
# 前向传播
in_data = np.load("./data_pic/data_3_relu1[10-32-14-14].npy")
out_data,con_w ,con_b = ConvLayer(32, 16, 3, 1e-5).forward(in_data)
print (in_data.shape)
print (con_w.shape)
print (con_b.shape)
print (out_data.shape)

(10, 32, 14, 14)
(16, 32, 3, 3)
(16,)
(10, 16, 14, 14)


## 2.1.前向传播逐行推导

In [3]:
relu1_data = np.load('./data_pic/data_3_relu1[10-32-14-14].npy')
print ("卷积层的输入的shape是：\n"+str(relu1_data.shape))
pad = 1
in_data_pad = np.pad(relu1_data, ((0,), (0,), (pad,), (pad,)), mode='constant', constant_values=0) #边缘填充，这里上下左右各填一个像素
print ("\n卷积层的输入边缘填充之后的shape是：\n"+str(in_data_pad.shape))
print ("\n卷积层的输入边缘填充之后的第一个batch的第一个channel的前三行是：\n"+str(in_data_pad[:,:,0:3,0:3][0][0]))

in_data_pad_masked = in_data_pad[:, :, 0:3, 0:3]
print ("卷积层输入数据的32个channel的map的3X3的小块：")
print (in_data_pad_masked.shape)
print ("\n卷积层输入数据的第一个batch的第一个channel的map的3X3的小块：")
print (in_data_pad_masked[0][0])

conv2_w = np.random.randn(16,32, 3, 3)    #w初值设置，随机产生标准正太分布范围内的数
print ("\n第一个输入数据对应第一个输出数据的卷积核，类似于全连接中的W11：")
print (conv2_w[0][0])
conv2_b = np.random.randn(16)
conv2_out = np.zeros((10 , 16 , 14, 14))
conv2_out[:, 0 , 0, 0] = np.sum(in_data_pad_masked * conv2_w[0, :, :, :], axis=(1,2,3))+ conv2_b[0]
print ("\n卷积层输出数据的第一个batch的第一个channel的map的3X3的小块")
print (conv2_out[:,:,0:3,0:3][0][0])

卷积层的输入的shape是：
(10, 32, 14, 14)

卷积层的输入边缘填充之后的shape是：
(10, 32, 16, 16)

卷积层的输入边缘填充之后的第一个batch的第一个channel的前三行是：
[[   0.            0.            0.        ]
 [   0.          228.16981617  231.87908912]
 [   0.          228.16981617  228.16981617]]
卷积层输入数据的32个channel的map的3X3的小块：
(10, 32, 3, 3)

卷积层输入数据的第一个batch的第一个channel的map的3X3的小块：
[[   0.            0.            0.        ]
 [   0.          228.16981617  231.87908912]
 [   0.          228.16981617  228.16981617]]

第一个输入数据对应第一个输出数据的卷积核，类似于全连接中的W11：
[[ 0.07717962 -1.19064496 -1.66535497]
 [ 1.25750646  1.80767035 -0.0525058 ]
 [-1.4591329   0.9129148   0.86170935]]

卷积层输出数据的第一个batch的第一个channel的map的3X3的小块
[[-1499.06910742     0.             0.        ]
 [    0.             0.             0.        ]
 [    0.             0.             0.        ]]


# 3.反向传播

In [4]:
# 反向传播
residual = np.load("./data_pic/residual_7_maxpooling[10 16 14 14].npy")
print("上层传过来的残差top_diff的shape：")
print (residual.shape)
print("\n上层传过来的残差top_diff的第一个残差的左上角3x3的值：")
print (residual[:,:,0:3,0:3][0][0])
residual_pad = np.pad(residual, ((0,), (0,), (1,), (1,)), mode='constant', constant_values=0)
print ("\ntop_diff加pad之后的shape：\n"+str(residual_pad.shape))
print("\n上层传过来的残差top_diff的pad之后的第一个残差的左上角3x3的值：")
print (residual_pad[:,:,0:3,0:3][0][0])

db = np.sum(residual, axis=(0, 2, 3))


H_out = 14
W_out = 14
stride = 1
HH = 3
WW = 3
N = 10
F = 16
C =32
i = 0
j = 0
pad = 1
lr=0.01
reg = 0.75

x_pad = np.pad(in_data, ((0,), (0,), (1,), (1,)), mode='constant', constant_values=0)
x_pad_masked = x_pad[:, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
print ("\n输入数据的shape：\n"+str(in_data.shape))
print ("\n输入数据的加pad之后的shape：\n"+str(x_pad.shape))
print ("\n取输入数据的加pad之后的左上角3x3小块的shape：\n"+str(x_pad_masked.shape))
print ("\n取输入数据的加pad之后第一个map的左上角3x3小块：\n"+str(x_pad_masked[0][0]))



上层传过来的残差top_diff的shape：
(10, 16, 14, 14)

上层传过来的残差top_diff的第一个残差的左上角3x3的值：
[[  0.00000000e+00   1.64330032e-09   1.45613870e-08]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   6.06516134e-09   0.00000000e+00]]

top_diff加pad之后的shape：
(10, 16, 16, 16)

上层传过来的残差top_diff的pad之后的第一个残差的左上角3x3的值：
[[  0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   1.64330032e-09]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00]]

输入数据的shape：
(10, 32, 14, 14)

输入数据的加pad之后的shape：
(10, 32, 16, 16)

取输入数据的加pad之后的左上角3x3小块的shape：
(10, 32, 3, 3)

取输入数据的加pad之后第一个map的左上角3x3小块：
[[   0.            0.            0.        ]
 [   0.          228.16981617  231.87908912]
 [   0.          228.16981617  228.16981617]]


## 3.1.对w求导

### 这里，求dw有两种求法：

#### 1.原作者的求法：w_diff = pad(Bottom_data) 分块与top_diff中的每一个像素相乘（这里没有用到卷积）。具体例子请看博客：https://blog.csdn.net/weixin_37251044/article/details/81910932

#### 2.我的求法：按照w_diff = pad(Bottom_data) 卷积 top_diff

In [5]:
# 1.原作者的求法：
dw_1 = np.zeros_like(con_w)

for i in range(H_out):
            for j in range(W_out):
                x_pad_masked = x_pad[:, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
                for k in range(F):  # compute dw
                    dw_1[k, :, :, :] += np.sum(x_pad_masked * (residual[:, k, i, j])[:, None, None, None], axis=0)
                
                

print ("\n取输入数据的加pad之后的左上角3x3小块的shape：\n"+str(x_pad_masked.shape))
print ("\n对w求导的diff_w的shape：\n"+str(dw_1.shape))
print ("\n对w求导的diff_w的第一个dw的值：\n"+str(dw_1[0][0]))

print ((residual[:,0,0,0])[:,None,None,None].shape)




取输入数据的加pad之后的左上角3x3小块的shape：
(10, 32, 3, 3)

对w求导的diff_w的shape：
(16, 32, 3, 3)

对w求导的diff_w的第一个dw的值：
[[ -8.46562862e-05  -3.35537174e-05  -4.15742318e-05]
 [ -7.83996051e-05  -5.53166854e-05  -8.53559591e-06]
 [ -6.20210835e-05  -5.94926942e-05  -3.16911145e-05]]
(10, 1, 1, 1)


In [6]:
# 2.我的求法：
k = 0
dw_2 = np.zeros_like(con_w)
for m in range(HH):
            for n in range(WW):
                x_pad_masked_d = x_pad[:, :, m * stride:m * stride + H_out, n * stride:n * stride + W_out]
                for k in range(F):
                    for p in range(C):
                        dw_2[k, p, m, n] = np.sum(x_pad_masked_d[:,p,:,:] * residual[:, k, :, :], axis=(0,1,2)) 

print ("\n对w求导的diff_w的shape：\n"+str(dw_2.shape))
print ("\n对w求导的diff_w的第一个dw的值：\n"+str(dw_2[0][0]))





对w求导的diff_w的shape：
(16, 32, 3, 3)

对w求导的diff_w的第一个dw的值：
[[ -8.46562862e-05  -3.35537174e-05  -4.15742318e-05]
 [ -7.83996051e-05  -5.53166854e-05  -8.53559591e-06]
 [ -6.20210835e-05  -5.94926942e-05  -3.16911145e-05]]


### 说明:这里两个方法求得的dw[0 0 0 0]位置上的数值都是-0.00025397。从而证明了两种方法都正确。
---

## 3.2.对输入数据x求导

In [7]:
# 1.dx计算方法一:点积法


n = 0
#print ((residual_pad[n, :, i,j])[:, None, None, None].shape)


dx_pad = np.zeros_like(x_pad)

for i in range(H_out):
    for j in range(W_out):
        x_pad_masked = x_pad[:, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
        for n in range(N):  # compute dx_pad
            dx_pad[n, :, i * stride:i * stride + HH, j * stride:j * stride + WW] += np.sum((con_w[:, :, :, :] * (residual[n, :, i,j])[:, None, None, None]), axis=0)
            



#for n in range(N):
#    dx_pad[n, :, i * stride:i * stride + HH, j * stride:j * stride + WW] += np.sum((con_w[:, :, :, :] * (residual[n, :, i,j])[:, None, None, None]), axis=0)
print ("\ndx_pad的shape：")
print (dx_pad.shape)
print ("\ndx_pad[10 32 16 16]中batch_0位置上32个map的左上角3x3的小块：")
print (dx_pad[:,:,0:3,0:3][0][0])
dx = np.zeros_like(in_data)
dx[:,:,:,:] = dx_pad[:, :, pad:-pad, pad:-pad]
print ("\ndx的shape：")
print (dx.shape)
print ("\ndx[10 32 14 14]中batch_0位置上32个map的左上角3x3的小块：")
print (dx[:,:,0:3,0:3][0][0])





dx_pad的shape：
(10, 32, 16, 16)

dx_pad[10 32 16 16]中batch_0位置上32个map的左上角3x3的小块：
[[ -5.95674081e-09  -5.51905160e-09  -1.07998587e-08]
 [ -5.30157577e-09  -3.30199066e-09   7.88567986e-09]
 [  2.23308231e-08   1.09127648e-08   3.55374539e-08]]

dx的shape：
(10, 32, 14, 14)

dx[10 32 14 14]中batch_0位置上32个map的左上角3x3的小块：
[[ -3.30199066e-09   7.88567986e-09   8.58298651e-09]
 [  1.09127648e-08   3.55374539e-08  -1.47638047e-08]
 [ -2.19378633e-08  -1.57148127e-08   3.90876291e-08]]


In [8]:
# dx计算方法二：rot180卷积法
rot_w = np.rot90(con_w,2,(2,3)) #第一个2是逆时针旋转90度2次，也就是180度。（2,3）表示旋转con_w的第（2,3）维。
print ("\nw 的第一个3x3的小块：\n"+str(con_w[:,:,0:3,0:3][0][0]))
print ("\nrot180_w的shape：\n"+str(rot_w.shape))
print ("\nrot180_w的第一个3x3的小块：\n"+str(rot_w[:,:,0:3,0:3][0][0]))
# 说明：这里说明旋转正确，不知道为啥原作者觉得不旋转

h = 0
print ("\ntop_diff加pad之后的shape：\n"+str(residual_pad.shape))


residual_pad_masked = residual_pad[:, :, i*stride:i*stride+HH, j*stride:j*stride+WW]
print ("\nresidual_pad_masked的shape：\n"+str(residual_pad_masked.shape))
print ("\nresidual_pad_masked的左上角3X3小块：\n"+str(residual_pad_masked[0][0]))

dx_2 = np.zeros_like(in_data)

pad_diff_H = HH - (1 + pad)
pad_diff_W = WW - (1 + pad)
residual_pad = np.pad(residual, ((0,), (0,), (pad_diff_H,), (pad_diff_W,)), mode='constant', constant_values=0)
for i in range(H_out):
    for j in range(W_out):
        residual_pad_masked = residual_pad[:, :, i*stride:i*stride+HH, j*stride:j*stride+WW]        
        for h in range(C):
            dx_2[:, h , i, j] = np.sum(residual_pad_masked[:,:,:,:] * rot_w[:, h, :, :], axis=(1,2,3))
print ("\ndx_2的shape：")
print (dx_2.shape)
print ("\ndx_2[10 32 14 14]中batch_0位置上32个map的左上角3x3的小块：")
print (dx_2[:,:,0:3,0:3][0][0])


w 的第一个3x3的小块：
[[-1.04843514 -0.82283949 -0.7007134 ]
 [ 1.17047538  0.48789643  0.90273564]
 [ 0.1461493  -0.15088991  0.15200268]]

rot180_w的shape：
(16, 32, 3, 3)

rot180_w的第一个3x3的小块：
[[ 0.15200268 -0.15088991  0.1461493 ]
 [ 0.90273564  0.48789643  1.17047538]
 [-0.7007134  -0.82283949 -1.04843514]]

top_diff加pad之后的shape：
(10, 16, 16, 16)

residual_pad_masked的shape：
(10, 16, 3, 3)

residual_pad_masked的左上角3X3小块：
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]

dx_2的shape：
(10, 32, 14, 14)

dx_2[10 32 14 14]中batch_0位置上32个map的左上角3x3的小块：
[[ -3.30199066e-09   7.88567986e-09   8.58298651e-09]
 [  1.09127648e-08   3.55374539e-08  -1.47638047e-08]
 [ -2.19378633e-08  -1.57148127e-08   3.90876291e-08]]


# 总结：两个计算结果一致，说明我们的推导是正确的
---

## 3.3.更新w和b

In [9]:
prev_gradient_w = con_w
w = np.random.randn(16,32, 3, 3)
b = np.random.randn(16)
w -= lr * (dw_1 + prev_gradient_w * reg)
b -= lr * db
prev_gradient_w = w
print ("更新后的W：")
print (w[:,:,0:3,0:3][0][0])
print ("\n更新后的b：")
print (b)

更新后的W：
[[ 0.24096305 -0.11422576 -0.11721828]
 [-0.83480148  0.01053807 -0.18909859]
 [-1.14731769 -0.81641328 -0.70336186]]

更新后的b：
[-0.06972303 -0.04964828  0.03469369 -1.49293904 -1.20007582 -0.95382245
  0.3049204   0.14409858  0.96337989 -1.26748569  0.73992418 -0.66945493
  0.31008377  0.76122079 -2.06858212  0.40428955]


# 其它:数组旋转180度

In [10]:
import numpy as np
ac = np.arange(0,9,1).reshape(3,3)
print (ac)
dc = np.rot90(ac,1)
print ("将矩阵逆时针旋转90度")
print (dc)
ec = np.rot90(dc,1)
print ("再将矩阵逆时针旋转90度")
print (ec)
fc = np.rot90(ac,2)
print ("将矩阵逆时针旋转180度")
print (fc)


[[0 1 2]
 [3 4 5]
 [6 7 8]]
将矩阵逆时针旋转90度
[[2 5 8]
 [1 4 7]
 [0 3 6]]
再将矩阵逆时针旋转90度
[[8 7 6]
 [5 4 3]
 [2 1 0]]
将矩阵逆时针旋转180度
[[8 7 6]
 [5 4 3]
 [2 1 0]]


# numpy的点乘：

In [11]:
import numpy as np
ac = np.arange(0,12,1).reshape(2,2,3)
print (ac)
bc = np.zeros((2, ))[:,None,None]
print (bc)
print (ac*bc)
print (np.sum(ac,axis = 0))

[[[ 0  1  2]
  [ 3  4  5]]

 [[ 6  7  8]
  [ 9 10 11]]]
[[[ 0.]]

 [[ 0.]]]
[[[ 0.  0.  0.]
  [ 0.  0.  0.]]

 [[ 0.  0.  0.]
  [ 0.  0.  0.]]]
[[ 6  8 10]
 [12 14 16]]


In [12]:
print (np.log(2.7))
print ("说明：numpy的log函数默认是以10为底的")

0.99325177301
说明：numpy的log函数默认是以10为底的
