In [None]:
!pip install mxnet gluon d2l

In [2]:
from d2l import mxnet as d2l
from mxnet import autograd, np, npx
from mxnet.gluon import nn
npx.set_np()

In [3]:
def corr2d(X, K): #@save
  """Compute 2D cross-correlation."""
  h, w = K.shape
  # Compute output shape
  Y = np.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
  # loop over X and Y axes.
  for i in range(Y.shape[0]):
    for j in range(Y.shape[1]):
      Y[i, j] = d2l.reduce_sum((X[i: i + h, j: j + w] * K))
  return Y

In [4]:
X = np.array([[0.0, 1.0, 2.0],
              [3.0, 4.0, 5.0],
              [6.0, 7.0, 8.0]])
K = np.array([[0.0, 1.0],
              [2.0, 3.0]])
corr2d(X, K)

array([[19., 25.],
       [37., 43.]])

In [5]:
class Conv2D(nn.Block):
  def __init__(self, kernel_size, **kwargs):
    super().__init__(**kwargs)
    self.weight = self.params.get('weight', shape=kernel_size)
    self.bias = self.params.get('bias', shape=(1,))
  def forward(self, x):
    return corr2d(x, self.weight.data()) + self.bias.data()

In [6]:
# Edge Detection
X = np.ones((6, 8))
X[:, 2:6] = 0
X

array([[1., 1., 0., 0., 0., 0., 1., 1.],
       [1., 1., 0., 0., 0., 0., 1., 1.],
       [1., 1., 0., 0., 0., 0., 1., 1.],
       [1., 1., 0., 0., 0., 0., 1., 1.],
       [1., 1., 0., 0., 0., 0., 1., 1.],
       [1., 1., 0., 0., 0., 0., 1., 1.]])

In [7]:
K = np.array([[1.0, -1.0]])

In [8]:
Y = corr2d(X, K)
Y

array([[ 0.,  1.,  0.,  0.,  0., -1.,  0.],
       [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
       [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
       [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
       [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
       [ 0.,  1.,  0.,  0.,  0., -1.,  0.]])

In [9]:
corr2d(X.T, K)

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [10]:
# Construct a two-dimensional convolutional layer with 1 output channel and a
# kernel of shape (1, 2). For the sake of simplicity, we ignore the bias here
conv2d = nn.Conv2D(1, kernel_size=(1, 2), use_bias=False)
conv2d.initialize()
# The two-dimensional convolutional layer uses four-dimensional input and
# output in the format of (example, channel, height, width), where the batch
# size (number of examples in the batch) and the number of channels are both 1
X = X.reshape(1, 1, 6, 8)
Y = Y.reshape(1, 1, 6, 7)
for i in range(10):
  with autograd.record():
    Y_hat = conv2d(X)
    l = (Y_hat - Y) ** 2
  l.backward()
  # Update the kernel
  conv2d.weight.data()[:] -= 3e-2 * conv2d.weight.grad()
  if (i + 1) % 2 == 0:
    print(f'batch {i + 1}, loss {float(l.sum()):.3f}')

batch 2, loss 4.949
batch 4, loss 0.831
batch 6, loss 0.140
batch 8, loss 0.024
batch 10, loss 0.004


In [11]:
d2l.reshape(conv2d.weight.data(), (1, 2))

array([[ 0.9895   , -0.9873705]])

In [12]:
# output shapes after convolution
# input_shape = height * width | n_h * n_w
# kernel_size = height * width | k_h * k_w

# (n_h − k_h + 1) × (n_w − k_w + 1)

In [13]:
# convolution output shapes with padding
# input_shape = height * width | n_h * n_w
# kernel_size = height * width | k_h * k_w
# padding_size = height * width | p_h * p_w

# (n_h − k_h + p_h + 1) × (n_w − k_w + p_w + 1).

In [14]:
# padding to keep the input shape, e.g. padding='same'

In [15]:
from mxnet import np, npx
from mxnet.gluon import nn
npx.set_np()
# For convenience, we define a function to calculate the convolutional layer.
# This function initializes the convolutional layer weights and performs
# corresponding dimensionality elevations and reductions on the input and
# output
def comp_conv2d(conv2d, X):
  conv2d.initialize()
  # Here (1, 1) indicates that the batch size and the number of channels
  # are both 1
  X = X.reshape((1, 1) + X.shape)
  Y = conv2d(X)
  # Exclude the first two dimensions that do not interest us: examples and
  # channels
  return Y.reshape(Y.shape[2:])
# Note that here 1 row or column is padded on either side, so a total of 2
# rows or columns are added
conv2d = nn.Conv2D(1, kernel_size=3, padding=(1, 1))
X = np.random.uniform(size=(8, 8))
comp_conv2d(conv2d, X).shape

(8, 8)

In [16]:
# Here, we use a convolution kernel with a height of 5 and a width of 3. The
# padding numbers on either side of the height and width are 2 and 1,
# respectively
conv2d = nn.Conv2D(1, kernel_size=(7, 3), padding=(3, 1))
comp_conv2d(conv2d, X).shape

(8, 8)

In [17]:
# convolution output shapes with padding and strides.
# input_shape = height * width | n_h * n_w
# kernel_size = height * width | k_h * k_w
# padding_size = height * width | p_h * p_w
# stride_size = height * width | s_h * s_w

# ⌊(n_h − k_h + p_h + s_h )/s_h ⌋ × ⌊(n_w − k_w + p_w + s_w )/s_w ⌋.

# then if p_h = k_h − 1 and p_w = k_w − 1
# ⌊(n_h + s_h − 1)/s_h ⌋ × ⌊(n_w + s_w − 1)/s_w ⌋

In [18]:
print(X.shape)

(8, 8)


In [19]:
conv2d = nn.Conv2D(1, kernel_size=3, padding=1, strides=2)
comp_conv2d(conv2d, X).shape

(4, 4)

In [20]:
conv2d = nn.Conv2D(1, kernel_size=(3, 5), padding=(0, 1), strides=(3, 4))
comp_conv2d(conv2d, X).shape

(2, 2)

In [21]:
# multi channel input: 
# we need a filter for each channel:
# c_i × k_h × k_w,      e.g.    3 × 64 × 64

In [22]:
from d2l import mxnet as d2l
from mxnet import np, npx
npx.set_np()

In [23]:
def corr2d_multi_in(X, K):
  # First, iterate through the 0th dimension (channel dimension) of `X` and
  # `K`. Then, add them together
  return sum(d2l.corr2d(x, k) for x, k in zip(X, K))

In [24]:
X = np.array([[[0.0, 1.0, 2.0],
               [3.0, 4.0, 5.0], 
               [6.0, 7.0, 8.0]],
            [[1.0, 2.0, 3.0],
             [4.0, 5.0, 6.0],
             [7.0, 8.0, 9.0]]])
print(X.shape)
K = np.array([[[0.0, 1.0],
               [2.0, 3.0]],
               [[1.0, 2.0],
                [3.0, 4.0]]])
print(K.shape)
corr2d_multi_in(X, K)

(2, 3, 3)
(2, 2, 2)


array([[ 56.,  72.],
       [104., 120.]])

In [25]:
def corr2d_multi_in_out(X, K):
  # Iterate through the 0th dimension of `K`, and each time, perform
  # cross-correlation operations with input `X`. All of the results are
  # stacked together
  return np.stack([corr2d_multi_in(X, k) for k in K], 0)

In [26]:
K = np.stack((K, K + 1, K + 2), 0)
K.shape

(3, 2, 2, 2)

In [27]:
corr2d_multi_in_out(X, K)

array([[[ 56.,  72.],
        [104., 120.]],

       [[ 76., 100.],
        [148., 172.]],

       [[ 96., 128.],
        [192., 224.]]])