In [1]:
# parameters
import torch
import torch.nn as nn
import math

BATCH_SIZE = 2
CHANNEL_IN = 3
CHANNEL_OUT = 3
HEIGHT_IN = 8
WIDTH_IN = 8

KERNEL_SIZE = 5
STRIDE = 1
GROUP = 1
PADDING = math.ceil((KERNEL_SIZE - STRIDE) / 2)
EPS = 1e-5

HEIGHT_OUT = int((HEIGHT_IN - KERNEL_SIZE + 2 * PADDING) / STRIDE + 1)
WIDTH_OUT = int((WIDTH_IN - KERNEL_SIZE + 2 * PADDING) / STRIDE + 1)

print("Padding: ", PADDING)
print("Height_out: ", HEIGHT_OUT)
print("WIDTH_OUT: ", WIDTH_OUT)
print("In size: ", BATCH_SIZE * CHANNEL_IN * HEIGHT_IN * WIDTH_IN)
print("Out size: ", BATCH_SIZE * CHANNEL_IN * HEIGHT_IN * WIDTH_IN)

Padding:  2
Height_out:  8
WIDTH_OUT:  8
In size:  384
Out size:  384


In [2]:
# convolution
x = torch.zeros(BATCH_SIZE, CHANNEL_IN, HEIGHT_IN, WIDTH_IN)

for n in range(BATCH_SIZE):
    for c in range(CHANNEL_IN):
        for h in range(HEIGHT_IN):
            for w in range(WIDTH_IN):
                x[n, c, h, w] = h + c

KERNEL_CHANNEL = int(CHANNEL_IN / GROUP)
kernel = torch.zeros((CHANNEL_OUT, KERNEL_CHANNEL, KERNEL_SIZE, KERNEL_SIZE))
for k in range(CHANNEL_OUT):
    for l in range(KERNEL_CHANNEL):
        for i in range(KERNEL_SIZE):
            for j in range(KERNEL_SIZE):
                kernel[k, l, i, j] = j + k

bias = torch.zeros((CHANNEL_OUT,))
for c in range(CHANNEL_OUT):
    bias[c] = c + 10

conv = torch.nn.Conv2d(in_channels=CHANNEL_IN, out_channels=CHANNEL_OUT,
                       kernel_size=KERNEL_SIZE, bias=True, stride=STRIDE, padding=PADDING, groups=GROUP)
conv.weight.data = kernel
conv.bias.data = bias

afterConv = conv(x)

print("afterConv shape: ", afterConv.shape)
print(afterConv[0, 0, :, :])

afterConv shape:  torch.Size([2, 3, 8, 8])
tensor([[172., 190., 190., 190., 190., 190., 118.,  64.],
        [280., 310., 310., 310., 310., 310., 190., 100.],
        [415., 460., 460., 460., 460., 460., 280., 145.],
        [550., 610., 610., 610., 610., 610., 370., 190.],
        [685., 760., 760., 760., 760., 760., 460., 235.],
        [820., 910., 910., 910., 910., 910., 550., 280.],
        [712., 790., 790., 790., 790., 790., 478., 244.],
        [577., 640., 640., 640., 640., 640., 388., 199.]],
       grad_fn=<SliceBackward0>)


In [3]:
# batch normalization
# shape of running_mean and running_var should be same as channel_outs
RUNNING_MEAN = [ 82, 227, 444]
RUNNING_VAR = [ 945, 3780, 8505]
weight = [0.5, 0.5, 0.5]
bias = [0.2, 0.2, 0.2]
momentum = 0
eps = 1e-5

batch_norm = nn.BatchNorm2d(num_features=CHANNEL_OUT,momentum=momentum, eps=eps)
batch_norm.running_mean = torch.tensor(RUNNING_MEAN, dtype=torch.float)
batch_norm.running_var = torch.tensor(RUNNING_VAR, dtype=torch.float)
batch_norm.weight.data = torch.tensor(weight, dtype=torch.float)
batch_norm.bias.data = torch.tensor(bias, dtype=torch.float)

batch_norm.eval()
afterNorm = batch_norm(afterConv)

print("BN output:")
# print("running_mean:", batch_norm.running_mean)
# print("running_var:",batch_norm.running_var)
# print("weight:",batch_norm.weight)
# print("bias:",batch_norm.bias)
print(afterNorm.shape)
print(afterNorm[0, 0, :, :])

BN output:
torch.Size([2, 3, 8, 8])
tensor([[ 1.6639,  1.9566,  1.9566,  1.9566,  1.9566,  1.9566,  0.7855, -0.0928],
        [ 3.4205,  3.9084,  3.9084,  3.9084,  3.9084,  3.9084,  1.9566,  0.4928],
        [ 5.6162,  6.3482,  6.3482,  6.3482,  6.3482,  6.3482,  3.4205,  1.2247],
        [ 7.8120,  8.7879,  8.7879,  8.7879,  8.7879,  8.7879,  4.8843,  1.9566],
        [10.0078, 11.2277, 11.2277, 11.2277, 11.2277, 11.2277,  6.3482,  2.6885],
        [12.2036, 13.6674, 13.6674, 13.6674, 13.6674, 13.6674,  7.8120,  3.4205],
        [10.4470, 11.7156, 11.7156, 11.7156, 11.7156, 11.7156,  6.6409,  2.8349],
        [ 8.2512,  9.2759,  9.2759,  9.2759,  9.2759,  9.2759,  5.1771,  2.1030]],
       grad_fn=<SliceBackward0>)


In [13]:
normalize_shape = (HEIGHT_OUT, WIDTH_OUT, CHANNEL_OUT)
weight = torch.zeros(normalize_shape)
bias = torch.zeros(normalize_shape)

for h in range(HEIGHT_OUT):
    for w in range(WIDTH_OUT):
        for c in range(CHANNEL_OUT):
            weight[h, w, c] = h
            bias[h, w, c] = w
            
ln_in = afterConv.permute(0, 2, 3, 1)

ln_norm = nn.LayerNorm([HEIGHT_OUT, WIDTH_OUT, CHANNEL_OUT], eps=EPS, elementwise_affine=True)
ln_norm.weight.data = weight
ln_norm.bias.data = bias
print("weight shape: ", ln_norm.weight.shape)
print("bias shape: ", ln_norm.bias.shape)

ln_out = ln_norm(ln_in)
afterNorm = ln_out.permute(0, 3, 1, 2)
print("LN Out:")
print(afterNorm[0, 0, :, :])

weight shape:  torch.Size([4, 4, 3])
bias shape:  torch.Size([4, 4, 3])
LN Out:
tensor([[ 0.0000,  1.0000,  2.0000,  3.0000],
        [-0.6973,  0.3027,  1.3027,  1.6866],
        [-0.4706,  0.5294,  1.5294,  0.6812],
        [-1.6300, -0.6300,  0.3700, -0.7862]], grad_fn=<SliceBackward0>)


In [4]:
# relu
relu = nn.ReLU()
out = relu(afterNorm)

print(out[0, 0, :, :])

tensor([[ 1.6639,  1.9566,  1.9566,  1.9566,  1.9566,  1.9566,  0.7855,  0.0000],
        [ 3.4205,  3.9084,  3.9084,  3.9084,  3.9084,  3.9084,  1.9566,  0.4928],
        [ 5.6162,  6.3482,  6.3482,  6.3482,  6.3482,  6.3482,  3.4205,  1.2247],
        [ 7.8120,  8.7879,  8.7879,  8.7879,  8.7879,  8.7879,  4.8843,  1.9566],
        [10.0078, 11.2277, 11.2277, 11.2277, 11.2277, 11.2277,  6.3482,  2.6885],
        [12.2036, 13.6674, 13.6674, 13.6674, 13.6674, 13.6674,  7.8120,  3.4205],
        [10.4470, 11.7156, 11.7156, 11.7156, 11.7156, 11.7156,  6.6409,  2.8349],
        [ 8.2512,  9.2759,  9.2759,  9.2759,  9.2759,  9.2759,  5.1771,  2.1030]],
       grad_fn=<SliceBackward0>)


In [4]:
# silu
silu = nn.SiLU()
out = silu(afterNorm)

print(out[0, 0, :, :])

tensor([[ 1.3989,  1.7143,  1.7143,  1.7143,  1.7143,  1.7143,  0.5396, -0.0442],
        [ 3.3122,  3.8315,  3.8315,  3.8315,  3.8315,  3.8315,  1.7143,  0.3059],
        [ 5.5959,  6.3371,  6.3371,  6.3371,  6.3371,  6.3371,  3.3122,  0.9466],
        [ 7.8089,  8.7866,  8.7866,  8.7866,  8.7866,  8.7866,  4.8477,  1.7143],
        [10.0073, 11.2275, 11.2275, 11.2275, 11.2275, 11.2275,  6.3371,  2.5174],
        [12.2035, 13.6674, 13.6674, 13.6674, 13.6674, 13.6674,  7.8089,  3.3122],
        [10.4466, 11.7155, 11.7155, 11.7155, 11.7155, 11.7155,  6.6323,  2.6777],
        [ 8.2490,  9.2750,  9.2750,  9.2750,  9.2750,  9.2750,  5.1480,  1.8742]],
       grad_fn=<SliceBackward0>)


In [19]:
# gelu
gelu = nn.functional.gelu
out = gelu(afterNorm)

print(out[0, 0, :, :])

tensor([[-0.1582, -0.1582, -0.1582, -0.1689],
        [-0.0430, -0.0430, -0.0430, -0.1688],
        [ 0.2201,  0.2201,  0.2201, -0.1582],
        [ 0.0280,  0.0280,  0.0280, -0.1665]], grad_fn=<SliceBackward0>)


In [5]:
# shortcut
out = out + x

print(out[0, 0, :, :])

tensor([[ 1.3989,  1.7143,  1.7143,  1.7143,  1.7143,  1.7143,  0.5396, -0.0442],
        [ 4.3122,  4.8315,  4.8315,  4.8315,  4.8315,  4.8315,  2.7143,  1.3059],
        [ 7.5959,  8.3371,  8.3371,  8.3371,  8.3371,  8.3371,  5.3122,  2.9466],
        [10.8089, 11.7866, 11.7866, 11.7866, 11.7866, 11.7866,  7.8477,  4.7143],
        [14.0073, 15.2275, 15.2275, 15.2275, 15.2275, 15.2275, 10.3371,  6.5174],
        [17.2035, 18.6674, 18.6674, 18.6674, 18.6674, 18.6674, 12.8089,  8.3122],
        [16.4466, 17.7155, 17.7155, 17.7155, 17.7155, 17.7155, 12.6323,  8.6777],
        [15.2490, 16.2750, 16.2750, 16.2750, 16.2750, 16.2750, 12.1480,  8.8742]],
       grad_fn=<SliceBackward0>)
