In [15]:
import numpy as np

In [16]:
import sys
sys.path.append(r"C:\Users\kanin\Desktop\CV\Neural network")
import core.nn as nn
import core.optim as optim
from core.losses import MSE
from core.utils import accuracy

## Padding

In [17]:
x = np.array([
        [1, 2,],
        [3, 4,],

    ])
pad_arr = np.pad(x, (2, 3), 'constant', 
                 constant_values=(0, 9))
pad_arr

array([[0, 0, 0, 0, 9, 9, 9],
       [0, 0, 0, 0, 9, 9, 9],
       [0, 0, 1, 2, 9, 9, 9],
       [0, 0, 3, 4, 9, 9, 9],
       [0, 0, 9, 9, 9, 9, 9],
       [0, 0, 9, 9, 9, 9, 9],
       [0, 0, 9, 9, 9, 9, 9]])

In [18]:
# 2 D
x = np.array([
    [9, 9],
    [9, 9]
])

pad_arr = np.pad(
    x,
    pad_width=((1, 2), (3, 4)),
    mode='constant',
    constant_values=((1, 2), (3, 4))
)
print(pad_arr)

[[3 3 3 1 1 4 4 4 4]
 [3 3 3 9 9 4 4 4 4]
 [3 3 3 9 9 4 4 4 4]
 [3 3 3 2 2 4 4 4 4]
 [3 3 3 2 2 4 4 4 4]]


In [19]:
# 4 D
x = np.array([[
    [
        [9, 9],
        [9, 9]
    ]
]])

x_padded = np.pad(
    x,
    pad_width=((0,0), (0,0), (1, 2), (3, 4)),
    mode='constant',
    constant_values=((0,0), (0,0), (1, 2), (3, 4))
)
x_padded

print("Padded x shape:", x_padded.shape)
print(x_padded)


Padded x shape: (1, 1, 5, 9)
[[[[3 3 3 1 1 4 4 4 4]
   [3 3 3 9 9 4 4 4 4]
   [3 3 3 9 9 4 4 4 4]
   [3 3 3 2 2 4 4 4 4]
   [3 3 3 2 2 4 4 4 4]]]]


In [20]:
pad = 2  
x_padded = np.pad(
    x,
    pad_width=((0,0), (0,0), (pad, pad), (pad, pad)),
    mode='constant',
    constant_values=0
)
print("Padded x shape:", x_padded.shape)
print(x_padded)

Padded x shape: (1, 1, 6, 6)
[[[[0 0 0 0 0 0]
   [0 0 0 0 0 0]
   [0 0 9 9 0 0]
   [0 0 9 9 0 0]
   [0 0 0 0 0 0]
   [0 0 0 0 0 0]]]]


## Simple Conv2D

In [21]:
# Input (5x5)
x = np.array([
    [1, 2, 3, 4, 0],
    [0, 1, 2, 3, 4],
    [4, 0, 1, 2, 3],
    [3, 4, 0, 1, 2],
    [2, 3, 4, 0, 1]
])  # shape: (H, W) = (5, 5)

# Kernel (2x2)
kernel = np.array([
    [1, 0],
    [0, 1],
])  # shape: (KH, KW) = (2, 2)

# Strides
stride_x = 1  # horizontal stride
stride_y = 3  # vertical stride

H, W = x.shape     # H=5, W=5
KH, KW = kernel.shape  # KH=2, KW=2

# Output dimensions
H_out = (H - KH) // stride_y + 1  # H_out = 2
W_out = (W - KW) // stride_x + 1  # W_out = 4

out = np.zeros((H_out, W_out))  # shape: (H_out, W_out) = (2, 4)

# Convolution
for i in range(H_out):
    for j in range(W_out):
        start_i = i * stride_y
        start_j = j * stride_x
        region = x[start_i:start_i+KH, start_j:start_j+KW]  # shape: (KH, KW) = (2, 2)
        out[i, j] = np.sum(region * kernel)                # scalar

print("Output:\n", out)  # shape: (H_out, W_out) = (2, 4)

Output:
 [[2. 4. 6. 8.]
 [6. 8. 0. 2.]]


In [22]:
# Parameters
N = 4               # number of batch size
C_in = 3            # input channels
F_out = 2           # number of filters/output channels
KH, KW = 3, 3       # kernel size
stride = 1

# Input: batch of 3 channels, 5x5
x = np.random.randint(0, 5, (N, C_in, 5, 5))  # (N, C_in, H, W) = (4, 3, 5, 5)

# Initialize kernels
W = np.random.randint(-1, 2, (F_out, C_in, KH, KW))     # (F_out, C_in, KH, KW) = (2, 3, 3, 3)
b = np.random.randint(0, 2, F_out)                      # (F_out,) = (2,)

H, W_in = x.shape[2], x.shape[3]                    # H = 5, W_in = 5
H_out = (H - KH) // stride + 1                      # output height = 3
W_out = (W_in - KW) // stride + 1                   # output width = 3

out = np.zeros((N, F_out, H_out, W_out))           # output shape = (N, F_out, H_out, W_out) = (4, 2, 3, 3)

# Perform convolution
for n in range(N):          # loop over batch
    for f in range(F_out):         
        for c in range(C_in):  # for each input channel
            for i in range(H_out):
                for j in range(W_out):
                    start_i = i * stride
                    start_j = j * stride
                    region = x[n, c, start_i:start_i+KH, start_j:start_j+KW]  # (KH, KW) = (3, 3)
                    out[n, f, i, j] += np.sum(region * W[f, c])                # scalar sum
        out[n, f] += b[f]  # add bias

print("Output:\n", out)  # (N, F_out, H_out, W_out) = (4, 2, 3, 3)


Output:
 [[[[  2.   9.   1.]
   [ -2.   6.   1.]
   [  0.   1.   8.]]

  [[ 18.  21.  17.]
   [ 19.  22.  22.]
   [ 20.  15.  25.]]]


 [[[ 11. -10.  12.]
   [ 10. -10.   9.]
   [  9.   7.   0.]]

  [[ 29.  11.  25.]
   [ 14.  18.  18.]
   [ 30.  15.  26.]]]


 [[[ 11.   8.   3.]
   [ -2.  12.  -2.]
   [ 13.  10.   4.]]

  [[ 31.  21.  15.]
   [ 24.  22.  10.]
   [ 25.  14.  10.]]]


 [[[  3.   3.  20.]
   [ -9.   3.   1.]
   [ 13.   9.   5.]]

  [[ 21.  29.  30.]
   [ 18.  23.  16.]
   [ 19.  24.  13.]]]]


In [23]:
class Conv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        super().__init__()
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)

        self.KH, self.KW = kernel_size
        self.stride = stride
        self.padding = padding

        scale = np.sqrt(2 / (in_channels * self.KH * self.KW))
        self.W = nn.Parameter(scale * np.random.randn(out_channels, in_channels, self.KH, self.KW))
        self.b = nn.Parameter(np.zeros(out_channels))

        self.x = None

    def forward(self, x):
        self.x = x
        N, C, H, W = x.shape
        F_out = self.W.data.shape[0]
        KH, KW = self.KH, self.KW
        stride, pad = self.stride, self.padding

        # Output dimensions
        H_out = (H + 2*pad - KH) // stride + 1
        W_out = (W + 2*pad - KW) // stride + 1

        # Pad input
        x_padded = np.pad(x, ((0,0),(0,0),(pad,pad),(pad,pad)), mode="constant")
        out = np.zeros((N, F_out, H_out, W_out))

        # Sliding-window style
        for n in range(N):
            for i in range(H_out):
                for j in range(W_out):
                    h_start = i * stride
                    h_end = h_start + KH
                    w_start = j * stride
                    w_end = w_start + KW

                    # window of shape (C, KH, KW)
                    window = x_padded[n, :, h_start:h_end, w_start:w_end]

                    # Compute convolution for all filters at once
                    for f in range(F_out):
                        out[n, f, i, j] = np.sum(window * self.W.data[f]) + self.b.data[f]

        return out


In [24]:
N = 2           # batch size
C_in = 3        # input channels
H, W = 5, 5     # input height and width
C_out = 2       # output channels / number of filters
kernel_size = 3
stride = 1
padding = 0

# Random input
x = np.random.randn(N, C_in, H, W)

# Initialize Conv2d layer
conv = Conv2d(in_channels=C_in, out_channels=C_out, kernel_size=kernel_size, stride=stride, padding=padding)

# Forward pass
out = conv.forward(x)

print("Input shape:", x.shape)          # (2, 3, 5, 5)
print("Output shape:", out.shape)       # (2, 2, H_out, W_out)
print("Output sample [0,0]:\n", out[0,0])

Input shape: (2, 3, 5, 5)
Output shape: (2, 2, 3, 3)
Output sample [0,0]:
 [[ 1.82677114  0.26222125 -1.73973028]
 [-0.08396278  1.31555345 -0.92000793]
 [ 0.14000504  1.53162005 -0.41050941]]


## Pooling layer

In [25]:
# Input (5x5)
x = np.array([
    [1, 2, 3, 4, 0],
    [0, 1, 2, 3, 4],
    [4, 0, 1, 2, 3],
    [3, 4, 0, 1, 2],
    [2, 3, 4, 0, 1]
])  # shape: (H, W) = (5, 5)

# Pool space
pool_size_x = 2  # width of pooling window
pool_size_y = 2  # height of pooling window

# Strides
stride_x = 1  # horizontal stride
stride_y = 2  # vertical stride

# Input shape
H, W = x.shape  # H=5, W=5

# Output shape
H_out = (H - pool_size_y) // stride_y + 1  # H_out = 2
W_out = (W - pool_size_x) // stride_x + 1  # W_out = 4

# Output and global mask
out = np.zeros((H_out, W_out))             # shape: (H_out, W_out) = (2, 4)
mask = np.zeros_like(x, dtype=int)         # shape: (H, W) = (5, 5)

# Max-pooling forward + mask creation
for i in range(H_out):
    for j in range(W_out):
        start_i = i * stride_y
        start_j = j * stride_x

        # Current pooling window
        region = x[start_i:start_i+pool_size_y, start_j:start_j+pool_size_x]  # shape: (2, 2)

        # Max value in the window
        max_val = np.max(region)
        out[i, j] = max_val  # scalar

        # Local mask for this window (1 where max, 0 elsewhere)
        local_mask = (region == max_val).astype(int)

        # Add local mask into the global mask
        mask[start_i:start_i+pool_size_y, start_j:start_j+pool_size_x] += local_mask

# Display results
print("Input x:\n", x)
print("\nPooled output out:\n", out)
print("\nGlobal mask:\n", mask)


Input x:
 [[1 2 3 4 0]
 [0 1 2 3 4]
 [4 0 1 2 3]
 [3 4 0 1 2]
 [2 3 4 0 1]]

Pooled output out:
 [[2. 3. 4. 4.]
 [4. 4. 2. 3.]]

Global mask:
 [[0 1 1 2 0]
 [0 0 0 0 1]
 [1 0 0 1 1]
 [0 2 0 0 0]
 [0 0 0 0 0]]


In [26]:
# Input: batch of 2 samples, 3 channels, 5x5
N = 2
C = 3
H = 5
W = 5
x = np.random.randint(0, 5, (N, C, H, W))  # shape: (2, 3, 5, 5)

# Pool space
pool_size_x = 2  # width of pooling window
pool_size_y = 2  # height of pooling window

# Strides
stride_x = 1  # horizontal stride
stride_y = 2  # vertical stride

# Output size
H_out = (H - pool_size_y) // stride_y + 1  # 2
W_out = (W - pool_size_x) // stride_x + 1  # 4

# Output and mask
out = np.zeros((N, C, H_out, W_out))        # shape: (2, 3, 2, 4)
mask = np.zeros_like(x, dtype=int)          # shape: (2, 3, 5, 5)

# Max-pooling forward + mask creation
for n in range(N):
    for c in range(C):
        for i in range(H_out):
            for j in range(W_out):
                start_i = i * stride_y
                start_j = j * stride_x

                # Current pooling window
                region = x[n, c, start_i:start_i+pool_size_y, start_j:start_j+pool_size_x]  # shape: (2,2)

                # Max value
                max_val = np.max(region)
                out[n, c, i, j] = max_val

                # Local mask (1 where max, 0 elsewhere)
                local_mask = (region == max_val).astype(int)

                # Add local mask to the global mask
                mask[n, c, start_i:start_i+pool_size_y, start_j:start_j+pool_size_x] += local_mask

print('channel 0')
print("Input sample \n", x[0, 0])
print("\nPooled output sample \n", out[0, 0])
print("\nMask sample \n", mask[0, 0])

channel 0
Input sample 
 [[4 2 1 4 1]
 [0 2 3 3 3]
 [4 1 2 2 4]
 [2 2 1 1 1]
 [4 4 4 2 3]]

Pooled output sample 
 [[4. 3. 4. 4.]
 [4. 2. 2. 4.]]

Mask sample 
 [[1 0 0 2 0]
 [0 0 1 0 0]
 [1 0 2 1 1]
 [0 1 0 0 0]
 [0 0 0 0 0]]


In [27]:
class MaxPool2d(nn.Module):
    def __init__(self, pool_size=(2,2), stride=(1,1)):
        super().__init__()
        # Ensure pool_size and stride are tuples (height, width)
        if isinstance(pool_size, int):
            pool_size = (pool_size, pool_size)
        if isinstance(stride, int):
            stride = (stride, stride)

        self.pool_size_y, self.pool_size_x = pool_size  # pooling window height and width
        self.stride_y, self.stride_x = stride          # stride in vertical and horizontal directions
        self.x = None     # cache input for backward pass
        self.mask = None  # cache mask for backward pass

    def forward(self, x):
        self.x = x      # input array of shape (N, C, H, W)
        N, C, H, W = x.shape

        # Calculate output height and width
        H_out = (H - self.pool_size_y) // self.stride_y + 1
        W_out = (W - self.pool_size_x) // self.stride_x + 1

        # Initialize output array and mask
        out = np.zeros((N, C, H_out, W_out))
        self.mask = np.zeros_like(x, dtype=int)  # shape (N, C, H, W)

        # Loop over batch, channels, and output spatial dimensions
        for n in range(N):
            for c in range(C):
                for i in range(H_out):
                    for j in range(W_out):
                        # Determine window start and end indices
                        start_i = i * self.stride_y
                        start_j = j * self.stride_x

                        # Extract region of input corresponding to the pooling window
                        region = x[n, c, start_i:start_i+self.pool_size_y,
                                   start_j:start_j+self.pool_size_x]

                        # Find max value in window
                        max_val = np.max(region)
                        out[n, c, i, j] = max_val

                        # Create local mask (1 where max, 0 elsewhere)
                        local_mask = (region == max_val).astype(int)

                        # Store local mask in the global mask
                        self.mask[n, c, start_i:start_i+self.pool_size_y,
                                  start_j:start_j+self.pool_size_x] += local_mask

        return out  # (N, C, H_out, W_out)

In [28]:
N, C, H, W = 2, 3, 5, 5
np.random.seed(0)
x = np.random.randint(0, 5, (N, C, H, W))

# Initialize MaxPool2d layer
pool = MaxPool2d(pool_size=(2, 2), stride=(1, 2))

# Forward pass
out = pool.forward(x)
print("Input shape:", x.shape)
print("Pooled output shape:", out.shape)
print("Pooled output sample [0,0]:\n", out[0,0])
print("Mask sample [0,0]:\n", pool.mask[0,0])  # mask shows max locations

# Backward pass with dummy gradient (ones)
grad_output = np.ones_like(out)
dx = pool.backward(grad_output)
print("\nGradient w.r.t input shape:", dx.shape)
print("Gradient w.r.t input sample [0,0]:\n", dx[0,0])

Input shape: (2, 3, 5, 5)
Pooled output shape: (2, 3, 4, 2)
Pooled output sample [0,0]:
 [[4. 4.]
 [4. 4.]
 [4. 2.]
 [3. 3.]]
Mask sample [0,0]:
 [[1 0 0 0 0]
 [0 0 0 2 0]
 [0 2 1 0 0]
 [0 0 0 0 0]
 [1 0 1 0 0]]

Gradient w.r.t input shape: (2, 3, 4, 2)
Gradient w.r.t input sample [0,0]:
 [[1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]]
