In [1]:
import torch

In [2]:
class Relu:

    def __call__(self, 
                 X: torch.Tensor):
        return torch.clamp(X, min=0)

    def backward(self, 
                 dZ:torch.Tensor):
        return (dZ > 0).float()

    def parameters(self):
        return []
        

In [3]:
class Softmax:

    def __call__(self, 
                X: torch.Tensor, 
                dim: int):
        X = X - torch.max(X, dim = 1, keepdims = True).values
        sof = torch.exp(X)/torch.sum(torch.exp(X), dim = dim , keepdims = True)
        return sof

    def parameters(self):
        return []
        

In [4]:
from typing import Optional , List
class OptimizerSG:

    def __init__(self,
                params: Optional[List],
                lr : float = 0.1):
        self.params = params
        self.lr = lr

    def step(self):
        for param in self.params:
            if param.grad is not None:
                param.data -= self.lr * param.grad


In [5]:
class Flatten:

    def __call__(self,
                X: torch.Tensor):
        self.X = X
        self.out = X.view(X.shape[0], -1)
        return self.out

    def backward(self,
                 dZ: torch.Tensor):
        dX = dZ.view(self.X.size())
        return dX 
    
    def parameters(self):
        return []

        

In [6]:
class Sequential:
    def __init__(self, 
                layers: List):
        self.layers = layers

    def __call__(self,
                X: torch.Tensor):
        for layer in self.layers:
            X = layer(X)
        self.out = X
        return self.out

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]


In [7]:
class Linear:
    def __init__(self, 
                fan_in: int,
                fan_out : int,
                bias = True):
        self.weight = torch.randn((fan_in, fan_out)) // fan_in ** 0.5
        self.bias = torch.randn(fan_out) if bias else None

    def __call__(self, 
                X: torch.Tensor):
        self.last_input = X
        self.out = X @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def backward(self, d_L_d_out):
        
        # d_L_d_weights = torch.matmul(self.last_input.t(), d_L_d_out)

        d_L_d_weights = self.last_input.T @ d_L_d_out
        d_L_d_biases = torch.sum(d_L_d_out, dim=0)
        d_L_d_input = d_L_d_out @ self.weight.T
        
        return d_L_d_input

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])



In [92]:
class CrossEntropyLoss:

    def __call__(self,
                 y_pred: torch.Tensor,
                 y_true: torch.Tensor
                ):
        n_samples = y_pred.shape[0]
        log_likelihood = -torch.log(y_pred[range(n_samples), y_true])
        return torch.sum(log_likelihood) / n_samples

    def backward(self,
                y_pred: torch.Tensor,
                y_true: torch.Tensor
                ):
        n_samples = y_pred.shape[0]
        softmax = Softmax()
        grad = softmax(y_pred, dim=1)
        print(grad)
        grad[range(n_samples), y_true] -= 1
        grad = grad / n_samples
        return grad

    def paramerters(self):
        return []

    
    

In [9]:
import torch
from itertools import repeat
from typing import Tuple

class MaxPool2d:

    def __init__(self, kernel_size: int | Tuple[int, int], stride: int | Tuple[int, int]):
        self.kernel_size = tuple(kernel_size) if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = tuple(stride) if isinstance(stride, tuple) else (stride, stride)
        self.kh, self.kw = self.kernel_size
        self.sh, self.sw = self.stride
        self.padded_height, self.padded_width = None, None

    def prepare_submatrix(self, X: torch.Tensor):
        B, C, ih, iw = X.shape
        oh = (ih - self.kh) // self.sh + 1
        ow = (iw - self.kw) // self.sw + 1
        subM = X.unfold(2, self.kh, self.sh).unfold(3, self.kw, self.sw)
        return subM

    def __call__(self, X: torch.Tensor):
        self.X = X
        subM = self.prepare_submatrix(X)
        return subM.max(dim=-1).values.max(dim=-1).values

    def add_padding(self, x: torch.Tensor, padding: int):
        padding = tuple(repeat(padding, 4))
        batch_size, in_channels, original_height, original_width = x.size()
        padded_height = original_height + padding[0] + padding[1]
        padded_width = original_width + padding[2] + padding[3]

        if (self.padded_height and self.padded_width) is None:
            self.padded_height, self.padded_width = padded_height, padded_width

        padded_x = torch.zeros((batch_size, in_channels, padded_height, padded_width), dtype=x.dtype)
        padded_x[:, :, padding[0]:padding[0] + original_height, padding[2]:padding[2] + original_width] = x
        return padded_x

    def prepare_mask(self, subM: torch.Tensor):
        B, C, oh, ow, kh, kw = subM.shape
        a = torch.reshape(subM, (-1, kh * kw))
        idx = torch.argmax(a, dim=1)
        b = torch.zeros_like(a)
        b[torch.arange(b.shape[0]), idx] = 1
        mask = b.view(B, C, oh, ow, kh, kw)
        return mask

    def mask_dXp(self, mask: torch.Tensor, dz: torch.Tensor):
        dz_expanded = dz.unsqueeze(-1).unsqueeze(-1).expand_as(mask)
        dXp = dz_expanded * mask
        return dXp

    def maxpool_backprop(self, dZ: torch.Tensor, X: torch.Tensor):
        Xp = self.add_padding(X, self.kernel_size[0])
        subM = self.prepare_submatrix(Xp)
        mask = self.prepare_mask(subM)
        dXp = self.mask_dXp(mask, dZ)
        return dXp

    def padding_backward(self, dXp: torch.Tensor):
        B, C, ih, iw = self.X.shape
        dX = dXp[:, :, self.padded_height:ih, self.padded_width:iw]
        return dX

    def backward(self, dL_dout, lr):
        Batch, num_channels, input_height, input_width = self.X.shape
        dL_dinput = torch.zeros_like(self.X)
        output_height = (input_height - self.kh) // self.sh + 1
        output_width = (input_width - self.kw) // self.sw + 1

        # Extract patches from the input tensor
        subM = self.prepare_submatrix(self.X)

        # Create the mask for the max pooling operation
        mask = self.prepare_mask(subM)

        # Expand dL_dout to match the shape of mask and perform element-wise multiplication
        dL_dout_expanded = dL_dout.unsqueeze(-1).unsqueeze(-1).expand_as(mask)
        dL_dinput_unfolded = dL_dout_expanded * mask

        # Combine the unfolded gradients to form the final gradient
        dL_dinput = dL_dinput_unfolded.contiguous().view(Batch, num_channels, output_height, output_width, self.kh, self.kw)
        dL_dinput = dL_dinput.permute(0, 1, 2, 4, 3, 5).contiguous().view(Batch, num_channels, output_height * self.kh, output_width * self.kw)

        # Reduce the overlapping areas by summing them
        result = torch.zeros_like(self.X)
        for i in range(self.kh):
            for j in range(self.kw):
                result[:, :, i::self.kh, j::self.kw] += dL_dinput[:, :, i::self.kh, j::self.kw]

        return result

    def parameters(self):
        return []


In [25]:
import  torch
from typing import Tuple


class Conv2d:

    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: int,
                 stride: int = 1,
                 padding: int = 0,
                 dilation: int = 1,
                 groups: int = 1,
                 bias: bool = True
                 ) -> None:
        self.output_shape = None
        self.Ow = None
        self.Oh = None
        self.iw = None
        self.ih = None
        self.C = None
        self.B = None
        self.input_shape = None
        self.input_shape_x = None
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.kh, self.kw = self.kernel_size
        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
        self.sh, self.sw = self.stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups
        self.weights, self.bias = self.initialise_parameters(bias)

    def initialise_parameters(self, bias: bool = True):

        return (torch.randn(self.out_channels, self.in_channels // self.groups, *self.kernel_size, requires_grad=True),
                torch.zeros(self.out_channels, requires_grad=True) if not bias else torch.randn(self.out_channels,
                                                                                                requires_grad=True))

    def get_padding_dimensions(self,
                               input_shape: torch.Tensor.size,
                               kernel_size: Tuple,
                               s=(1, 1),
                               padding: int | Tuple = None
                               ):
        if len(input_shape) == 4:
            B, C, ih, iw = input_shape
        if len(input_shape) == 3:
            C, ih, iw = input_shape

        kh, kw = kernel_size
        sh, sw = s
        if padding is None:
            p = self.padding
        else:
            p = padding

        if isinstance(p, int):
            pt, pb, pl, pr = p, p, p, p
        elif isinstance(p, tuple):
            ph, pw = p
            pt, pb = ph // 2, (ph + 1) // 2
            pl, pr = pw // 2, (pw + 1) // 2
        elif p == 'valid':
            pt, pb = 0, 0
            pl, pr = 0, 0

        elif p == 'same':
            # calculating how much padding is required in all 4 directions
            # (top, bottom, left and right)
            ph = (sh - 1) * ih + kh - sh
            pw = (sw - 1) * iw + kw - sw

            pt, pb = ph // 2, (ph + 1) // 2
            pl, pr = pw // 2, (pw + 1) // 2
        else:
            raise ValueError(
                "Incorrect padding type. Allowed types are only 'same', 'valid', an integer or a tuple of length 2.")

        if len(input_shape) == 4:
            output_shape = (B, C, ih + pt + pb, iw + pl + pr)
        elif len(input_shape) == 4:
            output_shape = (C, ih + pt + pb + iw + pl + pr)

        return output_shape, (pt, pb, pl, pr)

    def get_dimensions(self,
                       input_shape: torch.Tensor):
        self.input_shape_x = input_shape.shape
        # Padded X will be actual input to this Conv2D

        self.input_shape, _ = self.get_padding_dimensions(self.input_shape_x,
                                                          self.kernel_size, self.stride)

        if len(self.input_shape) == 3:
            self.C, self.ih, self.iw = self.input_shape
        elif len(self.input_shape) == 4:
            self.B, self.C, self.ih, self.iw = self.input_shape

        # output shape
        self.Oh = (self.ih - self.kh) // self.sh + 1
        self.Ow = (self.iw - self.kw) // self.sw + 1

        if len(self.input_shape) == 3:
            self.output_shape = (self.out_channels, self.Oh, self.Ow)
        elif len(self.input_shape) == 4:
            self.output_shape = (self.B, self.out_channels, self.Oh, self.Ow)

    def prepare_subMatrix(self,
                          X: torch.Tensor,
                          Kh: int,
                          Kw: int,
                          s):
        B, C, ih, iw = X.shape
        sh, sw = s

        Oh = (ih - Kh) // sh + 1
        Ow = (iw - Kw) // sw + 1

        strides = (C * ih * iw, iw * ih, iw * sh, sw, iw, 1)
        subM = torch.as_strided(X,
                                size=(B, C, Oh, Ow, Kh, Kw),
                                stride=strides
                                )
        return subM

    def convolve(self,
                 X: torch.Tensor,
                 K: torch.Tensor,
                 s: Tuple = (1, 1),
                 mode: str = 'back'):

        F, Kc, Kh, Kw = K.shape
        subM = self.prepare_subMatrix(X, Kh, Kw, s)

        if mode == 'front':
            return torch.einsum('fckl,mcijkl->mfij', K, subM)
        elif mode == 'back':
            return torch.einsum('fdkl,mcijkl->mdij', K, subM)
        elif mode == 'param':
            return torch.einsum('mfkl,mcijkl->fcij', K, subM)

    def padding_forward(self,
                        X: torch.Tensor,
                        kernel_size,
                        s=(1, 1),
                        padding=None) -> torch.Tensor:
        self.input_shape_before_padding = X.shape
        B, C, ih, iw = self.input_shape_before_padding
        self.output_shape_padded, (self.pt, self.pb, self.pl, self.pr) = self.get_padding_dimensions(
            self.input_shape_before_padding, kernel_size, s, padding=padding)

        zeros_r = torch.zeros((B, C, ih, self.pr), dtype=X.dtype, device=X.device)
        zeros_l = torch.zeros((B, C, iw, self.pl), dtype=X.dtype, device=X.device)
        zeros_t = torch.zeros((B, C, self.pt, iw + self.pl + self.pr), dtype=X.dtype, device=X.device)
        zeros_b = torch.zeros((B, C, self.pb, iw + self.pl + self.pr), dtype=X.dtype, device=X.device)

        Xp = torch.concat((X, zeros_r), dim=3)
        Xp = torch.concat((zeros_l, Xp), dim=3)
        Xp = torch.concat((zeros_t, Xp), dim=2)
        Xp = torch.concat((Xp, zeros_b), dim=2)

        return Xp

    def padding_backward(self,
                         dXp: torch.Tensor):

        B, C, ih, iw = self.input_shape
        dX = dXp[:, :, self.pt:self.pt + ih, self.pl:self.pl + iw]
        return dX

    def dilate2D(self,
                 X: torch.Tensor,
                 Dr=(1, 1)) -> torch.Tensor:
        dh, dw = Dr  # Dilation rate
        B, C, H, W = X.shape

        # Dilation along width
        if dw > 1:
            Xd_w = torch.zeros((B, C, H, W + (W - 1) * (dw - 1)), dtype=X.dtype, device=X.device)
            Xd_w[:, :, :, ::dw] = X
        else:
            Xd_w = X

        # Dilation along height
        if dh > 1:
            Xd_h = torch.zeros((B, C, H + (H - 1) * (dh - 1), Xd_w.shape[-1]), dtype=X.dtype, device=X.device)
            Xd_h[:, :, ::dh, :] = Xd_w
        else:
            Xd_h = Xd_w

        return Xd_h

    def dZ_D_dX(self,
                dZ_D: torch.Tensor,
                ih: int,
                iw: int) -> torch.Tensor:
        # pad the dilated dZ (dZ_D to dZ_Dp)

        _, _, Hd, Wd = dZ_D.shape
        ph = ih - Hd + self.kh - 1
        pw = iw - Wd + self.kw - 1

        dZ_Dp = self.padding_forward(dZ_D, self.kernel_size, self.stride, (ph, pw))

        # Rotate K by 180 degrees
        k_rotated = self.weights.flip([2, 3])

        # convolve the dZ_Dp with k_rotated
        dXp = self.convolve(dZ_Dp, k_rotated, mode='back')
        dX = self.padding_backward(dXp)

        return dX

    def __call__(self, X: torch.Tensor) -> torch.Tensor:
        self.X = X
        self.get_dimensions(X)
        # padding
        Xp = self.padding_forward(X, self.kernel_size, self.stride)

        # convolve X with K
        Z = self.convolve(Xp, self.weights, self.stride, mode= 'front') + self.bias

        return Z

    def backward(self, dZ: torch.Tensor) -> torch.Tensor:
        Xp = self.padding_forward(self.X, self.kernel_size, self.stride)

        B, C, ih, iw = Xp.shape

        # Dilate dZ (dZ -> dZ_D)
        dZ_D = self.dilate2D(dZ, Dr=self.stride)
        dX = self.dZ_D_dX(dZ_D, ih, iw)

        # Gradient K
        _, _, Hd, Wd = dZ_D.shape

        ph = self.ih - Hd - self.kh + 1
        pw = self.iw - Wd - self.kw + 1

        dZ_Dp = self.padding_forward(dZ_D, self.kernel_size, self.stride, padding=(ph, pw))
        self.dK = self.convolve(Xp, dZ_Dp, mode='param')

        # gradient db
        self.db = torch.sum(dZ, dim=0)

        return dX

    def parameters(self):
        return [self.weights] + ([] if self.bias is None else [self.bias])


In [279]:
import torch
from typing import Tuple

class Conv2d:
    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: int,
                 stride: int = 1,
                 padding: int = 0,
                 dilation: int = 1,
                 groups: int = 1,
                 bias: bool = True) -> None:
        self.output_shape = None
        self.Ow = None
        self.Oh = None
        self.iw = None
        self.ih = None
        self.C = None
        self.B = None
        self.input_shape = None
        self.input_shape_x = None
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.kh, self.kw = self.kernel_size
        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
        self.sh, self.sw = self.stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups
        self.weights, self.bias = self.initialise_parameters()

    def initialise_parameters(self, bias: bool = True):
        # return (torch.randn(self.out_channels, self.in_channels // self.groups, *self.kernel_size, requires_grad=True),
        #         torch.zeros(self.out_channels,self.Oh, self.Ow, requires_grad=True) if not bias else torch.randn(self.out_channels,self.Oh, self.Ow, requires_grad=True))
        return (torch.randn(self.out_channels, self.in_channels // self.groups, *self.kernel_size, requires_grad=True),
                torch.zeros(self.out_channels, requires_grad=True) if not bias else torch.randn(self.out_channels, requires_grad=True))

    def get_padding_dimensions(self,
                               input_shape: torch.Tensor.size,
                               kernel_size: Tuple,
                               s=(1, 1),
                               padding: int | Tuple = None):
        if len(input_shape) == 4:
            B, C, ih, iw = input_shape
        if len(input_shape) == 3:
            C, ih, iw = input_shape

        kh, kw = kernel_size
        sh, sw = s
        if padding is None:
            p = self.padding
        else:
            p = padding

        if isinstance(p, int):
            pt, pb, pl, pr = p, p, p, p
        elif isinstance(p, tuple):
            ph, pw = p
            pt, pb = ph // 2, (ph + 1) // 2
            pl, pr = pw // 2, (pw + 1) // 2
        elif p == 'valid':
            pt, pb = 0, 0
            pl, pr = 0, 0

        elif p == 'same':
            ph = (sh - 1) * ih + kh - sh
            pw = (sw - 1) * iw + kw - sw

            pt, pb = ph // 2, (ph + 1) // 2
            pl, pr = pw // 2, (pw + 1) // 2
        else:
            raise ValueError("Incorrect padding type. Allowed types are only 'same', 'valid', an integer or a tuple of length 2.")

        if len(input_shape) == 4:
            output_shape = (B, C, ih + pt + pb, iw + pl + pr)
        elif len(input_shape) == 4:
            output_shape = (C, ih + pt + pb + iw + pl + pr)

        return output_shape, (pt, pb, pl, pr)

    def get_dimensions(self, input_shape: torch.Tensor):
        self.input_shape_x = input_shape.shape
        self.input_shape, _ = self.get_padding_dimensions(self.input_shape_x, self.kernel_size, self.stride)

        if len(self.input_shape) == 3:
            self.C, self.ih, self.iw = self.input_shape
        elif len(self.input_shape) == 4:
            self.B, self.C, self.ih, self.iw = self.input_shape

        self.Oh = (self.ih - self.kh) // self.sh + 1
        self.Ow = (self.iw - self.kw) // self.sw + 1

        if len(self.input_shape) == 3:
            self.output_shape = (self.out_channels, self.Oh, self.Ow)
        elif len(self.input_shape) == 4:
            self.output_shape = (self.B, self.out_channels, self.Oh, self.Ow)

    def prepare_subMatrix(self, X: torch.Tensor, Kh: int, Kw: int, s):
        B, C, ih, iw = X.shape
        sh, sw = s

        Oh = (ih - Kh) // sh + 1
        Ow = (iw - Kw) // sw + 1

        strides = (C * ih * iw, iw * ih, iw * sh, sw, iw, 1)
        subM = torch.as_strided(X,
                                size=(B, C, Oh, Ow, Kh, Kw),
                                stride=strides)
        return subM

    def convolve(self, X: torch.Tensor, K: torch.Tensor, s: Tuple = (1, 1), mode: str = 'back'):
        F, Kc, Kh, Kw = K.shape
        subM = self.prepare_subMatrix(X, Kh, Kw, s)
        print(F, Kc, Kh, Kw)
        if mode == 'front':
            return torch.einsum('fckl,mcijkl->mfij', K, subM)
        elif mode == 'back':
            return torch.einsum('fdkl,mcijkl->mdij', K, subM)
        elif mode == 'param':
            return torch.einsum('mfkl,mcijkl->fcij', K, subM)

    def padding_forward(self, X: torch.Tensor, kernel_size, s=(1, 1), padding=None) -> torch.Tensor:
        self.input_shape_before_padding = X.shape
        B, C, ih, iw = self.input_shape_before_padding
        self.output_shape_padded, (self.pt, self.pb, self.pl, self.pr) = self.get_padding_dimensions(self.input_shape_before_padding, kernel_size, s, padding=padding)

        zeros_r = torch.zeros((B, C, ih, self.pr), dtype=X.dtype, device=X.device)
        zeros_l = torch.zeros((B, C, iw, self.pl), dtype=X.dtype, device=X.device)
        zeros_t = torch.zeros((B, C, self.pt, iw + self.pl + self.pr), dtype=X.dtype, device=X.device)
        zeros_b = torch.zeros((B, C, self.pb, iw + self.pl + self.pr), dtype=X.dtype, device=X.device)

        Xp = torch.concat((X, zeros_r), dim=3)
        Xp = torch.concat((zeros_l, Xp), dim=3)
        Xp = torch.concat((zeros_t, Xp), dim=2)
        Xp = torch.concat((Xp, zeros_b), dim=2)

        return Xp

    def padding_backward(self, dXp: torch.Tensor):
        B, C, ih, iw = self.input_shape
        dX = dXp[:, :, self.pt:self.pt + ih, self.pl:self.pl + iw]
        return dX

    def dilate2D(self, X: torch.Tensor, Dr=(1, 1)) -> torch.Tensor:
        dh, dw = Dr
        B, C, H, W = X.shape

        if dw > 1:
            Xd_w = torch.zeros((B, C, H, W + (W - 1) * (dw - 1)), dtype=X.dtype, device=X.device)
            Xd_w[:, :, :, ::dw] = X
        else:
            Xd_w = X

        if dh > 1:
            Xd_h = torch.zeros((B, C, H + (H - 1) * (dh - 1), Xd_w.shape[-1]), dtype=X.dtype, device=X.device)
            Xd_h[:, :, ::dh, :] = Xd_w
        else:
            Xd_h = Xd_w

        return Xd_h

    def dZ_D_dX(self, dZ_D: torch.Tensor, ih: int, iw: int) -> torch.Tensor:
        _, _, Hd, Wd = dZ_D.shape
        ph = ih - Hd + self.kh - 1
        pw = iw - Wd + self.kw - 1

        dZ_Dp = self.padding_forward(dZ_D, self.kernel_size, self.stride, (ph, pw))
        k_rotated = self.weights.flip([2, 3])
        dXp = self.convolve(dZ_Dp, k_rotated, mode='back')
        dX = self.padding_backward(dXp)

        return dX

    def __call__(self, X: torch.Tensor) -> torch.Tensor:
        self.X = X
        self.get_dimensions(X)
        print(f"Input shape: {X.shape}")
        Xp = self.padding_forward(X, self.kernel_size, self.stride, self.padding)
        print(f"Padded input shape: {Xp.shape}")
        self.Z = self.convolve(Xp, self.weights, self.stride, mode = 'front')
        print(f"Convolved output shape: {self.Z.shape}")

        print(f"bias shape: {self.bias.shape}")

        if self.bias is not None:
            print(f"bias shape: {self.bias.unsqueeze(0).shape}")
            return self.Z + self.bias.view(1, -1, 1,1) #sum should be done on the last layer
            
        return self.Z

    def backward(self, dZ: torch.Tensor) -> torch.Tensor:
        Xp = self.padding_forward(self.X, self.kernel_size, self.stride)

        B, C, ih, iw = Xp.shape

        # Dilate dZ (dZ -> dZ_D)
        dZ_D = self.dilate2D(dZ, Dr=self.stride)
        dX = self.dZ_D_dX(dZ_D, ih, iw)

        # Gradient K
        _, _, Hd, Wd = dZ_D.shape

        ph = self.ih - Hd - self.kh + 1
        pw = self.iw - Wd - self.kw + 1

        dZ_Dp = self.padding_forward(dZ_D, self.kernel_size, self.stride, padding=(ph, pw))
        self.dK = self.convolve(Xp, dZ_Dp, mode='param')

        # gradient db
        self.db = torch.sum(dZ, dim=0)

        return dX

    def parameters(self):
        return [self.weights] + ([] if self.bias is None else [self.bias])


In [280]:
model.layers[-3].input_shape

(10, 10, 22, 22)

In [281]:
model = Sequential([
    Conv2d(in_channels=3, out_channels= 10, kernel_size= 3, stride=1, padding=1, dilation = 1),
    Relu(),
    Conv2d(in_channels=10, out_channels= 10, kernel_size=3, stride=1, padding=1, dilation = 1),
    Relu(),
    MaxPool2d(2,2),
])
classifier = Sequential([
            Flatten(),
            Linear(fan_in=1000,
                   fan_out=3)
])

In [282]:
# parameter init
with torch.no_grad():
  classifier.layers[-1].weight *= 0.1 

In [283]:
parameters = model.parameters() + classifier.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

4193


In [284]:
y = torch.randint(0,3, (10,))
y

tensor([1, 1, 0, 2, 1, 1, 1, 1, 2, 0])

In [285]:
sof = Softmax()
x = classifier(model(torch.randn(10,3,20,20)))

Input shape: torch.Size([10, 3, 20, 20])
Padded input shape: torch.Size([10, 3, 22, 22])
10 3 3 3
Convolved output shape: torch.Size([10, 10, 20, 20])
bias shape: torch.Size([10])
bias shape: torch.Size([1, 10])
Input shape: torch.Size([10, 10, 20, 20])
Padded input shape: torch.Size([10, 10, 22, 22])
10 10 3 3
Convolved output shape: torch.Size([10, 10, 20, 20])
bias shape: torch.Size([10])
bias shape: torch.Size([1, 10])


In [286]:
torch.randn(10,3,20,20) + torch.randn(10)

RuntimeError: The size of tensor a (20) must match the size of tensor b (10) at non-singleton dimension 3

In [287]:
loss = CrossEntropyLoss()
loss(x,y)

tensor(nan, grad_fn=<DivBackward0>)

In [288]:
model.layers[0].bias

tensor([ 0.5449, -0.4824,  0.3505, -0.9421, -2.4677,  1.0527, -2.1633, -0.1819,
        -0.8786,  0.7707], requires_grad=True)

In [289]:
x

tensor([[-1051.1987,  -998.0018, -1039.8259],
        [-1055.1014, -1043.0089, -1042.4410],
        [-1049.1191, -1067.7445, -1056.6111],
        [-1094.9994, -1076.3190, -1049.9355],
        [-1087.3521, -1065.5135, -1054.3969],
        [-1081.1299, -1063.2013, -1078.5293],
        [-1055.2360, -1079.5571, -1071.1384],
        [-1085.9821,  -972.9965, -1066.9231],
        [-1032.2350, -1025.0970, -1054.7189],
        [-1043.8016,  -978.1281, -1050.3032]], grad_fn=<AddBackward0>)

In [290]:
dl = loss.backward(x,y)
dl

tensor([[7.8864e-24, 1.0000e+00, 6.8553e-19],
        [2.0261e-06, 3.6173e-01, 6.3827e-01],
        [9.9944e-01, 8.1445e-09, 5.5725e-04],
        [2.6855e-20, 3.4820e-12, 1.0000e+00],
        [4.8723e-15, 1.4862e-05, 9.9999e-01],
        [1.6357e-08, 1.0000e+00, 2.2036e-07],
        [1.0000e+00, 2.7381e-11, 1.2406e-07],
        [0.0000e+00, 1.0000e+00, 1.6151e-41],
        [7.9376e-04, 9.9921e-01, 1.3648e-13],
        [3.0083e-29, 1.0000e+00, 4.5157e-32]], grad_fn=<DivBackward0>)


tensor([[ 7.8864e-25,  0.0000e+00,  6.8553e-20],
        [ 2.0261e-07, -6.3827e-02,  6.3827e-02],
        [-5.5724e-05,  8.1445e-10,  5.5725e-05],
        [ 2.6855e-21,  3.4820e-13,  0.0000e+00],
        [ 4.8723e-16, -9.9999e-02,  9.9999e-02],
        [ 1.6357e-09, -2.3842e-08,  2.2036e-08],
        [ 1.0000e-01, -1.0000e-01,  1.2406e-08],
        [ 0.0000e+00,  0.0000e+00,  1.6157e-42],
        [ 7.9376e-05,  9.9921e-02, -1.0000e-01],
        [-1.0000e-01,  1.0000e-01,  4.5157e-33]], grad_fn=<DivBackward0>)

In [291]:
dl1 = classifier.layers[-1].backward(dl)

In [292]:
dl1

tensor([[ 0.0000e+00, -7.8864e-26, -6.8554e-21,  ...,  0.0000e+00,
          0.0000e+00, -6.8553e-21],
        [ 6.3827e-03,  6.3827e-03, -1.8824e-10,  ...,  6.3827e-03,
          0.0000e+00, -6.3827e-03],
        [-8.1445e-11,  5.5724e-06, -1.1950e-10,  ..., -8.1445e-11,
          0.0000e+00, -5.5725e-06],
        ...,
        [ 0.0000e+00,  0.0000e+00, -1.6115e-43,  ...,  0.0000e+00,
          0.0000e+00, -1.6115e-43],
        [-9.9921e-03, -1.0000e-02,  5.2154e-10,  ..., -9.9921e-03,
          0.0000e+00,  1.0000e-02],
        [-1.0000e-02,  4.0978e-10,  4.0978e-10,  ..., -1.0000e-02,
          0.0000e+00, -4.5157e-34]], grad_fn=<MmBackward0>)

In [293]:
df = classifier.layers[-2].backward(dl1)
df.shape

torch.Size([10, 10, 10, 10])

In [294]:
dmp = model.layers[-1].backward(df, lr= 0.1)
dmp.shape

torch.Size([10, 10, 20, 20])

In [295]:
model.layers[-2].__class__.__name__

'Relu'

In [296]:
dre = model.layers[-2].backward(dmp)
print(dre)

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         ...,

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 

In [297]:
model.layers[-3].__class__.__name__

'Conv2d'

In [298]:
dconv = model.layers[-3].backward(dre)
dconv.shape

10 10 3 3
10 10 20 20


torch.Size([10, 10, 20, 20])

In [299]:
dconv

tensor([[[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00]],

         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
     

In [300]:
drel2 = model.layers[-4].backward(dconv)
drel2

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         ...,

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 

In [301]:
model.layers[-5].backward(drel2)

10 3 3 3
10 10 20 20


tensor([[[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00]],

         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
     

In [302]:
model.layers[-5].weights

tensor([[[[ 2.2225e-01, -1.3533e+00,  7.0780e-01],
          [ 1.5937e+00,  2.7034e-01, -5.8134e-01],
          [-1.8445e+00, -7.9288e-01, -1.0550e+00]],

         [[ 3.6484e-01, -1.3573e+00,  6.9395e-01],
          [-3.0585e-01,  1.0830e-01, -2.6417e-02],
          [ 1.4120e-01, -2.4403e+00,  2.0928e+00]],

         [[ 2.0570e+00, -4.3037e-01,  1.4259e-01],
          [ 9.7969e-01, -7.7720e-01,  6.7444e-01],
          [-3.2730e-01, -5.6612e-01,  3.2020e-01]]],


        [[[ 1.2262e+00,  1.3769e+00,  7.4663e-01],
          [ 7.9122e-01, -1.1171e+00,  1.0127e+00],
          [ 4.3653e-01,  1.2804e+00, -5.6944e-01]],

         [[-4.4468e-01, -3.2412e-01, -4.0151e-01],
          [ 6.3968e-01,  5.6488e-01, -8.7145e-01],
          [-5.1776e-02, -5.7690e-01, -1.7083e+00]],

         [[ 2.5890e-01, -8.1225e-01, -3.6549e-01],
          [-9.5235e-01, -5.5633e-01,  1.0506e+00],
          [ 2.6488e-01, -1.2906e+00, -1.0443e-01]]],


        [[[ 9.4880e-01,  1.3404e+00,  2.2903e-01],
          [-3.3

In [65]:
model.layers[-3].weights.flip([2,3])

tensor([[[[ 8.4431e-01,  1.3617e+00,  8.8031e-01],
          [-1.1335e-01,  2.7982e-01,  2.9498e-01],
          [ 1.1909e-01, -3.1197e-02,  3.7460e-01]],

         [[-2.6358e-01,  1.3453e-01,  6.4400e-01],
          [-3.3922e-01, -7.9234e-01,  3.0908e-01],
          [ 5.0949e-01, -9.7007e-01, -1.0359e+00]],

         [[-2.5105e+00, -3.5322e-01,  5.7998e-01],
          [-7.5148e-02, -2.0340e+00, -1.3283e+00],
          [-7.0969e-01, -5.7985e-01,  6.7454e-01]],

         [[ 2.2485e-01, -6.0687e-01,  5.7450e-01],
          [ 6.9587e-01, -3.2927e-01,  1.3095e+00],
          [ 1.2135e+00,  2.2984e-01,  7.7001e-01]],

         [[ 1.0539e+00,  5.8449e-01, -2.7815e-01],
          [ 1.2321e+00,  5.9105e-01, -2.7712e-01],
          [-2.3643e-01, -6.5620e-01, -6.4189e-01]],

         [[-6.6779e-01,  1.5244e+00,  1.5131e-01],
          [ 9.5951e-01,  5.0216e-02,  1.1087e+00],
          [ 4.6378e-01,  8.1848e-01, -5.5698e-01]],

         [[ 5.1324e-01,  7.0831e-01,  1.1616e+00],
          [-1.2329e

In [66]:
model.layers[-3].weights

tensor([[[[ 3.7460e-01, -3.1197e-02,  1.1909e-01],
          [ 2.9498e-01,  2.7982e-01, -1.1335e-01],
          [ 8.8031e-01,  1.3617e+00,  8.4431e-01]],

         [[-1.0359e+00, -9.7007e-01,  5.0949e-01],
          [ 3.0908e-01, -7.9234e-01, -3.3922e-01],
          [ 6.4400e-01,  1.3453e-01, -2.6358e-01]],

         [[ 6.7454e-01, -5.7985e-01, -7.0969e-01],
          [-1.3283e+00, -2.0340e+00, -7.5148e-02],
          [ 5.7998e-01, -3.5322e-01, -2.5105e+00]],

         [[ 7.7001e-01,  2.2984e-01,  1.2135e+00],
          [ 1.3095e+00, -3.2927e-01,  6.9587e-01],
          [ 5.7450e-01, -6.0687e-01,  2.2485e-01]],

         [[-6.4189e-01, -6.5620e-01, -2.3643e-01],
          [-2.7712e-01,  5.9105e-01,  1.2321e+00],
          [-2.7815e-01,  5.8449e-01,  1.0539e+00]],

         [[-5.5698e-01,  8.1848e-01,  4.6378e-01],
          [ 1.1087e+00,  5.0216e-02,  9.5951e-01],
          [ 1.5131e-01,  1.5244e+00, -6.6779e-01]],

         [[-5.3266e-01, -8.0716e-01,  1.9894e+00],
          [ 1.9018e

In [108]:
dL_dinput

tensor([[[[ 0.8928, -1.2513,  4.2086,  3.9624, -1.5612],
          [ 2.3196, 11.5357, -2.3091, -5.5656,  3.6225],
          [ 0.2837,  2.5777,  4.2926,  2.8067,  3.9617],
          [ 4.6437, -3.9557, -6.8163,  4.7117,  3.6249],
          [-0.9031,  1.1957,  1.9582, -5.7701, -2.6641]],

         [[-7.2258,  5.0775, -2.3936, -7.9741, -3.6698],
          [ 3.0071,  0.8859, -3.5058,  1.5540, -4.2442],
          [ 5.3874,  6.9414,  2.4184,  3.0068, -1.2007],
          [-1.0435, -2.4280,  0.0736,  2.3948, -9.0912],
          [ 1.1574, -3.1699, -3.3202,  6.5431, -5.7196]],

         [[ 0.8481,  4.5151,  1.5849,  0.6362,  4.9340],
          [ 3.4403,  1.2097,  3.9012,  4.4359,  3.3799],
          [ 2.6825, -0.7181, -5.6548, -1.6582,  3.5682],
          [-0.5762, -1.0083,  6.4352,  3.1938, -0.8816],
          [-1.2057, -1.6694,  1.2889,  1.9470, -0.1779]]]],
       grad_fn=<ConvolutionBackward0>)

In [29]:
len(torch.randn(1,2,2,2).shape)

4

In [30]:
a,b,c,d = torch.randn(1,2,2,2).shape

In [31]:
a, b, c,d

(1, 2, 2, 2)

In [79]:
def dilate2D(X, Dr=(1,1)):
    dh, dw = Dr  # Dilation rate
    B, C, H, W = X.shape

    # Dilation along width
    if dw > 1:
        Xd_w = torch.zeros((B, C, H, W + (W-1)*(dw-1)), dtype=X.dtype, device=X.device)
        Xd_w[:, :, :, ::dw] = X
    else:
        Xd_w = X

    # Dilation along height
    if dh > 1:
        Xd_h = torch.zeros((B, C, H + (H-1)*(dh-1), Xd_w.shape[-1]), dtype=X.dtype, device=X.device)
        Xd_h[:, :, ::dh, :] = Xd_w
    else:
        Xd_h = Xd_w

    return Xd_h


In [90]:
dilate2D(torch.randn(1,1,3,3), (2,2))

tensor([[[[-0.4840,  0.0000,  0.5853,  0.0000,  0.8889],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [-1.6283,  0.0000, -0.4107,  0.0000,  0.0686],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [-1.2815,  0.0000,  0.6569,  0.0000, -0.9884]]]])

In [86]:
import numpy as np
def dilate2D(X, Dr=(1,1)):
    dh, dw = Dr # Dilate rate
    m, C, H, W = X.shape
    Xd = np.insert(arr=X, obj=np.repeat(np.arange(1,W), dw-1), values=0, axis=-1)
    Xd = np.insert(arr=Xd, obj=np.repeat(np.arange(1,H), dh-1), values=0, axis=-2)
    return Xd

dilate2D(np.random.randn(1,1,3,3), (2,2))

array([[[[-0.41691474,  0.        , -0.37638792,  0.        ,
           1.33771079],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ],
         [ 0.73062132,  0.        ,  1.04873616,  0.        ,
          -0.90982625],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ],
         [ 0.33011789,  0.        ,  0.71715205,  0.        ,
           0.42174769]]]])

In [91]:
x = torch.randn(3,3,3,3)


In [95]:
x[:,:,:,::1]

tensor([[[[-7.9793e-01, -1.4052e+00,  4.0148e-01],
          [ 6.1866e-01,  2.0327e-01, -2.5889e-01],
          [ 5.9100e-01,  3.0477e-01,  4.0817e-01]],

         [[ 6.7856e-01,  9.8435e-02, -9.0524e-01],
          [-8.9082e-02, -6.9590e-01,  3.6190e-01],
          [ 1.6161e-01,  2.9888e+00,  8.5204e-01]],

         [[ 9.0889e-01, -3.8381e-01,  2.8003e-01],
          [-2.1493e-01,  5.8262e-01,  6.0509e-01],
          [ 7.1167e-03, -1.0686e+00,  4.9086e-01]]],


        [[[ 1.7531e+00,  4.9089e-02,  1.4345e-01],
          [-1.3172e+00, -3.4416e-01,  1.5436e+00],
          [ 5.4806e-02, -9.0877e-01,  5.1632e-01]],

         [[-5.7991e-01,  7.2682e-01, -1.8046e-01],
          [ 5.0542e-01, -1.4285e-01,  1.3375e+00],
          [-1.8278e-01,  2.1581e-01, -9.4317e-01]],

         [[-1.4291e+00,  2.6729e+00,  1.0795e+00],
          [-4.1787e-01,  1.4471e+00,  7.7561e-01],
          [-3.6922e-01, -1.5663e+00, -4.5650e-01]]],


        [[[-1.3512e+00, -1.8202e+00, -8.7615e-01],
          [-7.1

In [98]:
x[:,:,:,::2]

tensor([[[[-7.9793e-01,  4.0148e-01],
          [ 6.1866e-01, -2.5889e-01],
          [ 5.9100e-01,  4.0817e-01]],

         [[ 6.7856e-01, -9.0524e-01],
          [-8.9082e-02,  3.6190e-01],
          [ 1.6161e-01,  8.5204e-01]],

         [[ 9.0889e-01,  2.8003e-01],
          [-2.1493e-01,  6.0509e-01],
          [ 7.1167e-03,  4.9086e-01]]],


        [[[ 1.7531e+00,  1.4345e-01],
          [-1.3172e+00,  1.5436e+00],
          [ 5.4806e-02,  5.1632e-01]],

         [[-5.7991e-01, -1.8046e-01],
          [ 5.0542e-01,  1.3375e+00],
          [-1.8278e-01, -9.4317e-01]],

         [[-1.4291e+00,  1.0795e+00],
          [-4.1787e-01,  7.7561e-01],
          [-3.6922e-01, -4.5650e-01]]],


        [[[-1.3512e+00, -8.7615e-01],
          [-7.1592e-01,  2.5056e-01],
          [ 7.3638e-01,  6.7734e-01]],

         [[-5.4072e-01,  4.3290e-04],
          [-1.8095e-02, -9.8648e-01],
          [-5.6984e-01,  9.8978e-01]],

         [[-1.0057e+00,  7.0174e-01],
          [-4.5889e-01,  1.845

In [130]:
B,C,H,W = x.shape
dw = 2
x1 = torch.zeros((B, C, H, W + (W - 1) * (dw - 1)), dtype=x.dtype, device=x.device)
x1

tensor([[[[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]],

         [[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]],

         [[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]]],


        [[[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]],

         [[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]],

         [[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]]],


        [[[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]],

         [[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]],

         [[0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.],
          [0., 0., 0., 0., 0.]]]])

In [131]:
x1[:, :, :, ::dw] = x

In [132]:
x1

tensor([[[[-7.9793e-01,  0.0000e+00, -1.4052e+00,  0.0000e+00,  4.0148e-01],
          [ 6.1866e-01,  0.0000e+00,  2.0327e-01,  0.0000e+00, -2.5889e-01],
          [ 5.9100e-01,  0.0000e+00,  3.0477e-01,  0.0000e+00,  4.0817e-01]],

         [[ 6.7856e-01,  0.0000e+00,  9.8435e-02,  0.0000e+00, -9.0524e-01],
          [-8.9082e-02,  0.0000e+00, -6.9590e-01,  0.0000e+00,  3.6190e-01],
          [ 1.6161e-01,  0.0000e+00,  2.9888e+00,  0.0000e+00,  8.5204e-01]],

         [[ 9.0889e-01,  0.0000e+00, -3.8381e-01,  0.0000e+00,  2.8003e-01],
          [-2.1493e-01,  0.0000e+00,  5.8262e-01,  0.0000e+00,  6.0509e-01],
          [ 7.1167e-03,  0.0000e+00, -1.0686e+00,  0.0000e+00,  4.9086e-01]]],


        [[[ 1.7531e+00,  0.0000e+00,  4.9089e-02,  0.0000e+00,  1.4345e-01],
          [-1.3172e+00,  0.0000e+00, -3.4416e-01,  0.0000e+00,  1.5436e+00],
          [ 5.4806e-02,  0.0000e+00, -9.0877e-01,  0.0000e+00,  5.1632e-01]],

         [[-5.7991e-01,  0.0000e+00,  7.2682e-01,  0.0000e+00, -1.

In [133]:
dh = 2
x2 = torch.zeros((B, C, H + (H-1) * (dh-1), x1.shape[-1]), dtype=x.dtype, device=x.device)

In [135]:
x2[:,:,::dh,:] = x1

In [136]:
x2

tensor([[[[-7.9793e-01,  0.0000e+00, -1.4052e+00,  0.0000e+00,  4.0148e-01],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 6.1866e-01,  0.0000e+00,  2.0327e-01,  0.0000e+00, -2.5889e-01],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 5.9100e-01,  0.0000e+00,  3.0477e-01,  0.0000e+00,  4.0817e-01]],

         [[ 6.7856e-01,  0.0000e+00,  9.8435e-02,  0.0000e+00, -9.0524e-01],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
          [-8.9082e-02,  0.0000e+00, -6.9590e-01,  0.0000e+00,  3.6190e-01],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 1.6161e-01,  0.0000e+00,  2.9888e+00,  0.0000e+00,  8.5204e-01]],

         [[ 9.0889e-01,  0.0000e+00, -3.8381e-01,  0.0000e+00,  2.8003e-01],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
          [-2.1493e-01,  0.0000e+00,  5.8262e-01,  0.0000e+00,  6.0509e-

In [225]:
import torch
import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # First convolutional layer
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=10, kernel_size=3)
        # Second convolutional layer
        self.conv2 = nn.Conv2d(in_channels=10, out_channels=10, kernel_size=3)
    
    def forward(self, x):
        x = self.conv1(x)
        print(x.shape)
        x = torch.relu(x)  # Apply activation function
        x = self.conv2(x)
        x = torch.relu(x)  # Apply activation function
        return x

# Example usage
model = SimpleCNN()
input_image = torch.randn(1, 3, 32, 32)  # Batch size of 1, 3-channel image, 32x32 pixels
output = model(input_image)
# print(output.shape)  # Should be (1, 10, output_height, output_width)


torch.Size([1, 10, 30, 30])


In [212]:
model.conv1.weight.shape

torch.Size([10, 3, 3, 3])

In [213]:
model.conv1.bias

Parameter containing:
tensor([-0.1009, -0.1021,  0.1459,  0.0552,  0.1322,  0.0970, -0.0455,  0.1319,
         0.1671, -0.1678], requires_grad=True)