In [1]:
import torch

In [29]:
class Relu:

    def __call__(self, 
                 X: torch.Tensor):
        return torch.clamp(X, min=0)

    def backward(self, 
                 dZ:torch.Tensor):
        return (dZ > 0).float()

    def parameters(self):
        return []
        

In [3]:
class Softmax:

    def __call__(self, 
                X: torch.Tensor, 
                dim: int):
        X = X - torch.max(X, dim = 1, keepdims = True).values
        sof = torch.exp(X)/torch.sum(torch.exp(X), dim = dim , keepdims = True)
        return sof

    def parameters(self):
        return []
        

In [4]:
from typing import Optional , List
class OptimizerSG:

    def __init__(self,
                params: Optional[List],
                lr : float = 0.1):
        self.params = params
        self.lr = lr

    def step(self):
        for param in self.params:
            if param.grad is not None:
                param.data -= self.lr * param.grad


In [5]:
class Flatten:

    def __call__(self,
                X: torch.Tensor):
        self.X = X
        self.out = X.view(X.shape[0], -1)
        return self.out

    def backward(self,
                 dZ: torch.Tensor):
        dX = dZ.view(self.X.size())
        return dX 
    
    def parameters(self):
        return []

        

In [6]:
class Sequential:
    def __init__(self, 
                layers: List):
        self.layers = layers

    def __call__(self,
                X: torch.Tensor):
        for layer in self.layers:
            X = layer(X)
        self.out = X
        return self.out

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]


In [7]:
class Linear:
    def __init__(self, 
                fan_in: int,
                fan_out : int,
                bias = True):
        self.weight = torch.randn((fan_in, fan_out)) // fan_in ** 0.5
        self.bias = torch.randn(fan_out) if bias else None

    def __call__(self, 
                X: torch.Tensor):
        self.last_input = X
        self.out = X @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def backward(self, d_L_d_out):
        
        # d_L_d_weights = torch.matmul(self.last_input.t(), d_L_d_out)

        d_L_d_weights = self.last_input.T @ d_L_d_out
        d_L_d_biases = torch.sum(d_L_d_out, dim=0)
        d_L_d_input = d_L_d_out @ self.weight.T
        
        return d_L_d_input

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])



In [16]:
class CrossEntropyLoss:

    def __call__(self,
                 y_pred: torch.Tensor,
                 y_true: torch.Tensor
                ):
        n_samples = y_pred.shape[0]
        log_likelihood = -torch.log(y_pred[range(n_samples), y_true])
        return torch.sum(log_likelihood) / n_samples

    def backward(self,
                y_pred: torch.Tensor,
                y_true: torch.Tensor
                ):
        n_samples = y_pred.shape[0]
        softmax = Softmax()
        grad = softmax(y_pred, dim=1)
        grad[range(n_samples), y_true] -= 1
        grad = grad / n_samples
        return grad

    def paramerters(self):
        return []

    
    

In [8]:
import torch
from itertools import repeat
from typing import Tuple

class MaxPool2d:

    def __init__(self, kernel_size: int | Tuple[int, int], stride: int | Tuple[int, int]):
        self.kernel_size = tuple(kernel_size) if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = tuple(stride) if isinstance(stride, tuple) else (stride, stride)
        self.kh, self.kw = self.kernel_size
        self.sh, self.sw = self.stride
        self.padded_height, self.padded_width = None, None

    def prepare_submatrix(self, X: torch.Tensor):
        B, C, ih, iw = X.shape
        oh = (ih - self.kh) // self.sh + 1
        ow = (iw - self.kw) // self.sw + 1
        subM = X.unfold(2, self.kh, self.sh).unfold(3, self.kw, self.sw)
        return subM

    def __call__(self, X: torch.Tensor):
        self.X = X
        subM = self.prepare_submatrix(X)
        return subM.max(dim=-1).values.max(dim=-1).values

    def add_padding(self, x: torch.Tensor, padding: int):
        padding = tuple(repeat(padding, 4))
        batch_size, in_channels, original_height, original_width = x.size()
        padded_height = original_height + padding[0] + padding[1]
        padded_width = original_width + padding[2] + padding[3]

        if (self.padded_height and self.padded_width) is None:
            self.padded_height, self.padded_width = padded_height, padded_width

        padded_x = torch.zeros((batch_size, in_channels, padded_height, padded_width), dtype=x.dtype)
        padded_x[:, :, padding[0]:padding[0] + original_height, padding[2]:padding[2] + original_width] = x
        return padded_x

    def prepare_mask(self, subM: torch.Tensor):
        B, C, oh, ow, kh, kw = subM.shape
        a = torch.reshape(subM, (-1, kh * kw))
        idx = torch.argmax(a, dim=1)
        b = torch.zeros_like(a)
        b[torch.arange(b.shape[0]), idx] = 1
        mask = b.view(B, C, oh, ow, kh, kw)
        return mask

    def mask_dXp(self, mask: torch.Tensor, dz: torch.Tensor):
        dz_expanded = dz.unsqueeze(-1).unsqueeze(-1).expand_as(mask)
        dXp = dz_expanded * mask
        return dXp

    def maxpool_backprop(self, dZ: torch.Tensor, X: torch.Tensor):
        Xp = self.add_padding(X, self.kernel_size[0])
        subM = self.prepare_submatrix(Xp)
        mask = self.prepare_mask(subM)
        dXp = self.mask_dXp(mask, dZ)
        return dXp

    def padding_backward(self, dXp: torch.Tensor):
        B, C, ih, iw = self.X.shape
        dX = dXp[:, :, self.padded_height:ih, self.padded_width:iw]
        return dX

    def backward(self, dL_dout, lr):
        Batch, num_channels, input_height, input_width = self.X.shape
        dL_dinput = torch.zeros_like(self.X)
        output_height = (input_height - self.kh) // self.sh + 1
        output_width = (input_width - self.kw) // self.sw + 1

        # Extract patches from the input tensor
        subM = self.prepare_submatrix(self.X)

        # Create the mask for the max pooling operation
        mask = self.prepare_mask(subM)

        # Expand dL_dout to match the shape of mask and perform element-wise multiplication
        dL_dout_expanded = dL_dout.unsqueeze(-1).unsqueeze(-1).expand_as(mask)
        dL_dinput_unfolded = dL_dout_expanded * mask

        # Combine the unfolded gradients to form the final gradient
        dL_dinput = dL_dinput_unfolded.contiguous().view(Batch, num_channels, output_height, output_width, self.kh, self.kw)
        dL_dinput = dL_dinput.permute(0, 1, 2, 4, 3, 5).contiguous().view(Batch, num_channels, output_height * self.kh, output_width * self.kw)

        # Reduce the overlapping areas by summing them
        result = torch.zeros_like(self.X)
        for i in range(self.kh):
            for j in range(self.kw):
                result[:, :, i::self.kh, j::self.kw] += dL_dinput[:, :, i::self.kh, j::self.kw]

        return result

    def parameters(self):
        return []


In [173]:
# Custom Conv2d Layer
class Conv2d:
    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: int,
                 stride: int = 1,
                 padding: int = 0,
                 dilation: int = 1,
                 groups: int = 1):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups
        self._check_parameters()
        self._n_tuple()
        self.weights, self.bias = self.initialise_weights()
        self.padded_height, self.padded_width = None, None

    def _n_tuple(self):
        self.kernel_size = (self.kernel_size, self.kernel_size)
        self.stride = (self.stride, self.stride)
        self.padding = (self.padding, self.padding)
        self.dilation = (self.dilation, self.dilation)

    def initialise_weights(self):
        return (torch.randn(self.out_channels, self.in_channels // self.groups, *self.kernel_size, requires_grad=True),
                torch.zeros(self.out_channels, requires_grad=True))

    def add_padding(self, x: torch.Tensor, padding: int):
        padding = tuple(repeat(padding, 4))
        batch_size, in_channels, original_height, original_width = x.size()
        padded_height = original_height + padding[0] + padding[1]
        padded_width = original_width + padding[2] + padding[3]

        if (self.padded_height and self.padded_width) is None:
            self.padded_height, self.padded_width = padded_height, padded_width

        padded_x = torch.zeros((batch_size, in_channels, padded_height, padded_width), dtype=x.dtype)
        if padded_x[:, :, padding[0]:padding[0] + original_height, padding[2]:padding[2] + original_width].shape == x.shape:
            padded_x[:, :, padding[0]:padding[0] + original_height, padding[2]:padding[2] + original_width] = x
        else:
            return x
        return padded_x

    def _check_parameters(self):
        if self.groups <= 0:
            raise ValueError('groups must be a positive integer')
        if self.in_channels % self.groups != 0:
            raise ValueError('in_channels should be divisible by groups')
        if self.out_channels % self.groups != 0:
            raise ValueError('out_channels should be divisible by groups')

    def __call__(self, x: torch.Tensor):
        self.X = x
        self.ih, self.iw = self.X.shape[-2], self.X.shape[-1]
        batch_size, in_channels, in_height, in_width = x.size()
        out_height = (in_height + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) // \
                     self.stride[0] + 1
        out_width = (in_width + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1) // self.stride[
            1] + 1

        if self.padding[0] > 0 or self.padding[1] > 0:
            x = self.add_padding(x, self.padding[0])

        self.out = torch.zeros(batch_size, self.out_channels, out_height, out_width)

        for h in range(out_height):
            for w in range(out_width):
                h_start = h * self.stride[0]
                h_end = h_start + self.kernel_size[0]
                w_start = w * self.stride[1]
                w_end = w_start + self.kernel_size[1]
                receptive_field = x[:, :, h_start:h_end, w_start:w_end]

                self.out[:, :, h, w] = torch.sum(
                    receptive_field.unsqueeze(1) * self.weights.view(1, self.out_channels,
                                                                     self.in_channels // self.groups,
                                                                     *self.kernel_size),
                    dim=(2, 3, 4)
                ) + self.bias.view(1, self.out_channels)

        return self.out

    def prepare_subMatrix(self, X, Kh, Kw, s):
        B, C, ih, iw = X.shape
        sh, sw = s

        Oh = (ih - Kh) // sh + 1
        Ow = (iw - Kw) // sw + 1

        strides = (C * ih * iw, iw * ih, iw * sh, sw, iw, 1)
        subM = torch.as_strided(X,
                                size=(B, C, Oh, Ow, Kh, Kw),
                                stride=strides
                                )
        return subM

    def padding_backward(self,
                         dXp: torch.Tensor):

        B, C, ih, iw = self.X.shape
        dX = dXp[:, :, self.padded_height:ih, self.padded_width:iw]
        return dX

    def convolve(self, X: torch.Tensor,
                 K: torch.Tensor,
                 s: Tuple = (1, 1),
                 mode: str = 'back'):

        F, Kc, Kh, Kw = K.shape
        subM = self.prepare_subMatrix(X, Kh, Kw, s)

        if mode == 'front':
            return torch.einsum('fckl,mcijkl->mfij', K, subM)
        elif mode == 'back':
            return torch.einsum('fdkl,mcijkl->mdij', K, subM)
        elif mode == 'param':
            return torch.einsum('mfkl,mcijkl->fcij', K, subM)

    def dz_D_dx(self,
                dZ: torch.Tensor,
                ih: int,
                iw: int):
        _, _, Hd, Wd = dZ.shape
        ph = ih - Hd + self.kernel_size[0] - 1
        pw = iw - Wd + self.kernel_size[0] - 1

        dZ_Dp = self.add_padding(dZ, ph)

        # Rotate the Kernel by 180 degrees

        k_rotated = self.weights.flip([2,3])

        # convolve w.r.t k_rotated
        dXp = self.convolve(dZ_Dp, k_rotated, mode='back')

        dX = self.padding_backward(dXp)
        return dX

    def backward(self,
                 dZ: torch.Tensor):
        Xp: torch.Tensor = self.add_padding(self.X, self.padding[0])
        B, C, ih, iw = Xp.shape

        # dZ -> dZ_D_dX
        dX = self.dz_D_dx(dZ, ih, iw)

        # gradient dK
        _, _, Hd, Wd = dZ.shape
        ph = self.ih - Hd - self.kernel_size[0] + 1
        pw = self.iw - Wd - self.kernel_size[0] + 1

        print("padding hei :", ph)
        
        dZ_Dp = self.add_padding(dZ, padding= ph)

        print('dZ_Dp shape', dZ_Dp.shape)
        self.dweights = self.convolve(Xp, dZ_Dp, mode='param')

        # gradient db
        self.dbias = dZ.sum(0)

        return dX

    def parameters(self):
        return [self.weights] + ([] if self.bias is None else [self.bias])


In [174]:
model = Sequential([
    Conv2d(in_channels=3, out_channels= 10, kernel_size= 3, stride=1, padding=1),
    Relu(),
    Conv2d(in_channels=10, out_channels= 10, kernel_size=3, stride=1, padding=1),
    Relu(),
    MaxPool2d(2,2),
])
classifier = Sequential([
            Flatten(),
            Linear(fan_in=250,
                   fan_out=3)
])

In [175]:
# parameter init
with torch.no_grad():
  classifier.layers[-1].weight *= 0.1 

In [176]:
parameters = model.parameters() + classifier.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

1943


In [177]:
y = torch.randint(0,3, (10,))
y

tensor([2, 2, 1, 0, 0, 1, 1, 2, 1, 2])

In [178]:
sof = Softmax()
x = classifier(model(torch.randn(10,3,10,10)))

In [179]:
loss = CrossEntropyLoss()
loss(x,y)

tensor(nan, grad_fn=<DivBackward0>)

In [180]:
dl = loss.backward(x,y)
dl

tensor([[ 1.0000e-01,  1.2251e-20, -1.0000e-01],
        [ 9.9960e-02,  3.6758e-05, -9.9996e-02],
        [ 1.0000e-01, -1.0000e-01,  9.8889e-20],
        [-1.4961e-05,  2.4380e-16,  1.4958e-05],
        [ 0.0000e+00,  2.0308e-19,  5.9993e-16],
        [ 6.1225e-02, -9.9666e-02,  3.8441e-02],
        [ 1.0000e-01, -1.0000e-01,  1.3473e-08],
        [ 1.0000e-01,  1.0839e-12, -1.0000e-01],
        [ 9.6307e-02, -9.6333e-02,  2.5806e-05],
        [ 9.9998e-02,  2.0643e-06, -1.0000e-01]], grad_fn=<DivBackward0>)

In [181]:
dl1 = classifier.layers[-1].backward(dl)

In [182]:
dl1

tensor([[-1.0000e-02,  1.0000e-02,  1.0000e-02,  ..., -1.0000e-02,
         -1.0000e-02,  0.0000e+00],
        [-9.9960e-03,  9.9996e-03,  9.9960e-03,  ..., -9.9996e-03,
         -9.9960e-03,  0.0000e+00],
        [-1.0000e-02, -9.8889e-21,  1.0000e-02,  ..., -4.0978e-10,
         -1.0000e-02,  0.0000e+00],
        ...,
        [-1.0000e-02,  1.0000e-02,  1.0000e-02,  ..., -1.0000e-02,
         -1.0000e-02,  0.0000e+00],
        [-9.6307e-03, -2.5806e-06,  9.6307e-03,  ...,  2.5812e-06,
         -9.6307e-03,  0.0000e+00],
        [-9.9998e-03,  1.0000e-02,  9.9998e-03,  ..., -1.0000e-02,
         -9.9998e-03,  0.0000e+00]], grad_fn=<MmBackward0>)

In [183]:
df = classifier.layers[-2].backward(dl1)
df.shape

torch.Size([10, 10, 5, 5])

In [184]:
dmp = model.layers[-1].backward(df, lr= 0.1)
dmp.shape

torch.Size([10, 10, 10, 10])

In [185]:
model.layers[-2].__class__.__name__

'Relu'

In [186]:
dre = model.layers[-2].backward(dmp)
print(dre)

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [1., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          ...,
          [1., 0., 0.,  ..., 0., 0., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 1.]],

         ...,

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 

In [187]:
model.layers[-3].__class__.__name__

'Conv2d'

In [188]:
dconv = model.layers[-3].backward(dre)
dconv.shape

padding hei : -2
dZ_Dp shape torch.Size([10, 10, 14, 14])


RuntimeError: Storage size calculation overflowed with sizes=[10, 10, -1, -1, 14, 14] and strides=[1440, 144, 12, 1, 12, 1]

In [149]:
dconv

tensor([], size=(10, 10, 0, 0), grad_fn=<SliceBackward0>)

In [64]:
model.layers[-3].weights[:,:,::-1, ::-1]

ValueError: step must be greater than zero

In [65]:
model.layers[-3].weights.flip([2,3])

tensor([[[[ 8.4431e-01,  1.3617e+00,  8.8031e-01],
          [-1.1335e-01,  2.7982e-01,  2.9498e-01],
          [ 1.1909e-01, -3.1197e-02,  3.7460e-01]],

         [[-2.6358e-01,  1.3453e-01,  6.4400e-01],
          [-3.3922e-01, -7.9234e-01,  3.0908e-01],
          [ 5.0949e-01, -9.7007e-01, -1.0359e+00]],

         [[-2.5105e+00, -3.5322e-01,  5.7998e-01],
          [-7.5148e-02, -2.0340e+00, -1.3283e+00],
          [-7.0969e-01, -5.7985e-01,  6.7454e-01]],

         [[ 2.2485e-01, -6.0687e-01,  5.7450e-01],
          [ 6.9587e-01, -3.2927e-01,  1.3095e+00],
          [ 1.2135e+00,  2.2984e-01,  7.7001e-01]],

         [[ 1.0539e+00,  5.8449e-01, -2.7815e-01],
          [ 1.2321e+00,  5.9105e-01, -2.7712e-01],
          [-2.3643e-01, -6.5620e-01, -6.4189e-01]],

         [[-6.6779e-01,  1.5244e+00,  1.5131e-01],
          [ 9.5951e-01,  5.0216e-02,  1.1087e+00],
          [ 4.6378e-01,  8.1848e-01, -5.5698e-01]],

         [[ 5.1324e-01,  7.0831e-01,  1.1616e+00],
          [-1.2329e

In [66]:
model.layers[-3].weights

tensor([[[[ 3.7460e-01, -3.1197e-02,  1.1909e-01],
          [ 2.9498e-01,  2.7982e-01, -1.1335e-01],
          [ 8.8031e-01,  1.3617e+00,  8.4431e-01]],

         [[-1.0359e+00, -9.7007e-01,  5.0949e-01],
          [ 3.0908e-01, -7.9234e-01, -3.3922e-01],
          [ 6.4400e-01,  1.3453e-01, -2.6358e-01]],

         [[ 6.7454e-01, -5.7985e-01, -7.0969e-01],
          [-1.3283e+00, -2.0340e+00, -7.5148e-02],
          [ 5.7998e-01, -3.5322e-01, -2.5105e+00]],

         [[ 7.7001e-01,  2.2984e-01,  1.2135e+00],
          [ 1.3095e+00, -3.2927e-01,  6.9587e-01],
          [ 5.7450e-01, -6.0687e-01,  2.2485e-01]],

         [[-6.4189e-01, -6.5620e-01, -2.3643e-01],
          [-2.7712e-01,  5.9105e-01,  1.2321e+00],
          [-2.7815e-01,  5.8449e-01,  1.0539e+00]],

         [[-5.5698e-01,  8.1848e-01,  4.6378e-01],
          [ 1.1087e+00,  5.0216e-02,  9.5951e-01],
          [ 1.5131e-01,  1.5244e+00, -6.6779e-01]],

         [[-5.3266e-01, -8.0716e-01,  1.9894e+00],
          [ 1.9018e