## Trick's for Attention:
- Permute (multi dimensional transpose into contiguous) is safer bet to manipulate matrices, or einsum works as awell
- dont trust reshape or just view
- dont try to do more than one thing at a time with view, be explicit
- Keep note of Attn matrix (TxT specifies what your heads are looking at)
- For convolutions: stride 1 keeps same size with padding = 1 if and only if image size is greater than kernel size
- Don't expect coherent results trying to permute and view at the same time
- expecting C2,h3,w3 to be formulated as all9pixels,2channels with .view(h3*w3, C2) is falliable

## Tensor Mastery:
- You can't view permuted tensor in a different shape later on directly, unless you specifically lay it out in a contiguous fashion
- a.permute(blah blah) and then later on a=a.view(xy) doesnt work
- do a.permute().cntiguous() then a=a.view() works

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from modules import GroupNorm
import math

In [29]:

a = torch.ones(4, 3, 3, requires_grad=True)
b = a.pow(2)
loss = b.sum()
z = torch.autograd.grad(loss, a, retain_graph=True)
z

(tensor([[[2., 2., 2.],
          [2., 2., 2.],
          [2., 2., 2.]],
 
         [[2., 2., 2.],
          [2., 2., 2.],
          [2., 2., 2.]],
 
         [[2., 2., 2.],
          [2., 2., 2.],
          [2., 2., 2.]],
 
         [[2., 2., 2.],
          [2., 2., 2.],
          [2., 2., 2.]]]),)

In [22]:
a = torch.tensor ([1, 2, 3, 4])
b = torch.tensor ([100, 100, 100, 5 ])

answer = torch.argmin (b-a, dim=0)
answer.item()

3

In [24]:
a = torch.tensor ([[3,1,1], [2,2,3]])
a = a.view(2,3).unsqueeze(1)
a[0,0,0]

tensor(3)

### Testing broadcasting for quantization differences each BHW must be broadcasted VS times and have VS differences

In [34]:
torch.manual_seed(32)
B = 1
H = 2
W = 2
BHW = 4
vocab_size = 3
C = 2
X = torch.randn (BHW, C).unsqueeze (1) # (BHW, 1, C)
emb = torch.randn (vocab_size, C)
zeros = torch.ones_like(emb) # (VS, C)

diff = X - zeros
print (X)
print ("\n\n\n\n\n\n")
print (diff[0,:,:])

tensor([[[ 0.8651,  0.0284]],

        [[ 0.5256, -0.3633]],

        [[-0.4169, -1.2650]],

        [[ 1.2367,  0.1980]]])







tensor([[-0.1349, -0.9716],
        [-0.1349, -0.9716],
        [-0.1349, -0.9716]])


In [38]:
X = torch.randn (BHW, C).unsqueeze (1) # (BHW, 1, C)
emb = torch.randn (vocab_size, C)
zeros = torch.ones_like(emb)

distances = (X - emb).pow(2).mean(dim=2) # BHW, VS (because mean along dim = 2)
encoding_indices = torch.argmin(distances, dim=1)
distances.shape, encoding_indices.shape # implies encoding indices lose the dim along which we find min

(torch.Size([4, 3]), torch.Size([4]))

In [45]:
test = torch.tensor ([[[11,11,11],[12,12,12]], [[21,21,21],[22,22,22]], [[31,31,31],[32,32,32]]])
test_permuted = test.permute (0, 2, 1).contiguous()
test_permuted.detach()

tensor([[[11, 12],
         [11, 12],
         [11, 12]],

        [[21, 22],
         [21, 22],
         [21, 22]],

        [[31, 32],
         [31, 32],
         [31, 32]]])

In [52]:
x = torch.tensor ([[[11,11,11],[12,12,12]],[[21,21,21],[22,22,22]]])
x=x.view(2*2*3).unsqueeze(1)
x.shape

torch.Size([12, 1])

In [2]:
x = torch.randn (4,16,16,1024)
x = x.view (4*16*16, 1024)
x.shape

torch.Size([1024, 1024])

In [3]:
test = nn.Conv2d (3, 3, 3, stride=1, padding=1)
test.weight.shape, test.bias.shape,

(torch.Size([3, 3, 3, 3]), torch.Size([3]))

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from modules import GroupNorm
import math
from quantizer import Quantizer, QuantizerConfig
from encoder import Encoder, EncoderConfig
from decoder import Decoder, DecoderConfig

enc = Encoder (EncoderConfig)
snap = Quantizer (QuantizerConfig)
dec = Decoder (DecoderConfig)
enc.to('cuda')
snap.to('cuda')
dec.to('cuda')
X = torch.randn (4, 3, 256, 256)
X = X.to ('cuda')
ze = enc(X)
vqloss, zq, indices = snap(ze)
image = dec (zq)
ze.shape, zq.shape, image.shape 


(torch.Size([4, 1024, 16, 16]),
 torch.Size([4, 1024, 16, 16]),
 torch.Size([4, 3, 256, 256]))

In [5]:
from encoder import Encoder
from encoder import EncoderConfig
test = Encoder (EncoderConfig)

x = torch.randn (4, 3, 256,256)
x = x.to ('cuda')
test.to('cuda')
#a0=test.latent_activation[0](x)
#a1 =  test.latent_activation[1](a0)
#a2 = test.latent_activation[2](a1)
#a2.shape
ze, mu, logvar = test (x)
ze.shape, mu.shape, logvar.shape

(torch.Size([4, 1024, 16, 16]),
 torch.Size([4, 1024, 16, 16]),
 torch.Size([4, 1024, 16, 16]))

In [21]:
class isolatedSelfAttention (nn.Module):
    def __init__ (self, channels):
        super().__init__()
        self.channels = channels

        #norm
        self.group_norm = GroupNorm (channels)
        # attention
        self.q = nn.Conv2d (channels, channels, kernel_size=3, stride=1, padding=1)
        self.k = nn.Conv2d (channels, channels, kernel_size=3, stride=1, padding=1)
        self.v = nn.Conv2d (channels, channels, kernel_size=3, stride=1, padding=1)
        # projection
        self.conv_projection = nn.Conv2d (channels, channels, kernel_size=1, stride=1, padding=0)

        self.y = None

    def forward (self, X):
        B, C, H, W = X.size ()

        # normalize
        X_normalized = self.group_norm (X) # (B, C, H, W)
        # emit kqv
        q = self.q (X_normalized)
        k = self.k (X_normalized)
        v = self.v (X_normalized) # (B, C, H, W)

        k = k.view (B, C, H*W) # (B, C, HW)
        q = q.view (B, C, H*W).transpose(1,2) # (B, HW, C)

        attn= q @ k * (1.0/math.sqrt(C)) # (B, HW, HW)

        attn = F.softmax (attn, dim=-1)

        v = v.view (B, C, H*W).transpose(1,2)
        self.y =  attn @ v # (B, HW, C)
        self.y = self.y.transpose(1,2).contiguous().view (B, C, H, W)

        self.y = self.conv_projection (self.y)

        return X + self.y



In [23]:
# C H W
a = torch.tensor ([
                    [[1,1,1],[1,1,1],[1,1,1]], 
                    [[2,2,2],[2,2,2],[2,2,2]], 
                    [[3,3,3],[3,3,3],[3,3,3]],
                    [[4,4,4,],[4,4,4,],[4,4,4,]]
                ])
a


tensor([[[1, 1, 1],
         [1, 1, 1],
         [1, 1, 1]],

        [[2, 2, 2],
         [2, 2, 2],
         [2, 2, 2]],

        [[3, 3, 3],
         [3, 3, 3],
         [3, 3, 3]],

        [[4, 4, 4],
         [4, 4, 4],
         [4, 4, 4]]])

In [26]:
# 4, 3, 3
# C, H, W -> nh, hs, H, W
channels = 4
n_head = 2
h = 3
w = 3
b=a.view (n_head, channels//n_head, h*w)
b[1,:,:]

tensor([[3, 3, 3, 3, 3, 3, 3, 3, 3],
        [4, 4, 4, 4, 4, 4, 4, 4, 4]])

In [22]:
c = b.view (n_head, channels//n_head, h*w)
wrong = a.view (h*w, channels)
c[1, :, :], wrong[:,0]

(tensor([[3, 3, 3, 3, 3, 3, 3, 3, 3],
         [4, 4, 4, 4, 4, 4, 4, 4, 4]]),
 tensor([1, 1, 1, 2, 2, 3, 3, 4, 4]))

In [22]:
# testing einsum against permute into contiguous
# Refer to modules.py for kernel fused F.scaled_dot_product_attention
class SelfAttention (nn.Module):
    def __init__(self, channels, n_head):
        super().__init__()
        self.channels = channels
        self.n_head = n_head
        assert channels % n_head == 0, f"Specified channels:{channels} are not divisible by number of attention heads{n_head}"
        # norm
        self.group_norm = GroupNorm (channels)
        # attention
        self.conv_attention = nn.Conv2d (channels, 3 * channels, kernel_size=3, stride=1, padding=1)
        self.conv_projection = nn.Conv2d (channels, channels, kernel_size=1, stride=1, padding=0)
    
    def forward (self, X):
        B, C, H, W = X.size()
        # normalize X
        x_normalized = self.group_norm (X)
        # emit kqv
        # X (B, C, H, W)
        kqv = self.conv_attention (x_normalized) # (B, 3C, H, W)
        q, k, v = kqv.split (self.channels ,dim=1) #(B,C,H,W) x3

        # (B, C, H, W)
        head_size = C//self.n_head

        k=k.view (B, self.n_head, head_size, H*W).transpose(-1,-2) # (B, nh, HW, hs)
        q = q.view (B, self.n_head, head_size, H*W) # (B, nh, hs, HW)
        v=v.view (B, self.n_head, head_size, H*W) # (B, nh, hs, HW)

        #(HW HW) @ (HW, hs)
        att = k @ q * (1.0/math.sqrt(head_size))
        att = F.softmax (att, dim=-1) # (B, nh, HW, HW)
        y = att @ v.transpose (-1,-2) # (B, nh, HW, HW) @ (B, nh, HW, hs) -> (B, nh, HW, hs) same as B T T @ B T C
        
        y = y.permute (0, 1, 3, 2).contiguous ().view (B, C, H, W)
        y = self.conv_projection(y)
        return X + y


In [25]:
torch.manual_seed (32)
X = torch.randn (2,64, 5,5)
y = X.clone()
_3xattn = SelfAttention (64)
manual_attention = isolatedSelfAttention (64)

_3xattn.conv_projection
_3xattn_weights = (_3xattn.conv_attention.weight.data.clone())
_3xattn_bias = (_3xattn.conv_attention.bias.data.clone())
_3xattn_proj_weight = _3xattn.conv_projection.weight.data.clone()
_3xattn_proj_bias = _3xattn.conv_projection.bias.data.clone()
_3xattn_group_norm_weights = _3xattn.group_norm.group_norm.weight.data.clone()
_3xattn_group_norm_biases = _3xattn.group_norm.group_norm.bias.data.clone()

wq, wk, wv = _3xattn_weights.split (manual_attention.channels, dim=0)
bq, bk, bv = _3xattn_bias.split (manual_attention.channels, dim=0)




manual_attention.q.weight.data = wq
manual_attention.q.bias.data = bq
manual_attention.k.weight.data = wk
manual_attention.k.bias.data = bk
manual_attention.v.weight.data = wv
manual_attention.v.bias.data = bv
manual_attention.conv_projection.weight.data = _3xattn_proj_weight
manual_attention.conv_projection.bias.data = _3xattn_proj_bias
manual_attention.group_norm.group_norm.weight.data = _3xattn_group_norm_weights
manual_attention.group_norm.group_norm.bias.data = _3xattn_group_norm_biases

out1 = _3xattn(X)
out2 = manual_attention (X)



Fused vs Individual Attention | exact: True  | approximate: True  | maxdiff: 0.0


In [None]:
import torch
import os
from PIL import Image
import torch
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset

In [3]:
import random
random.seed(42)  # You can replace 42 with any integer of your choice

# Set seed for CPU and GPU (if using CUDA)
torch.manual_seed(42)  # For CPU

# If using GPU (CUDA), set the seed for CUDA operations as well:
torch.cuda.manual_seed(42)  # For the current GPU device
torch.cuda.manual_seed_all(42)  # For all GPUs if using multiple GPUs


In [57]:
# Custom Dataset Class for Loading Images
class ImageDataset(Dataset):
    def __init__(self, folder_path, transform=None):
        self.folder_path = folder_path
        self.transform = transform
        # Get all the jpeg files from the folder
        
        self.image_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.jpeg') or f.lower().endswith('.jpg')]
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img_name = os.path.join(self.folder_path, self.image_files[idx])
        image = Image.open(img_name).convert('RGB')  # Convert to RGB if it's not already
        
        # Apply transformations if any
        if self.transform:
            image = self.transform(image)
        
        return image

In [58]:
# Define image transformations (resizing, conversion to tensor, normalization)
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize to 256x256
    transforms.ToTensor(),          # Convert image to tensor
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize to [-1, 1]
])

# Create dataset instance
dataset = ImageDataset('./crushit2', transform=transform)

In [80]:
batches = DataLoader(dataset, batch_size=800, shuffle=False)

In [81]:
for batch_idx, batch in enumerate(batches):
    print(f"Batch {batch_idx + 1}:")
    print(f"Shape of the batch: {batch.shape}")  # (batch_size, channels, height, width)
    # print(len(batch))
    
    # You can now pass this batch to your CNN
    # For example, to pass to a model:
    # output = model(images)
    
    # Break after one batch for example purposes

Batch 1:
Shape of the batch: torch.Size([800, 3, 256, 256])


In [84]:
Xtr = batch

In [85]:
Xtr.shape

torch.Size([800, 3, 256, 256])

In [92]:
Xtr = Xtr.permute (0, 2,3, 1)
Xtr.shape

torch.Size([800, 256, 256, 3])

### Let's work group norm:

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [6]:
# Step 1: Input tensor (batch of RGB images)
B, C_in, H, W = 4, 3, 256, 256  # Input: RGB images
images = torch.randn(B, C_in, H, W)
# Step 2: Convolution to increase the number of channels
conv = nn.Conv2d(in_channels=C_in, out_channels=256, kernel_size=3, stride=1, padding = 1)
features = conv(images)  # Output: B x 256 x 256 x 256
features.shape

torch.Size([4, 256, 256, 256])

In [31]:
class GroupNorm2D (nn.Module):
    def __init__ (self, num_groups, num_channels, num_spatial_channels, affine=True, eps=1e-5):
        super().__init__()
        self.num_channels = num_channels
        self.num_groups = num_groups
        self.affine = affine
        self.eps = eps
        self.gamma = None
        self.beta = None
        assert self.num_channels % self.num_groups == 0, f"channels :{num_channels} are not divisible by {num_groups} groups"

        # Learnable parameters
        if affine == True:
            # scale
            self.gamma = nn.Parameter(torch.ones(1, num_channels, *[1] * num_spatial_channels))
            # shift
            self.beta = nn.Parameter (torch.zeros (1, num_channels, *[1] * num_spatial_channels))

    def forward (self, X):
        B, C = X.shape[:2]
        spatial_dims = X.shape[2:]
        assert C == self.num_channels, f"Mismatch between input channels: {C} and num_channels: {self.num_channels} at initialization"

        # rearrage the input in shape of groups as an extra batch dimension
        G = self.num_groups
        group_size = C // G
        X = X.view (B, G, group_size, *spatial_dims)

        print (f"Now shape of X after arranging in groups is {X.shape}")
        # compute mean and variance across group and spatial dimensions
        dims_to_reduce = tuple (range (2, X.dim())) # All Dimensions except B and G
        X_mean_no_keep_dim = X.mean (dim=dims_to_reduce, keepdim=False)
        X_var_no_keep_dim = X.var (dim= dims_to_reduce, keepdim=False)
        X_mean = X.mean (dim=dims_to_reduce, keepdim=True)
        X_var = X.var (dim= dims_to_reduce, keepdim=True)
        print (f"Shapes of means: NoKeepDim: {X_mean_no_keep_dim.shape} KeepDim: {X_mean.shape}")
        print (f"Shapes of vars: NoKeepDim: {X_var_no_keep_dim.shape} KeepDim: {X_var.shape}")

        # normalize:
        X = (X - X_mean) / torch.sqrt(X_var+self.eps)
        print (f"Intermediate shape just after normalization = {X.shape}")
        # reshape back to original shape
        X = X.view (B, C, *spatial_dims)
        print (f"Final shape after norm = {X.shape}")
        # Scale and shift
        return self.gamma * X + self.beta


In [27]:
# utility function we will use later when comparing manual layers to pytorch layers
def cmp(s, custom, torch_version):
    ex = torch.all(custom == torch_version)
    app = torch.allclose(custom, torch_version, rtol=1e-5, atol=1e-7)
    maxdiff = (custom - torch_version).abs().max().item()
    print(f'{s:15s} | exact: {str(ex.item()):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')


## Let's work Conv2D

In [28]:
class Conv2D(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
        super().__init__()

        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        # Initialize conv layer parameters
        self.weight = nn.Parameter(
            torch.randn(out_channels, in_channels, *kernel_size) * 
            (1.0 / (in_channels * kernel_size[0] * kernel_size[1]))**0.5
        )
        self.bias = nn.Parameter(torch.randn(out_channels)) if bias else None

    def forward(self, X):
        B, Cin, H, W = X.shape
        assert Cin == self.in_channels, (
            f"In channels of input tensor: {Cin} don't match the conv layer initialization channels: {self.in_channels}"
        )

        # pad the input (B, Cin, H, W) W->p[1] H->p[0]
        X = F.pad(X, (self.padding[1], self.padding[1], self.padding[0], self.padding[0]))

        # Compute Output Dimensions
        H_out = ((2 * self.padding[0] + H) - self.kernel_size[0]) // self.stride[0] + 1
        W_out = ((2 * self.padding[1] + W) - self.kernel_size[1]) // self.stride[1] + 1

        # Extract the striding blocks from input
        # (4, 3, 9, 9) -> with kernel size 3 and stride 1 yields (4, 9*3, 7*7) = (4, 27, 49)
        # in general unfold on (B, Cin, H, W) yields (B, total spots in one stride block, number of total striding patch blocks in the image)
        X = F.unfold(X, kernel_size=self.kernel_size, stride=self.stride)  # (B, Cin * kh * kw, L)

        #re-arrange weights
        # suppose out channels = 6 kernel size 3x3
        # weights initialized as (6, 3, 3, 3)
        # arrange them as (6, 27) because we have flattened our conv patch blocks as well
        weight_flat = self.weight.view(self.out_channels, -1)  # (out_channels, kernel_size[0]*kernel_size[1]*in_channels) i.e. (6, 27)

        # Perform matrix multiplication
        # X: (B, Cin * kh * kw, L), weight_flat: (out_channels, Cin * kh * kw)

        # BE CAREFUL! ABOUT mutating views like (B, 49, 6) as (B, 6, 7, 7)
        # rows and columns might get wrongly permuted
        # SAFE TODO: (B, 6, 49) -> (B, 6, 7, 7)
        out = torch.einsum('bkl, ok -> bol', X, weight_flat)  # (B, out_channels, L)

        # Add bias if available
        if self.bias is not None:
            out += self.bias.view(1, self.out_channels, 1)  # Broadcasting bias

        # Reshape to output dimensions
        out = out.view(B, self.out_channels, H_out, W_out)  # (B, out_channels, H_out, W_out)

        return out


In [29]:
B, Cin, H, W = 1, 4, 400, 400
out_channels, kernel_size = 3, (4, 4)
stride, padding = (1, 1), (0, 0)

torch.manual_seed(32)
input = torch.randn(B, Cin, H, W)

convLayer = Conv2D(Cin, out_channels, kernel_size, stride=stride, padding=padding, bias=True)
myOut = convLayer(input)

torchConv = nn.Conv2d(Cin, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=True)
torchConv.weight = nn.Parameter(convLayer.weight.data.clone())
torchConv.bias = nn.Parameter(convLayer.bias.data.clone())
torchOut = torchConv(input)

# Compare outputs
print("Max difference:", (myOut - torchOut).abs().max().item())


Max difference: 2.384185791015625e-06


In [32]:
# Step 1: Input tensor (batch of RGB images)
B, C_in, H, W, Z, Y = 4, 256, 20, 20, 20, 20# Input: RGB images
images = torch.randn(B, C_in, H, W, Z, Y)

normModule = GroupNorm2D (32, 256, 4, affine=True)
torch_group_norm = nn.GroupNorm(num_groups=32, num_channels=256, affine=True)

answer = normModule (images)
torch_answer = torch_group_norm (images)

B, Cin, H, W = 1, 4, 400, 400
out_channels, kernel_size = 3, (4, 4)
stride, padding = (1, 1), (0, 0)

torch.manual_seed(32)
input = torch.randn(B, Cin, H, W)

convLayer = Conv2D(Cin, out_channels, kernel_size, stride=stride, padding=padding, bias=True)
myConv2D = convLayer(input)

torchConv = nn.Conv2d(Cin, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=True)
torchConv.weight = nn.Parameter(convLayer.weight.data.clone())
torchConv.bias = nn.Parameter(convLayer.bias.data.clone())
torchConv2D = torchConv(input)


print ("\n\n")
# TODO: Documentation; change tolerances later for approx,


Now shape of X after arranging in groups is torch.Size([4, 32, 8, 20, 20, 20, 20])
Shapes of means: NoKeepDim: torch.Size([4, 32]) KeepDim: torch.Size([4, 32, 1, 1, 1, 1, 1])
Shapes of vars: NoKeepDim: torch.Size([4, 32]) KeepDim: torch.Size([4, 32, 1, 1, 1, 1, 1])
Intermediate shape just after normalization = torch.Size([4, 32, 8, 20, 20, 20, 20])
Final shape after norm = torch.Size([4, 256, 20, 20, 20, 20])





## Let's work UpSample and DownSample blocks

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [34]:
class UpSample (nn.Module):
    def __init__(self, channels, factor=2):
        super().__init__()
        self.factor = factor
        self.conv = nn.Conv2d (channels, channels, kernel_size=3, stride=1, padding=1)

    def forward (self, X):
        X = F.interpolate (X, scale_factor=self.factor)
        X = self.conv (X)
        return X

In [35]:
aold = torch.randn (1,2, 2,2)
upsampler = UpSample (2, 2)


In [36]:
upsampleda = F.interpolate (aold, scale_factor=2.0)
aold, upsampleda

(tensor([[[[-2.3440, -0.3994],
           [-1.2656, -1.4913]],
 
          [[-0.4680,  1.5279],
           [-0.4927,  0.1401]]]]),
 tensor([[[[-2.3440, -2.3440, -0.3994, -0.3994],
           [-2.3440, -2.3440, -0.3994, -0.3994],
           [-1.2656, -1.2656, -1.4913, -1.4913],
           [-1.2656, -1.2656, -1.4913, -1.4913]],
 
          [[-0.4680, -0.4680,  1.5279,  1.5279],
           [-0.4680, -0.4680,  1.5279,  1.5279],
           [-0.4927, -0.4927,  0.1401,  0.1401],
           [-0.4927, -0.4927,  0.1401,  0.1401]]]]))

In [37]:
x = torch.tensor([[1, 2, 3], [2,3,4]])

In [38]:
x.shape

torch.Size([2, 3])

In [39]:
x= F.pad (x, (0,1,0,1), mode='constant')

In [40]:
x

tensor([[1, 2, 3, 0],
        [2, 3, 4, 0],
        [0, 0, 0, 0]])

In [41]:
x.shape

torch.Size([3, 4])

In [42]:
x.size()

torch.Size([3, 4])

In [43]:
a = torch.tensor(
    [[[1, 1, 1],[1, 1, 1],[1, 1, 1]], [[2, 2, 2],[2, 2, 2],[2, 2, 2]]]
)


In [44]:
a.shape

torch.Size([2, 3, 3])

In [45]:
a.view (3,3,2)

tensor([[[1, 1],
         [1, 1],
         [1, 1]],

        [[1, 1],
         [1, 2],
         [2, 2]],

        [[2, 2],
         [2, 2],
         [2, 2]]])

In [46]:
a = a.permute (1,2,0)
a, a.shape

(tensor([[[1, 2],
          [1, 2],
          [1, 2]],
 
         [[1, 2],
          [1, 2],
          [1, 2]],
 
         [[1, 2],
          [1, 2],
          [1, 2]]]),
 torch.Size([3, 3, 2]))

In [47]:
a = torch.tensor(
    [[[1, 1, 1],[1, 1, 1],[1, 1, 1]], [[2, 2, 2],[2, 2, 2],[2, 2, 2]]]
)


In [48]:
a.shape

torch.Size([2, 3, 3])

In [49]:
a

tensor([[[1, 1, 1],
         [1, 1, 1],
         [1, 1, 1]],

        [[2, 2, 2],
         [2, 2, 2],
         [2, 2, 2]]])

In [50]:
b = a.reshape (3,3,2)
a = a.permute (1,2,0)
b, a

(tensor([[[1, 1],
          [1, 1],
          [1, 1]],
 
         [[1, 1],
          [1, 2],
          [2, 2]],
 
         [[2, 2],
          [2, 2],
          [2, 2]]]),
 tensor([[[1, 2],
          [1, 2],
          [1, 2]],
 
         [[1, 2],
          [1, 2],
          [1, 2]],
 
         [[1, 2],
          [1, 2],
          [1, 2]]]))

#### Comparisions between implementations from scratch

In [51]:
cmp ("group_norm", answer, torch_answer)
cmp ("conv2d", myConv2D, torchConv2D)
cmp ('Fused vs Individual Attention',out1, out2)

group_norm      | exact: False | approximate: True  | maxdiff: 2.86102294921875e-06
conv2d          | exact: False | approximate: False | maxdiff: 2.384185791015625e-06
Fused vs Individual Attention | exact: True  | approximate: True  | maxdiff: 0.0
