#### Tensors & Manipulations

In [18]:
# tensor is multidimensional array
import torch
x = torch.randn(2, 3, 4)  # create a random tensor with shape (2, 3, 4)
print(x)  # print the tensor

tensor([[[-6.7551e-01,  1.0748e+00,  1.3845e+00,  6.5353e-01],
         [-6.3221e-01,  7.9054e-01, -9.1392e-01,  7.0415e-01],
         [ 1.3968e+00, -4.4675e-01,  2.2130e+00, -8.7841e-01]],

        [[ 1.0176e-01,  8.5011e-02,  1.2939e-04,  5.2281e-01],
         [-7.2002e-01,  3.4378e-01, -9.3164e-01,  1.1351e+00],
         [-8.5246e-01, -1.7654e+00, -7.4132e-01, -1.9917e+00]]])


##### Why do we manipulate Tensor Dimensions ?

1. Batching - Models process multiple samples at once (batch dimension)
2. Layer Requirement - Expect inputs in certain shapes
3. Multi-head Attention - Require Splitting & merging dimensions for heads.
4. Broadcasting - Operations like addition/multiplication may require matching shapes

In [None]:
x = torch.randn(2, 3, 4, 5)  # create a random tensor with shape (2, 3, 4, 5)
# shape (2, 3, 4, 5) means:
# 2 matrices, each with 3 rows and 4 columns, and 
# each element is a vector of size 5
print(x)  # print the tensor

tensor([[[[ 1.9935, -1.0592,  0.2853,  1.8029, -1.4281],
          [-0.7983, -0.0774,  1.6071,  1.1049, -0.9329],
          [-1.0563,  0.6652, -0.1686,  0.3557,  1.3713],
          [ 0.8901, -0.2760,  0.2838, -1.6845, -0.3049]],

         [[-0.0922,  0.8959, -0.5861,  0.1803, -0.7828],
          [ 0.4372, -1.0142,  0.6472, -0.7854,  1.7937],
          [ 0.0085,  0.8860, -1.4004,  0.2979,  1.9493],
          [ 0.0380,  1.3593, -0.9947,  0.7738, -0.0056]],

         [[ 1.7288, -0.4158,  0.0079,  1.6452, -1.0752],
          [ 0.6869, -1.5170, -0.2340,  1.2625,  0.5970],
          [ 1.6960,  0.7483,  0.6303, -1.0809, -1.5471],
          [ 0.9458,  1.5987,  2.5899, -0.9784, -0.1785]]],


        [[[-0.0931,  1.5353, -1.3917,  0.7475, -0.4839],
          [ 0.3544,  0.4690,  0.8517, -0.5913, -0.5471],
          [-1.3679, -0.8743,  0.6295,  0.4105,  1.5556],
          [-0.1071,  1.1296,  1.4751, -0.1668, -0.2995]],

         [[-0.2317,  0.4635,  1.0820,  0.8397,  0.0129],
          [ 0.4140, -

#### CUDA vs MPS

1. torch.cuda.is_available() checks if a CUDA-capable NVIDIA GPU is available and if PyTorch can use it. CUDA is NVIDIA’s technology for running computations on their GPUs. This is used on most Windows and Linux systems with NVIDIA GPUs.

2. torch.mps.is_available() checks if Apple’s Metal Performance Shaders (MPS) backend is available. MPS is Apple’s technology for running computations on Apple Silicon (M1, M2, M3 chips) and some Intel Macs with supported GPUs. CUDA does not work on Apple Silicon; MPS is used instead.

In [22]:
if torch.mps.is_available():
    device = torch.device("mps")
    x = torch.randn(3, 3).to(device)
    print("Running on MPS:", x.device)
else:
    print("MPS not available")

if torch.cuda.is_available():
    device = torch.device("cuda")
    x = torch.randn(3, 3).to(device)
    print("Running on CUDA:", x.device)
else:
    print("CUDA not available")

Running on MPS: mps:0
CUDA not available


#### PyTorch Parameters (`.parameters()`)

In [10]:
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(4, 2)

# PyTorch makes two tensors for the weights and biases.
# Special because Pytorch marks them as things it should changes during Training.
# When we call model.parameters(), it returns these tensors.
model = MyModel()
for param in model.parameters():
    print(param.shape)
    print(param)

torch.Size([2, 4])
Parameter containing:
tensor([[-0.2013,  0.1039,  0.1993,  0.4580],
        [ 0.1079, -0.2671,  0.1110, -0.2548]], requires_grad=True)
torch.Size([2])
Parameter containing:
tensor([-0.0651, -0.4072], requires_grad=True)


In [None]:
import torch
import torch.nn as nn

w = nn.Parameter(torch.randn(2, 2))
print(isinstance(w, nn.Parameter))

# nn.Parameter is a special kind of tensor that is automatically registered as a parameter in the module.
# It is used to define learnable parameters in a neural network.
# nn.Parameter is a subclass of torch.Tensor, so it behaves like a tensor.

# If you add this to a module, it will show up in .parameters()
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.my_weight = nn.Parameter(torch.randn(2, 2))

model = MyModel()
print(list(model.parameters()))


#### Self-Attention Layer (Transformers)

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F

tokens = ["The", " ", "cat", " ", "sat", " ", "on", " ", "the", " ", "mat", "."]
n_tokens = len(tokens)
d_k = 6

# randomly initialize Q, K, V with Standard Normal distribution (mean=0, std=1)
Q = torch.randn(n_tokens, d_k) # n_tokens x d_k
K = torch.randn(n_tokens, d_k)
V = torch.randn(n_tokens, d_k)

# (n_tokens x d_k) @ (d_k x n_tokens) = (n_tokens x n_tokens)
scores = Q @ K.T 

# Values can become large, so we scale them down by the square root of d_k
# to prevent softmax from saturating
# scaling keeps variance of the dot product more consistent
# (n_tokens x n_tokens) / sqrt(d_k) = (n_tokens x n_tokens)
scaled_score = scores / (d_k ** 0.5)

# softmax to get attention weights last dimension
# For each query, softamx is applied across all keys
# converts each row to probaility distribution
# the last diimension corresponds to the keys
attn_weights = F.softmax(scaled_score, dim=-1)

# (n_tokens x n_tokens) @ (n_tokens x d_k) = (n_tokens x d_k)
# the attention weights are used to weight the values
# the result is a weighted sum of the values
output_original = attn_weights @ V

output_original

tensor([[ 0.3341, -0.5154, -1.2380, -0.2892, -0.4579, -0.2457],
        [-0.6077, -0.0793,  1.2263,  0.4887, -0.1040, -0.6966],
        [-0.2376,  0.6978, -0.2318, -0.5215,  0.0550, -0.2912],
        [-0.1975,  0.7894, -0.4018, -0.5038,  0.0247, -0.5158],
        [ 0.0433,  0.2650, -0.5138, -0.3914, -0.2075, -0.0951],
        [-0.3837,  0.6075,  0.4693, -0.1916, -0.1801, -0.0152],
        [ 0.2334, -0.6505, -1.1035, -0.1337, -0.4387, -0.4416],
        [-0.1326,  0.3576, -0.4958, -0.5872, -0.0889, -0.1419],
        [ 0.3356, -0.6253, -1.3418, -0.2246, -0.4712, -0.3358],
        [-0.1198,  0.5244, -0.3412, -0.5332, -0.0867, -0.0930],
        [-0.0488,  0.1532, -0.6066, -0.5057, -0.1759, -0.2830],
        [-0.1535,  0.2497, -0.4927, -0.4406, -0.1209, -0.2047]])

##### PyTorch Modules & Containers.

In [None]:
import torch
import torch.nn as nn

class MyModule(nn.Module):
    def __init__(self, num_layers, input_dim, output_dim):
        super().__init__()
        # Module holds a list of layers, each is a linear layer
        self.layers = nn.ModuleList(
            [nn.Linear(input_dim, output_dim) for _ in range(num_layers)]
        )
    def forward(self, x):
        # Iterate through each layer in ModuleList
        for layer in self.layers:
            x = layer(x)
        return x

# ModuleList register each layer as a submoudle,
# so their parameters are included in model.parameters() 

In [2]:
model = nn.Sequential(
    nn.Linear(10, 20),
    nn.ReLU(),
    nn.Linear(20, 30)
)

# Sequential - to define a model as a sequence of layers.
# It is a subclass of nn.Module that allows you to stack layers in a sequential manner.
print(model)

Sequential(
  (0): Linear(in_features=10, out_features=20, bias=True)
  (1): ReLU()
  (2): Linear(in_features=20, out_features=30, bias=True)
)


In [3]:
class DictModel(nn.Module):
    def __init__(self):
        super().__init__()
        # ModuleDict holds named layers
        self.layers = nn.ModuleDict(
            {
                "fc1": nn.Linear(10, 20),
                'fc2': nn.Linear(20, 5)
            }
        )
    def forward(self, x):
        x = self.layers['fc1'](x)
        x = self.layers['fc2'](x)
        return x

# ModuleDict is useful when you want to access layers by name
model = DictModel()
print(model)

DictModel(
  (layers): ModuleDict(
    (fc1): Linear(in_features=10, out_features=20, bias=True)
    (fc2): Linear(in_features=20, out_features=5, bias=True)
  )
)
