#### Tensors & Manipulations

In [18]:
# tensor is multidimensional array
import torch
x = torch.randn(2, 3, 4)  # create a random tensor with shape (2, 3, 4)
print(x)  # print the tensor

tensor([[[-6.7551e-01,  1.0748e+00,  1.3845e+00,  6.5353e-01],
         [-6.3221e-01,  7.9054e-01, -9.1392e-01,  7.0415e-01],
         [ 1.3968e+00, -4.4675e-01,  2.2130e+00, -8.7841e-01]],

        [[ 1.0176e-01,  8.5011e-02,  1.2939e-04,  5.2281e-01],
         [-7.2002e-01,  3.4378e-01, -9.3164e-01,  1.1351e+00],
         [-8.5246e-01, -1.7654e+00, -7.4132e-01, -1.9917e+00]]])


##### Why do we manipulate Tensor Dimensions ?

1. Batching - Models process multiple samples at once (batch dimension)
2. Layer Requirement - Expect inputs in certain shapes
3. Multi-head Attention - Require Splitting & merging dimensions for heads.
4. Broadcasting - Operations like addition/multiplication may require matching shapes

In [None]:
x = torch.randn(2, 3, 4)  # create a random tensor with shape (2, 3, 4, 5)
# shape (2, 3, 4, 5) means:
# 2 matrices, each with 3 rows and 4 columns, and 
# each element is a vector of size 5
print(x.shape)  # print the tensor
y = x.transpose(0, 1) # transpose the first two dimensions
# Switching betweeen the [batch, seuence, feature] and [sequence, batch, feature] formats
print(y.shape)  # print the transposed tensor

# Again back to [batch, sequence, feature]
y = y.permute(1, 0, 2)
print(y.shape)

torch.Size([2, 3, 4])
torch.Size([3, 2, 4])
torch.Size([2, 3, 4])


In [None]:
# Reshaping / view
# Changes shape of tensor without changing its data.
# Used to flatten images, prepare batches,

x = torch.arange(6) # Shape [6]
x_reshaped = x.view(2, 3) # Shape [2, 3]

# Use -1 to let library infer correct dimension. x.view(-1, 3)

# 1. Flattening for Fully Connected Layers
# linear layer expects [batch, features], not [batch, channesl, height, width]
x = torch.randn(32, 3, 28, 28) # [batch, channesl, height, width]
x_flat = x.view(32, -1) # [batch, features]

# 2. Adding a Batch Dimension
# if you have single sample but model expect a batch
x = torch.randn(10) # [features]
x_batch = x.unsqueeze(0) # [1, features]

# 3. Preparing Sequnces for RNNs
# PyTorch RNNs expect [seq, batch, features]
x = torch.randn(64, 10, 128) # [batch, seq, features]
x_seq_first = x.permute(1, 0, 2) # [seq, batch, features]

# 4. unsqueeze - adds a new dim of size 1 at specified position (axis)
# enables broadcasting, expected input shape of a layer
x = torch.tensor([1, 2, 3]) # Shape [3]
x1 = x.unsqueeze(0) # shape: [1, 3]
x2 = x.unsqueeze(1) # shape: [3, 1]

# Suppose model expects [batch, features], but you have a single feature

# 5. squeeze - removes all dim of size
# reduce rank of tensor oprations
y = torch.rann(1, 3, 1, 5)
y1 = y.squeeze() # Shape: [3, 5], (removes all size-1 dimes)
y2 = y.squeeze(2) # Shape: [1, 3, 5] (removes only dim 2)

In [None]:
# 6. cat - joins a squence of tensor along an existing dims
a = torch.randn(2, 3)
b = torch.randn(2, 3)

cat0 = torch.cat([a, b], dim=0) # concatenate along rows (dim=0): shape [4, 3]
cate1 = torch.cat([a, b], dim=1) # along columns (dim=1): shape [2, 6]

# 7. stack - Squence of tensor along a new dim
stack0 = torch.stack([a, b], dim=0) # row, result shape [2, 2, 3]
stack1 = torch.stack([a, b], dim=1) # column, shape[2, 2, 3]

In [None]:
# 8. split - divide a tensor into a list of smaller tensor of specified size(s)
# useful for dividing data into mini-bath, splitting features
x = torch.arnage(12).reshape(3, 4) # Shape [3, 4]
splits = torch.split(x, 2, dim=1) # column, split into 2. # [3, 2] [3, 2]

# 9. chunnk - divide into specified number of equal chunks
chunks = torch.chunk(x, 2, dim=0) # 2 parts along rows. #[2, 4] [1, 4]

In [None]:
# 10. Reduction Operations (sum, mean, max, min)
# collapse one or more dims. Specify the dim to reduce
x = torch.tensor([[1., 2.], [3., 4.]])
total = x.sum()
row_sum = x.sum(dim=0) # [4., 6.]
col_mean = x.mean(dim=1) #[1.5, 3.5]
max_val, max_idx = x.max(dim=1) #([2., 4.], [1, 1]) # max val & its index

# loss calculation, loss = (pred - target).pow(2).mean()
# pooling layers: torch.maxpool2d
# Normalization: x - x.mean(dim=0)

In [14]:
# 11. Matrix Multiplications, Dot Prodction
# Use @ or torch.matmul for matrix multiplication
# Use torch.dot for 1D vectors
# Linear Layer = x @ W.T + b
# Attention Score = Q @ K.T

a = torch.randn(2, 3)
b = torch.randn(3, 4)

# Matrix Multiplication: [2, 3] @ [3, 4] -> [2, 4]
c = a @ b

# Dot Prodcut : [3], [3] -> Scalar
v1= torch.tensor([1., 2., 3.])
v2 = torch.tensor([4., 5., 6.])
dot = torch.dot(v1, v2)


#### CUDA vs MPS

1. torch.cuda.is_available() checks if a CUDA-capable NVIDIA GPU is available and if PyTorch can use it. CUDA is NVIDIA’s technology for running computations on their GPUs. This is used on most Windows and Linux systems with NVIDIA GPUs.

2. torch.mps.is_available() checks if Apple’s Metal Performance Shaders (MPS) backend is available. MPS is Apple’s technology for running computations on Apple Silicon (M1, M2, M3 chips) and some Intel Macs with supported GPUs. CUDA does not work on Apple Silicon; MPS is used instead.

In [22]:
if torch.mps.is_available():
    device = torch.device("mps")
    x = torch.randn(3, 3).to(device)
    print("Running on MPS:", x.device)
else:
    print("MPS not available")

if torch.cuda.is_available():
    device = torch.device("cuda")
    x = torch.randn(3, 3).to(device)
    print("Running on CUDA:", x.device)
else:
    print("CUDA not available")

Running on MPS: mps:0
CUDA not available


#### PyTorch Parameters (`.parameters()`)

In [10]:
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(4, 2)

# PyTorch makes two tensors for the weights and biases.
# Special because Pytorch marks them as things it should changes during Training.
# When we call model.parameters(), it returns these tensors.
model = MyModel()
for param in model.parameters():
    print(param.shape)
    print(param)

torch.Size([2, 4])
Parameter containing:
tensor([[-0.2013,  0.1039,  0.1993,  0.4580],
        [ 0.1079, -0.2671,  0.1110, -0.2548]], requires_grad=True)
torch.Size([2])
Parameter containing:
tensor([-0.0651, -0.4072], requires_grad=True)


In [None]:
import torch
import torch.nn as nn

w = nn.Parameter(torch.randn(2, 2))
print(isinstance(w, nn.Parameter))

# nn.Parameter is a special kind of tensor that is automatically registered as a parameter in the module.
# It is used to define learnable parameters in a neural network.
# nn.Parameter is a subclass of torch.Tensor, so it behaves like a tensor.

# If you add this to a module, it will show up in .parameters()
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.my_weight = nn.Parameter(torch.randn(2, 2))

model = MyModel()
print(list(model.parameters()))


#### Self-Attention Layer (Transformers)

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F

tokens = ["The", " ", "cat", " ", "sat", " ", "on", " ", "the", " ", "mat", "."]
n_tokens = len(tokens)
d_k = 6

# randomly initialize Q, K, V with Standard Normal distribution (mean=0, std=1)
Q = torch.randn(n_tokens, d_k) # n_tokens x d_k
K = torch.randn(n_tokens, d_k)
V = torch.randn(n_tokens, d_k)

# (n_tokens x d_k) @ (d_k x n_tokens) = (n_tokens x n_tokens)
scores = Q @ K.T 

# Values can become large, so we scale them down by the square root of d_k
# to prevent softmax from saturating
# scaling keeps variance of the dot product more consistent
# (n_tokens x n_tokens) / sqrt(d_k) = (n_tokens x n_tokens)
scaled_score = scores / (d_k ** 0.5)

# softmax to get attention weights last dimension
# For each query, softamx is applied across all keys
# converts each row to probaility distribution
# the last diimension corresponds to the keys
attn_weights = F.softmax(scaled_score, dim=-1)

# (n_tokens x n_tokens) @ (n_tokens x d_k) = (n_tokens x d_k)
# the attention weights are used to weight the values
# the result is a weighted sum of the values
output_original = attn_weights @ V

output_original

tensor([[ 0.3341, -0.5154, -1.2380, -0.2892, -0.4579, -0.2457],
        [-0.6077, -0.0793,  1.2263,  0.4887, -0.1040, -0.6966],
        [-0.2376,  0.6978, -0.2318, -0.5215,  0.0550, -0.2912],
        [-0.1975,  0.7894, -0.4018, -0.5038,  0.0247, -0.5158],
        [ 0.0433,  0.2650, -0.5138, -0.3914, -0.2075, -0.0951],
        [-0.3837,  0.6075,  0.4693, -0.1916, -0.1801, -0.0152],
        [ 0.2334, -0.6505, -1.1035, -0.1337, -0.4387, -0.4416],
        [-0.1326,  0.3576, -0.4958, -0.5872, -0.0889, -0.1419],
        [ 0.3356, -0.6253, -1.3418, -0.2246, -0.4712, -0.3358],
        [-0.1198,  0.5244, -0.3412, -0.5332, -0.0867, -0.0930],
        [-0.0488,  0.1532, -0.6066, -0.5057, -0.1759, -0.2830],
        [-0.1535,  0.2497, -0.4927, -0.4406, -0.1209, -0.2047]])

##### PyTorch Modules & Containers.

In [None]:
import torch
import torch.nn as nn

class MyModule(nn.Module):
    def __init__(self, num_layers, input_dim, output_dim):
        super().__init__()
        # Module holds a list of layers, each is a linear layer
        self.layers = nn.ModuleList(
            [nn.Linear(input_dim, output_dim) for _ in range(num_layers)]
        )
    def forward(self, x):
        # Iterate through each layer in ModuleList
        for layer in self.layers:
            x = layer(x)
        return x

# ModuleList register each layer as a submoudle,
# so their parameters are included in model.parameters() 

In [2]:
model = nn.Sequential(
    nn.Linear(10, 20),
    nn.ReLU(),
    nn.Linear(20, 30)
)

# Sequential - to define a model as a sequence of layers.
# It is a subclass of nn.Module that allows you to stack layers in a sequential manner.
print(model)

Sequential(
  (0): Linear(in_features=10, out_features=20, bias=True)
  (1): ReLU()
  (2): Linear(in_features=20, out_features=30, bias=True)
)


In [3]:
class DictModel(nn.Module):
    def __init__(self):
        super().__init__()
        # ModuleDict holds named layers
        self.layers = nn.ModuleDict(
            {
                "fc1": nn.Linear(10, 20),
                'fc2': nn.Linear(20, 5)
            }
        )
    def forward(self, x):
        x = self.layers['fc1'](x)
        x = self.layers['fc2'](x)
        return x

# ModuleDict is useful when you want to access layers by name
model = DictModel()
print(model)

DictModel(
  (layers): ModuleDict(
    (fc1): Linear(in_features=10, out_features=20, bias=True)
    (fc2): Linear(in_features=20, out_features=5, bias=True)
  )
)
