In [18]:
# tensor is multidimensional array
import torch
x = torch.randn(2, 3, 4)  # create a random tensor with shape (2, 3, 4)
print(x)  # print the tensor

tensor([[[-6.7551e-01,  1.0748e+00,  1.3845e+00,  6.5353e-01],
         [-6.3221e-01,  7.9054e-01, -9.1392e-01,  7.0415e-01],
         [ 1.3968e+00, -4.4675e-01,  2.2130e+00, -8.7841e-01]],

        [[ 1.0176e-01,  8.5011e-02,  1.2939e-04,  5.2281e-01],
         [-7.2002e-01,  3.4378e-01, -9.3164e-01,  1.1351e+00],
         [-8.5246e-01, -1.7654e+00, -7.4132e-01, -1.9917e+00]]])


#### CUDA vs MPS

1. torch.cuda.is_available() checks if a CUDA-capable NVIDIA GPU is available and if PyTorch can use it. CUDA is NVIDIA’s technology for running computations on their GPUs. This is used on most Windows and Linux systems with NVIDIA GPUs.

2. torch.mps.is_available() checks if Apple’s Metal Performance Shaders (MPS) backend is available. MPS is Apple’s technology for running computations on Apple Silicon (M1, M2, M3 chips) and some Intel Macs with supported GPUs. CUDA does not work on Apple Silicon; MPS is used instead.

In [22]:
if torch.mps.is_available():
    device = torch.device("mps")
    x = torch.randn(3, 3).to(device)
    print("Running on MPS:", x.device)
else:
    print("MPS not available")

if torch.cuda.is_available():
    device = torch.device("cuda")
    x = torch.randn(3, 3).to(device)
    print("Running on CUDA:", x.device)
else:
    print("CUDA not available")

Running on MPS: mps:0
CUDA not available


In [10]:
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(4, 2)

# PyTorch makes two tensors for the weights and biases.
# Special because Pytorch marks them as things it should changes during Training.
# When we call model.parameters(), it returns these tensors.
model = MyModel()
for param in model.parameters():
    print(param.shape)
    print(param)

torch.Size([2, 4])
Parameter containing:
tensor([[-0.2013,  0.1039,  0.1993,  0.4580],
        [ 0.1079, -0.2671,  0.1110, -0.2548]], requires_grad=True)
torch.Size([2])
Parameter containing:
tensor([-0.0651, -0.4072], requires_grad=True)


In [None]:
import torch
import torch.nn as nn

w = nn.Parameter(torch.randn(2, 2))
print(isinstance(w, nn.Parameter))

# nn.Parameter is a special kind of tensor that is automatically registered as a parameter in the module.
# It is used to define learnable parameters in a neural network.
# nn.Parameter is a subclass of torch.Tensor, so it behaves like a tensor.

# If you add this to a module, it will show up in .parameters()
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.my_weight = nn.Parameter(torch.randn(2, 2))

model = MyModel()
print(list(model.parameters()))


True
[Parameter containing:
tensor([[-0.6255, -0.5323],
        [-1.7597,  0.5376]], requires_grad=True)]


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F

In [None]:
tokens = ["The", " ", "cat", " ", "sat", " ", "on", " ", "the", " ", "mat", "."]
n_tokens = len(tokens)
d_k = 6

# randomly initialize Q, K, V with Standard Normal distribution (mean=0, std=1)
Q = torch.randn(n_tokens, d_k) # n_tokens x d_k
K = torch.randn(n_tokens, d_k)
V = torch.randn(n_tokens, d_k)

# (n_tokens x d_k) @ (d_k x n_tokens) = (n_tokens x n_tokens)
scores = Q @ K.T 

# Values can become large, so we scale them down by the square root of d_k
# to prevent softmax from saturating
# scaling keeps variance of the dot product more consistent
# (n_tokens x n_tokens) / sqrt(d_k) = (n_tokens x n_tokens)
scaled_score = scores / (d_k ** 0.5)

# softmax to get attention weights last dimension
# For each query, softamx is applied across all keys
# converts each row to probaility distribution
# the last diimension corresponds to the keys
attn_weights = F.softmax(scaled_score, dim=-1)

# (n_tokens x n_tokens) @ (n_tokens x d_k) = (n_tokens x d_k)
# the attention weights are used to weight the values
# the result is a weighted sum of the values
output_original = attn_weights @ V

output_original

tensor([[-0.9442,  0.0367,  0.3134, -0.7202, -0.3174,  0.3125],
        [ 0.0459,  0.2607,  0.1560, -0.6120,  1.2428,  0.6578],
        [-0.8398, -0.1090,  0.1608, -0.4982, -0.4739,  0.0708],
        [-0.3803, -0.1351,  0.0249, -0.4601,  0.1185,  0.0723],
        [-0.4724,  0.0727,  0.0468, -0.5159,  0.2836,  0.1467],
        [-0.7485, -0.2667,  0.3595, -0.4836,  0.0442,  0.1493],
        [-0.3828, -0.1605,  0.1255, -0.2986,  0.1661,  0.0273],
        [-0.4709, -0.2345,  0.1679, -0.1312,  0.1520,  0.0548],
        [-0.5072, -0.2233,  0.1079, -0.4745,  0.1901, -0.1194],
        [-0.9217, -0.1705,  0.3464, -0.5381, -0.2762,  0.1383],
        [-1.0513, -0.1603,  0.4750, -0.6926, -0.2177,  0.2249],
        [-1.0403, -0.1923,  0.2474, -0.8217, -0.5872,  0.1576]])