In [2]:
import torch

In [None]:
x=torch.empty(5,7,3)
y=torch.empty(5,7,3)
# same shapes are always broadcastable (i.e. the above rules always hold)

x=torch.empty((0,))
y=torch.empty(2,2)
# x and y are not broadcastable, because x does not have at least 1 dimension

# can line up trailing dimensions
x=torch.empty(5,3,4,1)
y=torch.empty(  3,1,1)
# x and y are broadcastable.
# 1st trailing dimension: both have size 1
# 2nd trailing dimension: y has size 1
# 3rd trailing dimension: x size == y size
# 4th trailing dimension: y dimension doesn't exist

# but:
x=torch.empty(5,2,4,1)
y=torch.empty(  3,1,1)
# x and y are not broadcastable, because in the 3rd trailing dimension 2 != 3

In [1]:
# Demonstrate exp() vs softmax() in PyTorch
import torch
import torch.nn.functional as F

# Example vector of logits
logits = torch.tensor([1.0, 2.0, 3.0])

# exp() applies the exponential function to each element
exp_values = torch.exp(logits)
print("exp():", exp_values)
print("Sum of exp():", exp_values.sum())

# softmax() applies exp() to each element and normalizes so the outputs sum to 1
softmax_values = F.softmax(logits, dim=0)
print("softmax():", softmax_values)
print("Sum of softmax():", softmax_values.sum())

# Notes:
# - exp() just exponentiates each value, does not normalize, so the sum is not 1.
# - softmax() exponentiates and normalizes, so the outputs are positive and sum to 1 (probabilities).

exp(): tensor([ 2.7183,  7.3891, 20.0855])
Sum of exp(): tensor(30.1929)
softmax(): tensor([0.0900, 0.2447, 0.6652])
Sum of softmax(): tensor(1.)


In [None]:
# Example of L2 regularization (weight decay) in loss calculation
# L2 regularization helps prevent overfitting by discouraging large weights in the model.
# It adds a penalty term to the loss, which is proportional to the average of the squared weights.
# This encourages the model to keep weights small and find simpler solutions.

import torch

# Suppose W is a weight matrix
W = torch.tensor([[1.0, -2.0], [3.0, -4.0]], requires_grad=True)

# Example loss (e.g., from your model)
loss = torch.tensor(2.5)

# L2 regularization term
l2_reg = 0.01 * (W ** 2).mean()

# Total loss with L2 regularization
loss_total = loss + l2_reg

print("Original loss:", loss.item())
print("L2 regularization term:", l2_reg.item())
print("Total loss with L2 regularization:", loss_total.item())

# In practice, this helps the model generalize better and reduces the risk of memorizing the training data.

Original loss: 2.5
L2 regularization term: 0.07499999552965164
Total loss with L2 regularization: 2.575000047683716


In [None]:

C = torch.randn((32, 3))
X = torch.randn((27, 2))

X.shape  # (N, 3) - N samples, each with 3 character indices
C.shape  # (27, 2) - 27 characters, each with 2D embedding

emb = C[X]
emb.shape  # (N, 3, 2)

# Example: X has shape (32, 3)
X[0] = [0, 0, 5]  # first context
X[1] = [0, 5, 13] # second context
# ... 30 more rows

# C[X] returns:
# Row 0: [C[0], C[0], C[5]]    → shape (3, 2)
# Row 1: [C[0], C[5], C[13]]   → shape (3, 2)
# ...
# Final shape: (32, 3, 2)

# This efficiently converts all your input sequences from character indices to their learned vector representations in one operation.

In [3]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)

C[torch.tensor([5,6,7])]

# Equivalent to:
torch.stack([C[5], C[6], C[7]])

# Returns something like:
# tensor([[ 0.234, -1.123],   # embedding for character at index 5
#         [-0.456,  0.789],   # embedding for character at index 6  
#         [ 1.012, -0.334]])  # embedding for character at index 7

tensor([[-0.4713,  0.7868],
        [-0.3284, -0.4330],
        [ 1.3729,  2.9334]])

In [4]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [5]:
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [6]:
a.storage()

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]