In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

KeyboardInterrupt: 

In [None]:
words = open('names.txt').read().splitlines()

In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

In [None]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words[:3]:
  
  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)

In [None]:
X.shape, Y.shape

In [None]:
C = torch.randn((27,2)) # THIS IS THE EMBEDDING MATRIX



In [None]:
C[5] # the embedding for character 'e'

In [None]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C # one-hot encoding for 'e'

In [None]:
# above cells are same because we are using a one-hot encoding of the character 'e' and multiplying it with the embedding matrix C and rest are zeros


In [None]:
C[[5, 6, 7]].shape # embeddings for 'e', 'f', 'g'

In [None]:
C[torch.tensor([5, 6, 7,7,7])] # embeddings for 'e', 'f', 'g'

In [None]:
C[X].shape ,C[X][1] # embeddings for all characters in X

In [None]:
# Tensor	Shape	Meaning
# C	(vocab_size, embedding_dim)	Embedding matrix
# X	(batch_size, sequence_len)	Batch of token indices
# C[X]	(batch_size, sequence_len, embedding_dim)	Embeddings for each token

In [None]:
X[13][2]  , C[X][13][2] , C[1]   # embedding for the 2nd character in the 13th example the value is 1 because the character is 'a' and the embedding for 'a' is the first row in C

In [None]:
emb = C[X] # embeddings for all characters in X


In [None]:
W1 = torch.randn((2*3, 100)) # weights for the first layer 2 is embedding dim and we have 3 of them 
b1 = torch.randn(100) # bias for the first layer


In [None]:
torch.cat([emb[:,0,: ] , emb[:,1,: ], emb[:,2,: ]] ,dim=1).shape # concatenating the embeddings for the first, second and third character in each example

### Example: Using `torch.unbind`

The function `torch.unbind(tensor, dim)` removes a given dimension and returns a tuple of slices along that dimension.

For example, given the tensor `emb` of shape `(16, 3, 2)`:

```python
emb.shape
# Output: torch.Size([16, 3, 2])
```

Unbinding along `dim=1`:

```python
emb0, emb1, emb2 = torch.unbind(emb, dim=1)
print(emb0.shape, emb1.shape, emb2.shape)
# Output: torch.Size([16, 2]) torch.Size([16, 2]) torch.Size([16, 2])
```

This splits `emb` into three tensors, each corresponding to one position in the context window.

In [None]:
torch.unbind(emb,dim=1)# unbinding the embeddings along the second dimension


In [None]:
torch.cat(torch.unbind(emb,dim=1), dim=1).shape # concatenating the embeddings for the first, second and third character in each example

In [None]:
a = torch.arange(18)

In [None]:
a.view(3, 6) , a.view (2,9)

emb.view(-1,6)@ W1 shape is 16 x 100
 b1s shape is               1  x 100 it will be broadcasated and added

In [None]:
h = emb.view(-1,6)@ W1 + b1 # matrix multiplication with the weights and adding the bias
h.shape # shape of the hidden layer

In [None]:
h = torch.tanh(h) # activation function

In [None]:
W2 = torch.randn((100, 27)) # weights for the second layer
b2 = torch.randn(27) # bias for the second layer
logits = h @ W2 + b2 # output layer
logits.shape # shape of the output layer

In [None]:
counts = logits.exp() # converting logits to counts
prob = counts / counts.sum(1, keepdim=True) # converting counts to probabilities
prob.shape , prob[0].sum() # shape of the probabilities


In [None]:
Y

In [None]:
prob[torch.arange(len(Y)), Y] # getting the probabilities for the correct characters

In [None]:
loss = -(torch.log(prob[torch.arange(len(Y)), Y]).mean()) # calculating the loss

In [None]:
loss

More respectable

In [None]:
X.shape, Y.shape # shapes of the input and output

In [None]:
g  = torch.Generator().manual_seed(2147483647) # random number generator for reproducibility
C = torch.randn((27, 2), generator=g) # re-initializing the embedding matrix with a fixed seed
W1 = torch.randn((2*3, 100), generator=g) # re-initializing the weights for the first layer
b1 = torch.randn(100, generator=g) # re-initializing the bias for the first layer
W2 = torch.randn((100, 27), generator=g) # re-initializing the weights for the second layer
b2 = torch.randn(27, generator=g) # re-initializing the bias for the second layer
parameters = [C, W1, b1, W2, b2] # list of parameters

In [None]:
sum(p.nelement() for p in parameters) # total number of parameters

In [None]:
emb = C[X] # embeddings for all characters in X
h = emb.view(-1, 6) @ W1 + b1 # matrix multiplication with the weights and adding the bias
h = torch.tanh(h) # activation function
logits = h @ W2 + b2 # output layer
counts = logits.exp() # converting logits to counts
prob = counts / counts.sum(1, keepdim=True) # converting counts to probabilities
loss = -(torch.log(prob[torch.arange(len(Y)), Y]).mean()) # calculating the loss
loss

In [None]:
F.cross_entropy(logits, Y) # calculating the loss using cross entropy

In [None]:



# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
  
#   print(w)""
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)

In [None]:
g  = torch.Generator().manual_seed(2147483647) # random number generator for reproducibility
C = torch.randn((27, 2), generator=g) # re-initializing the embedding matrix with a fixed seed
W1 = torch.randn((2*3, 100), generator=g) # re-initializing the weights for the first layer
b1 = torch.randn(100, generator=g) # re-initializing the bias for the first layer
W2 = torch.randn((100, 27), generator=g) # re-initializing the weights for the second layer
b2 = torch.randn(27, generator=g) # re-initializing the bias for the second layer
parameters = [C, W1, b1, W2, b2] # list of parameters

In [None]:
for p in parameters:
    p.requires_grad = True # setting requires_grad to True for all parameters
    

In [None]:
for _ in range(1): # training loop
    emb = C[X] # embeddings for all characters in X
    h = emb.view(-1, 6) @ W1 + b1 # matrix multiplication with the weights and adding the bias
    h = torch.tanh(h) # activation function
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Y) # calculating the loss
    print(loss.item()) # printing the loss
    for p in parameters:
        p.grad = None # zeroing the gradients
    
    loss.backward() # backpropagation
    
    for p in parameters:
        p.data -= 0.1 * p.grad # updating the parameters with a learning rate of 0.1

In [None]:
torch.randint(0,X.shape[0], (32,)) # generating random indices to sample from the dataset

In [None]:



# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
  
#   print(w)""
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)

In [None]:
g  = torch.Generator().manual_seed(2147483647) # random number generator for reproducibility
C = torch.randn((27, 2), generator=g) # re-initializing the embedding matrix with a fixed seed
W1 = torch.randn((2*3, 100), generator=g) # re-initializing the weights for the first layer
b1 = torch.randn(100, generator=g) # re-initializing the bias for the first layer
W2 = torch.randn((100, 27), generator=g) # re-initializing the weights for the second layer
b2 = torch.randn(27, generator=g) # re-initializing the bias for the second layer
parameters = [C, W1, b1, W2, b2] # list of parameters

In [None]:
for p in parameters:
    p.requires_grad = True # setting requires_grad to True for all parameters
    

In [None]:
lre = torch.linspace(-3, 0, 1000) # learning rate exponential decay
lrs = 10 ** lre # learning rates

In [None]:
lri = []
lossi = []
for i in range(100): # training loop
    ix = torch.randint(0, X.shape[0], (32,)) # generating random indices to sample from the dataset
    emb = C[X[ix]] # embeddings for all characters in X
    h = emb.view(-1, 6) @ W1 + b1 # matrix multiplication with the weights and adding the bias
    h = torch.tanh(h) # activation function
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Y[ix]) # calculating the loss
    print(loss.item()) # printing the loss
    for p in parameters:
        p.grad = None # zeroing the gradients
    
    loss.backward() # backpropagation
    lr =lrs[i]
    for p in parameters:
        p.data -= lr * p.grad # updating the parameters with a learning rate of 0.1
    #track 
    lri.append(lrs[i]) # tracking the learning rate
    lossi.append(loss.item()) # tracking the loss
plt.plot(lri, lossi) # plotting the learning rate vs loss
print(loss.item()) # printing the loss

In [None]:
emb = C[X] # embeddings for all characters in X
h = emb.view(-1, 6) @ W1 + b1 # matrix multiplication with the weights and adding the bias
h = torch.tanh(h) # activation function
logits = h @ W2 + b2 # output layer
loss = F.cross_entropy(logits, Y) # calculating the loss
print(loss.item()) # printing the loss

In [None]:
#bui