In [1]:
import torch 
import torch.nn.functional as F 
import matplotlib.pyplot as plt 
%matplotlib inline 

In [4]:
words = open('names.txt', 'r').read().splitlines() 
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [16]:
# build vocabulary of characters 
chars = sorted(list(set(''.join(word for word in words))))
stoi = {ch: i+1 for i, ch in enumerate(chars)}
stoi['.'] = 0 
itos = {i: ch for ch, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [24]:
# build the dataset 

block_size = 3 # how many characters to take to predict the next character 
X, Y = [], [] 

for word in words[:5]:
    print(word)
    context = [0] * block_size 
    for ch in word + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix) 
        print(''.join(itos[i] for i in context), '------>', itos[ix])    
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ------> e
..e ------> m
.em ------> m
emm ------> a
mma ------> .
olivia
... ------> o
..o ------> l
.ol ------> i
oli ------> v
liv ------> i
ivi ------> a
via ------> .
ava
... ------> a
..a ------> v
.av ------> a
ava ------> .
isabella
... ------> i
..i ------> s
.is ------> a
isa ------> b
sab ------> e
abe ------> l
bel ------> l
ell ------> a
lla ------> .
sophia
... ------> s
..s ------> o
.so ------> p
sop ------> h
oph ------> i
phi ------> a
hia ------> .


In [26]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [31]:
C = torch.randn((27, 2)) # embedding vectors 

In [35]:
# 1. One way of getting embedding vector is to pluck out with index 
C[5]

tensor([ 0.2433, -0.2403])

In [37]:
# 2. The other way is using one hot vector and do matrix multiplication --> same result, but inefficient.  
xenc = F.one_hot(torch.tensor(5), num_classes = 27).float()
xenc @ C 

tensor([ 0.2433, -0.2403])

In [39]:
C[X].shape # indexing can be multi-dimensional --> num of examples x num of chars x embed dim 

torch.Size([32, 3, 2])

In [45]:
C[X][13,2] == C[X[13,2]]

tensor([True, True])

In [69]:
emb = C[X] 
print(emb.shape)
print((emb.view(-1, 6) == torch.concat(torch.unbind(emb, 1), 1)).all()) # (32 x 3 x 2) --> unbind(1) --> (32 x 2), (32 x 2), (32 x 2) --> concat --> (32 x 6) 
# view() is more efficient than unbind+concat. Does not have to create/copy/move data/memory.  
print(emb.view(-1, 6).shape)

torch.Size([32, 3, 2])
tensor(True)
torch.Size([32, 6])


In [70]:
W1 = torch.randn(6, 100)
b1 = torch.randn(100)

In [74]:
(emb.view(-1,6) @ W1 + b1).shape
# 32 x 100 + b1? 
# 32, 100 
#  1, 100 <---- broadcasting rule. starts from the right most. The empty dimension will be replaced with 1, and the value will be copied on this dim. (1-->32)

torch.Size([32, 100])

In [73]:
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
h.shape

torch.Size([32, 100])

In [75]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [79]:
logits = h @ W2 + b2 
print(logits.shape)
counts = logits.exp() 
probs = counts / counts.sum(1, keepdims=True) 
print(probs.sum(1))

torch.Size([32, 27])
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000])
