In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
#read all words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [6]:
#build vocabulaty of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos = {i:s for s,i in stoi.items()}

In [9]:
#build the dataset
block_size = 3 #context length: how many characters to look at to predict the next one?
X,Y = [],[]
for w in words[:5]:
    print(w)
    context = [0]*block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] #crop the first element and apend the new one

X = torch.tensor(X)
Y = torch.tensor(Y)


emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [10]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [11]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

As we can see, the 'e' is encodded as 5. To encode the examples, we create embeddings.

In [12]:
#We create a 2 dimensional tensor of shape (n_words, block_size) containing the integer encoded characters
C = torch.randn((27,2)) #the random embedding layer
C[5]

tensor([-1.7960,  0.0437])

In [13]:
F.one_hot(torch.tensor(5), num_classes=27) #one hot encoding of the 5th character
#and if we do a dot product of the one hot encoding with the embedding matrix we get the embedding of the 5th character
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([-1.7960,  0.0437])

i.e., we can index 5th row from C or we can treat "one-hot-encoding @ C" as 1st layer
So, we just use embedding tables and discard the one-hot encoding

In [19]:
#python indexing is powerful
print(C[5])
print('-'*4)
print(C[[5,6,7]]) #if we provide a list of indexes, we can retrieve mupltiple elements
print('-'*4)
print(C[[5,6,7,7,7]]) #indexing can retrieve mupltiple elements, and same element multiple times
print('-'*4)
print(C[torch.tensor([5,6,7])]) 
print('-'*4)
#index can also be 2D
#For example: X is 32x3, so we can pass it to C, embedding matrix directly to get the embeddings corresponging ...?
print(C[X].shape)
print(X[13,2])
print(C[X][13,2]) #----(1) also can be written as C[X[13,2]]
print(C[X[13,2]]) #----(1a)
print(C[1])       #----(2)
#(1) & (1a) is the same as (2) because X[13,2] = 1


tensor([-1.7960,  0.0437])
----
tensor([[-1.7960,  0.0437],
        [ 1.2913, -0.6508],
        [ 0.1613,  1.0627]])
----
tensor([[-1.7960,  0.0437],
        [ 1.2913, -0.6508],
        [ 0.1613,  1.0627],
        [ 0.1613,  1.0627],
        [ 0.1613,  1.0627]])
----
tensor([[-1.7960,  0.0437],
        [ 1.2913, -0.6508],
        [ 0.1613,  1.0627]])
----
torch.Size([32, 3, 2])
tensor(1)
tensor([0.3512, 0.0508])
tensor([0.3512, 0.0508])
tensor([0.3512, 0.0508])


In [20]:
#basically, our embedddnig is just
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [22]:
W1 = torch.randn((6,100)) #2 from each of 3 previous 'block_size' context characters
b1 = torch.randn(100) #biases for this layer



In [21]:
#but we can not do 
#emb @ W1 + b1 #since emb is 32x3x2 and W1 is 6x100
#so, we'll need to do something to make the shapes compatible
#we want to retrieve the three context character ebeddings and concatenate them
emb[:,0,:].shape #it plucks out the embeddings of the first character/word


torch.Size([32, 2])

In [None]:
#so, we 