In [1]:
import numpy as np
import pandas as pd
import requests as rqst
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [2]:
words = open('names.txt').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [8]:
chars = sorted(list(set(''.join(words))))
chars

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [9]:
# build index look up table
stoi = {char: i + 1 for i, char in enumerate(chars)}
stoi['.'] = 0
itos = {i : char for char, i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [12]:
block_size = 3
X, Y = [], []
for word in words[:5]:
    print(word)
    context = [0]*block_size
    for char in word + ".":
        ix = stoi[char]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '---->', itos[ix])
        context = context[1:] + [ix]
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [26]:
print(f" samples of X: {X[:2]}")
print(f" samples of Y: {Y[:2]}")
X.shape, X.dtype, Y.shape, Y.dtype

 samples of X: tensor([[0, 0, 0],
        [0, 0, 5]])
 samples of Y: tensor([ 5, 13])


(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

## embedding into smaller dimension


In [17]:

C = torch.randn((27, 2)) # look up table
# embedding one interger
C[5]

tensor([-0.4507,  0.5883])

In [21]:
# using one-hot encoding

emb_five = F.one_hot(torch.tensor(5), num_classes=27)
emb_five.dtype # (torch.int64) which is a long
emb_five.float() @ C

tensor([-0.4507,  0.5883])

## conclusion

---

The result is the same as the just indexing because matrix multiplication


In [23]:
## torch indexing is very powerful. you can actually use lists or tensors as indices and it should get everything there. my God, R poisoned everything!!
print(C[X].shape)
C[X] 

torch.Size([32, 3, 2])


tensor([[[-0.2244, -2.2367],
         [-0.2244, -2.2367],
         [-0.2244, -2.2367]],

        [[-0.2244, -2.2367],
         [-0.2244, -2.2367],
         [-0.4507,  0.5883]],

        [[-0.2244, -2.2367],
         [-0.4507,  0.5883],
         [ 0.8921,  0.6443]],

        [[-0.4507,  0.5883],
         [ 0.8921,  0.6443],
         [ 0.8921,  0.6443]],

        [[ 0.8921,  0.6443],
         [ 0.8921,  0.6443],
         [ 0.4130, -1.7827]],

        [[-0.2244, -2.2367],
         [-0.2244, -2.2367],
         [-0.2244, -2.2367]],

        [[-0.2244, -2.2367],
         [-0.2244, -2.2367],
         [-0.4456, -1.0036]],

        [[-0.2244, -2.2367],
         [-0.4456, -1.0036],
         [ 0.2209,  0.8026]],

        [[-0.4456, -1.0036],
         [ 0.2209,  0.8026],
         [ 1.8280, -0.3603]],

        [[ 0.2209,  0.8026],
         [ 1.8280, -0.3603],
         [ 0.6267, -0.1460]],

        [[ 1.8280, -0.3603],
         [ 0.6267, -0.1460],
         [ 1.8280, -0.3603]],

        [[ 0.6267, -0

In [24]:
X[13,2]

tensor(1)