In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt  # for making figures

%matplotlib inline

In [2]:
# read in all the words
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
# bulid the vocabulary of characters and mappings to/from integers

chars = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [5]:
# build the dataset

block_size = 3  # context length: how many charaters do we take to predict the next one?
# X: input Y: label
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print("".join(itos[i] for i in context), "------>", itos[ix])
        context = context[1:] + [ix]  # crop and append
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ------> e
..e ------> m
.em ------> m
emm ------> a
mma ------> .
olivia
... ------> o
..o ------> l
.ol ------> i
oli ------> v
liv ------> i
ivi ------> a
via ------> .
ava
... ------> a
..a ------> v
.av ------> a
ava ------> .
isabella
... ------> i
..i ------> s
.is ------> a
isa ------> b
sab ------> e
abe ------> l
bel ------> l
ell ------> a
lla ------> .
sophia
... ------> s
..s ------> o
.so ------> p
sop ------> h
oph ------> i
phi ------> a
hia ------> .


In [6]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [7]:
# 构建查找表C,做embbding将每个字符映射到二维空间中,预测的字符只有27种可能性
# 原理大概就是C是一个矩阵,经过运算把每个字符转换成二维张量
C = torch.randn((27, 2))

In [8]:
# 张量的索引形式(可使用一维整数数组/张量进行索引)
C[[5, 6, 7]]  # C[torch.tensor([5, 6, 7])]能得到相同结果
# 也支持C[torch.tensor([5, 6, 7, 7, 7, 7, 7, 7])]
# 我们也可使用多维整数张量进行索引
C[torch.tensor([[1, 2], [3, 4]])]

tensor([[[ 1.3230, -0.2767],
         [ 0.4499,  0.7208]],

        [[ 1.5825, -0.5541],
         [ 1.8208, -0.3932]]])

In [9]:
# 思考为什么C[X].shape是这样
C.shape, X.shape, C[X].shape

(torch.Size([27, 2]), torch.Size([32, 3]), torch.Size([32, 3, 2]))

In [10]:
# 得到索引为13的例子中对应第2个字符的embedding
C[X][13, 2]

tensor([ 1.3230, -0.2767])

In [11]:
# 得到索引为13的例子中对应的第2个字符
X[13, 2]

tensor(1)

In [12]:
# 也能得到索引为13的例子中对应第2个字符的embedding
C[1]

tensor([ 1.3230, -0.2767])

In [13]:
# C[5]和F.one_hot(torch.tensor(5), num_classes=27).float() @ C互为等价操作
# 不过C[5]这种直接取索引的方式是更快的
# creating embedding table
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [14]:
# 构建隐藏层
# 用3个单词预测,每个单词嵌入维度是2,那么总的输入为6
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [15]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape

torch.Size([32, 6])

In [16]:
# 这种concatenate方式的效率不是很高,会新创建一个张量(创建了新的内存),而view方法会进行原地更新
torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([32, 6])

In [17]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

In [18]:
h

tensor([[-9.9999e-01, -9.7743e-01, -8.6125e-01,  ...,  9.2551e-01,
          9.8394e-01, -9.1358e-01],
        [-9.9877e-01, -8.5105e-01, -9.3273e-01,  ...,  9.6424e-01,
         -8.5268e-02, -9.8647e-01],
        [-8.0745e-01, -9.9996e-01, -9.7997e-01,  ...,  4.1347e-01,
         -9.7300e-01, -9.7895e-01],
        ...,
        [-9.9997e-01,  9.9979e-01, -9.9894e-01,  ...,  9.9696e-01,
          8.6721e-01, -3.9351e-04],
        [ 9.7802e-01, -9.8434e-01,  1.9840e-01,  ...,  6.9779e-01,
         -9.9923e-01, -9.9948e-01],
        [ 1.8487e-01, -8.3643e-01,  9.5944e-01,  ...,  9.5108e-01,
          5.4847e-01,  9.9813e-01]])

In [19]:
h.shape

torch.Size([32, 100])

In [20]:
(emb.view(32, 6) @ W1).shape

torch.Size([32, 100])

In [21]:
b1.shape

torch.Size([100])

In [22]:
# 广播的步骤
# 32, 100 ---> 32, 100 ---> 32, 100
#     100 --->  1, 100 ---> 32, 100