# Encoding

In [432]:
sentence = "the quick brown fox jumped over the lazy dog"
words = sentence.split(' ')  # 分词
print(words)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


注：set() 函数创建一个无序不重复元素集，可进行关系测试，删除重复数据，还可以计算交集、差集、并集等。

In [433]:
x = set('runoob') 
y = set('google')
print(x, y)  # 重复的‘o’被删除

{'n', 'b', 'r', 'u', 'o'} {'g', 'l', 'e', 'o'}


回归主题：

In [434]:
vocabl = list(set(words))  # 去除了重复的 ‘the’
print(vocabl)

['jumped', 'the', 'quick', 'brown', 'over', 'dog', 'lazy', 'fox']


In [435]:
print(len(words))
print(len(vocabl))

9
8


## One-hot Encoding Example
需要纬度高，编码稀疏（信息量少）等

In [436]:
# convert words to indexes
word_to_ix1 = {word : i for i, word in enumerate(vocabl)}  # convert to dict
print(word_to_ix1)

{'jumped': 0, 'the': 1, 'quick': 2, 'brown': 3, 'over': 4, 'dog': 5, 'lazy': 6, 'fox': 7}


In [437]:
import torch
from torch.nn.functional import one_hot

words = torch.tensor([word_to_ix1[w] for w in vocabl], dtype=torch.long)
print(words)
one_hot_encoding = one_hot(words)  # one-hot编码
print(vocabl)
print(one_hot_encoding)  # 信息量太少

tensor([0, 1, 2, 3, 4, 5, 6, 7])
['jumped', 'the', 'quick', 'brown', 'over', 'dog', 'lazy', 'fox']
tensor([[1, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 1]])


## Word Embedding Example

In [438]:
# Context is the number of words we are using as a context for the next word we want to predict.
CONTEXT_SIZE = 2

# Embedding dimension is the size of embedding vector
EMBEDDING_DIM = 10

# Size of the hidden Layer
HIDDEN_DIM = 256

In [439]:
# we will use Shakespeare Sonnet 2
test_sentence = """Tomorrow and tomorrow and tomorrow,
Creeps in this petty pace from day to day
To the last syllable of recorded time,
And all our yesterdays have lighted fools
The way to dusty death. Out, out, brief candle!
Life's but a walking shadow, a poor player
That struts and frets his hour upon the stage
And then is heard no more: it is a tale
Told by an idiot, full of sound and fury,
Signifying nothing.""".lower().split()

In [440]:
# Build a list of tuples. Each tuple is ([word_i-2, word_i-1], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
             for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[: 6])
vocab2 = list(set(test_sentence))
print('=='*50)
print(vocab2)
print('=='*50)
print(len(test_sentence))
print(len(vocab2))
word_to_ix2 = {word : i for i, word in enumerate(vocab2)}
print('=='*50)
print(word_to_ix2)

[(['tomorrow', 'and'], 'tomorrow'), (['and', 'tomorrow'], 'and'), (['tomorrow', 'and'], 'tomorrow,'), (['and', 'tomorrow,'], 'creeps'), (['tomorrow,', 'creeps'], 'in'), (['creeps', 'in'], 'this')]
['last', 'petty', 'by', 'in', 'from', 'out,', 'nothing.', 'recorded', 'yesterdays', 'fury,', 'a', 'but', 'no', 'more:', 'sound', 'the', 'pace', 'death.', 'that', 'then', 'dusty', 'brief', 'tale', 'this', 'time,', 'frets', 'signifying', 'his', 'walking', 'hour', 'poor', 'upon', 'is', 'to', 'syllable', 'candle!', 'have', 'heard', 'creeps', 'day', 'lighted', 'it', 'idiot,', "life's", 'struts', 'and', 'way', 'fools', 'our', 'stage', 'all', 'shadow,', 'tomorrow,', 'an', 'told', 'player', 'of', 'full', 'tomorrow']
75
59
{'last': 0, 'petty': 1, 'by': 2, 'in': 3, 'from': 4, 'out,': 5, 'nothing.': 6, 'recorded': 7, 'yesterdays': 8, 'fury,': 9, 'a': 10, 'but': 11, 'no': 12, 'more:': 13, 'sound': 14, 'the': 15, 'pace': 16, 'death.': 17, 'that': 18, 'then': 19, 'dusty': 20, 'brief': 21, 'tale': 22, 'this

In [441]:
import torch
import torch.autograd as autograd
import torch.nn as nn 
import torch.optim as optim
import torch.nn.functional as F 

In [442]:
class NGramLanguageModeler(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, HIDDEN_DIM)
        self.linear2 = nn.Linear(HIDDEN_DIM, vocab_size)

    def forward(self, inputs):
        embeds = self.embedding(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [443]:
learning_rate = 0.001
losses = []
loss_function = nn.NLLLoss()  # negative log likehood
model = NGramLanguageModeler(len(vocab2), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [444]:
from tqdm import tqdm  # 加上进度条

for epoch in range(50):
    total_loss = 0
    iterator = tqdm(trigrams)
    for context, target in iterator:
        # Step 1. Prepare the inputs to be passed to the model(i.e, turn the words into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a new instance,you need to zero out gradients from the old instance.
        model.zero_grad()

        # Step 3. Run the forward pass,getting log probabilities over next words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again,Torch wants the target word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix2[target]], dtype=torch.long))
        # Step 5. Do the backward pass and update the gradient
        loss.backward() 
        optimizer.step()

        # Get the Python number from a 1-elements Tensor calling tensor.item()
        total_loss += loss.item()
        iterator.set_postfix(loss=float(loss))  # 在进度条后面加上损失值 loss
    losses.append(total_loss)
    # add progress bar with epoch

100%|██████████| 73/73 [00:02<00:00, 34.55it/s, loss=4.5]
100%|██████████| 73/73 [00:02<00:00, 30.08it/s, loss=4.46]
100%|██████████| 73/73 [00:03<00:00, 22.43it/s, loss=4.42]
100%|██████████| 73/73 [00:01<00:00, 36.54it/s, loss=4.38]
100%|██████████| 73/73 [00:02<00:00, 29.75it/s, loss=4.34]
100%|██████████| 73/73 [00:01<00:00, 53.40it/s, loss=4.3]
100%|██████████| 73/73 [00:01<00:00, 37.98it/s, loss=4.26]
100%|██████████| 73/73 [00:02<00:00, 35.78it/s, loss=4.22]
100%|██████████| 73/73 [00:01<00:00, 46.44it/s, loss=4.18]
100%|██████████| 73/73 [00:02<00:00, 32.79it/s, loss=4.15]
100%|██████████| 73/73 [00:02<00:00, 33.60it/s, loss=4.11]
100%|██████████| 73/73 [00:02<00:00, 32.43it/s, loss=4.07]
100%|██████████| 73/73 [00:02<00:00, 34.24it/s, loss=4.03]
100%|██████████| 73/73 [00:01<00:00, 39.31it/s, loss=4]
100%|██████████| 73/73 [00:01<00:00, 38.18it/s, loss=3.96]
100%|██████████| 73/73 [00:02<00:00, 30.38it/s, loss=3.92]
100%|██████████| 73/73 [00:02<00:00, 31.14it/s, loss=3.88]
10

In [445]:
# check the structure of our model here
# model.eval()

In [446]:
import numpy

with torch.no_grad():
    context = ['creeps', 'in']
    context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)
    pred = model(context_idxs)
    print(pred)
    print(pred.type())
    index_of_prediction = numpy.argmax(pred)
    print(vocab2[index_of_prediction])

tensor([[-4.4783, -4.4191, -3.7023, -5.1273, -4.2793, -4.2370, -4.6083, -3.9433,
         -4.2768, -4.4262, -2.8790, -3.8211, -4.4354, -3.7915, -4.1032, -3.1344,
         -4.4880, -4.8107, -4.3386, -5.1267, -4.4028, -4.3076, -4.0304, -2.6152,
         -4.3099, -4.5722, -4.1308, -4.7391, -4.6606, -3.9616, -4.4295, -4.6521,
         -4.1112, -3.6293, -4.5858, -4.3190, -4.6619, -5.0203, -4.2950, -4.3610,
         -4.1950, -4.5477, -4.9065, -4.2715, -4.2793, -2.9366, -4.3579, -4.1131,
         -4.5759, -4.4559, -4.4987, -4.1404, -4.7564, -4.8629, -4.3767, -3.7872,
         -2.7306, -4.6732, -4.6658]])
torch.FloatTensor
this
