#### NLPM(Neural Probabilistic Language Model)
主要由三部分组成
- 输入层：将词汇映射到连续的词向量空间
- 隐藏层：通过非线形激活函数学习词与词之间的复杂关系
- 输出层：通过softmax函数产生下一个单词的概率分布

![image](../data/image/NLPM.jpg)


NLPM的历史意义在于开创性地把神经网络技术引入NLP领域。




In [56]:
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#### 创建一个简单的用于演示的数据集

In [6]:
sentences = ['我 喜欢 玩具', '我 爱 爸爸', '我 讨厌 挨打']
word_list = list(set(' '.join(sentences).split()))
word_to_idx = {word:idx for idx, word in enumerate(word_list)}
idx_to_word = {idx:word for idx, word in enumerate(word_list)}
voc_size = len(word_list)

In [54]:
word_to_idx, idx_to_word, voc_size

({'挨打': 0, '我': 1, '爱': 2, '爸爸': 3, '讨厌': 4, '玩具': 5, '喜欢': 6},
 {0: '挨打', 1: '我', 2: '爱', 3: '爸爸', 4: '讨厌', 5: '玩具', 6: '喜欢'},
 7)

#### 生成NPLM训练数据集

In [20]:
batch_size = 2
def make_batch():
    #定义输入数据列表
    intput_batch = []
    # 定义输出数据列表
    target_batch = []
    #随机选取batch_size个句子
    selected_sentences = random.sample(sentences, batch_size)
    for sen in selected_sentences:
        word = sen.split()
        input = [word_to_idx[n] for n in word[:-1]]
        output = word_to_idx[word[-1]]
        intput_batch.append(input)
        target_batch.append(output)
    input_batch = torch.LongTensor(intput_batch)
    target_batch = torch.LongTensor(target_batch)
    return input_batch, target_batch

In [68]:
input,output = make_batch()
input_word = []
for input_idx in input:
    input_word.append([idx_to_word[idx.item()] for idx in input_idx])
print(f"模型输入数据{input},对应词{input_word}")
target_word = [idx_to_word[idx.item()] for idx in output]
print(f"模型输出数据{output.shape},对应词{target_word}")

模型输入数据tensor([[1, 4],
        [1, 2]]),对应词[['我', '讨厌'], ['我', '爱']]
模型输出数据torch.Size([2]),对应词['挨打', '爸爸']


#### 定义NPLM模型

In [71]:
class NPLM(nn.Module):
    def __init__(self, voc_size, embedding_size, n_setp, n_hidden):
        super(NPLM, self).__init__()
        self.embedding_size = embedding_size
        self.n_setp = n_setp
        self.embedding = nn.Embedding(voc_size, embedding_size)
        self.hidden = nn.Linear(embedding_size * n_setp, n_hidden, bias=True)
        self.output = nn.Linear(n_hidden, voc_size,bias=True)
        
    def forward(self, x):
        emb = self.embedding(x).view(-1,self.embedding_size * self.n_setp)
        hidden = F.tanh(self.hidden(emb))
        out = self.output(hidden)
        return out

In [67]:
input = torch.LongTensor([[1,4],[2,3]])
model = NPLM(7, 2, 2, 2)
model(input).shape

torch.Size([2, 4])


torch.Size([2, 7])

#### 训练NPLM模型

In [74]:
learning_rate = 0.1
epochs = 5000
criterion = nn.CrossEntropyLoss()
model = NPLM(7, 2, 2, 2)
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)
for epoch in range(epochs):
    optimizer.zero_grad()
    intput_batch, target_batch = make_batch()
    output = model(intput_batch)
    loss = criterion(output, target_batch)
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch:{epoch+1}, cost:{loss.item()}")
    loss.backward()
    optimizer.step()

Epoch:1000, cost:0.0006107345107011497
Epoch:2000, cost:0.00018952196114696562
Epoch:3000, cost:0.00011288482346571982
Epoch:4000, cost:5.7516826927894726e-05
Epoch:5000, cost:3.2007181289372966e-05


#### 使用NPLM预测

In [100]:
# 要预测的提示词
input_strs = [['我','讨厌'],['我','喜欢']]
# 转换为对应的索引
input_indices = [[word_to_idx[word] for word in seq] for seq in input_strs]
# 转化为张量
input_batch = torch.LongTensor(input_indices)
# 计算结果，并找到概率最大的
predict = model(input_batch).data.max(1).indices

predict_str = [idx_to_word[n.item()] for n in predict]

for input_seq, pred in zip(input_strs, predict_str):
    print(input_seq,'->',pred)

['我', '讨厌'] -> 挨打
['我', '喜欢'] -> 玩具
