# Deep Learning for Natural Language Processing with Pytorch

本教程将向您介绍使用Pytorch进行深度学习编程的关键思想。本教程旨在让您开始编写深度学习代码。
请注意，这是关于模型(model)，而不是数据。 对于所有模型，我只创建了一些维度较小的测试示例，以便您可以看到权重在训练时如何变化。 如果您想要尝试一些真实数据，您应该能够从这个笔记本中删除任何模型并在其上使用它们。

In [3]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

<torch._C.Generator at 0x205d60bc890>

# 1.介绍Torch的张量(tensor)库
所有的深度学习都是在张量上计算的。首先，让我们来看看用张量可以做什么。
## 创建张量
能用torch.Tensor()函数创建张量

In [28]:
#使用给定数据创建torch.Tensor对象。数据是一维的
V_data = [1,2,3]
V = torch.tensor(V_data)
print('V:',V)

#创建一个矩阵
M_data = [[1,2,3],[4,5,6]]
M = torch.tensor(M_data)
print('M:',M)

#创建一个3维张量，尺寸为2*2*2
T_data = [[[1,2], [3,4]],
          [[5,6], [7,8]]]
T = torch.tensor(T_data)
print('T:',T)
print('T[0]:',T[0])

V: tensor([1, 2, 3])
M: tensor([[1, 2, 3],
        [4, 5, 6]])
T: tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])
T[0]: tensor([[1, 2],
        [3, 4]])


In [29]:
#可以用随机的数来创建张量，只需提供维度
x = torch.randn((3, 4, 5))
print(x)

tensor([[[-1.5256, -0.7502, -0.6540, -1.6095, -0.1002],
         [-0.6092, -0.9798, -1.6091, -0.7121,  0.3037],
         [-0.7773, -0.2515, -0.2223,  1.6871,  0.2284],
         [ 0.4676, -0.6970, -1.1608,  0.6995,  0.1991]],

        [[ 0.8657,  0.2444, -0.6629,  0.8073,  1.1017],
         [-0.1759, -2.2456, -1.4465,  0.0612, -0.6177],
         [-0.7981, -0.1316,  1.8793, -0.0721,  0.1578],
         [-0.7735,  0.1991,  0.0457,  0.1530, -0.4757]],

        [[-0.1110,  0.2927, -0.1578, -0.0288,  0.4533],
         [ 1.1422,  0.2486, -1.7754, -0.0255, -1.0233],
         [-0.5962, -1.0055,  0.4285,  1.4761, -1.7869],
         [ 1.6103, -0.7040, -0.1853, -0.9962, -0.8313]]])


## 张量的一些运算

In [31]:
x = torch.Tensor([ 1., 2., 3. ])
y = torch.Tensor([ 4., 5., 6. ])
z = x + y
print(z)

tensor([5., 7., 9.])


我们稍后将使用的一个有用的操作叫连接(concatenation)

In [35]:
#默认是按照第一轴连接
x_1 = torch.randn(2, 5)
y_1 = torch.randn(3, 5)
z_1 =torch.cat([x_1, y_1])
print(x_1)
print(y_1)
print(z_1)

#按照第二轴连接
x_2 = torch.randn(2, 3)
y_2 = torch.randn(2, 5)
z_2 = torch.cat([x_2, y_2], 1)
print(x_2)
print(y_2)
print(z_2)

tensor([[-0.1153,  0.3170,  0.5629,  0.8662, -0.3528],
        [ 0.3482,  1.1371, -0.3339, -1.4724,  0.7296]])
tensor([[-0.1312, -0.6368,  1.0429,  0.4903,  1.0318],
        [-0.5989,  1.6015, -1.0735, -1.2173,  0.6472],
        [-0.0412, -0.1775, -0.5000,  0.8673, -0.2732]])
tensor([[-0.1153,  0.3170,  0.5629,  0.8662, -0.3528],
        [ 0.3482,  1.1371, -0.3339, -1.4724,  0.7296],
        [-0.1312, -0.6368,  1.0429,  0.4903,  1.0318],
        [-0.5989,  1.6015, -1.0735, -1.2173,  0.6472],
        [-0.0412, -0.1775, -0.5000,  0.8673, -0.2732]])
tensor([[-0.4608, -0.0991,  0.4728],
        [ 1.0049, -0.2871, -1.1619]])
tensor([[ 0.0276,  0.5652, -0.0115,  0.6706, -0.4929],
        [ 1.5050, -2.3264,  1.6169, -0.9026,  0.1737]])
tensor([[-0.4608, -0.0991,  0.4728,  0.0276,  0.5652, -0.0115,  0.6706, -0.4929],
        [ 1.0049, -0.2871, -1.1619,  1.5050, -2.3264,  1.6169, -0.9026,  0.1737]])


## 重塑张量
使用.view（）方法重塑张量

In [37]:
x = torch.randn(2, 3, 4)   #2*3*4=24
print(x)
print(x.view(2, 12)) # 2行12列 2*12=24
print(x.view(2, -1))

tensor([[[ 0.2936, -0.4139, -0.0960, -1.3281],
         [ 0.2324,  0.8615,  0.6218, -1.7812],
         [-0.9965,  0.8073,  1.1739, -0.9398]],

        [[ 0.3861,  1.0473, -0.7327, -0.9168],
         [ 0.6867,  0.4209, -1.0214,  0.9886],
         [ 0.7806, -2.2049, -1.4975, -0.9023]]])
tensor([[ 0.2936, -0.4139, -0.0960, -1.3281,  0.2324,  0.8615,  0.6218, -1.7812,
         -0.9965,  0.8073,  1.1739, -0.9398],
        [ 0.3861,  1.0473, -0.7327, -0.9168,  0.6867,  0.4209, -1.0214,  0.9886,
          0.7806, -2.2049, -1.4975, -0.9023]])
tensor([[ 0.2936, -0.4139, -0.0960, -1.3281,  0.2324,  0.8615,  0.6218, -1.7812,
         -0.9965,  0.8073,  1.1739, -0.9398],
        [ 0.3861,  1.0473, -0.7327, -0.9168,  0.6867,  0.4209, -1.0214,  0.9886,
          0.7806, -2.2049, -1.4975, -0.9023]])


# 2.计算图和自动微分
计算图的概念对于高效的深度学习编程至关重要，因为它允许您不必自己编写反向传播梯度。
计算图只是一个关于如何组合数据以提供输出的规范。 由于图表完全指定了哪些参数涉及哪些操作，因此它包含足够的信息来计算导数。 这可能听起来很模糊，所以让我们看看使用Pytorch的基本类：autograd.Variable。

In [43]:
x = autograd.Variable(torch.tensor([1., 2., 3]), requires_grad=True )  #必须是浮点数
print(x.data)
y = autograd.Variable(torch.tensor([4., 5., 6]), requires_grad=True )
z = x + y
print(z.data)
print(z.grad_fn)

tensor([1., 2., 3.])
tensor([5., 7., 9.])
<AddBackward0 object at 0x00000205CED65DD8>


In [55]:
x = autograd.Variable(torch.tensor([2.,3.]),requires_grad=True)
x
y = x+2
y
z = y*y*3
z
out = z.mean()
print(out)
out.backward()
print(x.grad)

tensor(61.5000, grad_fn=<MeanBackward1>)
tensor([12., 15.])


# 3. 用pytorch创建网络组件
在我们继续关注NLP之前，让我们做一个注释示例，使用仿射映射和非线性在Pytorch中构建网络。 我们还将看到如何计算损失函数，使用Pytorch内置的负对数似然，并通过反向传播更新参数。
让我们编写一个带注释的网络示例，该网络采用稀疏的词袋表示，并在两个标签上输出概率分布：“英语”和“西班牙语”。这个模型只是逻辑回归。

In [60]:
data = [ ("me gusta comer en la cafeteria".split(), "SPANISH"),
         ("Give it to me".split(), "ENGLISH"),
         ("No creo que sea una buena idea".split(), "SPANISH"),
         ("No it is not a good idea to get lost at sea".split(), "ENGLISH") ]

test_data = [ ("Yo creo que si".split(), "SPANISH"),
              ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
print(VOCAB_SIZE)

{'on': 25, 'la': 4, 'good': 19, 'si': 24, 'sea': 12, 'to': 8, 'Give': 6, 'is': 16, 'get': 20, 'No': 9, 'Yo': 23, 'a': 18, 'una': 13, 'buena': 14, 'que': 11, 'idea': 15, 'comer': 2, 'cafeteria': 5, 'it': 7, 'at': 22, 'lost': 21, 'gusta': 1, 'creo': 10, 'en': 3, 'not': 17, 'me': 0}
26


In [61]:
class BoWClassifier(nn.Module): # inheriting from nn.Module!
    
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        
        # 确定你理解了为什么输入的维度是vacab_size,输出是num_labels
        self.linear = nn.Linear(vocab_size, num_labels)
        
        
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec))

In [74]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

In [70]:
NUM_LABELS = 2

model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.0160,  0.0598, -0.1947,  0.1744, -0.0427, -0.1800,  0.1352, -0.0530,
          0.1781, -0.1332, -0.1821, -0.1473,  0.0850, -0.0220, -0.1817,  0.0364,
         -0.0125,  0.0709, -0.0998, -0.1400,  0.1337, -0.0944,  0.1380,  0.1590,
         -0.1372, -0.0904],
        [ 0.0236, -0.1849,  0.0602, -0.0817, -0.0870, -0.1516,  0.0917, -0.1250,
         -0.1568, -0.0516, -0.1092, -0.1907,  0.0790, -0.0746,  0.1374,  0.1380,
         -0.1251, -0.1143, -0.0696,  0.1149, -0.0068, -0.0701,  0.1215,  0.1860,
          0.1450, -0.1500]], requires_grad=True)
Parameter containing:
tensor([-0.0407, -0.0376], requires_grad=True)


In [73]:
print(data)
sample = data[0]
bow_vector = make_bow_vector(sample[0], word_to_ix)
log_probs = model(autograd.Variable(bow_vector))
print(log_probs)

[(['me', 'gusta', 'comer', 'en', 'la', 'cafeteria'], 'SPANISH'), (['Give', 'it', 'to', 'me'], 'ENGLISH'), (['No', 'creo', 'que', 'sea', 'una', 'buena', 'idea'], 'SPANISH'), (['No', 'it', 'is', 'not', 'a', 'good', 'idea', 'to', 'get', 'lost', 'at', 'sea'], 'ENGLISH')]
tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])
tensor([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])
tensor([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])
tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])
tensor([1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])
tensor([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])
tensor



In [75]:
label_to_ix = { "SPANISH": 0, "ENGLISH": 1 }

接下来开始训练。为此，我们传递实例以获取日志概率，计算损失函数，计算损失函数的梯度，然后使用渐变步骤更新参数。 损耗函数由Torch在nn包中提供。 nn.NLLLoss()是我们想要的负对数似然丢失。它还定义了torch.optim中的优化函数。在这里，我们将使用SGD。

In [81]:
# 训练之前在测试集上运行 
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)
# print(next(model.parameters())[:,word_to_ix["creo"]])

tensor([[-0.1670, -1.8723]], grad_fn=<LogSoftmaxBackward>)
tensor([[-2.8061, -0.0623]], grad_fn=<LogSoftmaxBackward>)




In [80]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(100):
    for instance, label in data:
        # 第一步。
        # 因为Pytorch积累梯度值，所以要在每个实例前清空
        model.zero_grad()
    
        # Step 2. Make our BOW vector and also we must wrap the target in a Variable
        # as an integer.  For example, if the target is SPANISH, then we wrap the integer
        # 0.  The loss function then knows that the 0th element of the log probabilities is
        # the log probability corresponding to SPANISH
        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
        target = autograd.Variable(make_target(label, label_to_ix))
    
        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)
    
        # Step 4. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)



tensor([[-0.1670, -1.8723]], grad_fn=<LogSoftmaxBackward>)
tensor([[-2.8061, -0.0623]], grad_fn=<LogSoftmaxBackward>)
