# pytorch学习

## 一、pytorch 简介

In [5]:
import torch
x = torch.randn(2, 3, 4)
print(x)
print(x.view(2, 12)) 

tensor([[[-1.0962,  0.7281,  1.4302, -0.0948],
         [ 0.5363, -1.4412,  1.2011, -0.1106],
         [-0.6762, -0.1976, -0.3512, -0.2141]],

        [[ 1.0735, -0.0218,  0.1169,  1.3246],
         [-1.4683, -1.5913, -1.3995, -0.6038],
         [ 0.2400, -0.2608, -1.0707, -0.2362]]])
tensor([[-1.0962,  0.7281,  1.4302, -0.0948,  0.5363, -1.4412,  1.2011, -0.1106,
         -0.6762, -0.1976, -0.3512, -0.2141],
        [ 1.0735, -0.0218,  0.1169,  1.3246, -1.4683, -1.5913, -1.3995, -0.6038,
          0.2400, -0.2608, -1.0707, -0.2362]])


In [6]:
# Tensor factory methods have a ``requires_grad`` flag
x = torch.tensor([1., 2., 3], requires_grad=True)
print(x)
# With requires_grad=True, you can still do all the operations you previously
# could
y = torch.tensor([4., 5., 6], requires_grad=True)
print(y)

z = x + y
print(z)
# BUT z knows something extra.
print(z.grad_fn)

tensor([1., 2., 3.], requires_grad=True)
tensor([4., 5., 6.], requires_grad=True)
tensor([5., 7., 9.], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x11b825790>


In [7]:
# Lets sum up all the entries in z
s = z.sum()
print(s)
print(s.grad_fn)

tensor(21., grad_fn=<SumBackward0>)
<SumBackward0 object at 0x11b825710>


In [8]:
s.backward()
print(x.grad)

tensor([1., 1., 1.])


In [9]:
x = torch.randn(2, 2)
y = torch.randn(2, 2)
# By default, user created Tensors have ``requires_grad=False``
print(x.requires_grad, y.requires_grad)
z = x + y
# So you can't backprop through z
print(z.grad_fn)

# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad``
# flag in-place. The input flag defaults to ``True`` if not given.
x = x.requires_grad_()
y = y.requires_grad_()
# z contains enough information to compute gradients, as we saw above
z = x + y
print(z.grad_fn)
# If any input to an operation has ``requires_grad=True``, so will the output
print(z.requires_grad)

# Now z has the computation history that relates itself to x and y
# Can we just take its values, and **detach** it from its history?
new_z = z.detach()

# ... does new_z have information to backprop to x and y?
# NO!
print(new_z.grad_fn)
# And how could it? ``z.detach()`` returns a tensor that shares the same storage
# as ``z``, but with the computation history forgotten. It doesn't know anything
# about how it was computed.
# In essence, we have broken the Tensor away from its past history

False False
None
<AddBackward0 object at 0x103e66390>
True
None


In [10]:
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
    print((x ** 2).requires_grad)

True
True
False


## 二、使用 pytorch 进行深度学习

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10bcf9dd0>

In [12]:
lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b
# data is 2x5.  A maps from 5 to 3... can we map "data" under A?
data = torch.randn(2, 5)
print(lin(data))  # yes # 得到2*3的张量

tensor([[ 0.1755, -0.3268, -0.5069],
        [-0.6602,  0.2260,  0.1089]], grad_fn=<AddmmBackward>)


In [13]:
# In pytorch, most non-linearities are in torch.functional (we have it imported as F)
# Note that non-linearites typically don't have parameters like affine maps do.
# That is, they don't have weights that are updated during training.
data = torch.randn(2, 2)
print(data)
print(F.relu(data))

tensor([[-0.5404, -2.2102],
        [ 2.1130, -0.0040]])
tensor([[0.0000, 0.0000],
        [2.1130, 0.0000]])


In [14]:
# Softmax is also in torch.nn.functional
data = torch.randn(5)
print(data)
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
print(F.log_softmax(data, dim=0))  # theres also log_softmax

tensor([ 1.3800, -1.3505,  0.3455,  0.5046,  1.8213])
tensor([0.2948, 0.0192, 0.1048, 0.1228, 0.4584])
tensor(1.)
tensor([-1.2214, -3.9519, -2.2560, -2.0969, -0.7801])


### 1、逻辑回归词袋分类器

网络输出：$\log \text{Softmax}(Ax+b)$

#### （1）数据预处理

In [15]:
# 构造词袋向量（BOW向量）：x = [Count(word1),count(word2),...,Count(word10)]
# 下面的例子对分辨句子是英语还是西班牙语
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
# word_to_ix 将词汇中的每一个单词映射到一个唯一的整数，该整数成为单词词袋向量中的索引
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix: # 加入列表的第一个单词索引为0、第二个为1、第三个为2、....
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


#### （2）建立逻辑回归模型

In [16]:
VOCAB_SIZE = len(word_to_ix) #词向量长度
NUM_LABELS = 2 #类别数

# 构建模型
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        # 输入特征维数为vocab_size，输出特征维数为num_labels
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec): # 定义前向传递函数
        # Pass the input through the linear layer,
        # then pass that through log_softmax. # 因为构建的是逻辑回归模型
        # Many non-linearities and other functions are in torch.nn.functional
        # log_softmax将n维输入张量的每个元素缩放到(0,1)区间且和为1。dim=0表示按列计算，dim=1表示按行计算
        return F.log_softmax(self.linear(bow_vec), dim=1) # 对每一类都输出一个概率值


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix)) # 初始化词袋向量
    for word in sentence:
        vec[word_to_ix[word]] += 1 #向量中出现的单词，其值加1
    return vec.view(1, -1) # 返回 1*len(word_to_ix) 的张量


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])# 将 label 转化为label_to_ix对应的长张量


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE) # 构建模型

In [17]:
# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the PyTorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.1194,  0.0609, -0.1268,  0.1274,  0.1191,  0.1739, -0.1099, -0.0323,
         -0.0038,  0.0286, -0.1488, -0.1392,  0.1067, -0.0460,  0.0958,  0.0112,
          0.0644,  0.0431,  0.0713,  0.0972, -0.1816,  0.0987, -0.1379, -0.1480,
          0.0119, -0.0334],
        [ 0.1152, -0.1136, -0.1743,  0.1427, -0.0291,  0.1103,  0.0630, -0.1471,
          0.0394,  0.0471, -0.1313, -0.0931,  0.0669,  0.0351, -0.0834, -0.0594,
          0.1796, -0.0363,  0.1106,  0.0849, -0.1268, -0.1668,  0.1882,  0.0102,
          0.1344,  0.0406]], requires_grad=True)
Parameter containing:
tensor([0.0631, 0.1465], requires_grad=True)


In [18]:
# To run the model, pass in a BoW vector
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad(): 
    sample = data[0] # 获取数据
    bow_vector = make_bow_vector(sample[0], word_to_ix) #将一个训练数据转化为词袋向量
    log_probs = model(bow_vector) # 获得每一类的概率
    print(log_probs)

tensor([[-0.5378, -0.8771]])


In [19]:
# 以上哪个值对应于英语的对数概率，哪个对应于西班牙语？我们从未定义过它，
# 但是如果我们想训练它，就需要定义它。
label_to_ix = {"SPANISH": 0, "ENGLISH": 1}

In [20]:
# Run on test data before we train, just to see a before-and-after
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix) #将测试数据转化为词袋向量
        log_probs = model(bow_vec) # 获得每一类的概率
        print(log_probs)

tensor([[-0.9297, -0.5020]])
tensor([[-0.6388, -0.7506]])


In [21]:
# Print the matrix column corresponding to "creo"
print(next(model.parameters())[:, word_to_ix["creo"]])

tensor([-0.1488, -0.1313], grad_fn=<SelectBackward>)


#### （3）训练模型

In [22]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(100):
    for instance, label in data:
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        # Step 1. Pytorch 会积累 gradients，我们需要在每个实例前清除它们。
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Tensor as an integer. For example, if the target is SPANISH, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to SPANISH
        # 将数据转化为BOW向量，并将label包装在张量中作为整数。
        # 例如，如果label是SPANISH，则我们包装整数0。
        # 从而损失函数能知道对数概率的第0个元素是与SPANISH对应的对数概率
        bow_vec = make_bow_vector(instance, word_to_ix)
        target = make_target(label, label_to_ix)

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

In [23]:
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

tensor([[-0.2093, -1.6669]])
tensor([[-2.5330, -0.0828]])


我们得到了正确的答案！您可以看到，在第一个示例中，西班牙语的对数概率要高得多，而在第二个示例中，英语的对数概率要高得多，这应该是正确的。

In [24]:
# Index corresponding to Spanish goes up, English goes down!
print(next(model.parameters())[:, word_to_ix["creo"]])

tensor([ 0.2803, -0.5605], grad_fn=<SelectBackward>)


### 3、N-Gram 语言建模

### 词嵌入
- One-hot 编码基本将所有单词视为独立的个体
- 我们想要利用语义相似性

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10bcf9dd0>

In [26]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]],
       grad_fn=<EmbeddingBackward>)


#### （1）数据预处理

In [27]:
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


In [31]:
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}
print(word_to_ix)

{'deep': 0, 'weed': 1, 'treasure': 2, "totter'd": 3, 'make': 4, 'lusty': 5, 'more': 6, 'When': 7, 'answer': 8, 'by': 9, 'To': 10, 'proud': 11, 'shame,': 12, 'all': 13, 'trenches': 14, 'old,': 15, 'Where': 16, 'use,': 17, 'asked,': 18, 'a': 19, 'sunken': 20, 'to': 21, 'the': 22, 'made': 23, 'If': 24, 'and': 25, 'new': 26, 'own': 27, 'gazed': 28, 'on': 29, 'thine!': 30, 'within': 31, "youth's": 32, 'warm': 33, 'child': 34, 'small': 35, 'This': 36, 'where': 37, 'blood': 38, 'now,': 39, 'praise': 40, 'thine': 41, 'besiege': 42, "deserv'd": 43, 'see': 44, 'of': 45, 'forty': 46, 'praise.': 47, 'Proving': 48, 'thou': 49, 'so': 50, 'shall': 51, "feel'st": 52, 'be': 53, 'in': 54, 'sum': 55, 'his': 56, 'it': 57, 'all-eating': 58, 'days;': 59, 'much': 60, 'thriftless': 61, "'This": 62, 'held:': 63, 'Shall': 64, 'brow,': 65, 'say,': 66, "excuse,'": 67, 'succession': 68, "beauty's": 69, 'field,': 70, 'beauty': 71, 'cold.': 72, 'worth': 73, 'my': 74, 'old': 75, 'couldst': 76, 'dig': 77, 'being': 78,

#### （2）模型构建

In [32]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        # 定义嵌入层
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 定义线性层
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size) # 输出维度 vocab_size

    def forward(self, inputs):
        # 嵌入层计算
        embeds = self.embeddings(inputs).view((1, -1))
        # 线性层1计算（用到激活函数）
        out = F.relu(self.linear1(embeds))
        # 线性层2计算
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs # 输出vocab_size的各单词概率


model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

#### （3）模型训练

In [33]:
losses = []
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        # Step 1. 将单词转化为整数张量
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        # Step 2. 清除 *accumulates* gradients
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        # Step 3. 前向传递
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        # Step 4. 计算损失函数
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[518.2574729919434, 515.7466995716095, 513.2517650127411, 510.77280926704407, 508.30748105049133, 505.8540885448456, 503.41428542137146, 500.9863142967224, 498.56913900375366, 496.16119599342346]


## 三、序列模型和长期记忆网络

In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10bcf9dd0>

In [36]:
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5
inputs

[tensor([[ 0.2673, -0.4212, -0.5107]]),
 tensor([[-1.5727, -0.1232,  3.5870]]),
 tensor([[-1.8313,  1.5987, -1.2770]]),
 tensor([[ 0.3255, -0.4791,  1.3790]]),
 tensor([[ 2.5286,  0.4107, -0.9880]])]

In [37]:
# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
hidden

(tensor([[[-0.9081,  0.5423,  0.1103]]]),
 tensor([[[-2.2590,  0.6067, -0.1383]]]))

In [42]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    print('out=')
    print(out)
    print('hidden=')
    print(hidden)
    print('\n')

out=
tensor([[[-0.0150,  0.0519,  0.0517]]], grad_fn=<StackBackward>)
hidden=
(tensor([[[-0.0150,  0.0519,  0.0517]]], grad_fn=<StackBackward>), tensor([[[-0.0260,  0.1683,  0.0925]]], grad_fn=<StackBackward>))


out=
tensor([[[-0.1650, -0.0318,  0.0311]]], grad_fn=<StackBackward>)
hidden=
(tensor([[[-0.1650, -0.0318,  0.0311]]], grad_fn=<StackBackward>), tensor([[[-0.8559, -0.0422,  0.1107]]], grad_fn=<StackBackward>))


out=
tensor([[[-0.1564,  0.2177, -0.0997]]], grad_fn=<StackBackward>)
hidden=
(tensor([[[-0.1564,  0.2177, -0.0997]]], grad_fn=<StackBackward>), tensor([[[-0.2084,  0.3100, -0.1684]]], grad_fn=<StackBackward>))


out=
tensor([[[-0.2287,  0.0275,  0.0541]]], grad_fn=<StackBackward>)
hidden=
(tensor([[[-0.2287,  0.0275,  0.0541]]], grad_fn=<StackBackward>), tensor([[[-0.6237,  0.0727,  0.1046]]], grad_fn=<StackBackward>))


out=
tensor([[[-0.1561,  0.0370,  0.3259]]], grad_fn=<StackBackward>)
hidden=
(tensor([[[-0.1561,  0.0370,  0.3259]]], grad_fn=<StackBackward>), ten

In [43]:
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
# 或者，我们可以一次完成整个序列。 LSTM返回的第一个值是整个序列中的所有隐藏状态。
# 第二个只是最近的隐藏状态（将“ out”的最后一个切片与下面的“ hidden”进行比较，
# 它们是相同的）的原因是：“ out”将使您可以访问序列中的所有隐藏状态“隐藏”将允许您
# 继续序列并向后传播，方法是在以后将其作为参数传递给lstm并添加第二维
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[ 0.4252, -0.0237,  0.2770]],

        [[-0.1433, -0.1000,  0.0961]],

        [[-0.1402,  0.1732,  0.0462]],

        [[-0.2264,  0.0123,  0.1304]],

        [[-0.1573,  0.0320,  0.3576]]], grad_fn=<StackBackward>)
(tensor([[[-0.1573,  0.0320,  0.3576]]], grad_fn=<StackBackward>), tensor([[[-0.2055,  0.1972,  0.4843]]], grad_fn=<StackBackward>))


### 示例：用于词性标记的 LSTM

#### （1）数据准备

In [45]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [46]:
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
def prepare_sequence(seq, to_ix): # 用来将文本数据转化为词向量
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

#### （2）创建模型

In [48]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        # 定义嵌入层
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

    
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), 
                   len(tag_to_ix))

#### （3）训练模型

In [49]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training 看看在训练前的得分
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)#词向量
    tag_scores = model(inputs)
    print(tag_scores)

# 开始训练
for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)#词向量
        targets = prepare_sequence(tags, tag_to_ix)#标签

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)#前向传递

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-1.3573, -1.1695, -0.8391],
        [-1.2872, -1.1532, -0.8956],
        [-1.2530, -1.3007, -0.8164],
        [-1.1299, -1.3814, -0.8540],
        [-1.2190, -1.3878, -0.7878]])
tensor([[-0.0203, -4.5639, -4.6403],
        [-4.1408, -0.0284, -4.4142],
        [-3.8771, -3.5853, -0.0497],
        [-0.0273, -4.3892, -4.2355],
        [-4.5757, -0.0218, -4.4850]])
