In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

### 张量维度

In [2]:
attn=torch.randn(size=(4,5))
mask=torch.ones_like(attn)
mask[1,:]=0
mask[3,:]=0
res=attn.masked_fill(mask == 0, -1e9) 
attn,res

(tensor([[-1.1723,  0.6560, -0.0198, -0.1243,  1.2857],
         [ 1.6174, -1.2217,  0.2966, -0.5486, -1.2536],
         [-0.5397,  1.2439,  1.0251,  0.1083,  1.0483],
         [-2.4084,  0.0065,  0.3842,  1.2833,  0.7698]]),
 tensor([[-1.1723e+00,  6.5604e-01, -1.9832e-02, -1.2426e-01,  1.2857e+00],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
         [-5.3971e-01,  1.2439e+00,  1.0251e+00,  1.0834e-01,  1.0483e+00],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]]))

In [3]:
a=torch.randn(size=(4,5,3))
a[:, :a.size(1)].clone().detach().shape,a[:].shape

(torch.Size([4, 5, 3]), torch.Size([4, 5, 3]))

## nn.embedding 模块

nn.Embedding是PyTorch中的一个常用模块，其主要作用是将输入的整数序列转换为密集向量表示。在自然语言处理（NLP）任务中，可以将每个单词表示成一个向量，从而方便进行下一步的计算和处理。

embeddings中的值是正态分布N(0,1)中随机取值。

In [None]:

# torch.nn.Embedding(num_embeddings, 字典中词的个数
#                    embedding_dim, embedding的维度
#                    padding_idx=None, 索引指定填充：如果给定，则遇到padding_idx中的索引，则将其位置填0（0是默认值，事实上随便填充什么值都可以）。
#                    max_norm=None, 
#                    norm_type=2.0, 
#                    scale_grad_by_freq=False, 
#                    sparse=False, 
#                    _weight=None, 
#                    _freeze=False, 
#                    device=None, 
#                    dtype=None)

比如有两个句子：

I want a plane

I want to travel to Beijing

将两个句子转化为ID映射：

{I：1，want：2，a：3，plane：4，to：5，travel：6，Beijing：7}

转化成ID表示的两个句子如下：

* 1,2,3,4
* 1,2,5,6,5,7


In [16]:
import torch
from torch import nn
 
# 创建最大词个数为10，每个词用维度为4表示
embedding = nn.Embedding(100, 4,padding_idx=1)
 
# 将第一个句子填充0，与第二个句子长度对齐
in_vector = torch.LongTensor([[1, 2, 3, 4, 0, 0], [1, 2, 5, 6, 5, 7]])
out_emb = embedding(in_vector)
print(in_vector.shape)
print((out_emb.shape))
print(out_emb)
print(embedding.weight.shape) #可以看到这个维度正式和之前定义的一致，也就是对于每一个词表中的词，都会建立一个长度固定的向量

torch.Size([2, 6])
torch.Size([2, 6, 4])
tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [-3.7617e-01,  1.4583e+00,  2.7081e-01,  1.5754e+00],
         [ 9.7307e-01,  1.5120e-01, -1.8328e+00,  3.0220e-01],
         [ 1.3211e-02,  1.2444e+00,  1.6113e-03,  5.7591e-01],
         [ 3.8019e-01,  1.1239e+00, -2.4857e+00,  8.9475e-01],
         [ 3.8019e-01,  1.1239e+00, -2.4857e+00,  8.9475e-01]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [-3.7617e-01,  1.4583e+00,  2.7081e-01,  1.5754e+00],
         [-6.1391e-01,  8.6846e-01,  1.0594e+00, -1.5858e+00],
         [ 9.3353e-01, -2.6515e-01,  1.2069e-03, -3.6509e-02],
         [-6.1391e-01,  8.6846e-01,  1.0594e+00, -1.5858e+00],
         [ 2.1252e+00, -5.1110e-01, -4.5761e-01, -6.5869e-01]]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([100, 4])


## nn.embedding的可学习性

In [26]:
import torch
from torch import nn
import copy
 
# 创建最大词个数为10，每个词用维度为4表示
embedding = nn.Embedding(10, 4)
# old_weight=embedding.weight.data.clone() # 这里需要注意。简单的等号赋值，只是浅拷贝，未做到复制的效果
old_weight=copy.deepcopy(embedding.weight.data)
# print(embedding.weight)
 
# 将第一个句子填充0，与第二个句子长度对齐
in_vector = torch.LongTensor([[1, 2, 3, 4, 0, 0], [1, 2, 5, 6, 5, 7]])
 
optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01)
criteria = nn.MSELoss()
 
for i in range(1000):
    outputs = embedding(torch.LongTensor([1, 2, 3, 4]))
    loss = criteria(outputs, torch.ones(4, 4))
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
 
# print(embedding.weight)
print(old_weight==embedding.weight.data) # 前后的比较
new_output = embedding(in_vector)
print(new_output)

tensor([[ True,  True,  True,  True],
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        [ True,  True,  True,  True],
        [ True,  True,  True,  True],
        [ True,  True,  True,  True],
        [ True,  True,  True,  True],
        [ True,  True,  True,  True]])
tensor([[[ 0.2158,  0.7156,  0.4839,  0.7790],
         [ 0.8220,  0.7616,  0.5950,  1.0292],
         [ 0.4477,  0.9220,  0.8966,  1.0344],
         [ 0.8997,  0.3897,  0.4733,  0.9383],
         [-0.7471,  0.0598,  1.2257,  0.0170],
         [-0.7471,  0.0598,  1.2257,  0.0170]],

        [[ 0.2158,  0.7156,  0.4839,  0.7790],
         [ 0.8220,  0.7616,  0.5950,  1.0292],
         [ 0.4900,  1.7632, -0.0985, -3.3767],
         [-1.6483, -0.4112,  0.5477, -0.3571],
         [ 0.4900,  1.7632, -0.0985, -3.3767],
         [-0.4942,  0.3563,  2.1549,  0.0706]]], grad_fn=<EmbeddingBackward0>)


可以发现embedding是可以通过梯度的反向传播进行学习的

## 掩码处理

In [27]:
def get_pad_mask(seq, pad_idx):
    # (batch, seqlen) -> (batch, 1, seqlen) 
    return (seq != pad_idx).unsqueeze(-2)

In [30]:
def get_subsequent_mask(seq):
    ''' For masking out the subsequent info. '''
    sz_b, len_s = seq.size()
    subsequent_mask = (1 - torch.triu(
        torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
    return subsequent_mask

In [28]:
import torch
a=torch.tensor([[1,2,3,4,0,0],[1,2,5,6,5,7]])
b=get_pad_mask(a,0)
print(b) #若和pad_index一致，则为false

tensor([[[ True,  True,  True,  True, False, False]],

        [[ True,  True,  True,  True,  True,  True]]])


In [33]:
a=torch.tensor([[1,2,3,4,0,0],[1,2,5,6,5,7]])
c=get_subsequent_mask(a)
print(c)
c.shape

tensor([[[ True, False, False, False, False, False],
         [ True,  True, False, False, False, False],
         [ True,  True,  True, False, False, False],
         [ True,  True,  True,  True, False, False],
         [ True,  True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True,  True]]])


torch.Size([1, 6, 6])