In [1]:
import torch

In [2]:
from torch import nn
from torch.autograd import Variable

# 输入层

## embedding 

作用：将输入空间的一个值映射成特征空间中的高维向量，希望在高维空间捕捉词汇间的关系

In [3]:
import torch
from torch import nn

embd = nn.Embedding(10, 3)  # 输入空间10个元素，每个都映射成3维向量
input = torch.tensor([1,2,3,4,5])
embd(input)

tensor([[-1.1120,  0.2073,  0.7566],
        [-0.3458,  1.8638,  0.7209],
        [-0.1936, -0.1341,  1.1532],
        [-0.3584,  1.1800,  0.1187],
        [ 0.1320,  0.5332,  0.3781]], grad_fn=<EmbeddingBackward0>)

In [4]:
embd = nn.Embedding(10, 3, padding_idx=3)  # 将指定元素映射成 0 
input = torch.tensor([1,2,3,4,5])
embd(input)

tensor([[-0.0313,  1.5260, -0.8182],
        [-0.7808, -0.3657,  0.2140],
        [ 0.0000,  0.0000,  0.0000],
        [ 1.3338,  0.8547,  1.0599],
        [ 0.1208, -0.5930,  0.7166]], grad_fn=<EmbeddingBackward0>)

### embedding类

In [5]:
import math
from torch.autograd import Variable

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        # d_model: 词嵌入的维度
        # vocab: 词表的大小
        super(Embeddings, self).__init__()
        # 定义Embdding层
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
        
    def forward(self, x):
        # x为输入进模型的文本通过词汇映射后的数字张量
        return self.lut(x) * math.sqrt(self.d_model)


In [6]:
d_model = 3
vocab = 10
x = Variable(torch.LongTensor([[1,2], [3,4]]))
emb = Embeddings(d_model, vocab)
emb(x)

tensor([[[-0.7681,  0.6057,  1.6045],
         [ 0.0447,  1.5964,  2.6488]],

        [[ 0.1470,  0.6834,  3.0567],
         [-2.6747,  0.7002, -1.4043]]], grad_fn=<MulBackward0>)

## Position Encoding

<a>https://zhuanlan.zhihu.com/p/106644634</a>

dropout: 丢弃一部分值

In [7]:
from torch import nn

In [8]:
m = nn.Dropout(p=1)
input = torch.randn(4, 5)
print(input, "\n", m(input))

tensor([[ 0.8570,  0.1192, -0.9470,  0.2433, -1.0871],
        [-0.1387,  0.7731, -2.3681, -0.8907, -0.7810],
        [-0.5403,  1.0688, -0.0794, -0.1415, -0.2017],
        [ 0.2693,  0.0621, -0.1020, -0.6094, -1.0652]]) 
 tensor([[0., 0., -0., 0., -0.],
        [-0., 0., -0., -0., -0.],
        [-0., 0., -0., -0., -0.],
        [0., 0., -0., -0., -0.]])


unsquezze: 张量升维

In [9]:
import torch

In [10]:
x = torch.tensor([1, 2, 3, 4])
x.shape

torch.Size([4])

In [11]:
y = torch.unsqueeze(x, 0)
print(y, y.shape)

tensor([[1, 2, 3, 4]]) torch.Size([1, 4])


In [12]:
z = torch.unsqueeze(x, 1)
print(z, z.shape)

tensor([[1],
        [2],
        [3],
        [4]]) torch.Size([4, 1])


### Positional Encoding 类

给定一个长度为n的输入序列，让t表示词在序列中的位置，$\vec{p_t} \in R^d$ 表示t位置对应的向量，d是向量的纬度。  
$f:N \rightarrow R^d$ 是生成位置向量 $\vec{p_t}$ 的函数，定义如下：
$$\vec{p_t}^{(i)} = f(t)^{i} := 
\begin{cases}
sin(\omega_k \cdot t) & i = 2k \\
cos(\omega_k \cdot t) & i = 2k + 1
\end{cases}
$$
其中，频率w_k 定义如下：
$$\omega_k = \frac{1}{10000^{2k/d}}$$

In [13]:
from torch import unsqueeze
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        # d_model 词嵌入的纬度
        # dropout 置零比率
        # max_len 句子最大长度
        super(PositionalEncoding, self).__init__()
        
        # 实例化 dropout
        self.dropout = nn.Dropout(p=dropout)
        
        # 初始化一个位置编码矩阵， 大小是 maxlen * d_model
        pe = torch.zeros(max_len, d_model)
        
        # 初始化一个绝对位置矩阵 maxlen * 1
        position = torch.arange(0, max_len).unsqueeze(1)

        # 定义一个变化矩阵div_term, 跳跃式的初始化
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))

        # 将定义的变化矩阵进行奇数，偶数的分别赋值
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # 增维
        pe = pe.unsqueeze(0)
        
        # 注册成bufer
        self.register_buffer("pe", pe)
        
    def forward(self, x):
        # x 代表文本序列的词嵌入表示（即张量表示）
        # 将 pe 截取到输入序列的长度
        # print(x.shape, self.pe[:, :x.size(1)].shape, self.pe.shape)
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad = False)  # 不参与训练更新 
        return self.dropout(x)  # 返回携带了位置编码并丢弃一部分值的结果
    

In [14]:
d_model = 4  # 要是 2 的倍数
vocab = 10
dropout = 0.1
maxlen = 60
x = Variable(torch.LongTensor([[1,2,3], [1,3,5]]))
emb = Embeddings(d_model, vocab)
embx = emb(x)
pe = PositionalEncoding(d_model, dropout, maxlen)
pex = pe(embx)
print("origin:", x, x.shape)
print("emb:", embx, embx.shape)
print("pe:", pex, pex.shape)

origin: tensor([[1, 2, 3],
        [1, 3, 5]]) torch.Size([2, 3])
emb: tensor([[[-0.3166, -1.3563, -1.5514, -3.4348],
         [ 1.4588, -1.6432, -1.2367, -3.3309],
         [ 0.5503,  0.2891,  0.7986, -0.1928]],

        [[-0.3166, -1.3563, -1.5514, -3.4348],
         [ 0.5503,  0.2891,  0.7986, -0.1928],
         [-0.3156,  0.3774, -0.2601,  0.7235]]], grad_fn=<MulBackward0>) torch.Size([2, 3, 4])
pe: tensor([[[-0.3518, -0.3959, -1.7237, -0.0000],
         [ 2.5559, -1.2254, -1.3630, -2.5900],
         [ 1.6217, -0.1412,  0.9095,  0.8966]],

        [[-0.3518, -0.3959, -0.0000, -2.7053],
         [ 1.5464,  0.9216,  0.8984,  0.8968],
         [ 0.6597, -0.0430, -0.2668,  1.9147]]], grad_fn=<MulBackward0>) torch.Size([2, 3, 4])


### PE可视化

# 编码器

## 掩码张量

掩盖住输入的一部分信息使得模型不违反因果

In [15]:
import numpy as np

m = [[1,2,3], [4,5,6], [7,8,9]]
np.triu(m, 1)

array([[0, 2, 3],
       [0, 0, 6],
       [0, 0, 0]])

### 构建掩码张量的函数

In [16]:
def subsequent_mask(size):
    # size 代表掩码张量的后两个纬度，形成一个方阵
    attn_shape = (1, size, size)
    
    # 构建上三角矩阵
    subsequent_mask = np.triu(np.ones(attn_shape), k = 1).astype("uint8")  # 节省空间
    return torch.from_numpy(1 - subsequent_mask)  # 反转返回

In [17]:
size = 5
subsequent_mask(size)

tensor([[[1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0],
         [1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1]]], dtype=torch.uint8)

其中 0 代表被遮掩 1 代表没有被遮掩，可以看到随着处理的词汇增加，可以利用的词汇也增加，不会出现因果问题

In [18]:
# import matplotlib.pyplot as plt
# plt.figure(figsize=(5, 5))
# plt.imshow(subsequent_mask(20)[0])

## 注意力机制

Q K V 的比喻解释：  
假设现在需要对一段文本进行描述，为方便这个过程的进行，给出一些关键词，则文本可以看作query， 提示的关键词可看作key，最终需要我们得出的文本的描述信息被称为value，最初我们只知道这些关键词，即此时value和key基本相同，但随着我们对问题的深入理解，value开始逐渐变化，最终完成任务，得到文本的描述，这个过程就是注意力作用的过程。

一般情况下最初key与value相同但与query不同，即一般的注意力输入形式。但有一种特殊的注意力机制，此处query = key = value，即自注意力机制，比喻来说就是需要从给定的文本中抽取关键词来描述它，相当于对文本自身做了一次特征提取。

$$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$$

<div align="center">
<img src=./imgs/attention.png width=30% />
</div>

### 1. mask fill

In [19]:
x = Variable(torch.randn(5, 5))
x

tensor([[-1.2285,  1.0086, -0.8663, -0.1387, -0.2359],
        [ 1.1106, -0.0918, -0.6523, -0.8730, -0.0174],
        [-0.2812, -0.5392, -0.9327, -0.1785,  0.6068],
        [ 1.7927, -0.1893, -2.2222, -0.9944, -2.1909],
        [ 0.3817,  1.1343, -0.1169, -0.2870, -0.3145]])

In [20]:
mask = Variable(torch.zeros(5, 5))
mask

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [21]:
x.masked_fill(mask == 0, -1e9)  # 0 的位置都替换成很小的数，达到mask的效果

tensor([[-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]])

### 2. attention

In [22]:
import torch.nn.functional as F

def attention(query, key, value, mask = None, dropout = None):
    # 此处dropout为一个dropout层对象
    
    d_k = query.size(-1)  # query的最后一个纬度，即特征向量的纬度
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)  # 计算attention
    # print(scores.size())
    if mask is not None: scores = scores.masked_fill(mask == 0, -1e9)  # 掩码
    p_attn = F.softmax(scores, dim = -1)  # 对最后一格纬度做softmax，即得到attention在特征空间各个分量的分数
    if dropout is not None: p_attn = dropout(p_attn)
    
    # 完成attention的计算
    return torch.matmul(p_attn, value), p_attn

In [23]:
d_model = 2  # 要是 2 的倍数
vocab = 10
dropout = 0.1
maxlen = 60
x = Variable(torch.LongTensor([[1,2], [1,5]]))
emb = Embeddings(d_model, vocab)
embx = emb(x)
pe = PositionalEncoding(d_model, dropout, maxlen)
pex = pe(embx)
print("origin:", x, x.shape)
print("emb:", embx, embx.shape)
print("pe:", pex, pex.shape)

origin: tensor([[1, 2],
        [1, 5]]) torch.Size([2, 2])
emb: tensor([[[ 1.7049,  2.7717],
         [-1.7638,  1.7419]],

        [[ 1.7049,  2.7717],
         [-1.9604,  0.2723]]], grad_fn=<MulBackward0>) torch.Size([2, 2, 2])
pe: tensor([[[1.8943, 4.1908],
         [-0.0000, 2.5358]],

        [[1.8943, 0.0000],
         [-0.0000, 0.9029]]], grad_fn=<MulBackward0>) torch.Size([2, 2, 2])


In [24]:
query = key = value = pex  # 文本嵌入位置编码之后的输入
mask = Variable(torch.zeros((2,2,2)))
attention(query, key, value,mask)

(tensor([[[0.9472, 3.3633],
          [0.9472, 3.3633]],
 
         [[0.9472, 0.4515],
          [0.9472, 0.4515]]], grad_fn=<UnsafeViewBackward0>),
 tensor([[[0.5000, 0.5000],
          [0.5000, 0.5000]],
 
         [[0.5000, 0.5000],
          [0.5000, 0.5000]]], grad_fn=<SoftmaxBackward0>))

transpose 的语法

In [25]:
a = torch.tensor([[[1,2,3], [1,2,3]], [[3,4,5], [3,4,5]]])
a.shape

torch.Size([2, 2, 3])

In [26]:
a.transpose(-2, -1)

tensor([[[1, 1],
         [2, 2],
         [3, 3]],

        [[3, 3],
         [4, 4],
         [5, 5]]])

第一个通道都是channel，即有多少个形状一样的矩阵，对后两个维度转置即为对高维矩阵中的每一个二维矩阵进行转置。

### 3. 多头注意力

利用多个注意力机制使得每个结构都注意到原始特征的不同部分，均衡同一种注意力机制可能产生的偏差。

<div align="center">
<img src=./imgs/muti-attention.png width=30% />
</div>

这里V K Q在计算attention之前都进行了全连接层的操作。

#### torch.view() 改变矩阵形状

In [27]:
x = torch.randn((4, 4))
x.shape

torch.Size([4, 4])

In [28]:
x.view(16)

tensor([-0.8144,  0.2960, -1.4826, -0.2169,  1.0038, -1.2349,  0.9982,  1.8104,
        -0.7132, -1.3654, -0.2447,  0.2305,  0.0745,  0.7838, -0.9695, -0.8112])

In [29]:
x.view(-1, 4)  # -1 自动匹配

tensor([[-0.8144,  0.2960, -1.4826, -0.2169],
        [ 1.0038, -1.2349,  0.9982,  1.8104],
        [-0.7132, -1.3654, -0.2447,  0.2305],
        [ 0.0745,  0.7838, -0.9695, -0.8112]])

#### transpose

In [30]:
x = torch.randn(1, 2, 3, 4)
x.shape
x

tensor([[[[-0.1405,  0.0080, -0.4701,  1.6622],
          [-1.0113, -1.1622,  0.3613,  0.8428],
          [-1.5466, -1.7818, -0.2423, -0.8024]],

         [[ 1.8207,  0.5783,  0.3131,  0.3011],
          [-0.9022,  0.5600,  0.8024,  1.3328],
          [ 1.1283,  1.4759,  0.2779,  0.4628]]]])

In [31]:
y = x.transpose(1,2)  # 第一第二纬度转置 
y.shape

torch.Size([1, 3, 2, 4])

In [32]:
z = x.view(1, -1, 2, 4)  # 二者结果并不一致
z.shape

torch.Size([1, 3, 2, 4])

#### 实现clone函数

多头注意力需要用到多个结构相同的注意力模块

In [33]:
import copy
from torch.nn import ModuleList

def clones(module, N):
    return ModuleList([copy.deepcopy(module) for _ in range(N)])

#### 实现多头注意力机制的类

In [34]:
class MultiHeadAttention(nn.Module):
    def __init__(self, head, embdding_dim, dropout = 0.1):
        """
        head: 多头注意力的数量
        embdding_dim: 输入的特征纬度
        dropout: 置零的比率
        """
        super(MultiHeadAttention, self).__init__()
        
        assert embdding_dim % head == 0  # 确认特征纬度能被head整除
        
        self.head = head
        self.embdding_dim = embdding_dim
        self.d_k = embdding_dim // head  # 每个head分别处理的特征纬度
        # 生成结构中的线性层，共4个
        self.linears = clones(nn.Linear(embdding_dim, embdding_dim), 4)  # 线性层输入输出都是特征的维数
        self.attn = None  # 初始化注意力张量
        self.dropout = nn.Dropout(p = dropout)
        
    def forward(self, query, key, value, mask = None):
        """query, key, value 是注意力的三个输入张量，mask代表掩码张量"""
        if mask is not None: mask = mask.unsqueeze(1)  # 升维，代表多头中的第n个头
        batch_size = query.size(0)  # 获得batchsize，即channel
        
        """
        未切割之前每个纬度的含义：0 batchsize 即一次性喂了多少个数据； 
        1 length 即该数据(句子的长度)； 2 dim 句子中每个元素的特征维度
        model(x). 后面的处理：
        1. 将输出张量的最后一个纬度进行切割，即划分输入给为多个head的特征,此时 2 nth 代表第n个头 3 dim 代表第n个头处理的特征维度
        2. 进行transpose的操作 将 1 2 转置
        """
        query, key, value = \
        [model(x).view(batch_size, -1, self.head, self.d_k).transpose(1, 2)   
         for model, x in zip(self.linears, (query, key, value))]  # 分别对输入的Q K V进行全连接层的处理
        
        # 将每个头的输出传入到注意力层
        x, self.attn = attention(query, key, value, mask = mask, dropout = self.dropout)
        
        # 得到每个头的计算结果是4维张量，需要进行形状的转换
        # 前面已经将1，2两个维度进行过转置，在这里要重新转置回来
        # 经历了transpose方法后，必须要使用contiguous方法，不然无法使用view
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.embdding_dim)
        
        # 最后对输出进行全连接
        return self.linears[-1](x)

In [35]:
d_model = 6  # 要是 2 的倍数
vocab = 10
dropout = 0.2
maxlen = 20
head = 3
x = Variable(torch.LongTensor([[1,2,3], [1,5,7]]))
emb = Embeddings(d_model, vocab)
embx = emb(x)
pe = PositionalEncoding(d_model, dropout, maxlen)
pex = pe(embx)
print("origin:", x, x.shape)
print("emb:", embx, embx.shape)
print("pe:", pex, pex.shape)
query = key = value = pex  # 文本嵌入位置编码之后的输入
mask = Variable(torch.zeros((2,3,3)))  # mask: batchsize * length * length

mha = MultiHeadAttention(head = head, embdding_dim = d_model, dropout = dropout)
mhax = mha(query, key, value, mask)
print("mhax:", mhax, mhax.shape)

origin: tensor([[1, 2, 3],
        [1, 5, 7]]) torch.Size([2, 3])
emb: tensor([[[ 3.8845, -1.4806, -3.4664, -1.3254, -0.2796,  0.8907],
         [ 1.5759, -1.3199,  1.3154,  1.3322, -3.8599,  2.9264],
         [-2.0447,  0.2596,  0.1811, -3.0931, -0.3690,  0.4002]],

        [[ 3.8845, -1.4806, -3.4664, -1.3254, -0.2796,  0.8907],
         [ 0.1021, -1.1432,  0.5417,  1.5325, -2.7589,  1.4150],
         [ 0.6348, -1.7899, -1.8147, -1.1538,  5.7850,  0.0276]]],
       grad_fn=<MulBackward0>) torch.Size([2, 3, 6])
pe: tensor([[[ 4.8556, -0.6007, -4.3330, -0.4068, -0.3495,  2.3634],
         [ 3.0217, -0.0000,  1.7023,  0.0000, -0.0000,  0.0000],
         [-1.4192, -0.1957,  0.3422, -2.6217, -0.4558,  1.7502]],

        [[ 4.8556, -0.6007, -0.0000, -0.4068, -0.3495,  2.3634],
         [ 1.1794, -0.7536,  0.7351,  3.1643, -3.4459,  3.0187],
         [ 1.9302, -2.7575, -2.1525, -0.1977,  7.2366,  1.2845]]],
       grad_fn=<MulBackward0>) torch.Size([2, 3, 6])
mhax: tensor([[[-0.2177, -0.031

## 前馈全连接层

考虑到zhu

In [36]:
import torch
import torch.nn
import torch.nn.functional as F

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        """
        d_model: 词嵌入维度，特征维度
        d_ff: 第一个线性层的输出，第二个线性层的输入
        dropout: 置零比率
        """
        super(PositionwiseFeedForward, self).__init__()
        
        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(p = dropout)
        
    def forward(self, x):
        """
        x 代表上一层的输出
        """
        return self.w2(self.dropout(F.relu(self.w1(x))))

In [37]:
d_model = 6
d_ff = 6
dropout = 0.2

In [38]:
x = mhax
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
ffx = ff(x)
print(ffx, ffx.shape)

tensor([[[-0.2637, -0.0758, -0.4798,  0.5895,  0.4317, -0.5520],
         [-0.3093, -0.0863, -0.5154,  0.6792,  0.4811, -0.5909],
         [-0.3156, -0.0957, -0.5451,  0.6866,  0.5001, -0.6437]],

        [[-0.3063, -0.1257, -0.4345,  0.6693,  0.5353, -0.5732],
         [-0.2572, -0.0733, -0.4717,  0.5774,  0.4232, -0.5406],
         [-0.2948, -0.0777, -0.4875,  0.6540,  0.4573, -0.5468]]],
       grad_fn=<ViewBackward0>) torch.Size([2, 3, 6])


## 归一化层

使得网络的输出都维持在合理的范围之内，不会出现过大或过小的情况，方便训练。

In [39]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps = 1e-6):
        """
        features: 词嵌入维度
        eps: 一个足够小的正数，用来在规范化计算公式的分母中防止除零操作
        """
        super(LayerNorm, self).__init__()
        
        # 初始化两个参数张量a2，b2 用于对结果做规范化计算
        # 将其用nn.Parameter进行封装，代表他们是模型中的参数
        self.a2 = nn.Parameter(torch.ones(features))
        self.b2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(-1, keepdim = True)  # 在最后一个维度即特征维求均值，保持维度
        std = x.std(-1, keepdim = True)  # 同上
        return self.a2 * (x - mean) / (std + self.eps) + self.b2

In [40]:
features = 6
x = ffx

In [41]:
ln = LayerNorm(features)
lnx = ln(x)
print(lnx, lnx.shape)

tensor([[[-0.4332, -0.0367, -0.8891,  1.3666,  1.0337, -1.0413],
         [-0.4785, -0.0557, -0.8691,  1.3955,  1.0200, -1.0122],
         [-0.4485, -0.0487, -0.8656,  1.3733,  1.0342, -1.0448]],

        [[-0.5137, -0.1665, -0.7601,  1.3624,  1.1046, -1.0268],
         [-0.4309, -0.0350, -0.8925,  1.3655,  1.0337, -1.0408],
         [-0.4908, -0.0568, -0.8759,  1.4055,  1.0123, -0.9943]]],
       grad_fn=<AddBackward0>) torch.Size([2, 3, 6])


## 子层连接结构

<div align="center">
<img src=./imgs/res.png width=25% />
</div>

即子层函数加残差连接

In [42]:
class SubLayerConnection(nn.Module):
    def __init__(self, size, dropout = 0.1):
        """size: 词嵌入维度 dropout: 置零比率"""
        super(SubLayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(p = dropout)
        self.size = size
        
    def forward(self, x, sublayer):
        """x: 上一层的输出 sublayer：该子层中的子层函数，如attention等"""
        return x + self.dropout(sublayer(self.norm(x)))  # 将上一层输出经过归一化送入子层函数，再dropout以及残差连接

In [43]:
size = d_model = 6  # 特征维度 6
head = 3  # 3 个头
dropout = 0.2

x = pex  # 位置编码后的输出
mask = Variable(torch.zeros(2, 3, 3))  # batchsize 2 length 3
self_attn = MultiHeadAttention(head, d_model)  # 多头自注意力

sublayer = lambda x: self_attn(x, x, x, mask) 

sc = SubLayerConnection(size, dropout)
scx = sc(x, sublayer)
print(scx, scx.shape)

tensor([[[ 4.9678, -0.8298, -4.3330, -0.8675,  0.3727,  2.4276],
         [ 3.1339, -0.2291,  1.7580, -0.4607,  0.7223,  0.0642],
         [-1.2618, -0.4020,  0.3099, -2.9680, -0.4558,  1.7502]],

        [[ 5.1224, -0.8391,  0.1135, -0.6318,  0.5595,  2.4885],
         [ 1.5108, -0.9266,  0.7010,  3.1008, -2.6581,  3.1745],
         [ 2.1970, -2.9959, -2.1525, -0.4227,  7.2366,  1.2845]]],
       grad_fn=<AddBackward0>) torch.Size([2, 3, 6])


## 编码器层

作为编码器的组成单元，每个编码器层完成一次对输入的特征提取，即编码过程。多个编码器层组成编码器。

<div align="center">
<img src=./imgs/res.png width=25% />
</div>

In [44]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        """
        size: 词嵌入维度
        self_attn: 多头注意力子层的实例化对象
        feed_forward: 前馈全连接层的实例化对象
        dropout: 置零比率
        """
        super(EncoderLayer, self).__init__()
        
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SubLayerConnection(size, dropout), 2)  # 复制两个子层结构
        
    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))  # 第一个自注意力层
        return self.sublayer[1](x, self.feed_forward)  # 第二个前馈全连接层

In [45]:
size = d_model = 6  # 词嵌入维度
head = 3  # 头数量
d_ff = 3  # 全连接层维度
x = pex
dropout = 0.2

In [46]:
self_attn = MultiHeadAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
mask = Variable(torch.ones(2, 3, 3))

el = EncoderLayer(size, self_attn, ff, dropout)
elx = el(x, mask)
print(elx, elx.shape)

tensor([[[ 5.0561, -0.5475, -4.2266, -1.0762, -0.7733,  2.2734],
         [ 3.0217, -0.0528,  1.7953, -0.7089,  0.2792,  0.1649],
         [-0.7983, -0.3059,  0.3841, -3.4623, -0.4840,  1.4067]],

        [[ 5.4026, -0.5363,  0.5002, -1.2378, -1.2746,  2.6544],
         [ 1.5407, -0.7895,  0.8690,  2.2927, -3.0183,  3.4550],
         [ 2.6608, -2.4170, -1.8692, -0.7672,  7.1582,  1.5987]]],
       grad_fn=<AddBackward0>) torch.Size([2, 3, 6])


## 编码器

编码器用于对输入进行指定的特征提取过程，由N个编码器层堆叠而成。  
编码器类的输出就是Transformer中编码器的特征提取表示，它将成为编码器的输出的一部分。

<div align="center">
<img src=./imgs/encoder-layer.png width=30% />
</div>

In [47]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        """layer: 代表解码器层 N:代表解码器中有几个layer"""
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)  # 复制 N 个编码器层
        self.norm = LayerNorm(layer.size)  # 初始化一个规范化层，作用在编码器的最后面
        
    def forward(self, x, mask):
        """x: 上一层输出张量， mask：掩码张量"""
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [48]:
size = d_model = 6
d_ff = 3
head = 3
dropout = 0.2
N = 6
c = copy.deepcopy
attn = MultiHeadAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
layer = EncoderLayer(size, c(attn), c(ff), dropout)
mask = Variable(torch.zeros(2, 3, 3))
x = pex

In [49]:
en = Encoder(layer, N)
enx = en(x, mask)
print(enx, enx.shape)

tensor([[[ 1.3469, -0.0511, -1.2224, -0.8977, -0.1091,  0.9335],
         [ 0.7794,  0.5000, -0.3535, -1.6574, -0.3424,  1.0739],
         [-0.6509,  0.4056, -0.3593, -1.3701,  0.5097,  1.4649]],

        [[ 1.4468,  0.3115, -0.5582, -1.2771, -0.6188,  0.6958],
         [ 0.3410,  0.3520, -1.1145, -0.3353, -0.8704,  1.6272],
         [ 1.1492, -0.3128, -1.0484, -1.2043,  0.8290,  0.5873]]],
       grad_fn=<AddBackward0>) torch.Size([2, 3, 6])


# 解码器部分

<div align="center">
<img src=./imgs/decoder.png width=30% />
</div>

## 解码器层

作为解码器的组成单元，每个解码器层根据给定的输入向目标方向进行特征提取操作，即解码过程。

In [50]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        """
        size: 词嵌入维度
        self_attn: 多头自注意力机制对象
        src_attn: 常规注意力机制对象
        feed_forward: 前馈全连接层
        dropout: 置零比率
        """
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.dropout = dropout
        self.sublayer = clones(SubLayerConnection(size, dropout), 3)  # 复制 3 个子层连接对象
    
    def forward(self, x, memory, source_mask, target_mask):
        """
        x: 上一层的输出
        memory：编码器得到的句子的语义
        source_mask: 源数据的掩码张量 -> 为了遮盖住对结果信息无用的数据
        target_mask: 解码时遮盖住未来的信息，不产生因果问题
        """
        m = memory
        # 对输入进行自注意力操作，同时用掩码遮盖
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, target_mask))  
        # 对输入进行常规注意力操作，遮盖住不重要的区域
        # 形象理解即此处使用编码层获得的语义信息对目标的文本进行语义的提取处理
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, source_mask))
        # 全连接层
        return self.sublayer[2](x, self.feed_forward)

In [51]:
size = d_model = 6
head = 3
d_ff = 3  # 全连接层中间维度
dropout = 0.2
self_attn = src_attn = MultiHeadAttention(head, d_model, dropout)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
x = pex  # 位置编码的 x
memory = enx  # 编码层输出的 x
mask = Variable(torch.zeros(2,3,3))
source_mask = target_mask = mask

In [52]:
dl = DecoderLayer(size, self_attn, src_attn, feed_forward = ff, dropout = dropout)
dlx = dl(x, memory, source_mask, target_mask)
print(dlx, dlx.shape)

tensor([[[ 6.6282,  0.2504, -4.2016, -1.3818,  0.3407,  1.8044],
         [ 4.5422,  0.9363,  2.4530, -0.7655,  0.9313, -0.2787],
         [-0.6684,  0.4788,  0.3806, -3.8594,  0.5337,  1.3313]],

        [[ 5.5136, -0.1112,  0.7617, -1.5233,  0.5750,  1.5778],
         [ 1.7645,  0.1676,  1.3989,  2.2690, -2.7667,  2.3048],
         [ 2.8819, -2.0632, -1.3684, -0.7084,  8.0000,  1.7142]]],
       grad_fn=<AddBackward0>) torch.Size([2, 3, 6])


## 解码器

根据编码器的结果以及上一次预测的结果，对下一次可能出现的值进行表征。

In [53]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        """layer: 解码器层的对象； N：堆叠层数"""
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)  # 即特征维度
    
    def forward(self, x, memory, source_mask, target_mask):
        """
        x: 上一层输出
        memory: 编码器的输出，即语义提取张量
        source_mask: 源数据的掩码张量
        target_mask: 目标数据的掩码张量
        """
        for layer in self.layers:
            x = layer(x, memory, source_mask, target_mask)
        return self.norm(x)  # 输出归一化的 x

In [54]:
size = d_model = 6
head = 3
d_ff = 3  # 全连接层中间维度
dropout = 0.2
N = 6
self_attn = src_attn = MultiHeadAttention(head, d_model, dropout)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
x = pex  # 位置编码的 x
memory = enx  # 编码层输出的 x
mask = Variable(torch.zeros(2,3,3))
source_mask = target_mask = mask
dl = DecoderLayer(size, self_attn, src_attn, feed_forward = ff, dropout = dropout)

In [55]:
de = Decoder(layer = dl, N = N)
dex = de(x, memory, source_mask, target_mask)
print(dex, dex.shape)

tensor([[[ 0.9365, -0.4618, -0.4322, -1.5576,  0.4847,  1.0304],
         [ 1.1654,  0.2689,  0.3806, -1.8065, -0.2880,  0.2795],
         [-0.0305,  0.1631,  0.8662, -1.8461, -0.0535,  0.9006]],

        [[ 1.4039, -0.1870, -0.0595, -1.6784,  0.1230,  0.3981],
         [-0.2392, -0.3472,  0.4160, -1.2336, -0.3330,  1.7369],
         [ 0.5674, -1.0952,  0.1276, -1.2342,  1.3665,  0.2679]]],
       grad_fn=<AddBackward0>) torch.Size([2, 3, 6])


# 输出部分实现

<div align="center">
<img src=./imgs/output.png width=25% />
</div>

- 线性层：通过对上一步的线性变化得到指定维度的输出，也就是转换为度的作用。
- softmax层：使最后一维的向量中的数字缩放到 0 - 1 的概率值域内，并满足它们的和为 1。

In [56]:
import torch.nn.functional as F

In [57]:
# 将线性层和softmax计算层一起实现，因为二者的共同目标是生成最后的结构
# 因此把类的名字叫做Generator， 生成器类
class Generator(nn.Module):
    def __init__(self, d_model, vocab_size):
        """d_model: 词嵌入维度；vocab_size: 词表大小"""
        super(Generator, self).__init__()
        self.project = nn.Linear(d_model, vocab_size)  # 映射到指定维度
        
    def forward(self, x):
        """x: 上一层的输出张量"""
        # return F.softmax(self.project(x), dim = -1)  # 在最后一个维度进行映射操作即特征映射
        return F.log_softmax(self.project(x), dim = -1)  # 另一种softmax

In [58]:
d_model = 6
vocab_size = 10
x = dex

In [59]:
gen = Generator(d_model, vocab_size)
genx = gen(x)
print(genx, genx.shape)

tensor([[[-2.3013, -2.1341, -2.7590, -1.7552, -3.4373, -2.7430, -2.5643,
          -1.9503, -2.7872, -1.7840],
         [-2.1983, -1.2795, -3.3211, -2.1018, -3.1781, -3.0560, -2.5728,
          -2.3910, -2.7137, -2.0434],
         [-2.7156, -1.4871, -2.4989, -2.4766, -2.8369, -2.5102, -2.6038,
          -2.1814, -2.3710, -2.1079]],

        [[-2.1210, -1.5872, -3.2404, -1.7843, -3.4087, -3.0195, -2.5256,
          -2.3069, -2.7902, -1.9267],
         [-2.9621, -2.2435, -2.0339, -2.4450, -2.9178, -2.0346, -2.5325,
          -2.0221, -2.7543, -1.8056],
         [-2.4831, -1.9012, -2.4460, -1.7923, -3.2660, -2.3698, -2.4966,
          -2.5117, -2.1690, -2.2588]]], grad_fn=<LogSoftmaxBackward0>) torch.Size([2, 3, 10])


# 模型构建

<div align="center">
<img src=./imgs/transformer.png width=30% />
</div>

In [60]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, source_embed, target_embed, generator):
        """
        encoder: 编码器对象
        decoder: 解码器对象
        source_embed: 源数据的嵌入函数
        target_embed:目标数据的嵌入函数
        generator: 输出部分类别生成器对象
        """
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = source_embed
        self.tgt_embed = target_embed
        self.generator = generator
    
    def forward(self, source, target, source_mask, target_mask):
        """
        source: 代表源数据
        target: 代表目标数据
        source_mask: 代表源数据的掩码张量
        target_mask: 代表目标数据的掩码张量
        """
        return self.decode(
            self.encode(source, source_mask),
            source_mask,
            target,
            target_mask
        )
    
    def encode(self, source, source_mask):
        # 对源输入进行词嵌入并带着 mask 进行 encode
        return self.encoder(self.src_embed(source), source_mask)
    
    def decode(self, memory, source_mask, target, target_mask):
        # 对目标的输出进行词嵌入，并且同编码器的输出一同进行解码
        return self.decoder(self.tgt_embed(target), memory, source_mask, target_mask)

In [61]:
vocab_size = 10
d_model = 6
encoder = en
decoder = de
source_embed = nn.Embedding(vocab_size, d_model)
target_embed = nn.Embedding(vocab_size, d_model)  # 演示用只进行嵌入没进行位置编码
gen = gen
source = target = Variable(torch.LongTensor([[1,2,3], [1,3,5]]))
source_mask = target_mask = Variable(torch.zeros(2,3,3))

In [62]:
ed = EncoderDecoder(encoder, decoder, source_embed, target_embed, gen)
edx = ed(source, target, source_mask, target_mask)
print(edx, edx.shape)

tensor([[[-0.5984, -0.1317,  0.7504, -1.5379,  0.2369,  1.2807],
         [ 0.2171, -0.4994,  0.5030, -1.7928,  0.7719,  0.8002],
         [-0.1979, -0.3047,  0.3110, -1.7074,  1.1127,  0.7862]],

        [[-0.1407, -0.0658,  0.8100, -1.8114,  0.2107,  0.9971],
         [-0.0019, -0.5077,  0.6756, -1.6698,  0.3236,  1.1802],
         [-0.0862, -0.0998,  0.9731, -1.8428,  0.7308,  0.3248]]],
       grad_fn=<AddBackward0>) torch.Size([2, 3, 6])


## make model

In [63]:
def make_model(source_vocab, target_vocab, N = 6, d_model = 512, d_ff = 1024, head = 8, dropout = 0.2):
    """
    source_vocab: 代表源数据的词汇总数
    target_vocab: 代表目标数据的词汇总数
    N: 代表编码器和解码器堆叠的层数
    d_model: 代表词嵌入的维度
    d_ff: 代表前馈全连接层中变换矩阵的维度
    head: 多头注意力机制中的头数
    dropout: 置零的比率
    """
    c = copy.deepcopy
    attn = MultiHeadAttention(head, d_model)  # 实例化多头注意力的对象
    ff = PositionwiseFeedForward(d_model, d_ff)  # 实例化全连接层对象
    pe = PositionalEncoding(d_model, dropout)  # 实例化位置编码器
    
    # 实例化模型 model，利用EncoderDecoder类
    # 编码器的结构中有 2 个子层，attention层和全连接层
    # 解码器中有 3 个子层，两个attention和一个全连接层
    # 都各自堆叠 N 次
    model = EncoderDecoder(
        Encoder(
            EncoderLayer(
                d_model,  # 词嵌入维度
                c(attn),  # 自注意力层  
                c(ff),   # 全连接层
                dropout
            ), N),  # 堆叠 N 层
        Decoder(
            DecoderLayer(
                d_model,
                c(attn),  
                c(attn),  # 这里的两个注意力对象不同，功能也不同
                c(ff),
                dropout
            ), N),  # 堆叠 N 层
        nn.Sequential(Embeddings(d_model, source_vocab), c(pe)),  # 输入文本的嵌入，加入位置编码
        nn.Sequential(Embeddings(d_model, target_vocab), c(pe)),  # 对目标文本进行词嵌入
        Generator(d_model, target_vocab)  # 由特征向量映射到目标词汇表
    )
    
    # 初始化模型参数
    for p in model.parameters():
        if p.dim() > 1: nn.init.xavier_uniform_(p)  # 进行均匀初始化
    
    return model

In [64]:
source_vocab = target_vocab = 10

In [65]:
make_model(source_vocab, target_vocab, N = 1)

EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadAttention(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w1): Linear(in_features=512, out_features=1024, bias=True)
          (w2): Linear(in_features=1024, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0): SubLayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (1): SubLayerConnection(
            (norm): LayerNorm()
   

# 模型测试

> copy 任务：
> - 任务描述：针对数字序列进行学习，学习的最终目标是使得输出与输入的序列相同，如输入[1,2,3]，也输出[1,2,3]。
> - 任务意义：copy任务在模型基础测试中具有重要意义，因为copy操作对于模型来讲是一条明显规律，因此模型能否在短时间内，小数据集中学会它，可以帮助我们判定模型所有过程是否正常，是否已经具备基本学习能力。

> 使用copy任务进行模型基本测试的四步曲：
> 1. 构建数据集生成器
> 2. 获得Transformer模型及其优化器和损失函数
> 3. 运行模型进行训练和评估
> 4. 使用模型进行贪婪解码

## 数据集生成器

In [66]:
from pyitcast.transformer_utils import Batch, get_std_opt, LabelSmoothing, SimpleLossCompute, run_epoch, greedy_decode
import numpy as np
import torch
from torch.autograd import Variable

In [67]:
def data_generator(V, batch_size, batch_num):
    """
    V: 随即生成数据的上界 + 1
    batch_size: 一次喂给网络多少个数据样本
    batch_num: 一共喂多少次
    """
    for i in range(batch_num):
        data = torch.from_numpy(np.random.randint(1, V, size = (batch_size, 10), dtype = "int64"))  # 每一条数据里包含 10 个数
        data[:, 0] = 1  # 将数据的第一列全部设置为 1 ，作为起始标志
        source = Variable(data, requires_grad = False)
        target = Variable(data, requires_grad = False)  # 源数据与目标数据一致并且不随着梯度更新而改变
        
        yield Batch(source, target)

In [68]:
V = 10
batch_size = 2
batch_num = 2

In [69]:
data = data_generator(V, batch_size, batch_num)
for d in data:
    print(d.src)

tensor([[1, 3, 4, 7, 4, 7, 6, 6, 3, 5],
        [1, 1, 2, 3, 1, 6, 3, 9, 8, 5]])
tensor([[1, 8, 8, 2, 4, 5, 3, 2, 8, 3],
        [1, 3, 2, 9, 3, 5, 6, 8, 2, 9]])


## 优化器及损失函数

label smoothing 对标签进行平滑  

第一个参数 size 代表目标数据的词汇总数，也就是最后一层得到的张量最后一维的大小。  
第二个参数 padding_idx 表示要将哪些数字替换成 0 ，一般此项为 0 表示不进行替换。  
第三个参数 smoothing 表示标签的平滑程度，如标签为 1 则平滑后的值变为 [1 - smoothing, 1 + smoothing]

In [70]:
import matplotlib.pyplot as plt
crit = LabelSmoothing(size = 5, padding_idx = 0, smoothing = 0.5)
predict = Variable(torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0], [0, 0.2, 0.7, 0.1, 0], [0, 0.2, 0.7, 0.1, 0]]))
target = Variable(torch.LongTensor([2, 1, 0]))
crit(predict, target)
# plt.imshow(crit.true_dist)



tensor(-3.1182)

In [127]:
model = make_model(V, V, N = 2)  # 获得模型的实例化对象
model_optimizer = get_std_opt(model)  # 获得模型的优化器
criterion = LabelSmoothing(size = V, padding_idx = 0, smoothing = 0.0)  # 获得标签平滑对象
loss = SimpleLossCompute(model.generator, criterion, model_optimizer)  # 获得利用标签平滑的结果得到的损失计算方法

## 运行及评估

In [128]:
def run(model, loss, epochs = 10):
    """
    model: 要训练的模型
    loss: 使用的损失计算方法
    epochs: 模型训练的轮次
    """
    for _ in range(epochs):
        model.train()  # 训练模式，参数更新
        run_epoch(data_generator(V, 8, 20), model, loss)  
        
        model.eval()  # 评估模式，参数保留
        run_epoch(data_generator(V, 8, 5), model, loss)

In [129]:
run(model, loss)

Epoch Step: 1 Loss: 2.998564 Tokens per Sec: 573.702637
Epoch Step: 1 Loss: 2.735607 Tokens per Sec: 620.688049
Epoch Step: 1 Loss: 2.611072 Tokens per Sec: 618.025635
Epoch Step: 1 Loss: 2.237946 Tokens per Sec: 657.596252
Epoch Step: 1 Loss: 2.235550 Tokens per Sec: 607.594971
Epoch Step: 1 Loss: 1.853113 Tokens per Sec: 648.644836
Epoch Step: 1 Loss: 2.240161 Tokens per Sec: 620.688049
Epoch Step: 1 Loss: 1.720811 Tokens per Sec: 631.581787
Epoch Step: 1 Loss: 1.951455 Tokens per Sec: 623.377441
Epoch Step: 1 Loss: 1.598292 Tokens per Sec: 648.652466
Epoch Step: 1 Loss: 2.019941 Tokens per Sec: 620.681702
Epoch Step: 1 Loss: 1.514075 Tokens per Sec: 654.540405
Epoch Step: 1 Loss: 1.776644 Tokens per Sec: 620.737854
Epoch Step: 1 Loss: 1.529034 Tokens per Sec: 651.583862
Epoch Step: 1 Loss: 1.810929 Tokens per Sec: 626.089478
Epoch Step: 1 Loss: 1.553615 Tokens per Sec: 651.583862
Epoch Step: 1 Loss: 1.843961 Tokens per Sec: 634.356567
Epoch Step: 1 Loss: 1.433536 Tokens per Sec: 657

## 使用模型进行贪婪解码

> 导入greddy_decode, 该工具将对最终结果进行贪婪解码， 贪婪解码的方式是每次预测都选择概率最大的结果作为输出，它不一定能获得全局最优解，但却具有最高的执行效率。

In [130]:
V = 11
model = make_model(V, V, N = 2)
model_optimizer = get_std_opt(model)  # 获得模型的优化器
criterion = LabelSmoothing(size = V, padding_idx = 0, smoothing = 0.0)  # 获得标签平滑对象
loss = SimpleLossCompute(model.generator, criterion, model_optimizer)  # 获得利用标签平滑的结果得到的损失计算方法

In [131]:
def run(model, loss, epochs = 10):
    """
    model: 要训练的模型
    loss: 使用的损失计算方法
    epochs: 模型训练的轮次
    """
    for _ in range(epochs):
        model.train()  # 训练模式，参数更新
        run_epoch(data_generator(V, 8, 20), model, loss)  
        
        model.eval()  # 评估模式，参数保留
        run_epoch(data_generator(V, 8, 5), model, loss)
    
    model.eval()
    source = Variable(torch.LongTensor([[1,2,4,3,5,7,6,8,9,10]]))
    source_mask = Variable(torch.ones(1, 1, 10))  # 全 1 的掩码张量，无任何遮掩
    result = greedy_decode(model, source, source_mask, max_len = 10, start_symbol = 1)
    
    return result

In [132]:
run(model, loss)

Epoch Step: 1 Loss: 3.409424 Tokens per Sec: 564.709351
Epoch Step: 1 Loss: 2.483949 Tokens per Sec: 654.551758
Epoch Step: 1 Loss: 2.884865 Tokens per Sec: 597.511719
Epoch Step: 1 Loss: 2.422046 Tokens per Sec: 637.167969
Epoch Step: 1 Loss: 2.501607 Tokens per Sec: 585.367981
Epoch Step: 1 Loss: 2.121666 Tokens per Sec: 631.579163
Epoch Step: 1 Loss: 2.299432 Tokens per Sec: 620.686829
Epoch Step: 1 Loss: 1.949369 Tokens per Sec: 637.165283
Epoch Step: 1 Loss: 2.006244 Tokens per Sec: 623.372253
Epoch Step: 1 Loss: 1.832286 Tokens per Sec: 645.742554
Epoch Step: 1 Loss: 2.045516 Tokens per Sec: 620.698303
Epoch Step: 1 Loss: 1.619347 Tokens per Sec: 648.653198
Epoch Step: 1 Loss: 1.841399 Tokens per Sec: 620.691895
Epoch Step: 1 Loss: 1.619378 Tokens per Sec: 648.646179
Epoch Step: 1 Loss: 1.840465 Tokens per Sec: 618.025635
Epoch Step: 1 Loss: 1.614929 Tokens per Sec: 657.533203
Epoch Step: 1 Loss: 1.886616 Tokens per Sec: 612.763123
Epoch Step: 1 Loss: 1.483351 Tokens per Sec: 651

tensor([[1, 4, 3, 5, 2, 7, 6, 8, 9, 7]])

In [1]:
import numpy as np

In [18]:
a = np.ones((2, 3, 6))
a.shape

(2, 3, 6)

In [21]:
b = a.transpose(0, 2, 1)
b.shape

(2, 6, 3)

In [23]:
c = np.matmul(a, b)
c.shape

(2, 3, 3)

In [24]:
d = np.matmul(c, a)
d.shape

(2, 3, 6)