## BERT-pytorch简单实现

### 主要内容：
1. 搭建BERT模型
2. 训练BERT模型
3. 预测[MASK]
4. 预测下个句子是否是IsNext

### 1. 准备数据集


In [113]:
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

text1 = (
        "你是谁?\n"
        "谁?  谁只是代表了一个人罢了\n"
        "那么你呢?\n"
        "一个个戴面具的男人.\n"
        "我看得出来.\n"
        "我怀疑这不是你的观察力, 但这仅仅是问一个戴面具的人的矛盾性质. \n"
        "请告诉我，你喜欢音乐吗?\n"
        "我喜欢电影.\n"
        "你喜欢哪类电影?\n"
        "爱丽丝漫游仙境\n"
        "我希望我是疯狂帽客.\n"
        "你完全疯了。 但我会告诉你一个秘密。所有最棒的人都是.\n"
        )
text = ' '.join(list(text))
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n') # filter '.', ',', '?', '!'
# 准备词典，token-id的映射dict
word_list = list(set(" ".join(sentences).split())) # ['你', '是', '谁', '只',...]
word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}
for i, w in enumerate(word_list):
    word2idx[w] = i + 4
idx2word = {i: w for i, w in enumerate(word2idx)}
vocab_size = len(word2idx)

# 准备预料
token_list = list()
for sentence in sentences:
    arr = [word2idx[s] for s in sentence.split()]
    token_list.append(arr)

### 2. 模型参数

- maxlen 表示同一个 batch 中的所有句子都由 30 个 token 组成，不够的补 PAD
- max_pred 表示最多需要预测多少个单词，即 BERT 中的完形填空任务
- n_layers 表示 Encoder Layer 的数量
- d_model 表示 Token Embeddings、Segment Embeddings、Position Embeddings 的维度
- d_ff 表示 Encoder Layer 中全连接层的维度
- n_segments 表示 Decoder input 由几句话组成


In [114]:
maxlen = 40
batch_size = 6
max_pred = 5 # max tokens of prediction
n_layers = 6
n_heads = 12
d_model = 768
d_ff = 768*4 # 4*d_model, FeedForward dimension
d_k = d_q = d_v = 64  # dimension of K(=Q), V
n_segments = 2

### 3. 数据预处理

1. 随机用[MASK]替换替换序列中15%的token
2. 拼接任意两句话

In [115]:
def make_data():
    """
    batch:
    """
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) # 随机选取两句话
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']] # 拼接两句话
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM
        n_pred =  min(max_pred, max(1, int(len(input_ids) * 0.15))) # 15 % of tokens in one sentence
        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] # candidate masked position
        shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [], []
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.8:  # 80%
                input_ids[pos] = word2idx['[MASK]'] # make mask
            elif random() > 0.9:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary
                while index < 4: # can't involve 'CLS', 'SEP', 'PAD'
                  index = randint(0, vocab_size - 1)
                input_ids[pos] = index # replace

        # Zero Paddings
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    return batch

batch:  
input_id: [1, 50, 3, 62, 37, 39, 6, 74, 47, 38, 2, 13, 22, 77, 21, 3, 69, 25, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
masked_tokens: [53, 62, 0, 0, 0]  
masked_pos: [15, 2, 0, 0, 0]  
isNext: False  


### 4. 模型构建

1. Transformer 的 Encoder
2. BERT模型

#### 4. 1 transformer - encoder

<div>
<img src='bert1.png' width='800' height='800'/>
</div>

In [116]:

def gelu(x):
    """
      Implementation of the gelu activation function.
      For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
      0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
      Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, seq_len, d_model]
        return enc_outputs
    
"""
多头注意力实现

说明：
1. 多头机制是先拆分成多份，再拼接回来。而不是输入维度不变，执行多次scaled_dot_product_attention，最后再缩放到原本维度。
2. 分拆后，每个头部的维度减少，因此总的计算成本与有全部维度的单头计算相同。
3. 多头只需拆最后一位维！attention在后面两维计算！
"""

    
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_q * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        """
        执行多头机制
        :param v: (batch_size, seq_len_q, d_model)
        :param k: [batch_size, seq_len_v, d_model]
        :param q: [batch_size, seq_len_q, d_model]
        :param mask:
        :return:
        output： (batch_size, seq_len_q, d_model)
        attention_weights: (batch_size, num_heads, seq_len_q, depth)
        """
        # 先多头分拆，再reshape回来
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        # 在最后一维分拆，attention在后面两维计算
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_q).transpose(1,2)  # q_s: [batch_size, n_heads, seq_len, d_q]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size, n_heads, seq_len, d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size, n_heads, seq_len, d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        # context: [batch_size, n_heads, seq_len, d_v], attn: [batch_size, n_heads, seq_len, seq_len]
        context = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size, seq_len, n_heads, d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual) # output: [batch_size, seq_len, d_model]

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
        return self.fc2(gelu(self.fc1(x)))

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        """
        计算注意力权重。
        要求：1. q,k,v，前面维度需要保持一样。
             2. k,v必须有匹配的倒数第二个维度，即：seq_len_k=seq_len_v
        :param q: shape(...,seq_len_q,depth)
        :param k: shape(...,seq_len_k,depth)
        :param v: shape(...,seq_len_v,depth_v)
        :param mask:  shape(...,seq_len_q,depth)
        :return:  输出注意力权重（...,seq_len_q,depth_v）
        """
        
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_q) # scores : [batch_size, n_heads, seq_len_q, seq_len_v]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)    # (..., seq_len_q, seq_len_k)
        context = torch.matmul(attn, V)      # (..., seq_len_q, depth_v)
        return context

#### 4.2 BERT模型


<table align='left'>
<tr>
<td><img src='bert-mask.png' width='600' height='600'/></td>
<td><img src='bert-nsp.png' width='600' height='600'/></td>
</tr>
</table>


In [117]:
def get_attn_pad_mask(seq_q, seq_k):
    # 构建mask矩阵
    batch_size, seq_len = seq_q.size()
    pad_attn_mask = seq_q.data.eq(0).unsqueeze(1)  # [batch_size, 1, seq_len]
    return pad_attn_mask.expand(batch_size, seq_len, seq_len)  # [batch_size, seq_len, seq_len]

# embedding层
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # [seq_len] -> [batch_size, seq_len]
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg) # 三者相加
        return self.norm(embedding)
    

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()  ##输入embedding
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)]) #n层encoder
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.Dropout(0.5),
            nn.Tanh(),
        )
        self.classifier = nn.Linear(d_model, 2) # nsp
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        # fc2 is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        self.fc2 = nn.Linear(d_model, vocab_size, bias=False) # mlm
        self.fc2.weight = embed_weight   #fc2层权重需要是tok_embed的权重

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)  # [bach_size, seq_len, d_model]
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids) # [batch_size, maxlen, maxlen]
        for layer in self.layers:
            # output: [batch_size, max_len, d_model]
            output = layer(output, enc_self_attn_mask)
        # 使用第一个位置（CLS）的隐向量来做NSP
        h_pooled = self.fc(output[:, 0])        # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2] predict isNext

        masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # 选取mask的位置 [batch_size, max_pred, d_model]
        h_masked = self.activ2(self.linear(h_masked))  # [batch_size, max_pred, d_model]
        logits_lm = self.fc2(h_masked)                 # [batch_size, max_pred, vocab_size]
        return logits_lm, logits_clsf

In [118]:
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.001)

### 5. 模型训练

#### 5.1 构造dataloader
1. 数据变成tensor
2. 继承Dataset，构造自定义Dataset, 实现__init__，__getitem__ 方法
3. 实现DataLoader，为训练作准备

In [122]:
batch = make_data()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
# 变tensor
input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
    torch.LongTensor(input_ids),  torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens),\
    torch.LongTensor(masked_pos), torch.LongTensor(isNext)

class MyDataSet(Data.Dataset):
  def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):
    self.input_ids = input_ids
    self.segment_ids = segment_ids
    self.masked_tokens = masked_tokens
    self.masked_pos = masked_pos
    self.isNext = isNext
  
  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    # 有index取一个sample
    return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]

loader = Data.DataLoader(MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext), batch_size, shuffle=True)

#### 5.2 构造dataloader

开始训练，50各batch，没10个batch输出loss

In [120]:
for epoch in range(50):
    for input_ids, segment_ids, masked_tokens, masked_pos, isNext in loader:
      logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
      loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1)) # masked LM
      loss_lm = (loss_lm.float()).mean()
      loss_clsf = criterion(logits_clsf, isNext) # nsp
      loss = loss_lm + loss_clsf
      if (epoch + 1) % 10 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 0010 loss = 2.484647
Epoch: 0020 loss = 1.323910
Epoch: 0030 loss = 1.053848
Epoch: 0040 loss = 0.926646
Epoch: 0050 loss = 0.917656


### 6. 模型测试

1. 取部分数据
2. 用训练的BERT模型预测[MASK]和isNext

In [121]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[1]
print('input_id:', input_ids)
print('segment_ids:', segment_ids)
print('masked_tokens:', masked_tokens)
print('masked_pos:', masked_pos)
print('isNext:', isNext)
print('================================')
print([idx2word[w] for w in input_ids if idx2word[w] != '[PAD]'])

logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), \
                 torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))
print(logits_lm.shape)
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('masked tokens list : ',[pos for pos in masked_tokens if pos != 0])
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_clsf else False)

input_id: [1, 2, 5, 58, 3, 5, 48, 49, 20, 56, 26, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids: [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
masked_tokens: [66, 0, 0, 0, 0]
masked_pos: [4, 0, 0, 0, 0]
isNext: False
['[CLS]', '[SEP]', '我', '希', '[MASK]', '我', '是', '疯', '狂', '帽', '客', '[SEP]']
torch.Size([1, 5, 79])
masked tokens list :  [66]
predict masked tokens list :  [66]
isNext :  False
predict isNext :  False
