In [2]:
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## 1. Data

In [3]:
import spacy

with open("./data/wiki_king.txt", "r") as f:
    raw_text = f.read()

## 2. Data Preprocessing

### 2.1 Tokenization

In [4]:
import spacy
import re

# Assuming word2id is defined somewhere in your code
# word2id = {'example': 1, ...}

nlp = spacy.load("en_core_web_sm")
doc = nlp(raw_text)
sentences = [sentence.text.lower() for sentence in doc.sents]  # Convert sentences to lowercase
sentences = [re.sub("[.,!?\\-]", '', sentence) for sentence in sentences]  # Clean symbols from each sentence

#making vocabs - numericalization
word_list = list(set(" ".join(sentences).split()))
word2id   = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

for i, w in enumerate(word_list):
    word2id[w] = i + 4 #reserve the first 0-3 for CLS, PAD
    id2word    = {i:w for i, w  in enumerate(word2id)}
    vocab_size = len(word2id)

token_list = []
for sentence in sentences:
    arr = [word2id.get(word, 0) for word in sentence.split()]  # Use .get() to avoid KeyError for unknown words
    token_list.append(arr)

print(token_list)


[[97, 85, 11, 30, 49, 69, 11, 74, 28, 68, 22, 25, 6, 44, 15, 62, 91, 88, 60, 81, 51, 99, 21, 46, 40, 18, 29, 34, 51, 32, 94, 40, 88, 35, 27, 7, 86, 41, 72, 96, 103, 94, 70], [11, 36, 35, 27, 23, 57, 51, 35, 16, 12, 17, 96, 39, 67, 51, 40, 24, 4, 71, 92, 82], [98, 62, 40, 101, 71, 54, 35, 27, 99, 51, 81, 97, 85, 11, 90, 50, 94, 51, 52, 105, 107, 8, 45, 75, 63, 33, 65, 27, 51, 78, 83, 35, 79], [11, 52, 40, 41, 20, 87, 84, 51, 38, 46, 40, 43, 47, 71, 40, 76, 39, 80, 37, 95], [10, 47, 71, 40, 48, 98, 52, 40, 77, 66, 7, 94, 66, 93, 51, 55, 26, 9, 71, 40, 24, 64, 14, 94, 100, 59], [11, 55, 26, 40, 64, 56, 19, 73, 5, 98, 106, 96, 53, 102, 89, 13, 58, 42, 19, 40, 31, 71, 40, 104, 61]]


## 3. Data Loader

In [11]:
#randomly choose two sentence
tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
tokens_a, tokens_b            = token_list[tokens_a_index], token_list[tokens_b_index]
print(f"tokens_a:{tokens_a}")
print(f"tokens_b:{tokens_b}")
        
#1. token embedding - add CLS and SEP
input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]
print(f"input_ids:{input_ids}")

#2. segment embedding - which sentence is 0 and 1
segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
print(f"segment_ids:{segment_ids}")

tokens_a:[11, 52, 40, 41, 20, 87, 84, 51, 38, 46, 40, 43, 47, 71, 40, 76, 39, 80, 37, 95]
tokens_b:[98, 62, 40, 101, 71, 54, 35, 27, 99, 51, 81, 97, 85, 11, 90, 50, 94, 51, 52, 105, 107, 8, 45, 75, 63, 33, 65, 27, 51, 78, 83, 35, 79]
input_ids:[1, 11, 52, 40, 41, 20, 87, 84, 51, 38, 46, 40, 43, 47, 71, 40, 76, 39, 80, 37, 95, 2, 98, 62, 40, 101, 71, 54, 35, 27, 99, 51, 81, 97, 85, 11, 90, 50, 94, 51, 52, 105, 107, 8, 45, 75, 63, 33, 65, 27, 51, 78, 83, 35, 79, 2]
segment_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [13]:
batch_size = 6
max_mask   = 5 #even though it does not reach 15% yet....maybe you can set this threshold
max_len    = 1000 #maximum length that my transformer will accept.....all sentence will be padded

In [14]:
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size / 2 or negative != batch_size / 2:
        
        #randomly choose two sentence
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b            = token_list[tokens_a_index], token_list[tokens_b_index]
        
        #1. token embedding - add CLS and SEP
        input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]
        
        #2. segment embedding - which sentence is 0 and 1
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
        
        #3 masking
        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))
        #get all the pos excluding CLS and SEP
        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos)
        masked_tokens, masked_pos = [], []
        #simply loop and mask accordingly
        for pos in candidates_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.1:  #10% replace with random token
                index = randint(0, vocab_size - 1)
                input_ids[pos] = word2id[id2word[index]]
            elif random() < 0.8:  #80 replace with [MASK]
                input_ids[pos] = word2id['[MASK]']
            else: 
                pass
            
        #4. pad the sentence to the max length
        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        
        #5. pad the mask tokens to the max length
        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
        
        #6. check whether is positive or negative
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1
        
    return batch
        

In [15]:
batch = make_batch()
print(len(batch))

input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

6


In [16]:
input_ids.shape, segment_ids.shape, masked_tokens.shape, masked_pos.shape, isNext

(torch.Size([6, 1000]),
 torch.Size([6, 1000]),
 torch.Size([6, 5]),
 torch.Size([6, 5]),
 tensor([0, 0, 0, 1, 1, 1]))

In [27]:
# len(input_ids[0])
# input_ids[0]

In [17]:
masked_tokens

tensor([[ 89,  66,  98,  31,  40],
        [ 82,  11,  40,  35,  51],
        [ 88,  41,  49,  30,  51],
        [ 67, 101,  65,  52,  11],
        [ 59,  61,  40,  13,  66],
        [ 40,  51,  98,  71,  94]])

In [18]:
masked_pos

tensor([[15, 39, 32, 21, 23],
        [42, 22, 11, 29, 28],
        [33, 38, 49,  4, 21],
        [14, 26, 49, 41, 36],
        [26, 52,  4, 43, 10],
        [41,  8, 27, 14, 33]])

## 4. Modelling

Recall that BERT only uses the encoder.

BERT has the following components:

- Embedding layers
- Attention Mask
- Encoder layer
- Multi-head attention
- Scaled dot product attention
- Position-wise feed-forward network
- BERT (assembling all the components)

### 4.1 Embedding

In [None]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(max_len, d_model)      # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        #x, seg: (bs, len)
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (len,) -> (bs, len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

### 4.2 Attention Mask

In [28]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

In [32]:
# seq_q = torch.tensor([[1, 2, 3, 4], [1, 2, 3, 0]])
# seq_k = torch.tensor([[1, 2, 3, 4], [1, 2, 3, 0]])
# pad_mask = get_attn_pad_mask(seq_q, seq_k)
# pad_mask.shape

torch.Size([2, 4, 4])

In [35]:
print(get_attn_pad_mask(input_ids, input_ids).shape)

torch.Size([6, 1000, 1000])


### 4.3 Encoder

The encoder has two main components: 

- Multi-head Attention
- Position-wise feed-forward network

First let's make the wrapper called `EncoderLayer`

In [36]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn       = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn