### 0. Import libraries

In [1]:
import torch
import torch.nn as nn
import numpy as np
import datasets
import nltk

import torch.optim as optim
import torch.nn.functional as F
from random import *

import time
import os
import math
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

### 1. Load Data

In [2]:
import spacy

with open('./data/harrypotter.txt', 'r') as f:
    raw_text = f.read() # load the set

In [3]:
raw_text



### 2. Preprocessing

- tokenization

In [4]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(raw_text)

In [5]:
for sent in doc.sents:
    print(sent)

Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.
They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. 


Mr. Dursley was the director of a firm called Grunnings, which made drills.
He was a big, beefy man with hardly any neck, although he did have a very large mustache.
Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.
The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. 


The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it.
They didn't think they could bear it if anyone found out about the Potte

In [6]:
sentences = list(doc.sents) 
sentences

[Harry Potter and the Sorcerer's Stone 
 
 CHAPTER ONE 
 
 THE BOY WHO LIVED 
 
 Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.,
 They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. 
 ,
 Mr. Dursley was the director of a firm called Grunnings, which made drills.,
 He was a big, beefy man with hardly any neck, although he did have a very large mustache.,
 Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.,
 The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. 
 ,
 The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it.,
 They didn't think they could bear it if anyone foun

In [7]:
len(sentences)

6184

- lower case and clean the symbols

In [8]:
# transform to lower text
lower_text = []

for x in sentences:
    lower_x = x.text.lower()
    lower_text.append(lower_x)

lower_text

["harry potter and the sorcerer's stone \n\nchapter one \n\nthe boy who lived \n\nmr. and mrs. dursley, of number four, privet drive, were proud to say that they were perfectly normal, thank you very much.",
 "they were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. \n\n",
 'mr. dursley was the director of a firm called grunnings, which made drills.',
 'he was a big, beefy man with hardly any neck, although he did have a very large mustache.',
 'mrs. dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.',
 'the dursleys had a small son called dudley and in their opinion there was no finer boy anywhere. \n\n',
 'the dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it.',
 "they didn't think they could bea

In [9]:
# clean the symbols
cleaned_dataset = []

for x in lower_text:
    cleaned_x = re.sub('[.,!?\\\n&;@-]', '', x)
    cleaned_dataset.append(cleaned_x)

cleaned_dataset

["harry potter and the sorcerer's stone chapter one the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much",
 "they were the last people you'd expect to be involved in anything strange or mysterious because they just didn't hold with such nonsense ",
 'mr dursley was the director of a firm called grunnings which made drills',
 'he was a big beefy man with hardly any neck although he did have a very large mustache',
 'mrs dursley was thin and blonde and had nearly twice the usual amount of neck which came in very useful as she spent so much of her time craning over garden fences spying on the neighbors',
 'the dursleys had a small son called dudley and in their opinion there was no finer boy anywhere ',
 'the dursleys had everything they wanted but they also had a secret and their greatest fear was that somebody would discover it',
 "they didn't think they could bear it if anyone found out about the potters"

- numericalization

In [10]:
word_list = ' '.join(cleaned_dataset).split()
word_list = list(set(word_list))
word_list

['"when\'s',
 'fortune"',
 'mantelpiece',
 'ghost',
 'valuable"',
 'particularly',
 'chivalry',
 'doughnut',
 'proof"',
 '"let',
 '"wizards',
 'stool',
 'railview',
 "abou'",
 'sleeve',
 'grown',
 "game's",
 '"vernon"',
 'then"',
 'splendid',
 'echoes',
 'sadness',
 'hesitated',
 'mustached',
 "twins'",
 'pork',
 '"me"',
 'pure',
 'behavior',
 'placed',
 'main',
 'itself"',
 '"might',
 'fitted',
 'balls)',
 'smelled',
 'injured',
 'twanging',
 'bit',
 'moonlight',
 'multilevel',
 'volume',
 'strange"',
 'goodbye',
 'facedown',
 'leather',
 'similar)',
 'wishing',
 'hooknosed',
 'vain"',
 "studyin'",
 'counter',
 'suspicion',
 'slope',
 'both"',
 "dean's",
 'badger',
 'nails',
 'meanwhile',
 '"dursley"',
 '"coming',
 'gasp',
 'halloween',
 'flame',
 'beginning',
 'hot',
 'passages',
 'duffers',
 'yelps',
 'ask',
 'breakfast',
 "wasn't",
 'bushy',
 'torn',
 'office',
 'confessed',
 'hands',
 'advancing',
 'provoked',
 'iron',
 'fleet',
 'hand"',
 "takin'",
 'coming"',
 'cant',
 'ripped',

In [11]:
word2id = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3, '[UNK]': 4}

for i,w in enumerate(word_list):
    word2id[w] = i+5 # reserve the first 0-3 for PAD, CLS, SEP, MASK
                     # ATTENTION MASK DOESN'T PAY ATTENTION TO PADDING TOKENS

In [12]:
id2word = {}

for i,w in enumerate(word2id):
    id2word[i] = w

In [13]:
id2word

{0: '[PAD]',
 1: '[CLS]',
 2: '[SEP]',
 3: '[MASK]',
 4: '[UNK]',
 5: '"when\'s',
 6: 'fortune"',
 7: 'mantelpiece',
 8: 'ghost',
 9: 'valuable"',
 10: 'particularly',
 11: 'chivalry',
 12: 'doughnut',
 13: 'proof"',
 14: '"let',
 15: '"wizards',
 16: 'stool',
 17: 'railview',
 18: "abou'",
 19: 'sleeve',
 20: 'grown',
 21: "game's",
 22: '"vernon"',
 23: 'then"',
 24: 'splendid',
 25: 'echoes',
 26: 'sadness',
 27: 'hesitated',
 28: 'mustached',
 29: "twins'",
 30: 'pork',
 31: '"me"',
 32: 'pure',
 33: 'behavior',
 34: 'placed',
 35: 'main',
 36: 'itself"',
 37: '"might',
 38: 'fitted',
 39: 'balls)',
 40: 'smelled',
 41: 'injured',
 42: 'twanging',
 43: 'bit',
 44: 'moonlight',
 45: 'multilevel',
 46: 'volume',
 47: 'strange"',
 48: 'goodbye',
 49: 'facedown',
 50: 'leather',
 51: 'similar)',
 52: 'wishing',
 53: 'hooknosed',
 54: 'vain"',
 55: "studyin'",
 56: 'counter',
 57: 'suspicion',
 58: 'slope',
 59: 'both"',
 60: "dean's",
 61: 'badger',
 62: 'nails',
 63: 'meanwhile',
 64:

In [14]:
vocab_size = len(word2id)
vocab_size

7485

- token list

In [15]:
token_list = [] # word2id

for sentence in cleaned_dataset:
    arr = []
    for word in sentence.split():
        arr.append(word2id[word])
    token_list.append(arr)

In [16]:
len(token_list)

6184

### 3. DataLoader

- Token embeddings (randomly picked two sentences and then separate them with CLS and SEP)
- Next sentence prediction (segment embedding) (for seperating two sentences [0 0 0 0 1 1 1 1]). Do they really followed by each other (POSTIVE AND NEGATIVE)
- Masking (In the original paper, BERT randomly assigns to 15% of the sequence ==> 80% is replaced with masks, 10% is replaced with random tokens, 10% is original)
- Padding (transformer needs to limit seq length and once we mask, we will add padding)

In [17]:
batch_size = 6
max_mask   = 5 # calculated roughtly around 15% 
max_len    = 1000 # max length by transformer would accept # all sentences would be padded

In [18]:
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size / 2 or negative !=  batch_size / 2:

        # randomly choose two sentences 
        token_a_index, token_b_index = randrange(len(sentences)), randrange(len(sentences))
        token_a, token_b             = token_list[token_a_index], token_list[token_b_index]

        # 1. token embedding - ADD CLS AND SEP
        input_ids = [word2id['[CLS]']] + token_a + [word2id['[SEP]']] + token_b + [word2id['[SEP]']]
        # print('input_ids: ', input_ids)

        # 2. segment embedding - which sentence is 0 and 1
        segment_id = [0] * (1 + len(token_a) + 1) + [1] * (len(token_b) + 1)

        # 3. masking # masked 15%, but should be at least 1 but does not exceed max_mask
        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))

        # Calculate 15% of the total number of tokens
        # Round the percentage to the nearest integer
        # Ensure the rounded percentage is at least 1
        # Ensure n_pred does not exceed max_mask

        # get all the pos excluding CLS and SEP
        candidates_masked_pos = [i for i,token in enumerate(input_ids) if token != word2id['[CLS]'] and token != word2id['[SEP]']]

        # and shuffle them
        shuffle(candidates_masked_pos)

        masked_tokens, masked_pos = [], []
        # simply loop and mask accordingly

         #simply loop and change the input_ids to [MASK]
        for pos in candidates_masked_pos[:n_pred]:
            masked_pos.append(pos)  #remember the position
            masked_tokens.append(input_ids[pos]) #remember the tokens

            #80% replace with a [MASK], but 10% will replace with a random token
            if random() < 0.1:  # 10%

                index = randint(0, vocab_size - 1) # random index in vocabulary
                input_ids[pos] = word2id[id2word[index]] # replace

            elif random() < 0.8:  # 80%
                input_ids[pos] = word2id['[MASK]'] # make mask

            else:  #10% do nothing
                pass


        # 4. pad the sentence to the max length        # input = output
        n_pad = max_len - len(input_ids)
        
        input_ids.extend([0] * n_pad)
        segment_id.extend([0] * n_pad)

        # 5. pad the mask tokens to the max length     # output = input
        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        # 6. check whether it is positive or negative which means first sentence really comes before the second sentence # make sure positive is exactly half the batch size

        if token_a_index + 1 == token_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_id, masked_tokens, masked_pos, True]) # isNext
            positive += 1
        elif token_a_index + 1 != token_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_id, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    return batch

In [19]:
batch = make_batch()
len(batch)

6

In [20]:
input_ids, segment_id, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [21]:
input_ids.shape, segment_id.shape, masked_tokens.shape, masked_pos.shape, isNext

(torch.Size([6, 1000]),
 torch.Size([6, 1000]),
 torch.Size([6, 5]),
 torch.Size([6, 5]),
 tensor([0, 0, 0, 1, 1, 1]))

In [22]:
masked_tokens

tensor([[6878, 5161, 4598, 3174, 6080],
        [1990, 2040, 1751,  350, 6485],
        [3108, 5116, 5094, 5453,  317],
        [5078,  172, 3111, 5092, 4998],
        [6949, 1687,    0,    0,    0],
        [6837, 2456, 3871, 1215,    0]])

In [23]:
id2word[4644]

'start"'

#### 3.1 Embedding

In [24]:
d_model    = 768  # Embedding Size
n_segments = 2

In [25]:
max_len

1000

In [26]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        
        self.tok_embed = nn.Embedding(vocab_size, d_model) # It maps token indices to continuous-valued vectors that can capture its semantic and syntactic properties.
        self.pos_embed = nn.Embedding(max_len, d_model) # max_len = sequence_max_len
        self.seg_embed = nn.Embedding(n_segments, d_model) 
        self.norm      = nn.LayerNorm(d_model)
        
    def forward(self, x, seg):
        
        seq_len   = x.size(1) # (bs, seq_len) # x input_ids
        pos       = torch.arange(seq_len, dtype=torch.long, device=device)
        pos       = pos.unsqueeze(0).expand_as(x)  # (len,) -> (bs, len) 
        #This expands the tensor pos to match the shape of tensor x. 
        #It repeats the tensor along each dimension to match the corresponding dimension of x.

        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

#### 3.2 Attention mask

- In summary, the get_attn_pad_mask function generates an attention mask that masks out padding tokens in the key sequences, ensuring that attention scores are not calculated for those positions during computation. This helps the model focus only on relevant tokens and improves computational efficiency.

In [27]:
def get_attn_pad_mask(seq_q, seq_k):
    
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    # .eq(0) if the corresponding element in seq_k is equal to zero (indicating a padding token), and False otherwise.
    
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

In [28]:
# testing the attention mask
print(get_attn_pad_mask(input_ids, input_ids).shape)

torch.Size([6, 1000, 1000])


### 4. Model

Encoder has two main components:

- Multi-head attention
- Position-wise feed-forward network

In [29]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn       = PoswiseFeedForwardNet()

    # enc_self_attn_mask use get_attn_pad_mask function to ensure that attention is not paid to padding tokens during computation.

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs       = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        # print('enc_outputs: ', enc_outputs.shape)
        # print('enc_attn: ', attn.shape)
        return enc_outputs, attn

#### 4.1 ScaledDot Product

In [30]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores  = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn    = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        # print('scaled_attn: ', attn.shape)
        # print('context: ', context.shape)
        return context, attn 

#### 4.2 MultiHeadAttention

In [31]:
n_layers   = 6    # number of Encoder of Encoder Layer
n_heads    = 8    # number of heads in Multi-Head Attention
d_model    = 768  # Embedding Size
d_ff       = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v  = 64  # dimension of K(=Q), V # head_dim = hid_dim / n*heads = 
n_segments = 2

In [32]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()

        self.W_Q = nn.Linear(d_model, d_k * n_heads).to(device)
        self.W_K = nn.Linear(d_model, d_k * n_heads).to(device)
        self.W_V = nn.Linear(d_model, d_v * n_heads).to(device)

    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1).to(device) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context       = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output        = nn.Linear(n_heads * d_v, d_model).to(device)(context)

        # print('multi_head_attn: ', attn.shape)
        return nn.LayerNorm(d_model).to(device)(output + residual), attn # output: [batch_size x len_q x d_model]


#### 4.3 PostionWise FeedForward Layer

In [33]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(F.gelu(self.fc1(x)))


#### 4.4 Putting them together

In [34]:
class BERT(nn.Module):
    def __init__(self):

        super(BERT, self).__init__()
        self.embedding  = Embedding().to(device)
        self.layers     = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc         = nn.Linear(d_model, d_model).to(device)
        self.activ      = nn.Tanh()
        self.linear     = nn.Linear(d_model, d_model).to(device)
        self.norm       = nn.LayerNorm(d_model).to(device)
        self.classifier = nn.Linear(d_model, 2).to(device)

        # decoder is shared with embedding layer
        embed_weight        = self.embedding.tok_embed.weight
        n_vocab, n_dim      = embed_weight.size()
        self.decoder        = nn.Linear(n_dim, n_vocab, bias=False).to(device)
        self.decoder.weight = embed_weight
        self.decoder_bias   = nn.Parameter(torch.zeros(n_vocab)).to(device)

    def forward(self, input_ids, segment_ids, masked_pos):
        
        input_ids = input_ids.to(self.embedding.tok_embed.weight.device)
        segment_ids = segment_ids.to(self.embedding.tok_embed.weight.device)
        masked_pos = masked_pos.to(self.embedding.tok_embed.weight.device)

        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)

        # # print('bert_output: ', output)
        # print('-----------------------')
        # print('bert_output_shape: ', output.shape)
        # # print('enc_self_attn: ', enc_self_attn)
        # print('enc_self_attn_shape: ', enc_self_attn.shape)
        # print('enc_attn_mask_shape ', enc_self_attn_mask.shape)
        # print('-----------------------')


        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        
        # 1. predict next sentence
        # it will be decided by first token(CLS)
        h_pooled   = self.activ(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_nsp = self.classifier(h_pooled) # [batch_size, 2]

        # 2. predict the masked token
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked  = self.norm(F.gelu(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return output, logits_lm, logits_nsp

### 5. Training

In [35]:
model = BERT()


In [36]:
num_epoch = 100
model     = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

model.to(device)

print_interval = 10

# Start the timer

start_time = time.time()


# Move tensors to the same device
# input_ids = input_ids.to(device)
# segment_ids = segment_ids.to(device)
# masked_tokens = masked_tokens.to(device)
# masked_pos = masked_pos.to(device)
# isNext = isNext.to(device)

for epoch in range(num_epoch):
    optimizer.zero_grad()

    batch = make_batch()


        # Move tensors to the same device
    input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

    input_ids     = input_ids.to(device)
    segment_ids   = segment_ids.to(device)
    masked_tokens = masked_tokens.to(device)
    masked_pos    = masked_pos.to(device)
    isNext        = isNext.to(device)

    # logits_lm = logits_lm.to(device)
    # logits_nsp = logits_nsp.to(device)
    #logits_lm: (bs, max_mask, vocab_size) ==> (6, 5, 34)
    #logits_nsp: (bs, yes/no) ==> (6, 2)

    _, logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)  



    #1. mlm loss
    #logits_lm.transpose: (bs, vocab_size, max_mask) vs. masked_tokens: (bs, max_mask)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()
    #2. nsp loss
    #logits_nsp: (bs, 2) vs. isNext: (bs, )
    loss_nsp = criterion(logits_nsp, isNext) # for sentence classification
    # loss_nsp = loss_nsp.to(device)
    
    #3. combine loss
    loss = loss_lm + loss_nsp

    loss.backward()
    optimizer.step()
    
    # Print the loss every 100 epochs
    if (epoch + 1) % print_interval == 0:
        current_time = time.time()
        elapsed_time = current_time - start_time
        mins, secs   = divmod(elapsed_time, 60)
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss.item():2.6f} | Time: {int(mins):02d}m {int(secs):02d}s")

# End of training
end_time   = time.time()
total_time = end_time - start_time
total_mins, total_secs = divmod(total_time, 60)
print(f"Training completed in {int(total_mins):02d}m {int(total_secs):02d}s")

Epoch     10 | Loss: 182.213898 | Time: 00m 06s
Epoch     20 | Loss: 80.424767 | Time: 00m 13s
Epoch     30 | Loss: 66.025452 | Time: 00m 20s
Epoch     40 | Loss: 56.642830 | Time: 00m 28s
Epoch     50 | Loss: 44.878376 | Time: 00m 35s
Epoch     60 | Loss: 37.654888 | Time: 00m 42s
Epoch     70 | Loss: 46.874786 | Time: 00m 50s
Epoch     80 | Loss: 31.160089 | Time: 00m 58s
Epoch     90 | Loss: 24.687035 | Time: 01m 05s
Epoch    100 | Loss: 13.886909 | Time: 01m 13s
Training completed in 01m 13s


In [37]:
# save the model
torch.save(model.state_dict(), './model/bert_from_scratch.pth')

In [38]:
Data = {
    'word_list': word_list,
    'word2id': word2id,
    'id2word': id2word,
    'token_list': token_list,
    'vocab_size': vocab_size,
    'max_mask' : max_mask,
    'max_len' : max_len,
    'batch_size' : batch_size
}

In [39]:
import pickle
pickle.dump(Data,open('./data/Data.pkl', 'wb'))

In [40]:
batch[2]

[[1,
  438,
  4271,
  1156,
  2,
  1168,
  5647,
  1215,
  775,
  6837,
  3,
  5120,
  4166,
  5116,
  3,
  3325,
  5647,
  4400,
  2528,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

### 6. Inference

In [1]:
# Predict mask tokens ans isNext
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[2]))
# input_ids, segment_id, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))
print([id2word[w.item()] for w in input_ids[0] if id2word[w.item()] != '[PAD]'])

_, logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)
logits_lm  = logits_lm.cpu()
logits_nsp = logits_nsp.cpu()
#logits_lm:  (1, max_mask, vocab_size) ==> (1, 5, 34)
#logits_nsp: (1, yes/no) ==> (1, 2)

#predict masked tokens
#max the probability along the vocab dim (2), [1] is the indices of the max, and [0] is the first value
logits_lm = logits_lm.data.max(2)[1][0].data.numpy() 
#note that zero is padding we add to the masked_tokens
print('masked tokens (words) : ',[id2word[pos.item()] for pos in masked_tokens[0]])
print('masked tokens list : ',[pos.item() for pos in masked_tokens[0]])
print('masked tokens (words) : ',[id2word[pos.item()] for pos in logits_lm])
print('predict masked tokens list : ', [pos for pos in logits_lm])

#predict nsp
logits_nsp = logits_nsp.data.max(1)[1][0].data.numpy()
print(logits_nsp)
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_nsp else False)

NameError: name 'torch' is not defined