In [None]:
import random
random.seed(10)

In [None]:
import re
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from random import *

# **Loading Text**

In [None]:
from google.colab import drive

drive.mount('/content/drive')

import os

path = '/content/drive/MyDrive/Colab Notebooks/Cap11'

Mounted at /content/drive


In [None]:
text = open(os.path.join(path, 'texto.txt'), 'r').read()

# **Data PreProcessing**

In [None]:
sentences = re.sub('[,.!?\\-]', '', text.lower()).split('\n')

In [None]:
print(sentences)

In [None]:
word_list = list(set(' '.join(sentences).split()))

In [None]:
print(word_list)

In [None]:
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

In [None]:
word_dict

In [None]:
for i, w in enumerate(word_list):
    word_dict[w] = i + 4

In [None]:
print(word_dict)

In [None]:
number_dict = {i: w for i, w in enumerate(word_dict)}

In [None]:
print(number_dict)

In [None]:
vocab_size = len(word_dict)
print(vocab_size)

70


In [None]:
token_list = list()

In [None]:
for sentence in sentences:
  arr = [word_dict[s] for s in sentence.split()]
  token_list.append(arr)

In [None]:
text[0:29]

In [None]:
token_list[0]

# **Hyperparameters**

In [None]:
batch_size = 6
n_segments = 2
dropout = 0.2

max_len = 100

max_pred = 7

n_layers = 6

n_heads = 12

d_model = 768

d_ff = d_model * 4

d_k = d_v = 64

NUM_EPOCHS = 50

In [None]:
def make_batch():
  batch = []

  positive = negative = 0

  while positive != batch_size / 2 or negative != batch_size / 2:
    tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))

    tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]

    input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]

    segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (1 + len(tokens_b))

    n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15))))

    cand_maked_pos = [i for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]

    shuffle(cand_maked_pos)

    masked_tokens, masked_pos = [], []

    for pos in cand_maked_pos[:n_pred]:
      masked_pos.append(pos)
      masked_tokens.append(input_ids[pos])

      if random() < 0.8:
        input_ids[pos] = word_dict['[MASK]']
      elif random() < 0.5:
        index = randint(0, vocab_size - 1)
        input_ids[pos] = word_dict[number_dict[index]]

    n_pad = max_len - len(input_ids)
    input_ids.extend([0] * n_pad)
    segment_ids.extend([0] * n_pad)

    if max_pred > n_pred:
      n_pad = max_pred - n_pred
      masked_tokens.extend([0] * n_pad)
      masked_pos.extend([0] * n_pad)

    if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
      batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
      positive += 1
    elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
      batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
      negative += 1

  return batch

In [None]:
def get_attn_pad_masked(seq_q, seq_k):
  batch_size, len_q = seq_q.size()
  batch_size, len_k = seq_k.size()
  pad_attn_masked = seq_k.data.eq(0).unsqueeze(1)
  return pad_attn_masked.expand(batch_size, len_q, len_k)

In [None]:
batch = make_batch()

In [None]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [None]:
input_ids[0]

In [None]:
segment_ids[0]

In [None]:
masked_tokens[0]

In [None]:
masked_pos[0]

In [None]:
isNext[0]

In [None]:
get_attn_pad_masked(input_ids, input_ids)[0][0], input_ids[0]

# **Creating Model**

In [None]:
# GeLU Activation Function
def gelu(x):
  return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [None]:
class Embedding(nn.Module):
  def __init__(self):
    super(Embedding, self).__init__()

    self.tok_embed = nn.Embedding(vocab_size, d_model)

    self.pos_embed = nn.Embedding(maxlen, d_model)

    self.seg_embed = nn.Embedding(n_segments, d_model)

    self.norm = nn.LayerNorm(d_model)

  def forward(self, x, seg):
    seq_len = x.size(1)

    pos = torch.arange(batch_size, seq_len)

    pos = pos.unsqueeze(0).expand_as(x)

    embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)

    return self.norm(embedding)