In [1]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertModel
import torch.optim.lr_scheduler as lr_scheduler
from pytorch_model_summary import summary
from torch import optim
import torch.nn as nn
import torch

import numpy as np
import unicodedata
import random
import math
import re

2023-12-18 19:28:50.876784: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = "cuda"

# Process Data

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_encoder = BertModel.from_pretrained('bert-base-uncased')

In [4]:
tokenized = tokenizer.encode_plus("hello my name is nate",
                                  max_length=20,
                                  pad_to_max_length=True,
                                  return_attention_mask=True,
                                  return_tensors="pt")
with torch.no_grad():
    encodings = bert_encoder(**tokenized)
    
last_hidden_states = encodings.last_hidden_state
bert_encodings = last_hidden_states.mean(dim=1).squeeze().numpy()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [5]:
tokenizer.encode("/u Volume'ten'")

[101, 1013, 1057, 3872, 1005, 2702, 1005, 102]

In [6]:
tokenized

{'input_ids': tensor([[ 101, 7592, 2026, 2171, 2003, 8253,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [7]:
SOS_token = 101
EOS_token = 102

In [8]:
bert_encoder.parameters

<bound method Module.parameters of BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout

In [9]:
tokenized

{'input_ids': tensor([[ 101, 7592, 2026, 2171, 2003, 8253,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [10]:
last_hidden_states.shape

torch.Size([1, 20, 768])

In [11]:
def encodeString(text, tokenizer, encoder):
    indexed = tokenizer.encode_plus(text,
                                    max_length=20,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_tensors="pt")
    attention_mask = indexed["attention_mask"]
    with torch.no_grad():
        encodings = encoder(**indexed)
        
    last_hidden_states = encodings.last_hidden_state
    return last_hidden_states, attention_mask

In [12]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import random

# nltk.download('wordnet')

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return list(set(synonyms))

def synonym_replacement(sentence, num_replacements=1):
    words = word_tokenize(sentence)
    
    for i in range(len(words)):
        if random.random() < num_replacements / len(words):
            synonyms = get_synonyms(words[i])
            if synonyms:
                words[i] = random.choice(synonyms)

    return ' '.join(words)

# Example usage:
original_sentence = "Set the volume to eight"
augmented_sentence = synonym_replacement(original_sentence, num_replacements=2)
print("Original Sentence:", original_sentence)
print("Augmented Sentence:", augmented_sentence)

Original Sentence: Set the volume to eight
Augmented Sentence: Set the volume to eighter_from_Decatur


In [13]:
def readLangs():
    print("Reading lines...")

    lines = []
    counter = 0

    with open("/media/nathanmon/389E28739E282BB6/Users/Natha/Datasets/MyJarvisConversation/conversation.txt", "r") as f:
        for line in f.readlines():
            if line[0] == "U":
                lines.append("")
                lines[counter] += line[6:] + "/t"
            elif line[0] == "J":
                line = line.replace("/u", "/u ")
                lines[counter] += line[8:]
                counter += 1
                
#                 lines.append("")
#                 lines[counter] += synonym_replacement(lines[counter-1].split("/t")[0], num_replacements=2) + "/t"
#                 lines[counter] += lines[counter-1].split("/t")[1]
#                 counter += 1
                

    return lines

In [14]:
MAX_LENGTH = 20

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [15]:
def prepareData():
    pairs = readLangs()
    pairs = filterPairs(pairs)
    for i, pair in enumerate(pairs):
        pairs[i] = pair.split("/t")
    return pairs

In [16]:
pairs = prepareData()
print(random.choice(pairs))

Reading lines...
['Why was Donald Trump arrested\n', "/u Wiki'Why was Donald Trump arrested'\n"]


In [17]:
def get_dataloader(tokenizer, encoder, encoding_size, batch_size):
    pairs = prepareData()
    train_pairs, val_pairs = pairs[:225], pairs[225:]

    train_n = len(train_pairs)
    val_n = len(val_pairs)
    train_input_ids = np.zeros((train_n, MAX_LENGTH, encoding_size), dtype=np.float32)
    train_target_ids = np.zeros((train_n, MAX_LENGTH), dtype=np.int32)
    train_attention_masks = np.zeros((train_n, MAX_LENGTH), dtype=np.int32)
    val_input_ids = np.zeros((val_n, MAX_LENGTH, encoding_size), dtype=np.float32)
    val_target_ids = np.zeros((val_n, MAX_LENGTH), dtype=np.int32)
    val_attention_masks = np.zeros((val_n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(train_pairs):
        inp_encoded = encodeString(inp, tokenizer, encoder)[0][0]
        tgt_encoded = tokenizer.encode_plus(tgt,
                                            max_length=20,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,
                                            return_tensors="pt")
        tgt_tokenized = tgt_encoded['input_ids'][0]
#         tgt_masked = tgt_encoded['attention_mask'][0]

        targ_in = tgt_tokenized[:-1]
        targ_out = tgt_tokenized[1:]

        train_input_ids[idx, :len(inp_encoded)] = inp_encoded
        train_target_ids[idx, :len(targ_in)] = targ_in
        train_attention_masks[idx, :len(targ_out)] = targ_out
        
    for idx, (inp, tgt) in enumerate(val_pairs):
        inp_encoded = encodeString(inp, tokenizer, encoder)[0][0]
        tgt_encoded = tokenizer.encode_plus(tgt,
                                            max_length=20,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,
                                            return_tensors="pt")
        tgt_tokenized = tgt_encoded['input_ids'][0]
#         tgt_masked = tgt_encoded['attention_mask'][0]

        targ_in = tgt_tokenized[:-1]
        targ_out = tgt_tokenized[1:]
        
        val_input_ids[idx, :len(inp_encoded)] = inp_encoded
        val_target_ids[idx, :len(targ_in)] = targ_in
        val_attention_masks[idx, :len(targ_out)] = targ_out

    train_data = TensorDataset(torch.FloatTensor(train_input_ids).to(device),
                               torch.LongTensor(train_target_ids).to(device),
                               torch.LongTensor(train_attention_masks).to(device))
    val_data = TensorDataset(torch.FloatTensor(val_input_ids).to(device),
                             torch.LongTensor(val_target_ids).to(device),
                             torch.LongTensor(val_attention_masks).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    val_sampler = RandomSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
    
    return train_dataloader, val_dataloader

In [18]:
train_ds, val_ds = get_dataloader(tokenizer, bert_encoder, last_hidden_states.shape[-1], 1)

Reading lines...




In [19]:
for data in train_ds:
    input_tensor, target_tensor, attention_mask = data
    print(target_tensor)
    break

tensor([[ 101, 1013, 1057, 9617, 8663, 2850, 1005, 2485, 1005,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]], device='cuda:0')


# Create Model

### Positional Encoding

In [20]:
def positional_encoding(length, depth):
    depth = depth/2

    positions = torch.unsqueeze(torch.arange(length), 1)
    depths = torch.unsqueeze(torch.arange(depth), 0)/depth
#     positions = torch.arange(length)[:, np.newaxis]     # (seq, 1)
#     depths = torch.arange(depth)[np.newaxis, :]/depth   # (1, depth)

    angle_rates = 1 / (10000**depths)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)

    pos_encoding = torch.concatenate(
      [torch.sin(angle_rads), torch.cos(angle_rads)],
      axis=-1) 

    return pos_encoding.to(device, dtype=torch.float32)

In [21]:
class PositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        # The positional encoding is used to introduce sequence to a sentence by causing words near 
        # eachother to have similar vectors
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def forward(self, x):
        length = np.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= math.sqrt(torch.tensor(self.d_model).type(torch.float32))
        x = x + torch.unsqueeze(self.pos_encoding, 0)[:, :length]
#         x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [22]:
sample_positional_embedding = PositionalEmbedding(vocab_size=len(tokenizer.get_vocab()),
                                                  d_model=768).to(device)

In [23]:
print(summary(sample_positional_embedding, tgt_encoded, show_input=True))

NameError: name 'tgt_encoded' is not defined

### Attention

In [24]:
class BaseAttention(nn.Module):
    def __init__(self, d_model, **kwargs):
        super().__init__()
        self.num_heads = kwargs.get('num_heads')
        self.mha = nn.MultiheadAttention(**kwargs)
        self.layernorm = nn.LayerNorm(d_model)

In [25]:
class CrossAttention(BaseAttention):
    def forward(self, x, context):
        x_ = x.permute(1, 0, 2)
        context_ = context.permute(1, 0, 2)
        attn_output, attn_scores = self.mha(
            query=x_,
            key=context_,
            value=context_,
            need_weights=True)
        attn_output = attn_output.permute(1, 0, 2)
        attn_scores = attn_scores.permute(1, 0, 2)

        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores

        x =x + attn_output
        x = self.layernorm(x)

        return x
    
sample_ca = CrossAttention(d_model=256, embed_dim=128, num_heads=2, kdim=256)

In [26]:
class CausalSelfAttention(BaseAttention):
    def forward(self, x):
        x_ = x.permute(1, 0, 2)
        attention_mask = nn.Transformer.generate_square_subsequent_mask(x_.shape[0]).to(device)
        attention_mask = attention_mask.expand(x_.shape[1]*self.num_heads, -1, -1).to(device)
        
        attn_output = self.mha(
            query=x_,
            value=x_,
            key=x_,
            attn_mask=attention_mask,
            is_causal=True)[0]
        attn_output = attn_output.permute(1, 0, 2)
        x = x + attn_output
        x = self.layernorm(x)
        return x
    
sample_csa = CausalSelfAttention(d_model=256, embed_dim=128, 
                                 num_heads=2, kdim=256)

### Decoder

In [27]:
class FeedForward(nn.Module):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model),
            nn.Dropout(dropout_rate)
        ).to(device)
        self.layer_norm = nn.LayerNorm(d_model).to(device)
        
    def forward(self, x):
        x = x + self.seq(x)
        x = self.layer_norm(x)
        return x
    
sample_ffn = FeedForward(28, 512)

In [28]:
class DecoderLayer(nn.Module):
    def __init__(self,
                   *,
                   d_model,
                   num_heads,
                   dff,
                   dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.causal_self_attention = CausalSelfAttention(
            d_model=d_model,
            embed_dim=d_model,
            num_heads=num_heads,
            kdim=d_model,
            dropout=dropout_rate).to(device)
        
        self.cross_attention = CrossAttention(
            d_model=d_model,
            embed_dim=d_model,
            num_heads=num_heads,
            kdim=d_model,
            dropout=dropout_rate).to(device)

        self.ffn = FeedForward(d_model, dff)

    def forward(self, x, context):
        x = self.causal_self_attention(x=x)
        x = self.cross_attention(x=x, context=context)

        # Cache the last attention scores for plotting later
        self.last_attn_scores = self.cross_attention.last_attn_scores

        x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
        return x

In [29]:
class Decoder(nn.Module):
    def __init__(self, *, emb_size, num_layers, d_model, num_heads, dff, vocab_size,
                   dropout_rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.linear = nn.Linear(emb_size, d_model)
        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                                 d_model=d_model).to("cuda")
        self.dropout = nn.Dropout(dropout_rate)
        self.dec_layers = [
            DecoderLayer(d_model=d_model, num_heads=num_heads,
                         dff=dff, dropout_rate=dropout_rate)
            for _ in range(num_layers)]
        self.dec_layers = nn.ModuleList(self.dec_layers)

        self.last_attn_scores = None

    def forward(self, x, context):
        # `x` is token-IDs shape (batch, target_seq_len)
#         context = self.linear(context)
        x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

        x = self.dropout(x)

        for i in range(self.num_layers):
            x  = self.dec_layers[i](x, context)

        self.last_attn_scores = self.dec_layers[-1].last_attn_scores

        # The shape of x is (batch_size, target_seq_len, d_model).
        return x

In [30]:
sample_decoder = Decoder(emb_size=768,
                         num_layers=2,
                         d_model=768,
                         num_heads=8,
                         dff=2048,
                         vocab_size=len(tokenizer.get_vocab())).to(device)

encoded = encodeString("hello world", tokenizer, bert_encoder)[0].to(device)
tgt_encoded = tokenizer.encode_plus("How's it going robot",
                                    return_tensors="pt")['input_ids'].to(device)
decoder_out = sample_decoder(tgt_encoded, encoded)

In [31]:
print(summary(sample_decoder, tgt_encoded, encoded, show_input=True))

---------------------------------------------------------------------------------------
            Layer (type)                   Input Shape         Param #     Tr. Param #
   PositionalEmbedding-1                        [1, 8]      23,440,896      23,440,896
               Dropout-2                   [1, 8, 768]               0               0
          DecoderLayer-3     [1, 8, 768], [1, 20, 768]       7,877,888       7,877,888
          DecoderLayer-4     [1, 8, 768], [1, 20, 768]       7,877,888       7,877,888
Total params: 39,196,672
Trainable params: 39,196,672
Non-trainable params: 0
---------------------------------------------------------------------------------------


### Transformer

In [32]:
class Transformer(nn.Module):
    def __init__(self, *, emb_size, num_layers, d_model, num_heads, 
                 dff, vocab_size, dropout_rate=0.1):
        super().__init__()
        self.decoder = Decoder(emb_size=emb_size, num_layers=num_layers, d_model=d_model,
                               num_heads=num_heads, dff=dff, vocab_size=vocab_size,
                               dropout_rate=dropout_rate)

        self.final_layer = nn.Linear(d_model, vocab_size)

    def forward(self, context, x):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.
        
        x = self.decoder(x, context)  # (batch_size, target_len, d_model)

        # Final linear layer output.
        logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

        # Return the final output and the attention weights.
        return logits

In [33]:
num_layers = 5
emb_size = 768
d_model = 768
dff = 128
num_heads = 8
dropout_rate = 0.5

transformer = Transformer(
    emb_size=emb_size,
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    vocab_size=len(tokenizer.get_vocab()),
    dropout_rate=dropout_rate).to(device)

In [34]:
print(summary(transformer, encodeString("hello there", tokenizer, bert_encoder)[0].to(device), 
                           torch.zeros((1, 20), dtype=torch.int32).to(device), show_input=True))

-----------------------------------------------------------------------------
      Layer (type)               Input Shape         Param #     Tr. Param #
         Decoder-1     [1, 20], [1, 20, 768]      48,665,728      48,665,728
          Linear-2              [1, 20, 768]      23,471,418      23,471,418
Total params: 72,137,146
Trainable params: 72,137,146
Non-trainable params: 0
-----------------------------------------------------------------------------


# Train Model

In [35]:
class CustomSchedule(object):
    def __init__(self, optimizer, d_model, warmup_steps=4000):
        super().__init__()

        self.optimizer = optimizer
        self.d_model = float(d_model)

        self.warmup_steps = warmup_steps
        
        self.iters = 0.0

    def step(self):
        self.iters += 1.0
        arg1 = 1 / math.sqrt(self.iters)
        arg2 = self.iters * (self.warmup_steps ** -1.5)
        
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = 1 / math.sqrt(self.d_model) * min(arg1, arg2)

In [36]:
def masked_loss(label, pred):
    #Assuming padding index is 0, adjust if necessary
    mask = label != 0

    # Calculate CrossEntropyLoss directly without one-hot encoding
    loss_object = nn.CrossEntropyLoss(ignore_index=0)
    
    # Flatten the prediction tensor and the label tensor along the sequence dimension
    pred_flat = pred.view(-1, pred.size(-1))
    label_flat = label.view(-1)
    
    # Apply the mask to both the prediction and label tensors
    pred_masked = pred_flat[mask.view(-1)]
    label_masked = label_flat[mask.view(-1)]
    
    # Calculate the cross-entropy loss
    loss = loss_object(pred_masked, label_masked)
    return loss


def masked_accuracy(label, pred):
    pred = torch.argmax(pred, axis=2)
    label = label.to(pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask

    match = match.to(torch.float32)
    mask = mask.to(torch.float32)
    return torch.sum(match)/torch.sum(mask)

In [37]:
context = encodeString("hello world", tokenizer, bert_encoder)[0].to(device)
x = torch.tensor([[1, 2, 3, 4]]).to(device)
target_tensor_out = torch.tensor([[1, 2, 3, 4]]).to(device)

out = transformer(context, x)

loss = masked_loss(target_tensor_out, out)
loss

tensor(10.6722, device='cuda:0', grad_fn=<NllLossBackward0>)

In [38]:
def train_epoch(dataloader, trasformer, optimizer, scheduler, criterion, train=True):

    total_loss = 0
    for data in dataloader:
#         if torch.randint(low=0, high=5, size=()) == 0:
#             rand_batch = torch.randint(len(data[0]), ())
#             rand_word = torch.randint(len(data[0][0]), ())
#             data[0][rand_batch][rand_word] = OOV_token
        input_tensor, target_tensor_in, target_tensor_out = data

        optimizer.zero_grad()
        
        logits = transformer(input_tensor, target_tensor_in)
        loss = masked_loss(target_tensor_out, logits)
        
        if train:
            loss.backward()

            optimizer.step()
            scheduler.step()
    
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [39]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [40]:
plot_train_losses = []

def train(train_dataloader, val_dataloader, transformer, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    global plot_train_losses
    global d_model
    print_train_loss_total = 0  # Reset every print_every
    plot_train_loss_total = 0  # Reset every plot_every
    
    print_val_loss_total = 0  # Reset every print_every

    optimizer = optim.Adam(transformer.parameters(), lr=learning_rate, 
                           betas=(0.9, 0.98), eps=1e-9)
    scheduler = CustomSchedule(optimizer, d_model, warmup_steps=6000)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        train_loss = train_epoch(train_dataloader, transformer, optimizer, scheduler, criterion)
        print_train_loss_total += train_loss
        plot_train_loss_total += train_loss
        
        # Evaluate validation dataloader
        val_loss = train_epoch(val_dataloader, transformer, optimizer, scheduler, criterion, train=False)
        print_val_loss_total += val_loss

        if epoch % print_every == 0:
            print_train_loss_avg = print_train_loss_total / print_every
            print_train_loss_total = 0
            print_val_loss_avg = print_val_loss_total / print_every
            print_val_loss_total = 0
            print('%s (%d %d%%) %.4f %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_train_loss_avg, print_val_loss_avg
                                             ))

        if epoch % plot_every == 0:
            plot_train_loss_avg = plot_train_loss_total / plot_every
            plot_train_losses.append(plot_train_loss_avg)
            plot_train_loss_total = 0

    showPlot(plot_train_losses)

In [41]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [49]:
batch_size = 32

train_dataloader, val_dataloader = get_dataloader(tokenizer, bert_encoder, 768, batch_size)

train(train_dataloader, val_dataloader, transformer, 200, learning_rate=1e-3, print_every=5, plot_every=5)

Reading lines...
0m 1s (- 0m 46s) (5 2%) 2.6297 4.5404
0m 2s (- 0m 45s) (10 5%) 1.5081 3.3736
0m 3s (- 0m 43s) (15 7%) 0.6422 2.8761
0m 4s (- 0m 42s) (20 10%) 0.3646 2.6548
0m 5s (- 0m 41s) (25 12%) 0.2045 2.6262
0m 6s (- 0m 39s) (30 15%) 0.1800 2.5209
0m 8s (- 0m 38s) (35 17%) 0.1094 2.6591
0m 9s (- 0m 37s) (40 20%) 0.0976 2.3830
0m 10s (- 0m 36s) (45 22%) 0.0721 2.7252
0m 11s (- 0m 35s) (50 25%) 0.0643 2.6067
0m 12s (- 0m 34s) (55 27%) 0.0584 2.4533
0m 14s (- 0m 33s) (60 30%) 0.0561 2.9603
0m 15s (- 0m 31s) (65 32%) 0.0591 2.7089
0m 16s (- 0m 30s) (70 35%) 0.0522 2.7283
0m 17s (- 0m 29s) (75 37%) 0.0817 2.5434
0m 19s (- 0m 28s) (80 40%) 0.0461 2.4571
0m 20s (- 0m 27s) (85 42%) 0.0453 2.7756
0m 21s (- 0m 26s) (90 45%) 0.0562 2.6533
0m 23s (- 0m 25s) (95 47%) 0.0416 2.8827
0m 24s (- 0m 24s) (100 50%) 0.0386 2.7423
0m 25s (- 0m 23s) (105 52%) 0.0579 2.7289
0m 27s (- 0m 22s) (110 55%) 0.0591 2.6370
0m 28s (- 0m 21s) (115 57%) 0.0681 2.7780
0m 29s (- 0m 19s) (120 60%) 0.0489 2.6655
0m 31s

In [50]:
for input, target_in, target_out in train_dataloader:
    index = -2
    input = torch.unsqueeze(input[0], 0)
    target_in = torch.unsqueeze(target_in[0][:index], 0)
    target_out = torch.unsqueeze(target_out[0][:index], 0)
    print(target_in)
    print(target_out)
    out = transformer(input, target_in)
    loss = masked_loss(target_out, out)
    print(loss)
    break

tensor([[  101,  1013,  1057, 15536,  3211,  1005,  2073,  2001,  1996,  2942,
          2162,  1005,   102,     0,     0,     0,     0,     0]],
       device='cuda:0')
tensor([[ 1013,  1057, 15536,  3211,  1005,  2073,  2001,  1996,  2942,  2162,
          1005,   102,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward0>)


In [51]:
def sentenceFromIndexes(encoded):
    words = []
    for word in encoded:
        words.append(list(tokenizer.get_vocab().keys())[word])
        
    return words

In [52]:
class Chatbot():
    def __init__(self, transformer):
        self.transformer = transformer

    def __call__(self, sentence):
#         if len(sentence.shape) == 0:
#             sentence = torch.unsqueeze(sentence, 1)

        sentence, attention_mask = encodeString(sentence, tokenizer, bert_encoder)
        encoder_input = sentence

        output_array = torch.tensor([[SOS_token]])
        
        with torch.no_grad():
            for i in range(MAX_LENGTH):
                output = torch.unsqueeze(torch.flatten(output_array), 0)
                predictions = self.transformer(encoder_input.to(device),
                                                output.to(device))
                predictions = predictions[:, -1:, :]
                
                predicted_id = torch.argmax(predictions, -1)
                
                output_array = torch.cat((output_array.to(device),
                                          torch.unsqueeze(predicted_id[0], 0)), 0)
                
                if predicted_id[0] == torch.tensor([EOS_token]).to(device):
                    break

        output = torch.unsqueeze(torch.flatten(output_array), 0)
        
        tokens = sentenceFromIndexes(output[0].tolist())
        text = ' '.join(tokens)

        with torch.no_grad():
            self.transformer(encoder_input.to(device), 
                              output[:,:-1].to(device))
            attention_weights = self.transformer.decoder.last_attn_scores

        return text, attention_weights.to(device)

In [53]:
chatbot = Chatbot(transformer)

In [54]:
def print_translation(sentence, tokens):
    print(f'{"Input:":15s}: {sentence}')
    print(f'\n{"Prediction":15s}: {tokens}')

In [56]:
sentence = "What's the temperature"

translated_text, attention_weights = chatbot(sentence)
print_translation(sentence, translated_text)

Input:         : What's the temperature

Prediction     : [CLS] today is / u date [SEP]
