In [1]:
import math
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from model import TransformerModel
import text_data
import timeit

# Data processing and model compiling

In [2]:
corpus = text_data.Corpus("data/wikitext")

In [3]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [4]:
ntokens = len(corpus.dictionary)
seq_length = 35
epochs = 1
tr_batch_size = 20
val_batch_size = 10
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [5]:
tr_data = text_data.TextDataset(corpus.train, receptive_fields = 1, sample_size = seq_length)
val_data = text_data.TextDataset(corpus.valid, receptive_fields = 1, sample_size = seq_length)
test_data = text_data.TextDataset(corpus.test, receptive_fields = 1, sample_size = seq_length)

In [6]:
train_loader = torch.utils.data.DataLoader(tr_data, batch_size = tr_batch_size, shuffle = False)
val_loader = torch.utils.data.DataLoader(val_data, batch_size = val_batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = val_batch_size, shuffle = False)

In [7]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(seq_length).to(device)
    for batch, b in enumerate(train_loader):
        
        data, targets = b
        
        data = data.transpose(0,1).contiguous()
        targets = targets.transpose(0,1).contiguous().view(-1)
        
        optimizer.zero_grad()
        if data.size(0) != seq_length:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        start = timeit.default_timer()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        stop = timeit.default_timer()
        # print(f'\\Batch time: {stop-start}')
        optimizer.step()
        
        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_loader.dataset) // tr_batch_size, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(seq_length).to(device)
    with torch.no_grad():
        for batch, b in enumerate(data_source):
            data, targets = b
            data = data.transpose(0,1).contiguous()
            targets = targets.transpose(0,1).contiguous().view(-1)
            if data.size(0) != seq_length:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source.dataset) - 1)

# Training

In [9]:
best_val_loss = float("inf")
epochs = 4 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_loader)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()



| epoch   1 |   200/ 2983 batches | lr 5.00 | ms/batch 44.32 | loss  8.97 | ppl  7841.95
| epoch   1 |   400/ 2983 batches | lr 5.00 | ms/batch 42.90 | loss  7.67 | ppl  2151.15
| epoch   1 |   600/ 2983 batches | lr 5.00 | ms/batch 43.39 | loss  7.42 | ppl  1661.47
| epoch   1 |   800/ 2983 batches | lr 5.00 | ms/batch 42.96 | loss  7.01 | ppl  1106.95
| epoch   1 |  1000/ 2983 batches | lr 5.00 | ms/batch 42.93 | loss  6.83 | ppl   924.85
| epoch   1 |  1200/ 2983 batches | lr 5.00 | ms/batch 42.98 | loss  6.75 | ppl   855.23
| epoch   1 |  1400/ 2983 batches | lr 5.00 | ms/batch 42.97 | loss  6.61 | ppl   744.91
| epoch   1 |  1600/ 2983 batches | lr 5.00 | ms/batch 43.03 | loss  6.56 | ppl   704.91
| epoch   1 |  1800/ 2983 batches | lr 5.00 | ms/batch 43.36 | loss  6.55 | ppl   696.27
| epoch   1 |  2000/ 2983 batches | lr 5.00 | ms/batch 43.38 | loss  6.53 | ppl   683.97
| epoch   1 |  2200/ 2983 batches | lr 5.00 | ms/batch 46.80 | loss  6.47 | ppl   644.07
| epoch   1 |  2400/ 

# Evaluating

In [11]:
for batch, b in enumerate(val_loader):
    data, targets = b
    data = data.transpose(0,1).contiguous()
    targets = targets.transpose(0,1).contiguous().view(-1)
    break

In [12]:
decoder = corpus.dictionary.idx2word

In [13]:
print("Input:")
' '.join([decoder[i] for i in data[:,0].tolist()])

Input:


'<eos> = Homarus gammarus = <eos> <eos> Homarus gammarus , known as the European lobster or common lobster , is a species of clawed lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts'

In [15]:
print("Target:")
' '.join([decoder[i] for i in targets.view(seq_length, val_batch_size)[:,0].tolist()])

Target:


'= Homarus gammarus = <eos> <eos> Homarus gammarus , known as the European lobster or common lobster , is a species of clawed lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of'

In [28]:
i = 1
sentence = corpus.test[i*seq_length:(i+1)*seq_length].unsqueeze(0).cuda()
generated = sentence.transpose(0, 1)

In [29]:
print("Generating text with seed:")
' '.join([decoder[i] for i in generated.transpose(0,1).tolist()[0]])

Generating text with seed:


'This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television'

In [30]:
model.eval()
softmax = torch.nn.Softmax(dim = 1)
data = generated
for i in range(30):
    src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
    output = model(data, src_mask)
    output_flat = torch.argmax(softmax(output.view(-1, ntokens)), dim = 1)
    generated = torch.cat((generated, output_flat[-1].view(-1,1)), dim = 0)
    data = generated[-seq_length:]

In [31]:
print("Generated text:")
' '.join([decoder[i] for i in generated.transpose(0,1).tolist()[0]])

Generated text:


"This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television starling 's common starling 's common starling 's common starling 's common starling 's common starling 's common starling 's common starling is a common starling 's common starling is"