In [1]:
from IPython.core.debugger import set_trace
from torchtext.datasets import WikiText2
import spacy
import re
import html
from torchtext import data
from spacy.symbols import ORTH
import torch
import torch.nn as nn
import torch.nn.functional as V
import torch.optim as optim

In [2]:
class LanguageModel(nn.Module):
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz,
                 dropout=0.5):
        super(LanguageModel, self).__init__()
        self.nhid, self.nlayers, self.bsz = nhid, nlayers, bsz
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid,ntoken)

        self.init_weights()
        self.hidden = self.init_hidden(bsz)
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
 
    def forward(self, input):
        emb = self.drop(self.encoder(input))
        output, self.hidden = self.rnn(emb, self.hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        decoded_viewed = decoded.view(output.size(0), output.size(1), decoded.size(1))
        return decoded_viewed
 
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return (torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_().cuda()),
                torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_()).cuda())
  
    def reset_history(self):
        self.hidden = tuple(torch.tensor(v.data) for v in self.hidden)

In [3]:
from tqdm import tqdm_notebook as tqdm

def validate_model(model, valid_iter, criterion, n_tokens, use_tqdm = False):
    val_loss = 0
    if use_tqdm:
        valid_iter = tqdm(valid_iter)
    for batch in valid_iter:
        model.reset_history()
        
        text, targets = batch.text, batch.target
        
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        
        batch_loss = loss.item() * text.size(0) / len(valid.examples[0].text)
        val_loss += batch_loss
    return val_loss
    
def train_model(model, train_iter, criterion, optimizer, n_tokens, use_tqdm = False):
    epoch_loss = 0
    if use_tqdm:
        train_iter = tqdm(train_iter)
    for batch in train_iter:
        model.reset_history()
        optimizer.zero_grad()
        
        text, targets = batch.text, batch.target
        
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        loss.backward()
        optimizer.step()
        
        batch_loss = loss.item() * prediction.size(0) * prediction.size(1) / len(train.examples[0].text)
        epoch_loss += batch_loss
    return epoch_loss

In [4]:
import math
from tqdm import tqdm_notebook as tqdm

def perplexity(text, prob_of):
    total_prob_log = 1
    index = 0
    for word in text:
        total_prob_log += math.log(prob_of(word, index))
        index += 1
    total_prob = math.exp(total_prob_log)
    try:
        return math.pow(total_prob, -1/len(text))
    except:
        return math.inf

def single_batch(words, length):
    return [[word] for word in words[:length]]

def prob_of_model(model, text, word, index):
    result_all = model(torch.tensor(single_batch(text, index + 1)).cuda())
    result_values = result_all[result_all.shape[0]-1,0,:];
    result_softmax = nn.Softmax(0)(result_values)
    result = result_softmax[word].item()
    return result
    
def model_perplexity(text, target, model):
    model.reset_history()
    model.hidden = model.init_hidden(1)
    prob_of = lambda word, index: prob_of_model(model, text, word, index)
    return perplexity(target, prob_of)

def model_perplexity_batch(model, test_iter, use_tqdm = False):
    if use_tqdm:
        test_iter = tqdm(test_iter)
    total_perplexity = 0
    counts = 0
    for batch in test_iter:
        model.reset_history()
        
        texts, targets = batch.text, batch.target
        results = model(texts)
        for i in range(model.bsz): #32, second index in results
            text = texts[:,i]
            target = targets[:,i]
            result = nn.Softmax(1)(results[:,i,:])
            prob_of = lambda word, index: result[index][word]
            
            text_perplexity = perplexity(target, prob_of)
            total_perplexity += text_perplexity
            counts += 1
            
    
    return total_perplexity / counts

In [5]:
def do_training(epochs, use_tqdm = False):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = 1e-3, betas=(0.9,0.999))
    n_tokens = weight_matrix.size(0)
    print("Running...")
    for epoch in range(epochs):
        #reset size to work with testing batch sizes
        model.hidden = model.init_hidden(32)
        
        print(" Epoch {}/{}".format(epoch+1, epochs))
        
        print("   Running pre-validation")
        pre_val_loss = validate_model(model, valid_iter, criterion, n_tokens, use_tqdm)
        print("   Training")
        train_loss = train_model(model, train_iter, criterion, optimizer, n_tokens, use_tqdm)
        print("   Running post-validation")
        post_val_loss = validate_model(model, valid_iter, criterion, n_tokens, use_tqdm)
        print("   Calculating perplexity")
        perplexity = model_perplexity_batch(model, test_iter, use_tqdm)
        
        print("  Results {}/{}: Training Loss: {:.4f}, Validation Loss: {:.4f} -> {:.4f}, Perplexity: {:.4f}".format(epoch+1, epochs, train_loss, pre_val_loss, post_val_loss, perplexity))
    print("Complete")

In [6]:
spacy_en = spacy.load('en')

def tokenizer(x):
    return [tok.text for tok in spacy_en.tokenizer(x)]

TEXT = data.Field(lower=True, tokenize = tokenizer)

train, valid, test = WikiText2.splits(TEXT)

TEXT.build_vocab(train, vectors = "fasttext.en.300d")

train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=32,
    bptt_len=30, # this is where we specify the sequence length
    device = "cuda",
    repeat=False)

weight_matrix = TEXT.vocab.vectors
model = LanguageModel(weight_matrix.size(0),
weight_matrix.size(1), 200, 1, 32)
model.encoder.weight.data.copy_(weight_matrix)
model.cuda()

  "num_layers={}".format(dropout, num_layers))


LanguageModel(
  (drop): Dropout(p=0.5)
  (encoder): Embedding(28870, 300)
  (rnn): LSTM(300, 200, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=28870, bias=True)
)

In [7]:
do_training(5)

Running...
 Epoch 1/5
   Running pre-validation
   Training
   Running post-validation
   Calculating perplexity
  Results 1/5: Training Loss: 5.8237, Validation Loss: 0.3211 -> 0.1577, Perplexity: 198.0116
 Epoch 2/5
   Running pre-validation
   Training
   Running post-validation
   Calculating perplexity
  Results 2/5: Training Loss: 5.2071, Validation Loss: 0.1577 -> 0.1513, Perplexity: 161.4524
 Epoch 3/5
   Running pre-validation
   Training
   Running post-validation
   Calculating perplexity
  Results 3/5: Training Loss: 4.9659, Validation Loss: 0.1514 -> 0.1482, Perplexity: 146.9086
 Epoch 4/5
   Running pre-validation
   Training
   Running post-validation
   Calculating perplexity
  Results 4/5: Training Loss: 4.8137, Validation Loss: 0.1482 -> 0.1467, Perplexity: 140.6538
 Epoch 5/5
   Running pre-validation
   Training
   Running post-validation
   Calculating perplexity
  Results 5/5: Training Loss: 4.7072, Validation Loss: 0.1465 -> 0.1459, Perplexity: 137.0490
Complete
