In [None]:
from IPython.core.debugger import set_trace
from torchtext.datasets import WikiText2, IMDB
import spacy
import re
import html
from torchtext import data, datasets
from spacy.symbols import ORTH
import torch
import torch.nn as nn
import torch.nn.functional as V
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateaui
import matplotlib.pyplot as plt

We first need segment our data into seperate words and puncuation. We use spacys english tokenizer to process our data into tokens and to throw away the miscellaneous and irelevant parts of our dataset.

In [2]:
spacy_en  = spacy.load('en')
def tokenizer(x):
    return [tok.text for tok in spacy_en.tokenizer(x)]

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
TEXT = data.Field(lower=True, tokenize = tokenizer)

Next we create our vocabulary from our tokens. This step creates a mapping from tokens to integers and a mapping from integers back to tokens, we also include a max vocabular size as words that aren't in the 60,000 most frequently occuring probably won't have much of an impact or enough training data, and best treated as unknown. We use the 300 dimensional fasttext vectors to represent our tokens. These word vectors were released in 2017 by facebook AI research. The word representations are learned by taking into account subword information and incorporates character n-grams into the skipgram model.

In [4]:
train, valid, test = WikiText2.splits(TEXT)
TEXT.build_vocab(train, vectors = "fasttext.en.300d")

Now we have to create a dataloader for our datasets. This dataloader will create batch each time we need one. When we specify that batch size we divide our training set, validation set and test set, into 64 different pieces of equivalent length. Backpropagation through time is specifys how long of sequences we will look at at a time. So when we specify 64 we specify that we won't consider tokens more than 64 words back when making a decision about what the next word most likely is.

In [5]:
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=16,
    bptt_len=30, # this is where we specify the sequence length
    device = "cuda",
    repeat=False)

Below we create our our language model.  To begin the model encodes the tokens based on the fasttext word vectors. The main part of the model is the LSTM. The LSTM is a type of recurrent neural network. A reccurent neural network sequentially processes the sentence. At each step it takes in a new input word, encoded from the words vectors, and the previous hidden state. It reuses the same weights at each time step. The LSTM is a type if recurrent neural network which allows long term dependency information to flow through an additional path. This path can keep track of depencieslike plurality or negation. Finally the output layer makes our predictions. This is a fully connected layer that makes a prediction for the next word based on the current hidden state.

In [6]:
class LanguageModel(nn.Module):
    
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz,
                 dropout=0.5):
        super(LanguageModel, self).__init__()
        self.nhid, self.nlayers, self.bsz = nhid, nlayers, bsz
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid,ntoken)

        self.init_weights()
        self.hidden = self.init_hidden(bsz)
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
 
    def forward(self, input):
        emb = self.drop(self.encoder(input))
        output, self.hidden = self.rnn(emb, self.hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1))
 
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return (torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_().cuda()),
                torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_()).cuda())
  
    def reset_history(self):
        self.hidden = tuple(torch.tensor(h.data) for h in self.hidden)


In [7]:
weight_matrix = TEXT.vocab.vectors
model = LanguageModel(weight_matrix.size(0),
weight_matrix.size(1), 200, 3, 16)
model.encoder.weight.data.copy_(weight_matrix)
model.cuda()

LanguageModel(
  (drop): Dropout(p=0.5)
  (encoder): Embedding(28870, 300)
  (rnn): LSTM(300, 200, num_layers=3, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=28870, bias=True)
)

Here we train our model. We calculate the error using negative log likelihood loss and we propogate the error of our predictions back to the each of the weights in our network. Updating the weights by the error times a small learning rate. If our loss on the validation set does not improve for more than 3 epochs we decay the learning rate by 10. Because of dropout, randomly throwing away connections in our network with probability p, we are able to avoid overfitting.

In [9]:
from tqdm import tqdm_notebook as tqdm

def train_epoch(epoch,criterion, optimizer, n_tokens):
    epoch_loss = 0
    for batch in tqdm(train_iter):
        model.reset_history()
    
        optimizer.zero_grad()
        
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        loss.backward()
        
        optimizer.step()
        #set_trace()
        batch_loss = loss.item() * prediction.size(0) * prediction.size(1)
        
        batch_loss /= len(train.examples[0].text)
        
        epoch_loss += batch_loss
    
    val_loss = 0
        
    for index, batch in enumerate(tqdm(valid_iter)):
        model.reset_history()
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        batch_loss = loss.item() * prediction.size(0) * prediction.size(1)
        batch_loss /= len(valid.examples[0].text) 
        
        val_loss += batch_loss
        
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))
    return val_loss
        

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-3, betas=(0.7,0.99))
n_tokens = weight_matrix.size(0)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=0, verbose = True)
for i in range(30):
    val_losses = []
    val_loss = train_epoch(i,criterion, optimizer, n_tokens)
    val_losses.append(val_loss)
    scheduler.step(val_loss)
    if val_loss == min(val_losses):
        save_path = 'lstm_min_loss.pt'
        torch.save(model.state_dict(), save_path)
        
model.load_state_dict(torch.load('lstm_min_loss.pt'))

## Classifier

The classifier uses the same structure as our language model except for the linear layer. Instead of output after each word. We only take the last output of the rnn. We run our linear layer on this last output to predict to probabilities, one for the positive imdb class and one for the negative.

In [21]:
class ClassifierModel(nn.Module):
    
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz, noutputs,
                 dropout=0.6):
        super(ClassifierModel, self).__init__()
        self.nhid, self.nlayers, self.bsz = nhid, nlayers, bsz
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.linear = nn.Linear(nhid,noutputs)

    def forward(self, input):
        bsz = input.size()[1]
        if bsz != self.bsz:
            self.bsz = bsz
        emb = self.drop(self.encoder(input))
        self.hidden = (torch.tensor(emb.data.new(*(self.nlayers, self.bsz, self.nhid)).zero_()),
                       torch.tensor(emb.data.new(*(self.nlayers, self.bsz, self.nhid)).zero_()))
        output, _ = self.rnn(emb, self.hidden)
        return self.linear(self.drop(output[-1]))


In [20]:
batch_size = 4

In [22]:
model2 = ClassifierModel(weight_matrix.size(0),
weight_matrix.size(1), 200, 3,batch_size,2)
model2.cuda()

ClassifierModel(
  (drop): Dropout(p=0.6)
  (encoder): Embedding(28870, 300)
  (rnn): LSTM(300, 200, num_layers=3, dropout=0.6)
  (linear): Linear(in_features=200, out_features=2, bias=True)
)

Load the weights from the pretrained language model, we make sure the state is in the new model and the dimensions match, if not we initialize these weights from scratch.

In [23]:
model1_state = model.state_dict()
model2_state = model2.state_dict()
pretrained_state = { k:v for k,v in model1_state.items() if k in model2_state and v.size() == model2_state[k].size() }
model2_state.update(pretrained_state)
model2.load_state_dict(model2_state)

We use the same text object from earlier to split our dataset, so we have the same int to string and string to int mapping as the previous dataset.

In [24]:
LABEL = data.LabelField(tokenize='spacy')
train2, test2 = datasets.IMDB.splits(TEXT,LABEL)
train_iter2, test_iter2 = data.BucketIterator.splits((train2, test2), device='cuda', batch_size=batch_size,shuffle=True)
LABEL.build_vocab(train2)

In [39]:
optimizer = optim.Adam(model2.parameters(),lr=1e-3)
loss_function = nn.CrossEntropyLoss()
loss_function = loss_function.cuda()
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose = True)
def fit(epoch,model2,data_loader,phase='training'):
    if phase == 'training':
        model2.train()
    if phase == 'validation':
        model2.eval()
    running_loss = 0.0
    running_correct = 0
    run_total = 0
    for batch_idx , batch in enumerate(tqdm(data_loader)):
        text , target = batch.text , batch.label
        
        if phase == 'training':
            optimizer.zero_grad()
        output = model2(text)
        output = output.squeeze(1)
        loss = loss_function(output,target)
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_correct += (preds.squeeze() == target).float().sum()
        run_total += len(target)
        running_loss += loss.detach()
        if phase == 'training':            
            loss.backward()
            optimizer.step()
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * float(running_correct)/float(run_total)
    print(phase.capitalize())
    print('Epoch: {}, Loss: {:.4f}, Accuracy: {:.4f}'.format(epoch, loss, accuracy))
    return loss,accuracy
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]

In [40]:
# from tqdm import tqdm_notebook as tqdm
for epoch in tqdm(range(30)):
    epoch_loss, epoch_accuracy = fit(epoch,model2,train_iter2,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model2,test_iter2,phase='validation')
    scheduler.step(val_epoch_loss)
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

KeyboardInterrupt: 

In [27]:
print("train_losses")
print(train_losses)
print("train_accuracy")
print(train_accuracy)
print("val_losses")
print(val_losses)
print("val_accuracy")
print(val_accuracy)

train_losses
[tensor(0.1638, device='cuda:0')]
train_accuracy
[57.848]
val_losses
[tensor(0.1137, device='cuda:0')]
val_accuracy
[81.168]


## Model without transfer learning

In [29]:
model3 = ClassifierModel(weight_matrix.size(0),
weight_matrix.size(1), 200, 3, batch_size,2,dropout = 0.6)
model3.cuda()
optimizer = optim.Adam(model3.parameters(),lr=1e-3)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose = True)
loss_function = nn.CrossEntropyLoss()
loss_function = loss_function.cuda()
train_losses2 , train_accuracy2 = [],[]
val_losses2 , val_accuracy2 = [],[] 

for epoch in tqdm(range(30)):
    epoch_loss, epoch_accuracy = fit(epoch,model3,train_iter2,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model3,test_iter2,phase='validation')
    scheduler.step(val_epoch_loss)
    train_losses2.append(epoch_loss)
    train_accuracy2.append(epoch_accuracy)
    val_losses2.append(val_epoch_loss)
    val_accuracy2.append(val_epoch_accuracy)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

Epoch: 0, Loss: 0.1738, Accuracy: 51.2120


HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

Epoch: 0, Loss: 0.1774, Accuracy: 58.1880


HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))

KeyboardInterrupt: 

In [None]:
print("train_losses")
print(train_losses2)
print("train_accuracy")
print(train_accuracy2)
print("val_losses")
print(val_losses2)
print("val_accuracy")
print(val_accuracy2)