In [1]:
from IPython.core.debugger import set_trace
from torchtext.datasets import WikiText2, IMDB
import spacy
import re
import html
from torchtext import data, datasets
from spacy.symbols import ORTH
import torch
import torch.nn as nn
import torch.nn.functional as V
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
! ls data/

dogscats  dogscats.zip	wikitext-103  wikitext-2  wikitext-2-v1.zip


In [3]:
spacy_en  = spacy.load('en')

In [4]:
def tokenizer(x):
    return [tok.text for tok in spacy_en.tokenizer(x)]

In [5]:
TEXT = data.Field(lower=True, tokenize = tokenizer)


In [6]:
train, valid, test = WikiText2.splits(TEXT)

In [7]:
TEXT.build_vocab(train, vectors = "fasttext.en.300d")

In [8]:
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=16,
    bptt_len=30, # this is where we specify the sequence length
    device = "cuda",
    repeat=False)

In [9]:
class LanguageModel(nn.Module):
    
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz,
                 dropout=0.5):
        super(LanguageModel, self).__init__()
        self.nhid, self.nlayers, self.bsz = nhid, nlayers, bsz
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid,ntoken)

        self.init_weights()
        self.hidden = self.init_hidden(bsz)
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
 
    def forward(self, input):
        emb = self.drop(self.encoder(input))
        output, self.hidden = self.rnn(emb, self.hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1))
 
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return (torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_().cuda()),
                torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_()).cuda())
  
    def reset_history(self):
        self.hidden = tuple(torch.tensor(h.data) for h in self.hidden)


In [10]:
weight_matrix = TEXT.vocab.vectors
model = LanguageModel(weight_matrix.size(0),
weight_matrix.size(1), 200, 3, 16)
model.encoder.weight.data.copy_(weight_matrix)
model.cuda()

LanguageModel(
  (drop): Dropout(p=0.5)
  (encoder): Embedding(28870, 300)
  (rnn): LSTM(300, 200, num_layers=3, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=28870, bias=True)
)

In [None]:
model.load_state_dict(torch.load('lstm_8.pt'))

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-3, betas=(0.7,0.99))
n_tokens = weight_matrix.size(0)

In [12]:
from tqdm import tqdm_notebook as tqdm

def train_epoch(epoch):
    epoch_loss = 0
    for batch in tqdm(train_iter):
        model.reset_history()
    
        optimizer.zero_grad()
        
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        loss.backward()
        
        optimizer.step()
        #set_trace()
        batch_loss = loss.item() * prediction.size(0) * prediction.size(1)
        
        batch_loss /= len(train.examples[0].text)
        
        epoch_loss += batch_loss
    
    val_loss = 0
        
    for index, batch in enumerate(tqdm(valid_iter)):
        model.reset_history()
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        batch_loss = loss.item() * prediction.size(0) * prediction.size(1)
        batch_loss /= len(valid.examples[0].text) 
        
        val_loss += batch_loss
        
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))
    return val_loss
        

In [13]:
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=0)
for i in range(30):
    val_losses = []
    val_loss = train_epoch(i)
    val_losses.append(val_loss)
    scheduler.step(val_loss)
    if val_loss == min(val_losses):
        save_path = 'lstm_' + str(i) +'.pt'
        torch.save(model.state_dict(), save_path)





Epoch: 0, Training Loss: 6.0129, Validation Loss: 5.2913






Epoch: 1, Training Loss: 5.6318, Validation Loss: 5.1483






Epoch: 2, Training Loss: 5.4884, Validation Loss: 5.0862






Epoch: 3, Training Loss: 5.4244, Validation Loss: 5.0909






Epoch: 4, Training Loss: 5.3769, Validation Loss: 5.0361






Epoch: 5, Training Loss: 5.3623, Validation Loss: 5.0320






Epoch: 6, Training Loss: 5.3591, Validation Loss: 5.0381






Epoch: 7, Training Loss: 5.4166, Validation Loss: 4.9961






Epoch: 8, Training Loss: 5.4075, Validation Loss: 4.9906






Epoch: 9, Training Loss: 5.4041, Validation Loss: 4.9940






Epoch: 10, Training Loss: 5.4310, Validation Loss: 4.9833






Epoch: 11, Training Loss: 5.4255, Validation Loss: 4.9791






Epoch: 12, Training Loss: 5.4225, Validation Loss: 4.9790






Epoch: 13, Training Loss: 5.4229, Validation Loss: 4.9779






Epoch: 14, Training Loss: 5.4236, Validation Loss: 4.9772






Epoch: 15, Training Loss: 5.4226, Validation Loss: 4.9787






Epoch: 16, Training Loss: 5.4226, Validation Loss: 4.9782






Epoch: 17, Training Loss: 5.4230, Validation Loss: 4.9774






Epoch: 18, Training Loss: 5.4223, Validation Loss: 4.9768






Epoch: 19, Training Loss: 5.4222, Validation Loss: 4.9789






Epoch: 20, Training Loss: 5.4223, Validation Loss: 4.9767






Epoch: 21, Training Loss: 5.4238, Validation Loss: 4.9754






Epoch: 22, Training Loss: 5.4226, Validation Loss: 4.9776






Epoch: 23, Training Loss: 5.4225, Validation Loss: 4.9761






Epoch: 24, Training Loss: 5.4222, Validation Loss: 4.9771






Epoch: 25, Training Loss: 5.4225, Validation Loss: 4.9791






Epoch: 26, Training Loss: 5.4221, Validation Loss: 4.9780






Epoch: 27, Training Loss: 5.4215, Validation Loss: 4.9755






Epoch: 28, Training Loss: 5.4230, Validation Loss: 4.9777






Epoch: 29, Training Loss: 5.4228, Validation Loss: 4.9749


In [14]:
class ClassifierModel(nn.Module):
    
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz, noutputs,
                 dropout=0.5):
        super(ClassifierModel, self).__init__()
        self.nhid, self.nlayers, self.bsz = nhid, nlayers, bsz
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.linear = nn.Linear(nhid,noutputs)

    def forward(self, input):
        bsz = input.size()[1]
        if bsz != self.bsz:
            self.bsz = bsz
        emb = self.drop(self.encoder(input))
        self.hidden = (torch.tensor(emb.data.new(*(self.nlayers, self.bsz, self.nhid)).zero_()),
                       torch.tensor(emb.data.new(*(self.nlayers, self.bsz, self.nhid)).zero_()))
        output, _ = self.rnn(emb, self.hidden)
        return self.linear(output[-1])


In [15]:
model2 = ClassifierModel(weight_matrix.size(0),
weight_matrix.size(1), 200, 3, 4,2)
model2.cuda()

ClassifierModel(
  (drop): Dropout(p=0.5)
  (encoder): Embedding(28870, 300)
  (rnn): LSTM(300, 200, num_layers=3, dropout=0.5)
  (linear): Linear(in_features=200, out_features=2, bias=True)
)

In [16]:
model1_state = model.state_dict()
model2_state = model2.state_dict()
pretrained_state = { k:v for k,v in model1_state.items() if k in model2_state and v.size() == model2_state[k].size() }
model2_state.update(pretrained_state)
model2.load_state_dict(model2_state)

In [17]:
LABEL = data.LabelField(tokenize='spacy')
train2, test2 = datasets.IMDB.splits(TEXT,LABEL)

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:04<00:00, 18.2MB/s]


In [18]:
train_iter2, test_iter2 = data.BucketIterator.splits((train2, test2), device='cuda', batch_size=4,shuffle=True)
LABEL.build_vocab(train2)

In [19]:
optimizer = optim.Adam(model2.parameters(),lr=1e-3)
loss_function = nn.CrossEntropyLoss()
loss_function = loss_function.cuda()
def fit(epoch,model2,data_loader,phase='training'):
    if phase == 'training':
        model2.train()
    if phase == 'validation':
        model2.eval()
    running_loss = 0.0
    running_correct = 0
    run_total = 0
    for batch_idx , batch in enumerate(data_loader):
        text , target = batch.text , batch.label
        
        if phase == 'training':
            optimizer.zero_grad()
        output = model2(text)
        output = output.squeeze(1)
        loss = loss_function(output,target)
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_correct += (preds.squeeze() == target).float().sum()
        run_total += len(target)
        running_loss += loss.detach()
        if phase == 'training':            
            loss.backward()
            optimizer.step()
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * float(running_correct)/float(run_total)
    print('Epoch: {}, Loss: {:.4f}, Accuracy: {:.4f}'.format(epoch, loss, accuracy))
    return loss,accuracy
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]

In [20]:
from tqdm import tqdm_notebook as tqdm
for epoch in tqdm(range(30)):
    epoch_loss, epoch_accuracy = fit(epoch,model2,train_iter2,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model2,test_iter2,phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

Epoch: 0, Loss: 0.1541, Accuracy: 62.6680
Epoch: 0, Loss: 0.0951, Accuracy: 84.5080
Epoch: 1, Loss: 0.0797, Accuracy: 87.0520
Epoch: 1, Loss: 0.0725, Accuracy: 88.6520
Epoch: 2, Loss: 0.0549, Accuracy: 91.5640
Epoch: 2, Loss: 0.0689, Accuracy: 88.8320
Epoch: 3, Loss: 0.0418, Accuracy: 93.8120
Epoch: 3, Loss: 0.0760, Accuracy: 88.1280
Epoch: 4, Loss: 0.0315, Accuracy: 95.5040
Epoch: 4, Loss: 0.0750, Accuracy: 88.8280
Epoch: 5, Loss: 0.0241, Accuracy: 96.7920
Epoch: 5, Loss: 0.0861, Accuracy: 88.3920
Epoch: 6, Loss: 0.0199, Accuracy: 97.4560
Epoch: 6, Loss: 0.0985, Accuracy: 87.9320
Epoch: 7, Loss: 0.0157, Accuracy: 97.9920
Epoch: 7, Loss: 0.1272, Accuracy: 87.1600
Epoch: 8, Loss: 0.0138, Accuracy: 98.2400
Epoch: 8, Loss: 0.1175, Accuracy: 87.1720
Epoch: 9, Loss: 0.0113, Accuracy: 98.5200
Epoch: 9, Loss: 0.1593, Accuracy: 86.4120
Epoch: 10, Loss: 0.0101, Accuracy: 98.8040
Epoch: 10, Loss: 0.1331, Accuracy: 86.6600
Epoch: 11, Loss: 0.0096, Accuracy: 98.7760
Epoch: 11, Loss: 0.1309, Accura

In [21]:
print("train_losses")
print(train_losses)
print("train_accuracy")
print(train_accuracy)
print("val_losses")
print(val_losses)
print("val_accuracy")
print(val_accuracy)

train_losses
[tensor(0.1541, device='cuda:0'), tensor(0.0797, device='cuda:0'), tensor(0.0549, device='cuda:0'), tensor(0.0418, device='cuda:0'), tensor(0.0315, device='cuda:0'), tensor(0.0241, device='cuda:0'), tensor(0.0199, device='cuda:0'), tensor(0.0157, device='cuda:0'), tensor(0.0138, device='cuda:0'), tensor(0.0113, device='cuda:0'), tensor(0.0101, device='cuda:0'), tensor(0.0096, device='cuda:0'), tensor(0.0081, device='cuda:0'), tensor(0.0080, device='cuda:0'), tensor(0.0076, device='cuda:0'), tensor(0.0065, device='cuda:0'), tensor(0.0066, device='cuda:0'), tensor(0.0061, device='cuda:0'), tensor(0.0063, device='cuda:0'), tensor(0.0059, device='cuda:0'), tensor(0.0060, device='cuda:0'), tensor(0.0056, device='cuda:0'), tensor(0.0060, device='cuda:0'), tensor(0.0056, device='cuda:0'), tensor(0.0054, device='cuda:0'), tensor(0.0063, device='cuda:0'), tensor(0.0064, device='cuda:0'), tensor(0.0056, device='cuda:0'), tensor(0.0061, device='cuda:0'), tensor(0.0063, device='cuda:0