# Init

In [0]:
# Execute this code block to install dependencies when running on colab
try:
    import torch
except:
    from os.path import exists
    from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
    platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
    cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
    accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

    !pip install -q http://download.pytorch.org/whl/{accelerator}/torch-1.0.0-{platform}-linux_x86_64.whl torchvision

try: 
    import torchbearer
except:
    !pip install torchbearer

Collecting torchbearer
[?25l  Downloading https://files.pythonhosted.org/packages/ff/e9/4049a47dd2e5b6346a2c5d215b0c67dce814afbab1cd54ce024533c4834e/torchbearer-0.5.3-py3-none-any.whl (138kB)
[K     |██▍                             | 10kB 17.6MB/s eta 0:00:01[K     |████▊                           | 20kB 4.6MB/s eta 0:00:01[K     |███████▏                        | 30kB 6.6MB/s eta 0:00:01[K     |█████████▌                      | 40kB 5.0MB/s eta 0:00:01[K     |███████████▉                    | 51kB 6.1MB/s eta 0:00:01[K     |██████████████▎                 | 61kB 7.1MB/s eta 0:00:01[K     |████████████████▋               | 71kB 8.0MB/s eta 0:00:01[K     |███████████████████             | 81kB 8.9MB/s eta 0:00:01[K     |█████████████████████▍          | 92kB 9.8MB/s eta 0:00:01[K     |███████████████████████▊        | 102kB 8.5MB/s eta 0:00:01[K     |██████████████████████████      | 112kB 8.5MB/s eta 0:00:01[K     |████████████████████████████▌   | 122kB 8.5MB/

# Prepare Penn Treebank dataset

In [0]:
# automatically reload external modules if they change
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
from torchtext import vocab
from torchtext import datasets

import numpy as np
from matplotlib import pyplot as plt

from tqdm import tqdm

In [0]:
tokenize = lambda x: x.split()
TEXT = data.Field(sequential = True, tokenize = tokenize, lower = True, batch_first = True)
train_dataset, val_dataset, test_dataset = datasets.PennTreebank.splits(TEXT)
TEXT.build_vocab(train_dataset, vectors=vocab.GloVe(name='6B', dim=300))

vocab_size = len(TEXT.vocab)
word_embeddings = TEXT.vocab.vectors
print(vocab_size)
print(word_embeddings.size())
embeddings_length = 300
hidden_size = 256
batch_size = 32

downloading ptb.train.txt


ptb.train.txt: 5.10MB [00:00, 54.2MB/s]                   


downloading ptb.valid.txt


ptb.valid.txt: 400kB [00:00, 24.9MB/s]                   


downloading ptb.test.txt


ptb.test.txt: 450kB [00:00, 25.0MB/s]                   
.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 399957/400000 [00:38<00:00, 10514.72it/s]

10001
torch.Size([10001, 300])


In [0]:
train_iter, val_iter, test_iter = data.BPTTIterator.splits((train_dataset, val_dataset, test_dataset), batch_size = 32, bptt_len=30, repeat=False)

In [0]:
class LstmLangModel(nn.Module):
   def __init__(self, batch_size, hidden_size, vocab_size, embeddings_length, weights):
       super(LstmLangModel, self).__init__()
       self.batch_size = batch_size
       self.hidden_size = hidden_size
       self.vocab_size = vocab_size
       self.embed = nn.Embedding(vocab_size, embeddings_length)
       self.embed.weight.data.copy_(weights)
       self.lstm = nn.LSTM(embeddings_length, hidden_size, batch_first=True)
       self.fc = nn.Linear(hidden_size, vocab_size)
   def forward(self, x, h):
       x = self.embed(x)
       output_seq, (h, c) = self.lstm(x, h)
       out = output_seq.reshape(output_seq.size(0)*output_seq.size(1), output_seq.size(2))
       out = self.fc(out)
       return out, (h, c)



In [0]:
vocab_size = len(TEXT.vocab)
word_embeddings = TEXT.vocab.vectors
print(vocab_size)
print(word_embeddings.size())
embeddings_length = 300
hidden_size = 256
batch_size = 32

10001
torch.Size([10001, 300])


In [0]:
model = LstmLangModel(batch_size, hidden_size, vocab_size, embeddings_length, word_embeddings)
model.eval()

LstmLangModel(
  (embed): Embedding(10001, 300)
  (lstm): LSTM(300, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=10001, bias=True)
)

# Torchbear Section

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
net_lstm = LstmLangModel(batch_size, hidden_size, vocab_size, embeddings_length, word_embeddings)
net_lstm = net_lstm.to(device)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(filter(lambda p: p.requires_grad, net_lstm.parameters()),lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=True)
num_epochs = 200
epoch_list = []
train_loss_lstm_list = []
train_perp_lstm_list = []

def detach(states):
   return [state.detach() for state in states]



for epoch in range(num_epochs):
   train_loss = 0
   states = (torch.zeros(1, batch_size, hidden_size).to(device),
             torch.zeros(1, batch_size, hidden_size).to(device))
   net_lstm.train()

   for i, batch in enumerate(train_iter):
       text = batch.text.to(device)
       labels = batch.target.to(device)
       text = text.permute(1, 0)
       labels = labels.permute(1, 0)

       optim.zero_grad()
       states = detach(states)
       outputs, states = net_lstm(text, states)
       loss = criterion(outputs, labels.reshape(-1))
       train_loss += loss.item()
       loss.backward()
       optim.step()
   avg_train_loss = train_loss / len(train_iter)
   perplexity = np.exp(avg_train_loss)
   print('Epoch [{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'.format(epoch + 1, num_epochs, avg_train_loss, perplexity))
   train_loss_lstm_list.append(avg_train_loss)
   train_perp_lstm_list.append(perplexity)

   if epoch % 100 == 0:
       torch.save(net_lstm.state_dict(), r"./LSTM_" + str(epoch) + r".pth")
   
   if epoch == num_epochs - 1:
       torch.save(net_lstm.state_dict(), r"./LSTM_" + str(epoch) + r".pth")

100%|█████████▉| 399957/400000 [00:50<00:00, 10514.72it/s]

Epoch [1/200], Loss: 5.7943, Perplexity: 328.42
Epoch [2/200], Loss: 5.0785, Perplexity: 160.53
Epoch [3/200], Loss: 4.7854, Perplexity: 119.74
Epoch [4/200], Loss: 4.5722, Perplexity: 96.76
Epoch [5/200], Loss: 4.4042, Perplexity: 81.79
Epoch [6/200], Loss: 4.2633, Perplexity: 71.04
Epoch [7/200], Loss: 4.1399, Perplexity: 62.79
Epoch [8/200], Loss: 4.0291, Perplexity: 56.21
Epoch [9/200], Loss: 3.9284, Perplexity: 50.83
Epoch [10/200], Loss: 3.8350, Perplexity: 46.29
Epoch [11/200], Loss: 3.7477, Perplexity: 42.42
Epoch [12/200], Loss: 3.6647, Perplexity: 39.04
Epoch [13/200], Loss: 3.5860, Perplexity: 36.09
Epoch [14/200], Loss: 3.5120, Perplexity: 33.51
Epoch [15/200], Loss: 3.4421, Perplexity: 31.25
Epoch [16/200], Loss: 3.3762, Perplexity: 29.26
Epoch [17/200], Loss: 3.3126, Perplexity: 27.46
Epoch [18/200], Loss: 3.2520, Perplexity: 25.84
Epoch [19/200], Loss: 3.1947, Perplexity: 24.40
Epoch [20/200], Loss: 3.1403, Perplexity: 23.11
Epoch [21/200], Loss: 3.0868, Perplexity: 21.9

In [0]:
import pandas as pd

frame = pd.DataFrame(zip(train_loss_lstm_list,train_perp_lstm_list))
frame.columns = ['train_loss','train_perp']
frame.to_csv('1_LSTM(AmsGrad).csv')

# plt.figure(figsize=(10,6))
# plt.title('SGD default lr, 200 epochs, reduce lr on plateau on 150 epoch')
# plt.plot(range(3),frame['train_loss'])


# plt.figure(figsize=(10,6))
# plt.title('SGD default lr, 200 epochs, reduce lr on plateau on 150 epoch')
# plt.plot(range(3),frame['train_perp'])