In [1]:
from IPython.core.debugger import set_trace
from torchtext.datasets import WikiText2
import spacy
import re
import html
from torchtext import data
from spacy.symbols import ORTH
import torch
import torch.nn as nn
import torch.nn.functional as V
import torch.optim as optim

In [2]:
! ls data/

dogscats  dogscats.zip	wikitext-103  wikitext-2  wikitext-2-v1.zip


In [3]:
spacy_en  = spacy.load('en')

In [4]:
def tokenizer(x):
    return [tok.text for tok in spacy_en.tokenizer(x)]

In [5]:
TEXT = data.Field(lower=True, tokenize = tokenizer)


In [6]:
train, valid, test = WikiText2.splits(TEXT)

In [7]:
train

<torchtext.datasets.language_modeling.WikiText2 at 0x7f8095f20710>

In [8]:
for ex in train.examples:
    print(ex.text[0:10])

[' ', '<eos>', ' ', '=', 'valkyria', 'chronicles', 'iii', '=', '<eos>', ' ']


In [9]:
TEXT.build_vocab(train, vectors = "fasttext.en.300d")

In [10]:
TEXT

<torchtext.data.field.Field at 0x7f8095f205f8>

In [11]:
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=32,
    bptt_len=30, # this is where we specify the sequence length
    device = "cuda",
    repeat=False)

In [12]:
for batch in train_iter:
    print("text")
    print(batch.text.data)
    print("target")
    print(batch.target.data)
    break

text
tensor([[   12,  1934, 20015,    15,    29,    21, 11667,     2,     3,     3,
         11200,    13,    56,    15,  1710,  4475,    10,    18,    10,    19,
            15,     2,     2,    19,  1286,    75,    13, 26206,    19,     4,
             3,     2],
        [   13,    10,    30,    15,     4,     5,  4195,  3768,     2,    55,
            26,    12,  3886,    15,     3,     5,     2,   140,   489,  2282,
            15, 13895,    16,     2,     4,   655,    12,   532,     2,  1249,
           149, 16441],
        [   12,    32,     2,    15,  3062,    64,    27,    11,   121,    11,
            20,    13,     4,    13,   129,    64,   107,  2991,  2613,    21,
            15,     5,  7840, 19860,    13,   182,    13,    36,   939,     6,
          3948,  1307],
        [   15,   472, 10782,  1957,    22,   992,   207,    35,  3289,   194,
            50,    12,   946,    12,     2,   462,     4,     5,     3,  5023,
            13,     2,     9,     5,    12,   312,    

In [13]:
batch.text.data[:,0]

tensor([   12,    13,    12,    15,  3875,  3895,   889,    15,    13,    12,
           13,    12, 20060,    93,  3875,    98,    52,     6,     8,     7,
         3895,    27,   789,    52, 28868,     3,  6216,     4,  3875,     5],
       device='cuda:0')

In [14]:
batch.target.data[:,0]

tensor([   13,    12,    15,  3875,  3895,   889,    15,    13,    12,    13,
           12, 20060,    93,  3875,    98,    52,     6,     8,     7,  3895,
           27,   789,    52, 28868,     3,  6216,     4,  3875,     5,     2],
       device='cuda:0')

In [51]:
class LanguageModel(nn.Module):
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz,
                 dropout=0.5):
        super(LanguageModel, self).__init__()
        self.nhid, self.nlayers, self.bsz = nhid, nlayers, bsz
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid,ntoken)

        self.init_weights()
        self.hidden = self.init_hidden(bsz)
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
 
    def forward(self, input):
        emb = self.drop(self.encoder(input))
        output, self.hidden = self.rnn(emb, self.hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1))
 
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return (torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_().cuda()),
                torch.tensor(weight.new(self.nlayers, bsz, self.nhid).zero_()).cuda())
  
    def reset_history(self):
        self.hidden = tuple(torch.tensor(v.data) for v in self.hidden)

In [52]:
weight_matrix = TEXT.vocab.vectors
model = LanguageModel(weight_matrix.size(0),
weight_matrix.size(1), 200, 1, 32)
model.encoder.weight.data.copy_(weight_matrix)
model.cuda()

  "num_layers={}".format(dropout, num_layers))


LanguageModel(
  (drop): Dropout(p=0.5)
  (encoder): Embedding(28870, 300)
  (rnn): LSTM(300, 200, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=28870, bias=True)
)

In [58]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-3, betas=(0.9,0.999))
n_tokens = weight_matrix.size(0)

In [61]:
from tqdm import tqdm
def train_epoch(epoch):
    epoch_loss = 0
    for batch in tqdm(train_iter):
        model.reset_history()
    
        optimizer.zero_grad()
        
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        loss.backward()
        
        optimizer.step()
        
        batch_loss = loss.item() * prediction.size(0) * prediction.size(1)
        
        batch_loss /= len(train.examples[0].text)
        
        epoch_loss += batch_loss
        

    val_loss = 0
    for batch in valid_iter:
        model.reset_history()
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        batch_loss = loss.item() * text.size(0)
        batch_loss /= len(valid.examples[0].text) 
        val_loss += batch_loss
        
        
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))
        

In [62]:
for i in range(2):
    train_epoch(i)


  0%|          | 0/2330 [00:00<?, ?it/s][A
  0%|          | 1/2330 [00:00<19:06,  2.03it/s][A
  0%|          | 6/2330 [00:00<03:56,  9.81it/s][A
  0%|          | 11/2330 [00:00<02:34, 15.01it/s][A
  1%|          | 15/2330 [00:00<02:13, 17.38it/s][A
  1%|          | 20/2330 [00:00<01:53, 20.32it/s][A
  1%|          | 25/2330 [00:01<01:42, 22.60it/s][A
  1%|▏         | 30/2330 [00:01<01:34, 24.42it/s][A
Exception in thread Thread-10:
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 2330/2330 [00:56<00:00, 40.96it/s]
  0%|          

Epoch: 0, Training Loss: 5.3230, Validation Loss: 0.1537


100%|██████████| 2330/2330 [00:57<00:00, 40.82it/s]


Epoch: 1, Training Loss: 5.2084, Validation Loss: 0.1515


In [63]:
for i in range(5):
    train_epoch(i)

100%|██████████| 2330/2330 [00:56<00:00, 41.02it/s]
  0%|          | 0/2330 [00:00<?, ?it/s]

Epoch: 0, Training Loss: 5.1372, Validation Loss: 0.1505


100%|██████████| 2330/2330 [00:57<00:00, 40.82it/s]
  0%|          | 0/2330 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 5.0868, Validation Loss: 0.1498


100%|██████████| 2330/2330 [00:57<00:00, 40.87it/s]
  0%|          | 0/2330 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 5.0499, Validation Loss: 0.1494


100%|██████████| 2330/2330 [00:56<00:00, 40.89it/s]
  0%|          | 0/2330 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 5.0243, Validation Loss: 0.1491


100%|██████████| 2330/2330 [00:58<00:00, 39.77it/s]


Epoch: 4, Training Loss: 5.0054, Validation Loss: 0.1490
