In [1]:
import os
import torch 
import torch.nn as nn
import numpy as np
from torch.autograd import Variable


In [2]:
# Hyper Parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000   # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002


## Data processing

In [3]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
        
    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
    
    def __len__(self):
        return len(self.word2idx)

In [4]:
class Corpus(object):
    def __init__(self, path='data'):
        self.dictionary = Dictionary()
        
    def get_data(self, path, batch_size=20):
        # add words to the Dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
                    
        # token the file content
        ids = torch.LongTensor(tokens)
        token = 0
        with open(path, 'r') as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
        num_batches = ids.size(0) // batch_size
        ids = ids[:num_batches*batch_size]
        return ids.view(batch_size, -1)

In [5]:
# test something 
a = torch.LongTensor(10)
print(a)
print(a[0])
print(a.view(2, -1))


 2.3058e+18
 2.3058e+18
 5.0000e+00
 0.0000e+00
 0.0000e+00
 0.0000e+00
 0.0000e+00
 0.0000e+00
 2.3058e+18
 1.4161e+15
[torch.LongTensor of size 10]

2305843009213693952

 2.3058e+18  2.3058e+18  5.0000e+00  0.0000e+00  0.0000e+00
 0.0000e+00  0.0000e+00  0.0000e+00  2.3058e+18  1.4161e+15
[torch.LongTensor of size 2x5]



In [7]:
# Load Penn Treebank Dataset
train_path = 'language_model/data/train.txt'
sample_path = 'language_model/data/sample.txt'
corpus = Corpus()
ids = corpus.get_data(train_path, batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length

In [8]:
ids.size()

torch.Size([20, 46479])

In [9]:
print('vocab_size: ', vocab_size)
print('num_batches: ', num_batches)

vocab_size:  10000
num_batches:  1549


In [10]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()
        
    def init_weights(self):
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, inputs, hidden):
        # embed word ids to vectors
        inputs = self.embed(inputs)
        
        # forward
        out, hidden = self.lstm(inputs, hidden)
        
        # reshape out to (batch_size*sequence_length, hidden_size)
        out = out.contiguous().view(out.size(0)*out.size(1), out.size(2)) # torch.Tensor.contiguous
        
        # decode hidden states of all time steps
        out = self.linear(out)
        return out, hidden

In [12]:
language_model = RNNLM(vocab_size, embed_size, hidden_size, num_layers)
language_model.cuda()

print(language_model)

RNNLM(
  (embed): Embedding(10000, 128)
  (lstm): LSTM(128, 1024, batch_first=True)
  (linear): Linear(in_features=1024, out_features=10000, bias=True)
)


In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(language_model.parameters(), lr=learning_rate)

In [14]:
# truncated backpropagation
def detach(states):
    return [state.detach() for state in states]


In [15]:
# training
c = 0
for epoch in range(num_epochs):
    # initial hidden states and memory states
    states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)), 
              Variable(torch.zeros(num_layers, batch_size, hidden_size)))
    
    for i in range(0, ids.size(1)-seq_length, seq_length):
        # get batch inputs and targets
        inputs = Variable(ids[:, i:i+seq_length])
        targets = Variable(ids[:, i+1:i+1+seq_length].contiguous())
        print(inputs.size(), targets.size())

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([20, 30])
torch.Size([20, 30]) torch.Size([2

In [13]:
# training
c = 0
for epoch in range(num_epochs):
    # initial hidden states and memory states
    states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda(), 
              Variable(torch.zeros(num_layers, batch_size, hidden_size)).cuda())
    
    model.train()
    for i in range(0, ids.size(1)-seq_length, seq_length):
        # get batch inputs and targets
        inputs = Variable(ids[:, i:i+seq_length]).cuda()
        targets = Variable(ids[:, i+1:i+1+seq_length].contiguous()).cuda()
        
        # forward, backward, optimize
        language_model.zero_grad()
        states = detach(states)
        outputs, states = language_model(inputs, states)
        loss = criterion(outputs, targets.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm(language_model.parameters(), 0.5)
        optimizer.step()
        
        c += 1
        if c % 20 == 0:
            print("Epoch: {}/{}...".format(epoch+1, num_epochs), 
                  "Iterations: {}...".format(c), 
                  "Loss: {:5.4f}".format(loss.data[0]), 
                  "Perplexity: {:5.2f}".format(np.exp(loss.data[0])))

Epoch: 1/5... Iterations: 20... Loss: 6.8328 Perplexity: 927.75
Epoch: 1/5... Iterations: 40... Loss: 6.5764 Perplexity: 717.91
Epoch: 1/5... Iterations: 60... Loss: 6.5998 Perplexity: 734.96
Epoch: 1/5... Iterations: 80... Loss: 6.3462 Perplexity: 570.31
Epoch: 1/5... Iterations: 100... Loss: 6.1673 Perplexity: 476.88
Epoch: 1/5... Iterations: 120... Loss: 6.1071 Perplexity: 449.02
Epoch: 1/5... Iterations: 140... Loss: 5.8026 Perplexity: 331.15
Epoch: 1/5... Iterations: 160... Loss: 6.0299 Perplexity: 415.68
Epoch: 1/5... Iterations: 180... Loss: 5.6619 Perplexity: 287.70
Epoch: 1/5... Iterations: 200... Loss: 5.9642 Perplexity: 389.24
Epoch: 1/5... Iterations: 220... Loss: 5.8442 Perplexity: 345.21
Epoch: 1/5... Iterations: 240... Loss: 5.7608 Perplexity: 317.60
Epoch: 1/5... Iterations: 260... Loss: 5.8342 Perplexity: 341.79
Epoch: 1/5... Iterations: 280... Loss: 5.4582 Perplexity: 234.67
Epoch: 1/5... Iterations: 300... Loss: 5.7311 Perplexity: 308.31
Epoch: 1/5... Iterations: 320

Epoch: 2/5... Iterations: 2540... Loss: 4.4608 Perplexity: 86.55
Epoch: 2/5... Iterations: 2560... Loss: 4.5195 Perplexity: 91.79
Epoch: 2/5... Iterations: 2580... Loss: 4.5868 Perplexity: 98.18
Epoch: 2/5... Iterations: 2600... Loss: 4.3641 Perplexity: 78.58
Epoch: 2/5... Iterations: 2620... Loss: 4.4506 Perplexity: 85.68
Epoch: 2/5... Iterations: 2640... Loss: 4.6757 Perplexity: 107.31
Epoch: 2/5... Iterations: 2660... Loss: 4.5305 Perplexity: 92.81
Epoch: 2/5... Iterations: 2680... Loss: 4.4191 Perplexity: 83.02
Epoch: 2/5... Iterations: 2700... Loss: 4.2477 Perplexity: 69.94
Epoch: 2/5... Iterations: 2720... Loss: 4.4351 Perplexity: 84.36
Epoch: 2/5... Iterations: 2740... Loss: 4.2698 Perplexity: 71.51
Epoch: 2/5... Iterations: 2760... Loss: 4.1526 Perplexity: 63.60
Epoch: 2/5... Iterations: 2780... Loss: 4.5697 Perplexity: 96.52
Epoch: 2/5... Iterations: 2800... Loss: 3.9035 Perplexity: 49.58
Epoch: 2/5... Iterations: 2820... Loss: 4.0166 Perplexity: 55.51
Epoch: 2/5... Iterations

Epoch: 4/5... Iterations: 5080... Loss: 3.8197 Perplexity: 45.59
Epoch: 4/5... Iterations: 5100... Loss: 3.4889 Perplexity: 32.75
Epoch: 4/5... Iterations: 5120... Loss: 3.5738 Perplexity: 35.65
Epoch: 4/5... Iterations: 5140... Loss: 3.5470 Perplexity: 34.71
Epoch: 4/5... Iterations: 5160... Loss: 3.5901 Perplexity: 36.24
Epoch: 4/5... Iterations: 5180... Loss: 3.4525 Perplexity: 31.58
Epoch: 4/5... Iterations: 5200... Loss: 3.4787 Perplexity: 32.42
Epoch: 4/5... Iterations: 5220... Loss: 3.7744 Perplexity: 43.57
Epoch: 4/5... Iterations: 5240... Loss: 3.7538 Perplexity: 42.68
Epoch: 4/5... Iterations: 5260... Loss: 3.6153 Perplexity: 37.16
Epoch: 4/5... Iterations: 5280... Loss: 3.6235 Perplexity: 37.47
Epoch: 4/5... Iterations: 5300... Loss: 3.6385 Perplexity: 38.03
Epoch: 4/5... Iterations: 5320... Loss: 3.5055 Perplexity: 33.30
Epoch: 4/5... Iterations: 5340... Loss: 3.7164 Perplexity: 41.12
Epoch: 4/5... Iterations: 5360... Loss: 3.6282 Perplexity: 37.65
Epoch: 4/5... Iterations:

Epoch: 5/5... Iterations: 7620... Loss: 2.9670 Perplexity: 19.43
Epoch: 5/5... Iterations: 7640... Loss: 2.9306 Perplexity: 18.74
Epoch: 5/5... Iterations: 7660... Loss: 2.8656 Perplexity: 17.56
Epoch: 5/5... Iterations: 7680... Loss: 2.9138 Perplexity: 18.43
Epoch: 5/5... Iterations: 7700... Loss: 3.0475 Perplexity: 21.06
Epoch: 5/5... Iterations: 7720... Loss: 2.9476 Perplexity: 19.06
Epoch: 5/5... Iterations: 7740... Loss: 3.0340 Perplexity: 20.78


In [14]:
# sampling
with open(sample_path, 'w') as f:
    # initial hidden states and memory states
    states = (Variable(torch.zeros(num_layers, 1, hidden_size)).cuda(), 
              Variable(torch.zeros(num_layers, 1, hidden_size)).cuda())
    
    # select one word id randomly
    prob = torch.ones(vocab_size)
    inputs = Variable(torch.multinomial(prob, num_samples=1).unsqueeze(1), volatile=True).cuda()
    
    for i in range(num_samples):
        # forward propagate rnn
        outputs, states = language_model(inputs, states)
        
        # sample a word id
        prob = outputs.squeeze().data.exp().cpu()
        word_id = torch.multinomial(prob, 1)[0]
        
        # feed sampled word id to next time step
        inputs.data.fill_(word_id)
        
        # file write
        word = corpus.dictionary.idx2word[word_id]
        word = '\n' if word == '<eos>' else word + ' '
        f.write(word)
        
        if (i + 1) % 100 == 0:
            print("sampled [%d/%d] words and save to %s" % (i+1, num_samples, sample_path))

sampled [100/1000] words and save to data/sample.txt
sampled [200/1000] words and save to data/sample.txt
sampled [300/1000] words and save to data/sample.txt
sampled [400/1000] words and save to data/sample.txt
sampled [500/1000] words and save to data/sample.txt
sampled [600/1000] words and save to data/sample.txt
sampled [700/1000] words and save to data/sample.txt
sampled [800/1000] words and save to data/sample.txt
sampled [900/1000] words and save to data/sample.txt
sampled [1000/1000] words and save to data/sample.txt


In [15]:
torch.save(language_model.state_dict(), 'language_model.pkl')