In [5]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.cuda as cuda
import torch.optim as optim
import numpy as np

In [6]:
import os

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
    
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]
    
    def __len__(self):
        return len(self.idx2word)
    
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        # only english language characters
        self.whitelist = [chr(i) for i in range(32,127)]
        
        self.train = self.tokenize(os.path.join(path))
    
    def tokenize(self, path):
        '''Tokenize the text file'''
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding = 'utf8') as f:
            tokens = 0
            for line in f:
                line = ''.join([c for c in line if c in self.whitelist])
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
        ids = torch.LongTensor(tokens)
        
        # Tokenize file content
        with open(path, 'r',  encoding="utf8") as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                line = ''.join([c for c in line if c in self.whitelist])
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
        return ids

In [7]:
corpus = Corpus('C:\\Users\\user\\Python_Code\\HomeWorks\\sonnets.txt')

In [8]:
print(corpus.dictionary.idx2word[10])
print(corpus.dictionary.word2idx['That'])

<eos>
92


In [9]:
print(corpus.train.size())

torch.Size([23730])


In [10]:
id = corpus.train[112]
corpus.dictionary.idx2word[id]

'Produced'

In [11]:
vocab_size = len(corpus.dictionary)
print(vocab_size)

5664


### The RNN model(GRU cell)

In [75]:
import torch.nn as nn
from torch.autograd import Variable

class RNNModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        super(RNNModel, self).__init__()
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.drop1 = nn.Dropout(dropout)
        self.drop2 = nn.Dropout(dropout)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
        self.init_weights()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
    
    def forward(self, input, hidden):
        emb = self.drop1(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop2(output)

        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return Variable(weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
    

In [62]:
def batchify(data, batch_size):
    nbatch = data.size(0) // batch_size
    data = data.narrow(0, 0, nbatch*batch_size)

    data = data.view(batch_size, -1).t().contiguous()
    if cuda.is_available():
        data = data.cuda()
    return data

In [63]:
dummy_data = "And made my self a motley to the view"
dummy_data_idx = [corpus.dictionary.word2idx[w] for w in dummy_data.split()]
dummy_tensor = torch.LongTensor(dummy_data_idx) 
op = batchify(dummy_tensor, 2)
for row in op:
    print("%10s %10s" %  (corpus.dictionary.idx2word[row[0]], corpus.dictionary.idx2word[row[1]]))

       And          a
      made     motley
        my         to
      self        the


In [64]:
bs_train = 20 # batch_size for the training set
bs_valid = 20 # batch size for the validation set
bptt_size = 35 # number of times to unroll the graph for back prop through time
clip = 0.25 # gradient clipping to prevent gradient explosion
embed_size = 200 # size of embedding vector
hidden_size = 200 # size of the hidden state in the RNN
num_layers = 2 # number of RNN layers to use
dropout_pct = 0.5 # %age of neurond to dropout for regularization

In [65]:
train_data = batchify(corpus.train, bs_train)
val_data = train_data[900:]
train_data = train_data[:900]

In [66]:
train_data.shape

torch.Size([900, 20])

In [67]:
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout_pct)
if cuda.is_available():
    model.cuda()

In [68]:
criterion = nn.CrossEntropyLoss()

In [69]:
def get_batch(source, i, evaluation = False):
    seq_len = min(bptt_size, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile = evaluation)
    target = Variable(source[i+1: i+1+seq_len].view(-1))
    if cuda.is_available():
        data = data.cuda()
        target = target.cuda()
    return data, target

In [70]:
data, target = get_batch(train_data,0)
print(data.shape)
print(target.shape)

torch.Size([35, 20])
torch.Size([700])


### Training the model

In [71]:
def train(data_source, lr):
    # turn on training mode that enables dropout
    
    model.train()
    total_loss = 0
    hidden = model.init_hidden(bs_train)
    optimizer = optim.Adam(model.parameters(), lr = lr)
    
    for batch, i in enumerate(range(0, data_source.size(0) - 1, bptt_size)):
        
        data, targets = get_batch(data_source, i)
        
        # Starting each batch, we detach the hidden state from how it was previously produced
        # so that model doesen't ry to backprop to all the way start of the dataset
        # unrolling of the graph will go from the last iteration to the first iteration
        hidden = Variable(hidden.data)
        if cuda.is_available():
            hidden = hidden.cuda()
        optimizer.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()
        
        # clip_grad_norm to prevent gradient explosion
        torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        
        optimizer.step()
        total_loss += len(data) * loss.data
        # return accumulated loss for all the iterations
        return total_loss / len(data_source)

In [72]:
def evaluate(data_source):
    # turn on evaluation to disable dropout
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(bs_valid)
    
    for i in range(0, data_source.size(0) - 1, bptt_size):
        data, targets = get_batch(data_source, i, evaluation = True)
        
        if cuda.is_available():
            hidden = hidden.cuda()
        
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, vocab_size)
        
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = Variable(hidden.data)
        
    return total_loss/len(data_source)

### Run the Epochs

In [73]:
best_val_loss = None
def run(epochs, lr):
    global best_val_loss
    for epoch in range(0, epochs):
        train_loss = train(train_data, lr)
        val_loss = evaluate(val_data)
        print("Train Loss: ", train_loss, " Validation Loss: ", val_loss)

        if not best_val_loss or val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "./4.model.pth")

In [74]:
run(500, 0.001)

AttributeError: 'tuple' object has no attribute 'data'

### Text Generation

In [54]:
num_words = 200
temperature = 0.9

In [55]:
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout_pct)
model.load_state_dict(torch.load("./4.model.pth"))

if cuda.is_available():
    model.cuda()

model.eval()

RNNModel(
  (encoder): Embedding(5664, 200)
  (drop1): Dropout(p=0.5)
  (drop2): Dropout(p=0.5)
  (rnn): GRU(200, 200, num_layers=2, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=5664, bias=True)
)

In [59]:
hidden = model.init_hidden(1)
idx = corpus.dictionary.word2idx['I']
input = Variable(torch.LongTensor([[idx]]).long(), volatile = True)

if cuda.is_available():
    input.data = input.data.cuda()

print(corpus.dictionary.idx2word[idx],'',end='')

for i in range(num_words):
    output, hidden = model(input, hidden)
    word_weights = output.squeeze().data.div(temperature).exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    input.data.fill_(word_idx)
    word = corpus.dictionary.idx2word[word_idx]
    
    if word == '<eos>':
        print('')
    else:
        print(word + ' ',end='')

  This is separate from the ipykernel package so we can avoid doing imports until


I CXXI deaf how party breathes, condemned 
Myself Project 

whatsoever. tongues, divine, work 
sort, rage lie! forgotten. Presents 


Still 

LXIII dwell: in shake 
my 
spur, Give large spirits 
I in copies gone, do of mind, Till like 

haste 
be That Sometime of bonds guilty 

work In 
like import physicians quite, rearward Where proceeds. 
leisure in 
fairly Project Nothing, 

LIABILITY, wantonness; 
thy notice 
head, interest, 
thy When You crowned 
mistress his 

restrictions Which I like 


which 
and while on crying memory 

that thee 
turn 
the shown. The have woeful kindness choose belied, may 
loves be 

veins 
And 
ornament, masked 
do 


brand 
compound 
purpose, 

abhor, I maiden by no 
speak, is boughs Shakespeare runs tie or B. 
leisure copy Receiving I William a 
o'erworn; 

Unlook'd, belong and I me still 

shames waste haste 
brave 
comment; respose prizing proceed? temptation to, gbnewby@pglaf.org should hours you To 