In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
# open text file and read in data as `text`
with open('/Users/mayurjain/Desktop/anna.txt', 'r') as f:
    text = f.read()

In [16]:
# encode the text and map each character to an integer and vice versa

# we create two dictonaries:
# 1. int2char, which maps integers to characters
# 2. char2int, which maps characters to unique integers
words = tuple(set(text.split()))
int2words = dict(enumerate(words))
words2int = {ch: ii for ii, ch in int2words.items()}
encoded = np.array([words2int[ch] for ch in text.split()])

In [17]:
encoded

array([22291,    64, 12751, ..., 26993, 26372,  9320])

In [18]:
text.split()[:100]

['Chapter',
 '1',
 'Happy',
 'families',
 'are',
 'all',
 'alike;',
 'every',
 'unhappy',
 'family',
 'is',
 'unhappy',
 'in',
 'its',
 'own',
 'way.',
 'Everything',
 'was',
 'in',
 'confusion',
 'in',
 'the',
 "Oblonskys'",
 'house.',
 'The',
 'wife',
 'had',
 'discovered',
 'that',
 'the',
 'husband',
 'was',
 'carrying',
 'on',
 'an',
 'intrigue',
 'with',
 'a',
 'French',
 'girl,',
 'who',
 'had',
 'been',
 'a',
 'governess',
 'in',
 'their',
 'family,',
 'and',
 'she',
 'had',
 'announced',
 'to',
 'her',
 'husband',
 'that',
 'she',
 'could',
 'not',
 'go',
 'on',
 'living',
 'in',
 'the',
 'same',
 'house',
 'with',
 'him.',
 'This',
 'position',
 'of',
 'affairs',
 'had',
 'now',
 'lasted',
 'three',
 'days,',
 'and',
 'not',
 'only',
 'the',
 'husband',
 'and',
 'wife',
 'themselves,',
 'but',
 'all',
 'the',
 'members',
 'of',
 'their',
 'family',
 'and',
 'household,',
 'were',
 'painfully',
 'conscious',
 'of',
 'it.',
 'Every']

In [19]:
encoded[:100]

array([22291,    64, 12751,  2113, 27996, 24564,  8542, 14895, 15446,
        1329, 24200, 15446,   583, 27299, 23086,  7807, 24824,  4254,
         583, 23761,   583, 13503, 25506,  6062, 27506,   681,  6509,
        3933,  3823, 13503, 19517,  4254,  9291,  8216, 19411,  7897,
       29142,  5710, 10945,  1190, 19105,  6509, 20121,  5710,  9148,
         583,  2862, 21000,  9753, 24839,  6509,  5701,  8902,  4763,
       19517,  3823, 24839, 19151, 16340, 19983,  8216, 18492,   583,
       13503, 25311, 16274, 29142,  7845, 25338, 15417,  3304, 14065,
        6509,  4213, 26955,  3786, 25238,  9753, 16340,  5134, 13503,
       19517,  9753,   681, 20645, 28851, 24564, 13503, 16501,  3304,
        2862,  1329,  9753, 21430, 16307, 10526, 29218,  3304, 26837,
        3617])

In [21]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [22]:
def get_batches(arr, n_seqs, n_steps):
    '''Create a generator that returns batches of size
       n_seqs x n_steps from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       n_seqs: Batch size, the number of sequences per batch
       n_steps: Number of sequence steps per batch
    '''
    
    batch_size = n_seqs * n_steps
    n_batches = len(arr)//batch_size
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size]
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        # The features
        x = arr[:, n:n+n_steps]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+n_steps]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [23]:
batches = get_batches(encoded, 10, 50)
x, y = next(batches)

In [24]:
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[22291    64 12751  2113 27996 24564  8542 14895 15446  1329]
 [27154   583 13503  9706  3304 22817 16978 26748  5436  2572]
 [ 6509  8168  6416 15588 26083 18586 15404  7755 13504  9385]
 [28597   583  5710  5442 15367 21086  6509 12400  8902 21436]
 [10662  6838 24715 13503  4988  2813 15859 12486 13503  4993]
 [11037  5710 11368  3304  5087 16892 27506 21187 21086  6509]
 [11188 26528  9753  8757  8902 13503 14557  3304 27893  9753]
 [13503  2891   302 23737 15203 21675 21058  3945 26993 23435]
 [13503  7894  1817  3304 19411  4988 18645  2852  1672  4254]
 [ 1190 19734   123 23061  2572  7056 10841 14938 27506 13485]]

y
 [[   64 12751  2113 27996 24564  8542 14895 15446  1329 24200]
 [  583 13503  9706  3304 22817 16978 26748  5436  2572  1946]
 [ 8168  6416 15588 26083 18586 15404  7755 13504  9385 23950]
 [  583  5710  5442 15367 21086  6509 12400  8902 21436 15815]
 [ 6838 24715 13503  4988  2813 15859 12486 13503  4993 29142]
 [ 5710 11368  3304  5087 16892 27506 21187 210

In [25]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_steps=100, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
        # initialize the weights
        self.init_weights()
      
    
    def forward(self, x, hc):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hc`. '''
        
        ## TODO: Get x, and the new hidden state (h, c) from the lstm
        x, (h, c) = self.lstm(x, hc)
        
        ## TODO: pass x through a droupout layer
        x = self.dropout(x)
        
        # Stack up LSTM outputs using view
        x = x.view(x.size()[0]*x.size()[1], self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        x = self.fc(x)
        
        # return x and the hidden state (h, c)
        return x, (h, c)
    
    
    def predict(self, char, h=None, cuda=False, top_k=None):
        ''' Given a character, predict the next character.
        
            Returns the predicted character and the hidden state.
        '''
        if cuda:
            self.cuda()
        else:
            self.cpu()
        
        if h is None:
            h = self.init_hidden(1)
        
        x = np.array([[self.char2int[char]]])
        x = one_hot_encode(x, len(self.chars))
        inputs = torch.from_numpy(x)
        if cuda:
            inputs = inputs.cuda()
        
        h = tuple([each.data for each in h])
        out, h = self.forward(inputs, h)

        p = F.softmax(out, dim=1).data
        if cuda:
            p = p.cpu()
        
        if top_k is None:
            top_ch = np.arange(len(self.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
            
        return self.int2char[char], h
    
    def init_weights(self):
        ''' Initialize weights for fully connected layer '''
        initrange = 0.1
        
        # Set bias tensor to all zeros
        self.fc.bias.data.fill_(0)
        # FC weights as random uniform
        self.fc.weight.data.uniform_(-1, 1)
        
    def init_hidden(self, n_seqs):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, n_seqs, self.n_hidden).zero_(),
                weight.new(self.n_layers, n_seqs, self.n_hidden).zero_())

In [26]:
def train(net, data, epochs=10, n_seqs=10, n_steps=50, lr=0.001, clip=5, val_frac=0.1, cuda=False, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        n_seqs: Number of mini-sequences per mini-batch, aka batch size
        n_steps: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        cuda: Train with CUDA on a GPU
        print_every: Number of steps for printing training and validation loss
    
    '''
    
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if cuda:
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        h = net.init_hidden(n_seqs)
        for x, y in get_batches(data, n_seqs, n_steps):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            net.zero_grad()
            
            output, h = net.forward(inputs, h)
            loss = criterion(output, targets.view(n_seqs*n_steps))

            loss.backward()
            
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            opt.step()
            
            if counter % print_every == 0:
                
                # Get validation loss
                val_h = net.init_hidden(n_seqs)
                val_losses = []
                for x, y in get_batches(val_data, n_seqs, n_steps):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if cuda:
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net.forward(inputs, val_h)
                    val_loss = criterion(output, targets.view(n_seqs*n_steps))
                
                    val_losses.append(val_loss.item())
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [27]:
if 'net' in locals():
    del net

In [28]:
# define and print the net
net = CharRNN(words, n_hidden=512, n_layers=2)
print(net)

CharRNN(
  (lstm): LSTM(29230, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=29230, bias=True)
)


In [33]:
n_seqs, n_steps = 128, 100

# you may change cuda to True if you plan on using a GPU!
# also, if you do, please INCREASE the epochs to 25
train(net, encoded, epochs=10, n_seqs=n_seqs, n_steps=n_steps, lr=0.001, cuda=False, print_every=10)

Epoch: 1/10... Step: 10... Loss: 8.6368... Val Loss: 8.6551
Epoch: 1/10... Step: 20... Loss: 8.3034... Val Loss: 8.2754
Epoch: 2/10... Step: 30... Loss: 7.9721... Val Loss: 8.0434
Epoch: 2/10... Step: 40... Loss: 7.8390... Val Loss: 7.8640
Epoch: 3/10... Step: 50... Loss: 7.6038... Val Loss: 7.7103
Epoch: 3/10... Step: 60... Loss: 7.4621... Val Loss: 7.5508
Epoch: 3/10... Step: 70... Loss: 7.3454... Val Loss: 7.4271
Epoch: 4/10... Step: 80... Loss: 7.1729... Val Loss: 7.3142
Epoch: 4/10... Step: 90... Loss: 7.0925... Val Loss: 7.2414
Epoch: 5/10... Step: 100... Loss: 6.9424... Val Loss: 7.1809
Epoch: 5/10... Step: 110... Loss: 6.7779... Val Loss: 7.1187
Epoch: 5/10... Step: 120... Loss: 6.7744... Val Loss: 7.0846
Epoch: 6/10... Step: 130... Loss: 6.5740... Val Loss: 7.1443
Epoch: 6/10... Step: 140... Loss: 6.5954... Val Loss: 7.0446
Epoch: 7/10... Step: 150... Loss: 6.4174... Val Loss: 7.0076
Epoch: 7/10... Step: 160... Loss: 6.4295... Val Loss: 7.0014
Epoch: 8/10... Step: 170... Loss:

In [34]:
# change the name, for saving multiple files
model_name = 'rnn_1_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

In [37]:
def sample(net, size, prime='The', top_k=None, cuda=False):
        
    if cuda:
        net.cuda()
    else:
        net.cpu()

    net.eval()
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = net.predict(ch, h, cuda=cuda, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = net.predict(chars[-1], h, cuda=cuda, top_k=top_k)
        chars.append(char)

    return ' '.join(chars)

In [38]:
print(sample(net, 200, prime='Anna', top_k=5, cuda=False))

A n n a that he had come to him. She was at his own face of his head and at the same time as to his face was a of the same time as though the most of the most old man and his face of the same time as though he had been his own And he was not in the first but he was a long in his own But the most of which he had been on the other same as though they were were not a man in a same The time he had been in the same same as though his were were a man of the same same time with his eyes of the same were his eyes and a old new He had not been his own The first time for his head but was not to his head and to be at his own own The position in the first for his face of his own own and his own feeling of his own own his eyes and his face was a long of his old and was at the same time of the same He was his own face that had not been for his
