In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
with open("data.txt", 'r') as f:
    text = f.read()

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [4]:
#TOKENIZATION
# encode the text and map each character to an integer and vice versa
# 1. int2char, which maps integers to characters
# 2. char2int, which maps characters to unique integers
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}
# encode the text
encoded = np.array([char2int[ch] for ch in text])
encoded[:100]

array([32,  7, 35,  5,  0, 21, 54, 15, 11, 42, 42, 42, 31, 35,  5,  5, 59,
       15, 38, 35, 79, 65, 45, 65, 21,  6, 15, 35, 54, 21, 15, 35, 45, 45,
       15, 35, 45, 65,  1, 21, 62, 15, 21, 25, 21, 54, 59, 15, 70, 46,  7,
       35,  5,  5, 59, 15, 38, 35, 79, 65, 45, 59, 15, 65,  6, 15, 70, 46,
        7, 35,  5,  5, 59, 15, 65, 46, 15, 65,  0,  6, 15, 51, 50, 46, 42,
       50, 35, 59, 56, 42, 42, 27, 25, 21, 54, 59,  0,  7, 65, 46])

In [5]:

def one_hot_encode(arr, n_labels):
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    return one_hot


# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [6]:

def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [7]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[32  7 35  5  0 21 54 15 11 42]
 [ 6 51 46 15  0  7 35  0 15 35]
 [21 46 13 15 51 54 15 35 15 38]
 [ 6 15  0  7 21 15 75  7 65 21]
 [15  6 35 50 15  7 21 54 15  0]
 [75 70  6  6 65 51 46 15 35 46]
 [15 58 46 46 35 15  7 35 13 15]
 [80 44 45 51 46  6  1 59 56 15]]

y
 [[ 7 35  5  0 21 54 15 11 42 42]
 [51 46 15  0  7 35  0 15 35  0]
 [46 13 15 51 54 15 35 15 38 51]
 [15  0  7 21 15 75  7 65 21 38]
 [ 6 35 50 15  7 21 54 15  0 21]
 [70  6  6 65 51 46 15 35 46 13]
 [58 46 46 35 15  7 35 13 15  6]
 [44 45 51 46  6  1 59 56 15 39]]


In [8]:
train_on_gpu = torch.cuda.is_available()


In [9]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob =0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        # LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
    def forward(self, x, hidden):
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden   
        

In [10]:
# define and print the net
n_hidden=512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [11]:
batch_size = 128
seq_length = 100
epochs = 5
lr = 0.001
val_frac = 0.1
clip=5
print_every=10

In [12]:
net.train()
    
opt = torch.optim.Adam(net.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# create training and validation data
val_idx = int(len(encoded)*(1-val_frac))
data, val_data = encoded[:val_idx], encoded[val_idx:]

if(train_on_gpu):
    net.cuda()

counter = 0
n_chars = len(net.chars)
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    for x, y in get_batches(data, batch_size, seq_length):
        counter += 1

        # One-hot encode our data and make them Torch tensors
        x = one_hot_encode(x, n_chars)
        inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

        if(train_on_gpu):
            inputs, targets = inputs.cuda(), targets.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output, targets.view(batch_size*seq_length).long())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        opt.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for x, y in get_batches(val_data, batch_size, seq_length):
                # One-hot encode our data and make them Torch tensors
                x = one_hot_encode(x, n_chars)
                x, y = torch.from_numpy(x), torch.from_numpy(y)

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, targets = x, y
                if(train_on_gpu):
                    inputs, targets = inputs.cuda(), targets.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output, targets.view(batch_size*seq_length).long())

                val_losses.append(val_loss.item())

            net.train() # reset to train mode after iterationg through validation data

            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.4f}...".format(loss.item()),
                  "Val Loss: {:.4f}".format(np.mean(val_losses)))

Epoch: 1/5... Step: 10... Loss: 3.2640... Val Loss: 3.1859
Epoch: 1/5... Step: 20... Loss: 3.1517... Val Loss: 3.1286
Epoch: 1/5... Step: 30... Loss: 3.1476... Val Loss: 3.1204
Epoch: 1/5... Step: 40... Loss: 3.1118... Val Loss: 3.1195
Epoch: 1/5... Step: 50... Loss: 3.1452... Val Loss: 3.1182
Epoch: 1/5... Step: 60... Loss: 3.1214... Val Loss: 3.1154
Epoch: 1/5... Step: 70... Loss: 3.1071... Val Loss: 3.1138
Epoch: 1/5... Step: 80... Loss: 3.1215... Val Loss: 3.1095
Epoch: 1/5... Step: 90... Loss: 3.1164... Val Loss: 3.0991
Epoch: 1/5... Step: 100... Loss: 3.0867... Val Loss: 3.0748
Epoch: 1/5... Step: 110... Loss: 3.0310... Val Loss: 3.0079
Epoch: 1/5... Step: 120... Loss: 2.9134... Val Loss: 2.9153
Epoch: 1/5... Step: 130... Loss: 2.8668... Val Loss: 2.8378
Epoch: 2/5... Step: 140... Loss: 2.8028... Val Loss: 2.7482
Epoch: 2/5... Step: 150... Loss: 2.6989... Val Loss: 2.6615
Epoch: 2/5... Step: 160... Loss: 2.6058... Val Loss: 2.5726
Epoch: 2/5... Step: 170... Loss: 2.5309... Val Lo

In [13]:
# change the name, for saving multiple files
model_name = 'rnn_20_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

In [14]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

In [15]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [16]:
print(sample(net, 1000, prime='Anna', top_k=5))

Annants, to teen her harse, at a mine to husbing her and at to she cell than the somathing he come, and all the counse and was anden to
say he had to her, to that that that the convicinos of her that to har to the pospos of the celssity of she was and, and hould not
word at the pliectess
and seed it would
so dingress...

At he soot, but his fact,
wish her.

She wanded his some all of his hore, and wele and the
form on
the parsing head of she was new had alang whone had
not her hard, and sacined, and his a mort that as the pees in the mish a contriagian of and sone was say, and had becoused
the betine on the pitter
on take this
stording to the prossess and she seethed if imparsed the crost time
the
peasing of the mesting. He this trought of what and so see that inte to be, when he had not sitting to him, as had
stellent the
peeter
the pained was a seadinc the bating,
and a linger his had had at his foor, that he would her a sill sont of the sumpech of that he cannow the
sontel the for o

In [17]:

# Here we have loaded in a model that trained over 20 epochs `rnn_20_epoch.net`
with open('rnn_20_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [18]:
print(sample(loaded, 2000, top_k=5, prime="And Levin said"))

And Levin said.

A tears on the
minets of
the portally, and have his sumpersiag to the persicully with the prisce shall of
the chish the martion and which soud and the casinor with a coms in to be soul,
but
she had net that she sanding her
for his forting over. That
he
was not that it was till than his back of the since was no darsed, and allay houre had thing of
thome his amare to he censided that he has him her. She was stopped taking of she had and though the
saming at her take and a ment in him, and that the sance some on the
sind and all of her tally had
becoul and the mond of shile of the pronint to that the samors her and asked and the
cealed be one sact in the room. "In that all her, whine as her all of the same ther who care to say the court of this with of the roon, but the parsen of his confeating him
tell, and this sement and
center to the roals that the roon steally the mare of at one have his for she, and she seat the petition of the connice to him has hunders, he talking