In [1]:
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F

### Loading Data

In [2]:
with open("../Data/anna_karenina.txt", "rb") as obj:
    text = obj.read()

In [3]:
text[:100]

b'Chapter 1\r\n\r\n\r\nHappy families are all alike; every unhappy family is unhappy in its own\r\nway.\r\n\r\nEve'

### Tokenization
Creating two dictionaries:
1. int2char -> maps integers to the characters
2. char2int -> maps characters to unique integers

In [14]:
chars = tuple(set(text))
print(chars)

(10, 13, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122)


In [15]:
int2char = dict(enumerate(chars))
print(int2char)

{0: 10, 1: 13, 2: 32, 3: 33, 4: 34, 5: 36, 6: 37, 7: 38, 8: 39, 9: 40, 10: 41, 11: 42, 12: 44, 13: 45, 14: 46, 15: 47, 16: 48, 17: 49, 18: 50, 19: 51, 20: 52, 21: 53, 22: 54, 23: 55, 24: 56, 25: 57, 26: 58, 27: 59, 28: 63, 29: 64, 30: 65, 31: 66, 32: 67, 33: 68, 34: 69, 35: 70, 36: 71, 37: 72, 38: 73, 39: 74, 40: 75, 41: 76, 42: 77, 43: 78, 44: 79, 45: 80, 46: 81, 47: 82, 48: 83, 49: 84, 50: 85, 51: 86, 52: 87, 53: 88, 54: 89, 55: 90, 56: 95, 57: 96, 58: 97, 59: 98, 60: 99, 61: 100, 62: 101, 63: 102, 64: 103, 65: 104, 66: 105, 67: 106, 68: 107, 69: 108, 70: 109, 71: 110, 72: 111, 73: 112, 74: 113, 75: 114, 76: 115, 77: 116, 78: 117, 79: 118, 80: 119, 81: 120, 82: 121, 83: 122}


In [20]:
char2int = {ch: count for count, ch in int2char.items()}
print(char2int)

{10: 0, 13: 1, 32: 2, 33: 3, 34: 4, 36: 5, 37: 6, 38: 7, 39: 8, 40: 9, 41: 10, 42: 11, 44: 12, 45: 13, 46: 14, 47: 15, 48: 16, 49: 17, 50: 18, 51: 19, 52: 20, 53: 21, 54: 22, 55: 23, 56: 24, 57: 25, 58: 26, 59: 27, 63: 28, 64: 29, 65: 30, 66: 31, 67: 32, 68: 33, 69: 34, 70: 35, 71: 36, 72: 37, 73: 38, 74: 39, 75: 40, 76: 41, 77: 42, 78: 43, 79: 44, 80: 45, 81: 46, 82: 47, 83: 48, 84: 49, 85: 50, 86: 51, 87: 52, 88: 53, 89: 54, 90: 55, 95: 56, 96: 57, 97: 58, 98: 59, 99: 60, 100: 61, 101: 62, 102: 63, 103: 64, 104: 65, 105: 66, 106: 67, 107: 68, 108: 69, 109: 70, 110: 71, 111: 72, 112: 73, 113: 74, 114: 75, 115: 76, 116: 77, 117: 78, 118: 79, 119: 80, 120: 81, 121: 82, 122: 83}


In [21]:
encode_text = np.array([char2int[ch] for ch in text])
encode_text[:100]

array([32, 65, 58, 73, 77, 62, 75,  2, 17,  1,  0,  1,  0,  1,  0, 37, 58,
       73, 73, 82,  2, 63, 58, 70, 66, 69, 66, 62, 76,  2, 58, 75, 62,  2,
       58, 69, 69,  2, 58, 69, 66, 68, 62, 27,  2, 62, 79, 62, 75, 82,  2,
       78, 71, 65, 58, 73, 73, 82,  2, 63, 58, 70, 66, 69, 82,  2, 66, 76,
        2, 78, 71, 65, 58, 73, 73, 82,  2, 66, 71,  2, 66, 77, 76,  2, 72,
       80, 71,  1,  0, 80, 58, 82, 14,  1,  0,  1,  0, 34, 79, 62])

### Data Pre-Processing

In [22]:
def one_hot_encode(arr, n_labels):
    # initialize the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    # fill appropriate elements with one
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    # reshape back to original array
    one_hot = one_hot.reshape(*arr.shape, n_labels)
    
    return one_hot

In [23]:
# test_case

test_seq = np.array([1, 2, 3, 4, 5])
one_hot = one_hot_encode(test_seq, 10)
print(one_hot)

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]


In [28]:
def get_batches(arr, batch_size, seq_length):
    batch_size_total = batch_size * seq_length
    n_batches = len(arr) // batch_size_total
    
    arr = arr[:n_batches*batch_size_total]
    arr = arr.reshape(batch_size, -1)
    
    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [33]:
# test_case

batches = get_batches(encode_text, 8, 50)
x, y = next(batches)

print("x = \n", x[:10, :10], "\n")
print("y = \n", y[:10, :10])

x = 
 [[32 65 58 73 77 62 75  2 17  1]
 [ 0 77 65 72 78 64 65 77 76  2]
 [78 71 64  2 70 58 71 28  4  1]
 [71 61  2 77 72  2 75 62 77 66]
 [75 12  2 65 62  2 65 58 61  2]
 [72  2 61 66 76 60 78 76 76 66]
 [ 2 33 72 69 69 82 12  2 61 58]
 [66 71  2 77 65 62  2 60 72 71]] 

y = 
 [[65 58 73 77 62 75  2 17  1  0]
 [77 65 72 78 64 65 77 76  2 72]
 [71 64  2 70 58 71 28  4  1  0]
 [61  2 77 72  2 75 62 77 66 75]
 [12  2 65 62  2 65 58 61  2 71]
 [ 2 61 66 76 60 78 76 76 66 72]
 [33 72 69 69 82 12  2 61 58 75]
 [71  2 77 65 62  2 60 72 71 60]]


### Training the Model

In [35]:
use_gpu = torch.cuda.is_available()
if use_gpu:
    print("Training on GPU.")
else:
    print("Using CPU.")

Training on GPU.


In [40]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: count for count, ch in self.int2char.items()}
        # define LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        # define dropout layer
        self.dropout = nn.Dropout(drop_prob)
        # define final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
    
    def forward(self, x, hidden):
        r_out, hidden = self.lstm(x, hidden)
        out = self.dropout(r_out)
        # stack up LSTM outputs
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if use_gpu:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        return hidden

In [41]:
def train_network(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    net.train()
    optimizer = optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data) * (1 - val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    # train on gpu if available
    if use_gpu:
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        hidden_state = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            if use_gpu:
                inputs, targets = inputs.cuda(), targets.cuda()
            
            # new copy of hidden_state
            hidden_state = tuple([each.data for each in hidden_state])
            
            net.zero_grad()
            
            output, hidden_state = net(inputs, hidden_state)
            
            # calculate loss and perform backpropagation
            loss = criterion(output, targets.view(batch_size * seq_length).long())
            loss.backward()
            # using clip_grad_norm to avoid exploding gradient problem
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()
            
            # stats
            if counter % print_every == 0:
                val_hidden = net.init_hidden(batch_size)
                val_losses = []
                
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    val_hidden = tuple([each.data for each in val_hidden])
                    
                    inputs, targets = x, y
                    if use_gpu:
                        inputs, targets = inputs.cuda(), targets.cuda()
                    
                    output, val_hidden = net(inputs, val_hidden)
                    val_loss = criterion(output, targets.view(batch_size * seq_length).long())
                    val_losses.append(val_loss.item())
                
                net.train()
                
                print("{}\n".format("-" * 50),
                      "Step : {}\n".format(counter),
                      "Loss : {}\n".format(loss.item()), 
                      "Validation Loss : {}\n".format(np.mean(val_losses)))

In [42]:
n_hidden = 512
n_layers = 2

net = CharRNN(chars, n_hidden, n_layers)
net

CharRNN(
  (lstm): LSTM(84, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=84, bias=True)
)

In [44]:
batch_size = 128
seq_length = 100
n_epochs = 20

train_network(net, encode_text, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=100)

--------------------------------------------------
 Step : 100
 Loss : 1.5830848217010498
 Validation Loss : 1.5134381214777628

--------------------------------------------------
 Step : 200
 Loss : 1.5317405462265015
 Validation Loss : 1.47554456392924

--------------------------------------------------
 Step : 300
 Loss : 1.4878602027893066
 Validation Loss : 1.440233047803243

--------------------------------------------------
 Step : 400
 Loss : 1.454048752784729
 Validation Loss : 1.4058611313501994

--------------------------------------------------
 Step : 500
 Loss : 1.4014155864715576
 Validation Loss : 1.3825571060180664

--------------------------------------------------
 Step : 600
 Loss : 1.3742811679840088
 Validation Loss : 1.3528647502263387

--------------------------------------------------
 Step : 700
 Loss : 1.3665306568145752
 Validation Loss : 1.3328427950541177

--------------------------------------------------
 Step : 800
 Loss : 1.3282127380371094
 Validation

KeyboardInterrupt: 