In [39]:
import time
from collections import namedtuple

import numpy as np
import tensorflow as tf

Parse the text into a integer dictionary

In [40]:
with open('anna.txt', 'r') as f:
    text = f.read()
vocab = set(text)

In [41]:
vocab_to_int = {c: i for i, c in enumerate(vocab)}

In [42]:
chars = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

In [43]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [44]:
chars[:100]

array([58, 82, 37, 13,  0, 48, 57, 10, 19, 44, 44, 44, 56, 37, 13, 13, 79,
       10, 45, 37, 24, 64, 63, 64, 48, 77, 10, 37, 57, 48, 10, 37, 63, 63,
       10, 37, 63, 64, 81, 48, 35, 10, 48, 67, 48, 57, 79, 10, 23,  1, 82,
       37, 13, 13, 79, 10, 45, 37, 24, 64, 63, 79, 10, 64, 77, 10, 23,  1,
       82, 37, 13, 13, 79, 10, 64,  1, 10, 64,  0, 77, 10, 68,  2,  1, 44,
        2, 37, 79, 70, 44, 44,  7, 67, 48, 57, 79,  0, 82, 64,  1])

Number of classes

In [45]:
np.max(chars)+1

83

In [46]:
len(vocab_to_int)

83

## Making training and validation batches

In [47]:
def split_data(chars, batch_size, num_steps, split_frac=0.9):
    """ 
    Split character data into training and validation sets, inputs and targets for each set.
    
    Arguments
    ---------
    chars: character array
    batch_size: Size of examples in each of batch
    num_steps: Number of sequence steps to keep in the input and pass to the network
    split_frac: Fraction of batches to keep in the training set
    
    
    Returns train_x, train_y, val_x, val_y
    """
    
    slice_size = batch_size * num_steps
    n_batches = int(len(chars)/ slice_size)
    
    # Drop the last few characters to make only full batches
    x = chars[: n_batches*slice_size]
    y = chars[1: n_batches*slice_size + 1]
    
    # Split the data into batch_size slices, then stack them into a 2D matrix 
    x = np.stack(np.split(x, batch_size))
    y = np.stack(np.split(y, batch_size))
    # Now x and y are arrays with dimensions batch_size x n_batches*num_steps
    # Split into training and validation sets, keep the first split_frac batches for training
    split_idx = int(n_batches*split_frac)
    train_x, train_y = x[:, :split_idx* num_steps], y[:, :split_idx * num_steps]
    val_x, val_y = x[:, split_idx*num_steps:], y[:, split_idx*num_steps:]
    
    return train_x, train_y, val_x, val_y

Now I'll make my data sets and we can check out what's going on here. Here I'm going to use a batch size of 10 and 50 sequence steps.

In [48]:
train_x, train_y, val_x, val_y = split_data(chars, 10, 50)


In [49]:
train_x.shape

(10, 178650)

In [50]:
len(chars)

1985223

In [51]:
train_x.shape[1]

178650

In [52]:
len(chars)/train_x.shape[1]

11.112359361880772

Looking at the size of this array, we see that we have rows equal to the batch size. When we want to get a batch out of here, we can grab a subset of this array that contains all the rows but has a width equal to the number of steps in the sequence. The first batch looks like this:

In [79]:
train_x[:,:5]

array([[58, 82, 37, 13,  0],
       [10, 37, 24, 10,  1],
       [67, 64,  1, 70, 44],
       [ 1, 10, 31, 23, 57],
       [10, 64,  0, 10, 64],
       [10, 12,  0, 10,  2],
       [82, 48,  1, 10, 28],
       [35, 10, 25, 23,  0],
       [ 0, 10, 64, 77,  1],
       [10, 77, 37, 64, 31]])

In [78]:
train_x.shape

(10, 178650)

In [80]:
def get_batch(arrs, num_steps):
    batch_size, slice_size = arrs[0].shape
    
    n_batches = int(slice_size/num_steps)
    for b in range(n_batches):
        yield [x[:, b*num_steps: (b+1)*num_steps] for x in arrs]