
# RNN for Text Generation


## Imports

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Get Text Data

In [2]:
with open('../Data Science Projects/shakespeare.txt','r',encoding='utf8') as f:
    text = f.read()

In [3]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [4]:
len(text)

5445609

## Encode Text

In [5]:
# get all used characters
all_characters = set(text)

In [6]:
# decode num to characters 
decoder = dict(enumerate(all_characters))

In [7]:
# encode chars to num
encoder = {char: ind for ind,char in decoder.items()}

In [8]:
# encode all text
encoded_text = np.array([encoder[char] for char in text])

In [9]:
#encoded_text = torch.tensor([encoder[char] for char in text])

In [10]:
# display text when encoded
encoded_text[:500]

array([24, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29, 16, 24, 29, 29, 53,  9, 79, 45, 29, 63, 75, 56,
        9, 38, 82, 55, 29, 15,  9, 38, 75, 55, 52,  9, 38, 82, 29, 10, 38,
       29, 20, 38, 82, 56,  9, 38, 29, 56, 22, 15,  9, 38, 75, 82, 38, 35,
       24, 29, 29, 25, 34, 75, 55, 29, 55, 34, 38,  9, 38, 36, 37, 29, 36,
       38, 75, 52, 55, 37, 23, 82, 29,  9, 79, 82, 38, 29, 45, 56, 78, 34,
       55, 29, 22, 38, 77, 38,  9, 29, 20, 56, 38, 35, 24, 29, 29, 39, 52,
       55, 29, 75, 82, 29, 55, 34, 38, 29,  9, 56,  5, 38,  9, 29, 82, 34,
       79, 52,  0, 20, 29, 36, 37, 29, 55, 56, 45, 38, 29, 20, 38, 15, 38,
       75, 82, 38, 35, 24, 29, 29,  4, 56, 82, 29, 55, 38, 22, 20, 38,  9,
       29, 34, 38, 56,  9, 29, 45, 56, 78, 34, 55, 29, 36, 38, 75,  9, 29,
       34, 56, 82, 29, 45, 38, 45, 79,  9, 37, 42, 24, 29, 29, 39, 52, 55,
       29, 55, 34, 79, 52, 29, 15, 79, 22, 55,  9, 75, 15, 55, 38, 20, 29,
       55, 79, 29, 55, 34

## One Hot Encoding

We need to one-hot encode the data inorder for it to work with the network structure. 

In [11]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    encoded_text : batch of encoded text
    
    num_uni_chars = number of unique characters (len(set(text)))
    '''
    
    # METHOD FROM:
    # https://stackoverflow.com/questions/29831489/convert-encoded_textay-of-indices-to-1-hot-encoded-numpy-encoded_textay
      
    # Create a placeholder for zeros.
    one_hot = np.zeros((encoded_text.size, num_uni_chars))
    
    # Convert data type for later use with pytorch (errors if we dont!)
    one_hot = one_hot.astype(np.float32)

    # Using fancy indexing fill in the 1s at the correct index locations
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0
    

    # Reshape it so it matches the batch sahe
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))
    
    return one_hot

In [12]:
one_hot_encoder(np.array([1,2,0]),3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

--------------
---------------
# Creating Training Batches

We need to create a function that will generate batches of characters along with the next character in the sequence as a label.

-----------------
------------

In [13]:
example_text = np.arange(10)

In [14]:
example_text

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [15]:
# If we wanted 5 batches
example_text.reshape((5,-1))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [16]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    
    '''
    Generate (using yield) batches for training.
    
    X: Encoded Text of length seq_len
    Y: Encoded Text shifted by one
    
    Example:
    
    X:
    
    [[1 2 3]]
    
    Y:
    
    [[ 2 3 4]]
    
    encoded_text : Complete Encoded Text to make batches from
    batch_size : Number of samples per batch
    seq_len : Length of character sequence
       
    '''
    
    # Total number of characters per batch
    # Example: If samp_per_batch is 2 and seq_len is 50, then 100
    # characters come out per batch.
    char_per_batch = samp_per_batch * seq_len
    
    
    # Number of batches available to make
    # Use int() to roun to nearest integer
    num_batches_avail = int(len(encoded_text)/char_per_batch)
    
    # Cut off end of encoded_text that
    # won't fit evenly into a batch
    encoded_text = encoded_text[:num_batches_avail * char_per_batch]
    
    
    # Reshape text into rows the size of a batch
    encoded_text = encoded_text.reshape((samp_per_batch, -1))
    

    # Go through each row in array.
    for n in range(0, encoded_text.shape[1], seq_len):
        
        # Grab feature characters
        x = encoded_text[:, n:n+seq_len]
        
        # y is the target shifted over by 1
        y = np.zeros_like(x)
       
        #
        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1]  = encoded_text[:, n+seq_len]
            
        # FOR POTENTIAL INDEXING ERROR AT THE END    
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
            
        yield x, y

### Example of generating a batch

In [17]:
sample_text = encoded_text[:20]

In [18]:
sample_text

array([24, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29])

In [19]:
batch_generator = generate_batches(sample_text,samp_per_batch=2,seq_len=5)

In [20]:
# Grab first batch
x, y = next(batch_generator)

In [21]:
x

array([[24, 29, 29, 29, 29],
       [29, 29, 29, 29, 29]])

In [22]:
y

array([[29, 29, 29, 29, 29],
       [29, 29, 29, 29, 29]])

--------

## GPU Check

Remember this will take a lot longer on CPU!

In [23]:
torch.cuda.is_available()

True

# Creating the LSTM Model

**Note! We will have options for GPU users and CPU users. CPU will take MUCH LONGER to train and you may encounter RAM issues depending on your hardware. If that is the case, consider using cloud services like AWS, GCP, or Azure. Note, these may cost you money to use!**

In [24]:
class CharModel(nn.Module):
    
    def __init__(self, all_chars, num_hidden=256, num_layers=4,drop_prob=0.5,use_gpu=False):
        
        
        # SET UP ATTRIBUTES
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu
        
        #CHARACTER SET, ENCODER, and DECODER
        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char: ind for ind,char in decoder.items()}
        
        
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
      
    
    def forward(self, x, hidden):
                  
        
        lstm_output, hidden = self.lstm(x, hidden)
        
        
        drop_output = self.dropout(lstm_output)
        
        drop_output = drop_output.contiguous().view(-1, self.num_hidden)
        
        
        final_out = self.fc_linear(drop_output)
        
        
        return final_out, hidden
    
    
    def hidden_state(self, batch_size):
        '''
        Used as separate method to account for both GPU and CPU users.
        '''
        
        if self.use_gpu:
            
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
        return hidden
        

## Instance of the Model

In [25]:
model = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [26]:
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))

Try to make the total_parameters be roughly the same magnitude as the number of characters in the text.

In [27]:
sum(total_param)

5470292

In [28]:
len(encoded_text)

5445609

### Optimizer and Loss

In [29]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

## Training Data and Validation Data

In [30]:
# percentage of data to be used for training
train_percent = 0.1

In [31]:
len(encoded_text)

5445609

In [32]:
int(len(encoded_text) * (train_percent))

544560

In [33]:
train_ind = int(len(encoded_text) * (train_percent))

In [34]:
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]

# Training the Network

## Variables

Feel free to play around with these values!

In [35]:
## VARIABLES

# Epochs to train for
epochs = 50
# batch size 
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1
num_char 

84

------

In [36]:
# Set model to train
model.train()


# Check to see if using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        # One Hot Encode incoming data
        x = one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor      
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        # Adjust for GPU if necessary     
        if model.use_gpu:
            
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)
                
                # Convert Numpy Arrays to Tensor
                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust for GPU if necessary

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 0 Step: 25 Val Loss: 3.2387239933013916
Epoch: 1 Step: 50 Val Loss: 3.2339115142822266
Epoch: 1 Step: 75 Val Loss: 3.2166528701782227
Epoch: 2 Step: 100 Val Loss: 3.0681467056274414
Epoch: 2 Step: 125 Val Loss: 2.968043327331543
Epoch: 3 Step: 150 Val Loss: 2.803651809692383
Epoch: 4 Step: 175 Val Loss: 2.698382616043091
Epoch: 4 Step: 200 Val Loss: 2.5822930335998535
Epoch: 5 Step: 225 Val Loss: 2.4610958099365234
Epoch: 5 Step: 250 Val Loss: 2.3478972911834717
Epoch: 6 Step: 275 Val Loss: 2.2662200927734375
Epoch: 7 Step: 300 Val Loss: 2.2091410160064697
Epoch: 7 Step: 325 Val Loss: 2.158705472946167
Epoch: 8 Step: 350 Val Loss: 2.118455410003662
Epoch: 8 Step: 375 Val Loss: 2.0884435176849365
Epoch: 9 Step: 400 Val Loss: 2.057347536087036
Epoch: 10 Step: 425 Val Loss: 2.0339696407318115
Epoch: 10 Step: 450 Val Loss: 2.0089664459228516
Epoch: 11 Step: 475 Val Loss: 1.9839364290237427
Epoch: 11 Step: 500 Val Loss: 1.9574096202850342
Epoch: 12 Step: 525 Val Loss: 1.9453810453414

-------

## Saving the Model

In [44]:
model_name = 'example.net'

In [45]:
torch.save(model.state_dict(),model_name)

## Load Model

In [46]:
# MUST MATCH THE EXACT SAME SETTINGS AS MODEL USED DURING TRAINING!
model = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [47]:
model.load_state_dict(torch.load(model_name))
model.eval()

CharModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

# Generating Predictions

--------

In [48]:
def predict_next_char(model, char, hidden=None, k=1):
        
        # Encode raw letters with model
        encoded_text = model.encoder[char]
        
        # set as numpy array for one hot encoding
        # NOTE THE [[ ]] dimensions!!
        encoded_text = np.array([[encoded_text]])
        
        # One hot encoding
        encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))
        
        # Convert to Tensor
        inputs = torch.from_numpy(encoded_text)
        
        # Check for CPU
        if(model.use_gpu):
            inputs = inputs.cuda()
        
        
        # Grab hidden states
        hidden = tuple([state.data for state in hidden])
        
        
        # Run model and get predicted output
        lstm_out, hidden = model(inputs, hidden)
      
        # Convert lstm_out to probabilities
        probs = F.softmax(lstm_out, dim=1).data
              
        if(model.use_gpu):
            # move back to CPU to use with numpy
            probs = probs.cpu()
        
              
        # Return k largest probabilities in tensor. k determines how many characters to consider
        # for our probability choice.
        probs, index_positions = probs.topk(k)
        
        
        index_positions = index_positions.numpy().squeeze()
        
        # Create array of probabilities
        probs = probs.numpy().flatten()
        
        # Convert to probabilities per index
        probs = probs/probs.sum()
        
        # randomly choose a character based on probabilities
        char = np.random.choice(index_positions, p=probs)
       
        # return the encoded value of the predicted char and the hidden state
        return model.decoder[char], hidden

In [49]:
def generate_text(model, size, seed='The', k=1):
         
    # CHECK FOR GPU
    if(model.use_gpu):
        model.cuda()
    else:
        model.cpu()
    
    # Evaluation mode
    model.eval()
    
    # begin output from initial seed
    output_chars = [c for c in seed]
    
    # intiate hidden state
    hidden = model.hidden_state(1)
    
    # predict the next character for every character in seed
    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)
    
    # add initial characters to output
    output_chars.append(char)
    
    # Now generate for size requested
    for i in range(size):
        
        # predict based off very last letter in output_chars
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)
        
        # add predicted character
        output_chars.append(char)
    
    # return string of predicted text
    return ''.join(output_chars)

In [50]:
print(generate_text(model, 1000, seed='The ', k=3))

The beauty of my
    Shallow to have strong thing that I will neving stand
    And shall the beauty which they she died,
    The words of trust to the breath the strong and service
    That she hath bear me would I was a man.
    In my sweet sight thou hast but to my signt,
    Whom I will bear the cheeks of more.
                                                                                 [Aligum]
  CHARMIAN. Where they have spoken to trispess to thee which they have.
  CLEOPATRA. I have not song to bear him.
    The strength and tree, and there that should
    In this that with my son were the state
    Than they are stronger than the such that shall be
    The beauty's promise of his state of his hands.
    The gods shall see my father she was strong
    With trees the better shame of self.
  CLEOPATRA. Welcome, that we do not be the world with thee.  
  COUNTESS. I am, and the strange short as the stronger prayers,
    there is not so true than thou shouldst stronger with thee 