In [None]:
# load in data
# helper function from Udacity. This is the course I made this project for
# anytime a helper function appears its a function provided by Udacity
import helper
data_dir = './data/Seinfeld_Scripts.txt'
text = helper.load_data(data_dir)

### Understanding the data
Printing some data statistics to understand it better and printing some paragraphs as well

In [None]:
view_line_range = (0, 10)
import numpy as np

print('Dataset Stats')
print('number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('the lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

### Creating a lookup table
The lookup table assigns each word a number so it can be an input to the network

In [None]:
def create_lookup_tables(text):
    # creating a lookup table
    words = set(text)
    int_to_vocab = {i:word for i,word in enumerate(words)}
    vocab_to_int = {k:v for v,k in int_to_vocab.items()}
    # return tuple
    return (vocab_to_int, int_to_vocab)

In [None]:
from string import punctuation

def token_lookup():
    # tokens for punctuation
    punct_tokens = {"!":"||Exclamation||", ".":"||Period||", ",":"||comma||", punctuation[1]:"||Quotation||",
               ";":"||Semicolon||", "?":"||Question||", "(":"||Left_Parentheses||", ")":"||Right_Parentheses||",
               "-":"||Dash||", "\n":"||Return||"}
    return punct_tokens

### Checking for gpu availability

In [None]:
import torch

# Check for a GPU
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')
else:
    print("GPU BABYYYYY!!!")

### Preprocessing data
The helper function was provided by Udacity and it helped me preprocess the data using the functions I made above

In [None]:
import helper

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

### Batching data
creates a function to seperate all the text into sequences where a certain number of words
are put into a features list and the subsequent word is saved as the target. This is then
converted to data using TensorDataset and that is put into a dataloader.

In [None]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):
    n_batches = len(words)//(sequence_length)
    
    # only full batches
    words = words[:n_batches*sequence_length]
    #words = np.array(words)
    #words = words.reshape(n_batches, sequence_length)
    #creates lists
    x, y= [], []
    for idx in range(0, len(words), sequence_length):
        #seperates the batch into a section of the words from idx:idx+sequence length
        batch = words[idx:idx+sequence_length]
        #extracts the features and targets
        features = batch[:sequence_length-1]
        target = batch[sequence_length-1]
        #adds to overall list
        x.append(features)
        y.append(target)
        
    # Expected tensor for argument #1 'indices'to have scalar type Long; 
    # but got CUDAFloatTensor instead (while checking arguments for embedding)
    x = torch.from_numpy(np.array(x))
    y = torch.from_numpy(np.array(y))
    #converts to a dataloader
    data = TensorDataset(x, y)
    dataloader = DataLoader(data, batch_size = batch_size)
    return dataloader

In [None]:
# test dataloader

test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)

data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

### Defining the RNN perameters
LSTM cells are used instead of RNN cells because it is better at handling the vanishing gradient problem and they can be trained to remember more important words and forget less important ones to make more accurate predictions. LSTM was used over GRU because LSTM cells have 3 gates (input, output, and forget) while GRU's have two (reset and update). This makes GRU cells better for smaller datasets and LSTM cells better for larger data sets, such as the one we are working with.

In [None]:
from torch import nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        super(RNN, self).__init__()
        # TODO: Implement function
        self.sigmoid = nn.Sigmoid()
        
        # set class variables
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.output_size = output_size
        
        # define model layers
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim*4, 
                            n_layers, dropout=dropout, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim*4, hidden_dim*3.5,
                            n_layers, dropout=dropout, batch_dirst=True)
        self.fc1 = nn.Linear(hidden_dim*3.5, hidden_dim*3)
        self.fc2 = nn.Linear(hidden_dim*3, hidden_dim)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, nn_input, hidden):
        batch_size = nn_input.size(0)

        # embedding and lstm layer
        x = self.embed(nn_input)
        lstm_out, hidden = self.lstm(x, hidden)
        
        # reshape so lstm output can fit into fully connected layer
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully connected
        fc_in = self.dropout(lstm_out)
        fc_out = self.fc(fc_in)
        
        # adjust so you have the batch_size, layers, and output size
        output = fc_out.view(batch_size, -1, self.output_size)
        
        # activation function
        #output = self.sigmoid(output_in)
        
        # pull the final layer of outputs from lstm layers
        out = output[:,-1]
        
        # return one batch of output word scores and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        # Implement function
        weight = next(self.parameters()).data
        
        # initialize hidden state with zero weights, and move to GPU if available
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                 weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        
        return hidden

### Creates training functions

In [None]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    hidden = tuple([each.data for each in hidden])
    # move data to GPU, if available
    target, inp = target.cuda(), inp.cuda()
    # clears gradient so it can backprop again
    rnn.zero_grad()
    # runs through forward prop
    output, hidden = rnn(inp, hidden)
    # calculates error function
    loss = criterion(output, target)
    # backpropogates
    loss.backward()
    # converts from tensor to float
    #loss = float(loss.cpu().detach().numpy())
    # helps prevent the exploding gradient problem
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    # takes step
    optimizer.step()
    # return the loss over a batch and the hidden state produced by our model
    return loss.item(), hidden

In [None]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    previous_loss = np.inf
    
    rnn.train()
    #print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            # make sure you iterate over completely full batches, only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)     
            loss_initial = loss
            
            if previous_loss < loss_initial:
                # training the model
                torch.save(rnn.state_dict(), 'model.pt')
            
            previous_loss = loss
            
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                print('Model Trained and Saved')
                batch_losses = []
                
    # returns a trained rnn
    return rnn

### Batch parameters

In [None]:
# Data params
# Sequence Length
sequence_length = 10  # of words in a sequence
# Batch Size
batch_size = 120
in_text = int_text[:390]
# data loader - do not change
train_loader = batch_data(int_text, sequence_length, batch_size)

### Network parameters

In [None]:
# Training parameters
# Number of Epochs
num_epochs = 8
# Learning Rate
learning_rate = 0.001

# Model parameters
# Vocab size
vocab_size = len(int_to_vocab)
# Output size
output_size = vocab_size
# Embedding Dimension
embedding_dim = 500
# Hidden Dimension
hidden_dim = 100
# Number of RNN Layers
n_layers = 2

# Show stats for every n number of batches
show_every_n_batches = 300

### Training

In [None]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)

# this is to continue training the previously saved model
rnn.load_state_dict(torch.load('model.pt'), strict=True)

if train_on_gpu:
    rnn.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
helper.save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')

In [None]:
_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
trained_rnn = helper.load_model('./save/trained_rnn') ### helper code provided by Udacity

### Generating text using the RNN

In [None]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    rnn.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))
        
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
         
        # use top_k sampling to get the index of the next word
        top_k = 1
        _, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        # retrieve that word from the dictionary
        word = int_to_vocab[top_i]
        predicted.append(word)     
        
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    
    # Replace punctuation tokens
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    
    # return all the sentences
    return gen_sentences

In [None]:
gen_length = 400 # modify the length to your preference
prime_word = 'jerry' # name for starting the script

pad_word = helper.SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)