## Preprocessing EDA

In [15]:
import torch
train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    print("Training on GPU")
else:
    print("Training on CPU")

Training on GPU


In [16]:
with open("friends10.txt", "r") as f:
    dialogs = f.read()
text = dialogs[:]
print(dialogs[:100])
print(text[:100])
print(len(dialogs))


[Scene: Barbados, Monica and Chandler's Room. They both enter from Ross's room. Monica still has her
[Scene: Barbados, Monica and Chandler's Room. They both enter from Ross's room. Monica still has her
416013


## Explore the Data

In [17]:
import numpy as np

view_line_range = (0, 10)
#roughly the number of unique words
print("Roughly the number of unique words: {}".format(len({word:None for word in dialogs.split()})))
lines = dialogs.split("\n")
print("Number of lines: {}".format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print("Average number of word in each line: {}".format(np.average(word_count_line)))
print()
print("The lines from {} to {}".format(*view_line_range))
print("\n".join(dialogs.split("\n")[view_line_range[0]:view_line_range[1]]))

Roughly the number of unique words: 10760
Number of lines: 12389
Average number of word in each line: 5.975704253773508

The lines from 0 to 10
[Scene: Barbados, Monica and Chandler's Room. They both enter from Ross's room. Monica still has her big, frizzy hair.]

Monica: Oh, the way you crushed Mike at ping pong was such a turn-on.You wanna...? (plays with her finger on Chandlers chest)

Chandler: You know, I'd love to, but I'm a little tired.

Monica: I'll put a pillowcase over my head.

Chandler: You're on!



### Preprocessing Data

In [18]:
from collections import Counter

def create_lookup_tables(text):
    counts = Counter(text)
    vocab = sorted(counts, key=counts.get, reverse=True)
    vocab_to_int = {word:ii for ii, word in enumerate(vocab, 1)}
    int_to_vocab = {ii: word for word, ii in vocab_to_int.items()}
    return (vocab_to_int, int_to_vocab)
    

In [19]:
def token_lookup():
    symbols = [".", ",", "\"", ";", "!", "?", "(", ")", "-", "\n"]
    punctuations = ["||Period||",
                    "||Comma||",
                    "||Quotation_Mark||",
                    "||Semicolon||",
                    "||Exclamation_Mark||",
                    "||Question_Mark||",
                    "||Left_Parentheses||",
                    "||Right_Parentheses||",
                    "||Dash||",
                    "||Return||"
                   ]
    d = {symbol:token for symbol, token in zip(symbols, punctuations)}
    return d

In [20]:
import pickle
import os
SPECIAL_WORDS = {'PADDING': '<PAD>'}
token_dict = token_lookup()

if os.path.exists('preprocessfriends.p'):
    int_text, vocab_to_int, int_to_vocab, token_dict = pickle.load(open('preprocessfriends.p', mode='rb'))
else:
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocessfriends.p', 'wb'))

## Batching

In [21]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):
    """
    Batch the neural network data using DataLoader
    :param words: The word ids of the Friends TV scripts
    :param sequence_length: The sequence length of each batch
    :param batch_size: The size of each batch; the number of sequences in a batch
    :return: DataLoader with batched data
    """
    # TODO: Implement function
    total_batches = batch_size*sequence_length
    n_batches = len(words)//total_batches
    words = words[:n_batches*total_batches]

    features, target = [], []
    for ii in range(0, len(words)):
        if ii+sequence_length < len(words):
            features.append(words[ii:ii+sequence_length])
            target.append(words[ii+sequence_length])
    
    data = TensorDataset(torch.tensor(features), torch.tensor(target))
    data_loader = DataLoader(data, shuffle = True, batch_size=batch_size)
    
    # return a dataloader
    return data_loader

## Test Data Loader

In [22]:
test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)

data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

torch.Size([10, 5])
tensor([[30, 31, 32, 33, 34],
        [20, 21, 22, 23, 24],
        [11, 12, 13, 14, 15],
        [21, 22, 23, 24, 25],
        [24, 25, 26, 27, 28],
        [15, 16, 17, 18, 19],
        [23, 24, 25, 26, 27],
        [38, 39, 40, 41, 42],
        [ 2,  3,  4,  5,  6],
        [22, 23, 24, 25, 26]])

torch.Size([10])
tensor([35, 25, 16, 26, 29, 20, 28, 43,  7, 27])


## Build the Neural Network

In [24]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        """
        Initialize the PyTorch RNN Module
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(RNN, self).__init__()
        # TODO: Implement function
        # set class variables
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        #embedding and LSTM layers: Given the big amount of words is better to use embedding layer as a lookup table
        # define model layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            n_layers,
                            dropout = dropout,
                            batch_first=True)
        #dropout layer
        self.dropout = nn.Dropout(0.3)
        
        #linear layer
        self.fc = nn.Linear(hidden_dim, output_size)
        #self.sig = nn.Sigmoid()
        
    
    
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """
        # TODO: Implement function   
        batch_size = nn_input.size(0)
        
        #embedding and lstm_out
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        #stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        #dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        #sigmoid function
        #sig_out = self.sig(out)
        
        #reshape to the batch_size first
        out = out.view(batch_size, -1, self.output_size)
        out = out[:,-1] # get the last batch of labels

        # return one batch of output word scores and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        # Implement function
        
        # initialize hidden state with zero weights, and move to GPU if available
        
        weight = next(self.parameters()).data
        if train_on_gpu:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
            
        return hidden


## Define Forward and backpropagation

In [25]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    """
    Forward and backward propagation on the neural network
    :param rnn: The PyTorch Module that holds the neural network
    :param optimizer: The PyTorch optimizer for the neural network
    :param criterion: The PyTorch loss function
    :param inp: A batch of input to the neural network
    :param target: The target output for the batch of input
    :return: The loss and the latest hidden state Tensor
    """
    
    # TODO: Implement Function
    
    # move data to GPU, if available
    if train_on_gpu:
        #rnn = rnn.cuda()
        inp, target = inp.cuda(), target.cuda()
    
    # perform backpropagation and optimization
    hidden = tuple([each.data for each in hidden])
    rnn.zero_grad()
    output, hidden = rnn.forward(inp, hidden)
    loss = criterion(output.squeeze(), target.long())
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    optimizer.step()
    
    # return the loss over a batch and the hidden state produced by our model
    return loss.item(), hidden

## Train RNN

In [26]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            # make sure you iterate over completely full batches, only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    # returns a trained rnn
    return rnn

## Optimization And Model Hyperparameters

* Set sequence_length to the length of a sequence.
* Set batch_size to the batch size.
* Set num_epochs to the number of epochs to train for.
* Set learning_rate to the learning rate for an Adam optimizer.
* Set vocab_size to the number of unique tokens in our vocabulary.
* Set output_size to the desired size of the output.
* Set embedding_dim to the embedding dimension; smaller than the vocab_size.
* Set hidden_dim to the hidden dimension of your RNN.
* Set n_layers to the number of layers/cells in your RNN.
* Set show_every_n_batches to the number of batches at which the neural network should print progress.

In [30]:
# config1
# Sequence Length
sequence_length = 50  # of words in a sequence
# Batch Size
batch_size = 128

# data loader - do not change
train_loader = batch_data(int_text, sequence_length, batch_size)
# Training parameters
# Number of Epochs
num_epochs = 5
# Learning Rate
learning_rate = 0.01

# Model parameters
# Vocab size
vocab_size = len(vocab_to_int) + 1
# Output size
output_size = len(set(vocab_to_int))+1
# Embedding Dimension
embedding_dim = 400
# Hidden Dimension
hidden_dim = 256
# Number of RNN Layers
n_layers = 2
# Show stats for every n number of batches
show_every_n_batches = 500

In [31]:
import time
start = time.time()

# create model and move to gpu if available
rnn1 = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn1.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn1.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn1 = train_rnn(rnn1, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

end = time.time()
elapsed_time = end - start
print("Elapsed time: {} min, {} sec".format(elapsed_time//60, elapsed_time))

# saving the trained model
torch.save(trained_rnn1, 'trained_rnn_config1')
print('Model Trained and Saved')

Training for 5 epoch(s)...
Epoch:    1/5     Loss: 4.938925834655762

Epoch:    2/5     Loss: 4.528153651454283

Epoch:    3/5     Loss: 4.310648696863469

Epoch:    4/5     Loss: 4.175825493917027

Epoch:    5/5     Loss: 4.108616657054887

Elapsed time: 2.0 min, 144.26672387123108 sec
Model Trained and Saved


  "type " + obj.__name__ + ". It won't be checked "


In [35]:
# Data params
# Sequence Length
sequence_length = 10  # of words in a sequence
# Batch Size
batch_size = 256

# data loader - do not change
train_loader = batch_data(int_text, sequence_length, batch_size)
# Training parameters
# Number of Epochs
num_epochs = 10
# Learning Rate
learning_rate = 0.001

# Model parameters
# Vocab size
vocab_size = len(vocab_to_int) + 1
# Output size
output_size = len(set(vocab_to_int))+1
# Embedding Dimension
embedding_dim = 400
# Hidden Dimension
hidden_dim = 256
# Number of RNN Layers
n_layers = 2
# Show stats for every n number of batches
show_every_n_batches = 500

In [36]:
import time
start = time.time()

# create model and move to gpu if available
rnn2 = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn2.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn2.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn2 = train_rnn(rnn2, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

end = time.time()
elapsed_time = end - start
print("Elapsed time: {} min, {} sec".format(elapsed_time//60, elapsed_time))

# saving the trained model
torch.save(trained_rnn2, 'trained_rnn_config2')
print('Model Trained and Saved')

Training for 10 epoch(s)...
Elapsed time: 1.0 min, 63.37096166610718 sec
Model Trained and Saved
