In [1]:
# importing...
import numpy as np
from torch import nn,optim

In [2]:
# opening the gintama script file
with open('text.txt', 'r') as file:
    text = file.read()

In [3]:
# exploring data
view_line_range = (0, 10)

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 41322
Number of lines: 56992
Average number of words in each line: 7.141774284110051

The lines 0 to 10:
Kagura: There were also many other \Nstaff who were affected.
Kagura: and *****-san.
All: We're very sorry!
GINTOKI: Time to commit seppuku.
GINTOKI: You're dressed in white \Nso you're ready to die.
shinpachi: Why only me?!
shinpachi: We're all dressed in white!
Kagura: What's wrong?
Kagura: It's time for you to take responsibility
Kagura: for turning the eighth \Nepisode of the second series


In [5]:
from collections import Counter

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    # return tuple
    return (vocab_to_int, int_to_vocab)


In [6]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    # TODO: Implement Function
    tokens = dict()
    tokens['.'] = '||period||'
    tokens[','] = '||comma||'
    tokens['"'] = '||quotation_mark||'
    tokens[';'] = '||semicolon||'
    tokens['!'] = '||exclam_mark||'
    tokens['?'] = '||question_mark||'
    tokens['('] = '||left_par||'
    tokens[')'] = '||right_par||'
    tokens['-'] = '||dash||'
    tokens['\n'] = '||return||'
    return tokens

In [7]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}
token_dict = token_lookup()
for key,token in token_dict.items():
    text = text.replace(key, ' {} '.format(token))
text = text.lower()
text = text.split()
vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
int_text = [vocab_to_int[word] for word in text]

In [8]:
## check access to gpu
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import torch

# Check for a GPU
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')
    

In [9]:
from torch.utils.data import TensorDataset, DataLoader


def batch_data(words, sequence_length, batch_size):
    """
    Batch the neural network data using DataLoader
    :param words: The word ids of the TV scripts
    :param sequence_length: The sequence length of each batch
    :param batch_size: The size of each batch; the number of sequences in a batch
    :return: DataLoader with batched data
    """
    # TODO: Implement function
    data_x = np.zeros((len(words)-sequence_length, sequence_length), dtype='int64')
    data_y = np.zeros((len(words)-sequence_length, 1), dtype='int64')
    for i in range(len(words) - sequence_length):
        data_x[i] = words[i:sequence_length+i]
        data_y[i] = words[i+sequence_length:i+sequence_length+1]
    data_y = np.ndarray.flatten(data_y)
    dataset = TensorDataset(torch.from_numpy(data_x),torch.from_numpy(data_y))
    dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)  
    # return a dataloader
    return dataloader

In [10]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        """
        Initialize the PyTorch RNN Module
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(RNN, self).__init__()
        # TODO: Implement function
        # set class variables
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # define model layers
        
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        #self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """
        # TODO: Implement function   
        batch_size= nn_input.size(0)
        nn_input = nn_input.long()
        nn_input = self.embed(nn_input)
        lstm_out,hidden = self.lstm(nn_input,hidden)
        #output = self.dropout(lstm_out)
        output = self.fc(lstm_out)
        
        # reshape to be batch_size first
        output = output.view(batch_size, -1, self.output_size)
        output = output[:, -1] # get last batch of labels
        # return one batch of output word scores and the hidden state
        return output, hidden
    
    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        # Implement function
        
        # initialize hidden state with zero weights, and move to GPU if available
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [11]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    """
    Forward and backward propagation on the neural network
    :param decoder: The PyTorch Module that holds the neural network
    :param decoder_optimizer: The PyTorch optimizer for the neural network
    :param criterion: The PyTorch loss function
    :param inp: A batch of input to the neural network
    :param target: The target output for the batch of input
    :return: The loss and the latest hidden state Tensor
    """
    
    # TODO: Implement Function
    
    # move data to GPU, if available
    if (train_on_gpu):
        inp = inp.cuda()
        target = target.cuda()
    
    # perform backpropagation and optimization
    hidden = tuple([each.data for each in hidden])
    rnn.zero_grad()
    output, hidden = rnn(inp,hidden)
    loss = criterion(output.squeeze(), target)
    loss.backward()
    
    nn.utils.clip_grad_norm_(rnn.parameters(), 4)
    optimizer.step()
    # return the loss over a batch and the hidden state produced by our model
    
    return loss.item(), hidden


In [12]:

def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            # make sure you iterate over completely full batches, only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    # returns a trained rnn
    return rnn

In [14]:
#Data params
# Sequence Length
sequence_length = 10 # of words in a sequence
# Batch Size
batch_size = 128

# data loader
train_loader = batch_data(int_text, sequence_length, batch_size)

In [15]:
# Training parameters
# Number of Epochs
num_epochs = 15
# Learning Rate
learning_rate = 0.001

# Model parameters
# Vocab size
vocab_size = len(vocab_to_int)
# Output size
output_size = len(vocab_to_int)
# Embedding Dimension
embedding_dim = 256
# Hidden Dimension
hidden_dim = int(256*1.5) 
# Number of RNN Layers
n_layers = 2

# Show stats for every n number of batches
show_every_n_batches = 2000

In [16]:
# create model and move to gpu if available
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
torch.save(trained_rnn,'trained_rnn.pt')
print('Model Trained and Saved')

Training for 15 epoch(s)...
Epoch:    1/15    Loss: 5.145742391586304

Epoch:    1/15    Loss: 4.596281274080276

Epoch:    2/15    Loss: 4.288210717727797

Epoch:    2/15    Loss: 4.182242734193802

Epoch:    3/15    Loss: 3.9634728665879972

Epoch:    3/15    Loss: 3.913464971899986

Epoch:    4/15    Loss: 3.7257964204373435

Epoch:    4/15    Loss: 3.710173046588898

Epoch:    5/15    Loss: 3.5420918574522076

Epoch:    5/15    Loss: 3.548871638417244

Epoch:    6/15    Loss: 3.3951078974246593

Epoch:    6/15    Loss: 3.4106796110868456

Epoch:    7/15    Loss: 3.270848271050758

Epoch:    7/15    Loss: 3.311476106762886

Epoch:    8/15    Loss: 3.1743665154942025

Epoch:    8/15    Loss: 3.224072265267372

Epoch:    9/15    Loss: 3.086100681423659

Epoch:    9/15    Loss: 3.1342916858196257

Epoch:   10/15    Loss: 3.0234283503738255

Epoch:   10/15    Loss: 3.0779945914745332

Epoch:   11/15    Loss: 2.9589432012485513

Epoch:   11/15    Loss: 3.0191894228458405

Epoch:   12/15 

AttributeError: module 'torch' has no attribute 'save_model'

In [17]:
torch.save(trained_rnn,'trained_rnn.pt')
print('Model Trained and Saved')

  "type " + obj.__name__ + ". It won't be checked "


Model Trained and Saved


In [18]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    """
    Generate text using the neural network
    :param decoder: The PyTorch Module that holds the trained neural network
    :param prime_id: The word id to start the first prediction
    :param int_to_vocab: Dict of word id keys to word values
    :param token_dict: Dict of puncuation tokens keys to puncuation values
    :param pad_value: The value used to pad a sequence
    :param predict_len: The length of text to generate
    :return: The generated text
    """
    rnn.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))
        
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
         
        # use top_k sampling to get the index of the next word
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        # select the likely next word index with some element of randomness
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        
        # retrieve that word from the dictionary
        word = int_to_vocab[word_i]
        predicted.append(word)     
        
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    
    # Replace punctuation tokens
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    
    # return all the sentences
    return gen_sentences

In [86]:
gen_length = 100 # modify the length to your preference
prime_word = 'kagura' # name for starting the script
pad_word = SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

kagura: you can do!
shinpachi: what are you going to do?
shinpachi: what do we do?
gintoki: don't you understand?
kagura: you should go to sleep.
kagura: i don't care what happens\nto you geezers anymore.
hijikata: i won't allow you to beat\nme to this world.
kagura: that's why i told you to stop following my hair suffer.
shinpachi: the benizakura arc has\nnothing...
yamazaki:... a woman with a lady that\nhelps children.
gintoki: what?
kagura: i


In [87]:
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

kagura: you can go!
gintoki: don't give me that shit!
kagura: i won't let him!
kagura: i'm sorry. i have no idea \nwhat i should\ndo.
gintoki: what kind of education are you going to keep talking to us!
shinpachi: what?!
shinpachi: why do we have to be stuck with\nmr?
kagura: it's the 31st.
shinpachi: i don't know how to show\nmy appreciation or that...
gintoki: i didn't want you to die.
gintoki: i don't remember asking you.


In [88]:
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

kagura:?!
gintoki: don't give me your copy!
shinpachi: that's not the point!
shinpachi: what?!
shinpachi: what do you think of\nsomething like that is?
gintoki: what kind of errands are you?
gintoki: it's a pretty girl meeting.
gintoki: i don't want to see you through this thing.
shinpachi: you don't have to say it...
shinpachi: that's enough.
shinpachi: that's not the issue.
shinpachi: i don't know anything about the chain shoots.
gintoki: huh?
