In [1]:
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.distributions.categorical import Categorical

from textdataset_generation import TextDataset
from rnn import RNN

Reading in the desired parts of the books

In [2]:
# Helpers
def preprocessing(filepath, text, end=False):
    if 'austen' in filepath:
        start = text.find("Chapter I.]")

    elif 'dostoyevsky' in filepath:
        start = text.find("CHAPTER I")

    elif 'god' in filepath:
        start = text.find("1:1")

    elif 'kafka' in filepath:
        start = text.find("Chapter One")

    elif 'shelley' in filepath:
        start = text.find("_To")

    elif 'tolstoy' in filepath:
        start = text.find("Chapter 1")

    elif 'sturluson' in filepath:
        start = text.find("PREFACE OF SNORRE STURLASON.")
        
    elif 'cervantes' in filepath:
        start = text.find("Idle reader:")

    else:
        raise Exception("This book is not in our library!")
    
    if not end:
        end = text.find("*** END")
   
    return text[start:end]


Class implementation of the code outlined in Rasckha: 

In [26]:
class TextGenerationModel:
    def __init__(self, 
                 filepath, 
                 device='cpu',
                 seq_length=40, 
                 embed_dim=256, 
                 rnn_hidden_size=512, 
                 batch_size=64, 
                 lr=0.001
                 ):
        self.device = device
        self.seq_length = seq_length
        self.embed_dim = embed_dim
        self.rnn_hidden_size = rnn_hidden_size
        self.batch_size = batch_size
        self.lr=lr

        # Preprocess text - pick ou the relevant part of the text
        self.encode_text(filepath)
        
        # Create dataset and dataloader
        self.create_dataloader()

        # Initialize model, loss function, and optimizer
        self.initialize_model()

    def encode_text(self, filepath):
        """
        Reads in the given file and picks out the relevant parts of the text given
        start and stop defined in `preprocessing`.
        Creates a set of the characters and a dictionary with correspondence between
        charcter and an integer and uses this to encode the text from strings to ints.
        Splits the encoded text into chunks. 

        Parameters
        -----------
        filepath (str)
            Path to file one wishes to read in and encode

        """
        with open(filepath, 'r', encoding="utf8") as fp:
            text=fp.read()

        text = preprocessing(filepath, text)
        char_set = set(text)

        # create dictionary and encode text
        self.char_array = np.array(sorted(char_set))
        self.char2int = {ch: i for i, ch in enumerate(self.char_array)}
        text_encoded = np.array([self.char2int[ch] for ch in text], dtype=np.int32)
        
        # split text into chunks
        chunk_size = self.seq_length + 1
        self.text_chunks = [text_encoded[i:i + chunk_size] for i in range(len(text_encoded) - chunk_size + 1)]

    def create_dataloader(self):
        """
        Create dataset and dataloader of text chunks.
        """
        seq_dataset = TextDataset(torch.tensor(np.array(self.text_chunks)))
        self.seq_dl = DataLoader(seq_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True)

    def initialize_model(self):
        """
        Initializing the model by setting up the neural net with its nodes, and
        defining the loss function and the optimizer.
        """
        self.vocab_size = len(self.char_array)
        self.model = RNN(self.vocab_size, self.embed_dim, self.rnn_hidden_size)
        self.loss_fn = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
    
    def train_model(self, num_epochs=100):
        """ 
        Train the model given the model paramters specified in 
        `initialize_model` for a number of epochs. 

        Parameters
        ----------
        num_epochs (int):
            Number of training epochs. Default is 100.
        """
        torch.manual_seed(1)
        self.model.to(self.device)

        for epoch in range(num_epochs):
            hidden, cell = self.model.init_hidden(self.batch_size)
            seq_batch, target_batch = next(iter(self.seq_dl))
            seq_batch, target_batch = seq_batch.to(self.device), target_batch.to(self.device)

            self.optimizer.zero_grad()
            loss = 0
            for c in range(self.seq_length):
                pred, hidden, cell = self.model(seq_batch[:, c], hidden, cell)
                loss += self.loss_fn(pred, target_batch[:, c])
            loss.backward()
            self.optimizer.step()
            loss = loss.item() / self.seq_length

            if epoch % 500 == 0:
                print(f'Epoch {epoch} loss: {loss:.4f}')

    def save_model(self, path):
        """ 
        Saves the model with the trained weights 
        to the specified location.

        Parameters:
        -----------
        path (str)
            Filepath to desired saving location with name of model.
        """
        torch.save(self.model, path)

    def sample(self, starting_str, len_generated_text=500, scale_factor=2.0):
        """" 
        Generate a sample of text that is (ideally) similar to the training
        inputs.

        Parameters:
        -----------
        starting_str (str)
            String starting out the text generation.ipynb

        len_generated_text (int)
            Length of generated text. Default is 500.

        scale_factor (float)
            Controls the level of randomness of the text generation. Default is 2.0.
        """
        encoded_input = torch.tensor([self.char2int[s] for s in starting_str])
        encoded_input = torch.reshape(encoded_input, (1, -1))

        generated_str = starting_str

        self.model.eval()
        hidden, cell = self.model.init_hidden(1)
        hidden, cell = hidden.to(self.device), cell.to(self.device)
        encoded_input = encoded_input.to(self.device)

        for c in range(len(starting_str) - 1):
            _, hidden, cell = self.model(encoded_input[:, c].view(1), hidden, cell)

        last_char = encoded_input[:, -1]
        for i in range(len_generated_text):
            logits, hidden, cell = self.model(last_char.view(1), hidden, cell)
            logits = torch.squeeze(logits, 0)
            scaled_logits = logits * scale_factor
            m = Categorical(logits=scaled_logits)
            last_char = m.sample()
            generated_str += str(self.char_array[last_char])

        return generated_str

Creating an instance of the class

In [None]:
model = TextGenerationModel(filepath='../Texts/austen.txt', 
                            seq_length=40, 
                            embed_dim=256, 
                            rnn_hidden_size=512, 
                            batch_size=64, 
                            device='cpu')

model.train_model(num_epochs=10_000)

# save model
#model.save_model('austen_generator.pt')

# Generate new text
generated_text = model.sample(starting_str='Darcy', len_generated_text=500)
print(generated_text)

Epoch 0 loss: 4.4536
Victore in the part of his convinces delighted him to her talked her as such all the scarcely and make intemption the had not concere concent of seelings at the subject, and she was not be the event that I have been her feel that her something and they concending her the would accontent the former of Mr. Bennet which must attention of compoming a seld in the part, and been me, at it is not be mether in the subject of her
in my deserved and the should not such a sention of all the was mention you are o
