# Shaw's Last Act

In [1]:
import re
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.functional as functional
from ShawsDataset import ShawsDataset
from torch.utils.data import DataLoader

from gensim.utils import tokenize
from gensim.corpora.dictionary import Dictionary



In [2]:
# load text
original_text_path = 'data/original_scripts.txt'
with open(original_text_path, "r", encoding="utf8") as line:
    raw = line.read()

In [3]:
# text statatistics

unique_chars = set(list(raw))
print(f'There are {len(unique_chars)} unique characters in the text')

n_words = len(raw.split(' '))
print(f'There are approximately {n_words} words in the text')

n_unique_words = len(set(raw.split(' ')))
print(f'There are approximately {n_unique_words} unique words in the text')

n_lines = len(raw.split('\n'))
print(f'There are {n_lines} lines in the text')

print(f'On average, there are {n_words / n_lines} words per line')

titles = re.findall('Title:.*\n', raw)
titles = [title.replace('\n', '').replace('Title: ', '') for i, title in enumerate(titles)]
print(f'There are {len(titles)} different scripts in the text\n')
print('The text contains the scripts for the titles:', *titles, sep='\n  - ')

There are 95 unique characters in the text
There are approximately 284240 words in the text
There are approximately 49976 unique words in the text
There are 35742 lines in the text
On average, there are 7.952548822114039 words per line
There are 9 different scripts in the text

The text contains the scripts for the titles:
  - Pygmalion
  - Major Barbara
  - Saint Joan
  - Arms and the Man
  - Man And Superma
  - Mrs. Warren’s Profession
  - Heartbreak House
  - Caesar and Cleopatra
  - You Never Can Tell


In [122]:
# preprocess raw text

def text_special_characters(text):

    # identify all unique characters
    unique_chars = list(set(list(text)))
    
    # merge the characters to a single string
    unique_chars = ''.join(unique_chars)
    
    # remove letters and spaces 
    unique_chars = re.sub('[a-zA-Z\s:]', '', unique_chars)
    
    puntuations = set(string.punctuations)
    for char in unique_chars:
        punctuations.add(char)
    
    return list(punctuations)


# find unique characters that are not letters

def special_characters_json(filepath):
    with open(filepath, encoding='utf8') as line:
        char2token = json.loads(line.read())
        
    token2char = {special: token for token, special in char2token.items()}
    return (char2token, token2char)

# tokenize special characters
def tokenize_special_characters(text):
    '''
    
    '''
    # load the special characters to tokenize
    special2token, _ = special_characters_json('character_dictionary.json')

    # replace special characters with the new tokens
    for special, token in tokens_dict.items():
        text = text.replace(special, f' {token} ')
    
    # replace multiple whitespaces with single whitespace
    text = re.sub(r"\s+", " ", text)
    
    return text

text = replace_special_characters(raw)

tokenized_path = 'data/tokenized_scripts.txt'
with open(tokenized_path, "w") as line:
    line.write(text)

!)—2]“[6”8ïæ5_"?î½0’1-}'9.,3ôÉé;74{çè(&


In [None]:
import torch.nn as nn

class ShawsLSTM(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        """
        Initialize the PyTorch RNN Module
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(ShawsLSTM, self).__init__()
        
        # init hidden weights params
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

        # define the embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # define the LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout, batch_first=True)

        # define fully-connected layer
        self.dense = nn.Linear(hidden_dim, output_size)
    
    
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """
        # ensure embedding layer gets a LongTensor input
        nn_input = nn_input.long()
        
        # get the batch size for reshaping
        batch_size = nn_input.size(0)
        
        ## define forward pass
        embed = self.embedding(nn_input)
        output, state = self.lstm(embed, hidden)
        
        # stack LSTM
        output = output.contiguous().view(-1, self.hidden_dim) 
        
        # pass through last fully connected layer
        output = self.dense(output)
        
        output = output.view(batch_size, -1, self.vocab_size)
        output = output[:, -1] # save only the last output
        
        # return one batch of output word scores and the hidden state
        return output, state   

    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (torch.cuda.is_available()): #
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [97]:
BATCH_SIZE = 1
SEQUENCE_LENGTH = 10
dataset = ShawsDataset(tokenized_path, SEQUENCE_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

In [98]:
sentence, target = next(iter(dataloader))
sentence, target =  sentence.numpy().squeeze(), target.numpy().squeeze()

In [63]:
# from torch.utils.data import Dataset
# from collections import Counter
# from gensim.utils import tokenize
# from gensim.corpora.dictionary import Dictionary

# class Test(Dataset):
#     '''
#     Creates a custom PyTorch Dataset class
#     args:
#         filepath: string, path to text (UTF8)
#         sequence_length: integer, sequence length
#     '''
#     def __init__(self, filepath, sequence_length):
#         super(Test, self).__init__()
#         self.filepath = filepath
#         self.sequence_length = sequence_length
        
#         self.words = self.load_text()        
#         self.tokens = list(tokenize(self.words, token_pattern='\S+'))
#         self.token_dict = Dictionary([self.tokens])
        
#         self.words_indexes = [self.token_dict.token2id[token] for token in self.tokens]
    
#     def load_text(self):
#         with open(self.filepath, "r") as line:
#             text = line.read()
            
#         # replace multiple whitespaces with single whitespace
#         text = re.sub(r"\s+", " ", text)
#         return text

#     def __len__(self):
#         return len(self.words_indexes) - self.sequence_length

#     def __getitem__(self, index):
#         return (
#             torch.tensor(self.words_indexes[index : index+self.sequence_length]),
#             torch.tensor(self.words_indexes[index+self.sequence_length]),
#         )

In [121]:
index = dataset.index_to_word
new = []
for word in sentence:
    word = index[word]

    
    for token, char in token2special.items():
        if word == token:
            word = char
            continue
    
    new.append(word)
   

    # Replace punctuation tokens
    

print(' '.join(new))

Title : Pygmalion 
 ACT I 
 
 
 Covent


In [89]:
work_sentence = raw[:200]
work_sentence = re.sub(r"\s+", " ", work_sentence)

import string
punctuations=string.punctuation
punctuations
token_boundaries=[' ', '-']
delimiter_token='<SPLIT>'

for punctuation in punctuations:
      work_sentence = work_sentence.replace(punctuation, " "+punctuation+" ")

        
for delimiter in token_boundaries:
    work_sentence = work_sentence.replace(delimiter, delimiter_token)
    tokens = [x.strip() for x in work_sentence.split(delimiter_token) if x != '']

    
tokens

['Title',
 ':',
 'Pygmalion',
 'ACT',
 'I',
 'Covent',
 'Garden',
 'at',
 '11',
 '.',
 '15',
 'p',
 '.',
 'm',
 '.',
 'Torrents',
 'of',
 'heavy',
 'summer',
 'rain',
 '.',
 'Cab',
 'whistles',
 'blowing',
 'frantically',
 'in',
 'all',
 'directions',
 '.',
 'Pedestrians',
 'running',
 'for',
 'shelter',
 'into',
 'the',
 'market',
 'and',
 'under',
 'the',
 'po']