# Shaw's Last Act

In [1]:
import re
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.functional as functional
from ShawsDataset import ShawsDataset
from torch.utils.data import DataLoader

In [2]:
# load text
original_text_path = 'data/original_scripts.txt'
with open(original_text_path, "r", encoding="utf8") as line:
    raw = line.read()

In [3]:
# text statatistics

unique_chars = set(list(raw))
print(f'There are {len(unique_chars)} unique characters in the text')

n_words = len(raw.split(' '))
print(f'There are approximately {n_words} words in the text')

n_unique_words = len(set(raw.split(' ')))
print(f'There are approximately {n_unique_words} unique words in the text')

n_lines = len(raw.split('\n'))
print(f'There are {n_lines} lines in the text')

print(f'On average, there are {n_words / n_lines} words per line')

titles = re.findall('Title:.*\n', raw)
titles = [title.replace('\n', '').replace('Title: ', '') for i, title in enumerate(titles)]
print(f'There are {len(titles)} different scripts in the text\n')
print('The text contains the scripts for the titles:', *titles, sep='\n  - ')

There are 95 unique characters in the text
There are approximately 284240 words in the text
There are approximately 49976 unique words in the text
There are 35742 lines in the text
On average, there are 7.952548822114039 words per line
There are 9 different scripts in the text

The text contains the scripts for the titles:
  - Pygmalion
  - Major Barbara
  - Saint Joan
  - Arms and the Man
  - Man And Superma
  - Mrs. Warren’s Profession
  - Heartbreak House
  - Caesar and Cleopatra
  - You Never Can Tell


In [4]:
# preprocess raw text
def text_special_characters(text):

    # identify all unique characters
    unique_chars = list(set(list(text)))
    
    # merge the characters to a single string
    unique_chars = ''.join(unique_chars)
    
    # remove letters and spaces 
    unique_chars = re.sub('[a-zA-Z\s+:]', '', unique_chars)
    
    return unique_chars

def special_characters_json(filepath):
    '''
    
    '''
    with open(filepath, encoding='utf8') as line:
        char2token = json.loads(line.read())
        
    token2char = {special: token for token, special in char2token.items()}
    return (char2token, token2char)

# tokenize special characters
def tokenize_special_characters(text):
    '''
    
    '''
    # load the special characters to tokenize
    special2token, _ = special_characters_json('character_dictionary.json')

    # replace special characters with the new tokens
    for special, token in special2token.items():
        text = text.replace(special, f' {token} ')
    
    # replace multiple whitespaces with single whitespace
    text = re.sub(r"\s+", " ", text)
    
    return text

text = tokenize_special_characters(raw)

path_tokenized = 'data/tokenized_scripts.txt'
with open(path_tokenized, "w") as line:
    line.write(text)

JSONDecodeError: Expecting property name enclosed in double quotes: line 29 column 1 (char 690)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import torch.nn as nn

class ShawsLSTM(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        '''
        Initialize the PyTorch RNN Module
        inputs:
            vocab_size: integer, number of input dimensions (the size of the vocabulary)
            output_size: integer, number of output dimensions (the size of the vocabulary)
            embedding_dim: integer, word embedding dimensions       
            hidden_dim: integer, number hidden layer output nodes
            dropout: float, range between 0 and 1 to describe the chance of LSTM dropout layer (default= 0.5)
        '''
        super(ShawsLSTM, self).__init__()
        
        # init hidden weights params
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

        # define the embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # define the LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout, batch_first=True)

        # define fully-connected layer
        self.dense = nn.Linear(hidden_dim, output_size)
    
    
    def forward(self, nn_input, hidden):
        '''
        Returns the model output and the latest hidden state as Tensors
        inputs:
           nn_input: model inputs
           hidden: the last hideen state        
        '''
        assert hasattr(self, "batch_size"), 'Initalize hidden weights first! -> init_hidden(batch_size)'
        
        # ensure embedding layer gets a LongTensor input
        nn_input = nn_input.long()
        
        ## define forward pass
        embed = self.embedding(nn_input)
        output, state = self.lstm(embed, hidden)
        
        # stack LSTM
        output = output.contiguous().view(-1, self.hidden_dim) 
        
        # pass through last fully connected layer
        output = self.dense(output)
        
        output = output.view(self.batch_size, -1, self.vocab_size)
        output = output[:, -1] # save only the last output
        
        # return one batch of output word scores and the hidden state
        return output, state   

    
    def init_hidden(self, batch_size, device):
        '''
        Initialize the hidden state of an LSTM in the shape (n_layers, batch_size, hidden_dim)
        inputs:
            batch_size: integer, the batch_size of the hidden state
        
        '''
       
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        self.batch_size = batch_size
        
        # reshape, zero, and move to device
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        
        return hidden

In [None]:
def backpropagation(rnn, optimizer, criterion, inputs, target, hidden, device):
    '''
    Completes the forward and backward propagation, and 
    returns the final hidden state and train loss
        rnn: ShawsLSTM instance, PyTorch class
        optimizer: torch.optim, PyTorch optimizer
        criterion: loss function class, PyTorch (or custom) loss function
        inputs: torch Tensor, a batch of input to the neural network
        target: torch Tensor, the target output for the batch of inputs
    '''
    
    # move model to GPU, if available
    rnn.to(device)
    
    # move data to GPU, if available
    inputs, target = inputs.to(device), target.to(device)
    
    # dismember the hidden states to prevent backprop through entire training history
    hidden = tuple([hid.data for hid in hidden])
    
    # zero accumulated gradients
    rnn.zero_grad()

    # get the output and hidden state from the model
    output, hidden = rnn(inputs, hidden)
    
    # calcualte the loss
    loss = criterion(output.squeeze(), target.long())
    
    # perform backpropagation
    loss.backward()
    
    # clip to prevent gradients from becoming too large before optimizating
    nn.utils.clip_grad_value_(rnn.parameters(), 4)
    optimizer.step()
    
    # ensure everything is sent back to cpu
    rnn.to(device)
    inputs, target = inputs.to(device), target.to(device)
    
    # return the loss over a batch and the hidden state produced by our model
    return loss.item(), hidden

In [None]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    '''
    
    '''
    batch_losses = []
    with rnn.train():
        print("Training for %d epoch(s)..." % n_epochs)
        for epoch_i in tqdm(range(1, n_epochs + 1)):


            # initialize hidden state
            hidden = rnn.init_hidden(batch_size)

            for batch_i, (inputs, labels) in enumerate(train_loader, 1):

                # make sure you iterate over completely full batches, only
                n_batches = len(train_loader.dataset)//batch_size
                if(batch_i > n_batches):
                    break

                # forward, back prop
                loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
                # record loss
                batch_losses.append(loss)

                # printing loss stats
                if batch_i % show_every_n_batches == 0:
                    print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                        epoch_i, n_epochs, np.average(batch_losses)))
                    batch_losses = []

    # returns a trained rnn
    return rnn

In [None]:
BATCH_SIZE = 1
SEQUENCE_LENGTH = 10
dataset = ShawsDataset(tokenized_path, SEQUENCE_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

In [None]:
sentence, target = next(iter(dataloader))
sentence, target =  sentence.numpy().squeeze(), target.numpy().squeeze()

In [None]:
sentence

In [None]:
# from torch.utils.data import Dataset
# from collections import Counter
# from gensim.utils import tokenize
# from gensim.corpora.dictionary import Dictionary

# class Test(Dataset):
#     '''
#     Creates a custom PyTorch Dataset class
#     args:
#         filepath: string, path to text (UTF8)
#         sequence_length: integer, sequence length
#     '''
#     def __init__(self, filepath, sequence_length):
#         super(Test, self).__init__()
#         self.filepath = filepath
#         self.sequence_length = sequence_length
        
#         self.words = self.load_text()        
#         self.tokens = list(tokenize(self.words, token_pattern='\S+'))
#         self.token_dict = Dictionary([self.tokens])
        
#         self.words_indexes = [self.token_dict.token2id[token] for token in self.tokens]
    
#     def load_text(self):
#         with open(self.filepath, "r") as line:
#             text = line.read()
            
#         # replace multiple whitespaces with single whitespace
#         text = re.sub(r"\s+", " ", text)
#         return text

#     def __len__(self):
#         return len(self.words_indexes) - self.sequence_length

#     def __getitem__(self, index):
#         return (
#             torch.tensor(self.words_indexes[index : index+self.sequence_length]),
#             torch.tensor(self.words_indexes[index+self.sequence_length]),
#         )

In [None]:
index = dataset.index_to_word
new = []
for word in sentence:
    word = index[word]

    
    for token, char in token2special.items():
        if word == token:
            word = char
            continue
    
    new.append(word)
   

    # Replace punctuation tokens
    

print(' '.join(new).strip())

In [None]:
work_sentence = raw[:200]
work_sentence = re.sub(r"\s+", " ", work_sentence)

import string
punctuations=string.punctuation
punctuations
token_boundaries=[' ', '-']
delimiter_token='<SPLIT>'

for punctuation in punctuations:
      work_sentence = work_sentence.replace(punctuation, " "+punctuation+" ")

        
for delimiter in token_boundaries:
    work_sentence = work_sentence.replace(delimiter, delimiter_token)
    tokens = [x.strip() for x in work_sentence.split(delimiter_token) if x != '']

    
tokens

In [None]:
def text_special_characters(text):

    # identify all unique characters
    unique_chars = list(set(list(text)))
    
    # merge the characters to a single string
    unique_chars = ''.join(unique_chars)
    
    # remove letters and spaces 
    unique_chars = re.sub('[a-zA-Z\s:]', '', unique_chars)
    
    return unique_chars

a = text_special_characters(raw)

dict(enumerate(a))

In [None]:
class Classic():
    def __init__(self):
        self.mine = 0
        
    def forward(self):
        assert hasattr(self, "batch_size"), 'Initalize hidden weights first!'
        return 1
    
    def inital(self, batch_size):
        self.batch_size = batch_size
        

In [None]:
c = Classic()

In [None]:
c.inital(6)

In [None]:
c.forward()

In [None]:
'cpu' == torch.device('cpu')


In [None]:
from tqdm import tqdm

for i in tqdm(range(1), ascii=True):
    str(i)
    for n in range(100):
        float(n)

In [None]:
dir(tqdm)
#help(tqdm.in_place)