# Shaw's Last Act

In [92]:
import re
import os
import string
import torch
import torch.nn as nn
import torch.functional as functional
from torch.utils.data import Dataset, DataLoader

In [93]:
# load text
path = 'data/original_scripts.txt'
path = os.path.join(path)

with open(path, "r", encoding="utf8") as line:
    raw = line.read()

In [94]:
# text statatistics

unique_chars = len(set(list(raw)))
print(f'There are {unique_chars} unique characters in the text')

n_words = len(raw.split(' '))
print(f'There are approximately {n_words} words in the text')

n_unique_words = len(set(raw.split(' ')))
print(f'There are approximately {n_unique_words} unique words in the text')

n_lines = len(raw.split('\n'))
print(f'There are {n_lines} lines in the text')

print(f'On average, there are {n_words / n_lines} words per line')

titles = re.findall('Title:.*\n', raw)
titles = [title.replace('\n', '').replace('Title: ', '') for i, title in enumerate(titles)]
print(f'There are {len(titles)} different scripts in the text\n')
print('The text contains the scripts for the titles:', *titles, sep='\n  - ')

There are 95 unique characters in the text
There are approximately 256334 words in the text
There are approximately 45243 unique words in the text
There are 31278 lines in the text
On average, there are 8.195344970906069 words per line
There are 8 different scripts in the text

The text contains the scripts for the titles:
  - Pygmalion
  - Major Barbara
  - Saint Joan
  - Arms and the Man
  - Man And Superman
  - Mrs. Warren’s Profession
  - Heartbreak House
  - Caesar and Cleopatra


In [95]:
# identify special characters to replace with word tokens
special_characther = list(string.punctuation)
special_characther.extend(['\t', '\n'])

tokens_dict = {
    '.': '||period||',
    ',': '||come||',
    '"': '||doublequote||',
    "'": '||singlequote||',
    ':': '||colon||',
    ';': '||semicolon||',
    '!': '||exclamation||',
    '?': '||questionmark||',
    '(': '||lparenth||',
    ')': '||rparenth||',
    '&': '||rparenth||',
    '-': '||dash||',
    '--': '||dashdash||',
    '\n': '||newline||',
    '\t': '||tab||'
}

In [96]:
def create_lookup_tables(text):
    '''
    Create lookup tables and return a tuple of dicts (vocab_to_int, int_to_vocab)
    inputs:
        text: list, text split into words
        
    '''
     # Count and sort the corpus
    word_counts = collections.Counter(text)
    sorted_counts = word_counts.most_common()
    
    # create the look up dictionaries
    int_to_vocab = {n: word_tuple[0] for n, word_tuple in enumerate(sorted_counts)}
    vocab_to_int = {word: n for n, word in int_to_vocab.items()}

    return (vocab_to_int, int_to_vocab)