# Welcome to the Modern English -> Shakespeare Translator
To get things started click on the code cell below and press shift + enter to run it. Be sure the cell finishes running before you try to move on. A number in brackets will appear in the upper left corner of the cell when it is finished.

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt
import requests
from unicodedata import normalize
import tensorflow as tf
from mpl_toolkits.mplot3d import Axes3D
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas
import nltk
from collections import Counter
nltk.download('punkt')
print('Great Job!\nYou\'re on your way to becoming a data scientist' )
#from google.colab import drive
#drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Great Job!
You're on your way to becoming a data scientist


# Run the below cell to wipe all collected data
If you don't know what you're doing this cell is not for you. Skip to the next section. 

In [None]:
with open('Modern.txt', 'w') as f:
    f.write('')
    
with open('Original.txt', 'w') as f:
    f.write('')

# The next three cells handle sending webserver requests and saving our precious data
If you would like to collect data make sure to run these cells!
If you are only here to use the translator (*i.e. you were supplied with data*) this section can be ignored.

In [2]:
def try_connection(curr_url, headers=None): 
    try:
        if headers:
            response = requests.get(curr_url, headers=headers)
        else:
            response = requests.get(curr_url)
    except Exception as e:
        self.elog(e,'try_connection')
        print('Connection Error With Request: ...')
        return None    
    for i in range(15):
        if response.status_code == 200:
            return response
        elif response.status_code == 404:
            return response       
        print("Possible Bad Connection. Retrying in 1 min")
        time.sleep(60)
        try:
            if headers:
                response = requests.get(curr_url, headers=headers)
            else:
                response = requests.get(curr_url)
        except Exception as e:
            return None
    print(response.status_code)   
    print("Maximum Attempts used: returning response as None")   
    return None

In [3]:
def scrape_text(url, extensions=['']):
    punctuation = ',/:;-\"!?()1234567890'
    for ext in extensions:
        raw = try_connection(url + str(ext))
        print(f'Request for {url + str(ext)} result: {raw}')
        soup = BeautifulSoup(raw.content, 'html.parser')
        results = soup.find_all('td')
        cntr = 0
        for item in results:
            if cntr:
                mod_text = normalize('NFD', item.get_text()).encode('ascii', 'ignore').decode('UTF-8')
                tokens = []
                for word in mod_text.split():
                    tokens.append(''.join(str(letter.lower()) for letter in word if letter not in punctuation))
                with open('Modern.txt', 'a+') as f:
                    f.write(' '.join(tokens) + '$')
                cntr -= 1
            else:
                orig_text = normalize('NFD', item.get_text()).encode('ascii', 'ignore').decode('UTF-8')
                tokens = []
                for word in orig_text.split():
                    tokens.append(''.join(str(letter.lower()) for letter in word if letter not in punctuation))
                with open('Original.txt', 'a+') as f:
                    f.write(' '.join(tokens) + '$')
                cntr += 1

In [None]:
#Mid Summer Night's Dream
extensions = np.linspace(2, 146, 73, dtype=int)
scrape_text('https://www.sparknotes.com/nofear/shakespeare/msnd/page_', extensions=extensions)

#Richard iii
extensions = np.linspace(2, 342, 171, dtype=int)
scrape_text('https://www.sparknotes.com/nofear/shakespeare/richardiii/page_', extensions=extensions)

#Romeo and Juliet
extensions = np.linspace(2, 260, 130, dtype=int)
scrape_text('https://www.sparknotes.com/nofear/shakespeare/romeojuliet/page_', extensions=extensions)


# This section reads collected data from the target files and organizes it for the model
In order for the model to work These cells must be run. If you forgot how to run just click on the cell and press Shift + Enter. Be sure to wait untill the previous cells finish before running another.

In [2]:
def get_clean_data(file1='Modern.txt', file2='Original.txt'):
  with open(file1, 'r') as f:
    modern_phrases = f.read().split('$')
  with open(file2, 'r') as f:
    original_phrases = f.read().split('$')
  return (modern_phrases, original_phrases)

In [26]:
def compile_encoding(corpus):
  lookup = {}
  encodings = []
  length = 1
  for phrase in corpus:
    words = word_tokenize(phrase)
    enc = []
    for i in range(330):
      if i >= len(words):
        enc.append(0)
      elif words[i] in lookup:
        enc.append(lookup[words[i]])
      else:
        lookup[words[i]] = length
        enc.append(length)
        length += 1
    encodings.append(np.array([enc]))
  return (np.array(encodings), lookup)

def encode_phrase(phrase, lookup):
  words = word_tokenize(phrase)
  print(f'Tokenization {words}')
  enc = np.zeros((1,330))
  if len(words) > 330:
    print('Oversized Input Exiting (max 330 words)')
    return
  for i, word in enumerate(words):
    try:
      enc[0][i] = lookup[word]
    except:
      print(f'{word} is not recognized')
  return enc

In [4]:
modern, original = get_clean_data()
modern_data = compile_encoding(modern)
original_data = compile_encoding(original)

print(f'{len(modern_data[0])} Samples loaded')
print(f'{len(modern_data[1])} Total Modern Words found')
print(f'{len(original_data[1])} Total original Words found')


max = 0
ind = -1
for i, phrase in enumerate(modern_data[0]):
  if phrase[0].shape[0] >= max:
    max = phrase[0].shape[0]
    ind = i
print(f'{ind} Longest modern phrase {max} (by words)')

max = 0
ind = -1
for i, phrase in enumerate(original_data[0]):
  if phrase[0].shape[0] >= max:
    max = phrase[0].shape[0]
    ind = i
print(f'{ind} Longest orignal phrase {max} (by words)')

3238 Samples loaded
6156 Total Modern Words found
7021 Total original Words found
3237 Longest modern phrase 330 (by words)
3237 Longest orignal phrase 330 (by words)


# This cell contains the models architecture and training cell
Again these three cells are for Training the model and need not be tampered with

In [None]:
def model(input_shape=330):
  model = keras.Sequential()
  model.add(LSTM(units=500, return_sequences = True, input_shape = (1, input_shape)))
  model.add(LSTM(units=500, return_sequences=True))
  model.add(LSTM(units=500, return_sequences=True))
  model.add(LSTM(units=500, return_sequences=True))
  model.add(Dense(units=330))
  adam = Adam(learning_rate=1e-2)
  model.compile(optimizer='adam', loss='mean_squared_error')
  model.summary()
  return model

In [9]:
translator = load_model('drive/My Drive/Models/LSTM-v2-1.h5')

In [None]:
input_data = np.array(modern_data[0])
print(input_data.shape)
translator.fit(input_data, original_data[0], epochs=2000)
translator.save('drive/My Drive/Models/LSTM-v2-1.h5')

### This Cell is where the Translations happen
To try a translation run the below cell, enter a phrase in the box and press enter. Make sure to only use lowercase and no punctuation allowed.

In [28]:
translator = load_model('drive/My Drive/Models/LSTM-v2.h5')
phrase = input('Enter your modern phrase here: ')
encoding = encode_phrase(phrase, modern_data[1])
test_data = np.array([encoding])
prediction = translator.predict(test_data)
s = ''
for num in prediction[0][0]:
  c = [key for key, value in original_data[1].items() if value == int(num)]
  if len(c) > 0:
    s += c[0] + ' '
print(f'Translation: {s}')

Enter your modern phrase here: this is lysander
Tokenization ['this', 'is', 'lysander']
Translation: roaring daughter than turn bosoms judgment quickly old mans nuptial others revenue wanes turn long revenue lingers wanes lysander be philostrate steep our shall bring philostrate and lingers a wanes theseus text happy shall nuptial solemnities wanes my four withering on draws four to stir days draws dowager of enter now oh oh now . heaven text philostrate methinks happy draws fair oh stepdame theseus and nuptial theseus now fair apace our she time or happy hippolyta but nuptial in . hippolyta theseus apace text happy apace theseus now philostrate on but and now hippolyta others text philostrate and others now text with others now philostrate text theseus text now now original now philostrate now original hippolyta others now now now now now now now text now enter now now now theseus now now text now hippolyta original now and others now now now now now now text now now now hippolyta ori