# Welcome to the Modern English -> Shakespeare translator
To get things started click on the code cell below and press shift + enter to run it

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt
import requests
from unicodedata import normalize
import tensorflow as tf
from mpl_toolkits.mplot3d import Axes3D
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas
import nltk
from collections import Counter
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Run the below cell to wipe all collected data
If you don't know what you're doing this cell is not for you.

In [None]:
with open('Modern.txt', 'w') as f:
    f.write('')
    
with open('Original.txt', 'w') as f:
    f.write('')

# The next three cells handle sending webserver requests and saving our precious data
If you would like to collect data make sure to run these cells!
If you are only here to use the translator this section can be ignored.

In [2]:
def try_connection(curr_url, headers=None): 
    try:
        if headers:
            response = requests.get(curr_url, headers=headers)
        else:
            response = requests.get(curr_url)
    except Exception as e:
        self.elog(e,'try_connection')
        print('Connection Error With Request: ...')
        return None    
    for i in range(15):
        if response.status_code == 200:
            return response
        elif response.status_code == 404:
            return response       
        print("Possible Bad Connection. Retrying in 1 min")
        time.sleep(60)
        try:
            if headers:
                response = requests.get(curr_url, headers=headers)
            else:
                response = requests.get(curr_url)
        except Exception as e:
            return None
    print(response.status_code)   
    print("Maximum Attempts used: returning response as None")   
    return None

In [3]:
def scrape_text(url, extensions=['']):
    punctuation = ',/:;-\"!?()1234567890'
    for ext in extensions:
        raw = try_connection(url + str(ext))
        print(f'Request for {url + str(ext)} result: {raw}')
        soup = BeautifulSoup(raw.content, 'html.parser')
        results = soup.find_all('td')
        cntr = 0
        for item in results:
            if cntr:
                mod_text = normalize('NFD', item.get_text()).encode('ascii', 'ignore').decode('UTF-8')
                tokens = []
                for word in mod_text.split():
                    tokens.append(''.join(str(letter.lower()) for letter in word if letter not in punctuation))
                with open('Modern.txt', 'a+') as f:
                    f.write(' '.join(tokens) + '$')
                cntr -= 1
            else:
                orig_text = normalize('NFD', item.get_text()).encode('ascii', 'ignore').decode('UTF-8')
                tokens = []
                for word in orig_text.split():
                    tokens.append(''.join(str(letter.lower()) for letter in word if letter not in punctuation))
                with open('Original.txt', 'a+') as f:
                    f.write(' '.join(tokens) + '$')
                cntr += 1

In [4]:
#Mid Summer Night's Dream
extensions = np.linspace(2, 146, 73, dtype=int)
scrape_text('https://www.sparknotes.com/nofear/shakespeare/msnd/page_', extensions=extensions)

#Richard iii
extensions = np.linspace(2, 342, 171, dtype=int)
scrape_text('https://www.sparknotes.com/nofear/shakespeare/richardiii/page_', extensions=extensions)

#Romeo and Juliet
extensions = np.linspace(2, 260, 130, dtype=int)
scrape_text('https://www.sparknotes.com/nofear/shakespeare/romeojuliet/page_', extensions=extensions)


Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_2 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_4 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_6 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_8 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_10 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_12 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_14 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_16 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_18 result: <Response [200]>
Request for https://www.sparknotes.com/nofear/shakespeare/msnd/page_20 result: <Response [200]>
Request for https://www.sparknotes.com/nofea

# This section reads collected data from the target files and organizes it for the model
This cell is only used for model training, or for those interested in general corpus information

In [5]:
def get_clean_data(file1='Modern.txt', file2='Original.txt'):
  with open(file1, 'r') as f:
    modern_phrases = f.read().split('$')
  with open(file2, 'r') as f:
    original_phrases = f.read().split('$')
  return (modern_phrases, original_phrases)

In [7]:
def compile_encoding(corpus):
  lookup = {}
  encodings = []
  length = 1
  for phrase in corpus:
    words = word_tokenize(phrase)
    enc = []
    for i in range(330):
      if i >= len(words):
        enc.append(0)
      elif words[i] in lookup:
        enc.append(lookup[words[i]])
      else:
        lookup[words[i]] = length
        enc.append(length)
        length += 1
    encodings.append(np.array([enc]))
  return (np.array(encodings), lookup)

In [8]:
modern, original = get_clean_data()
modern_data = compile_encoding(modern)
original_data = compile_encoding(original)

print(f'{len(modern_data[0])} Samples loaded')
print(f'{len(modern_data[1])} Total Modern Words found')
print(f'{len(original_data[1])} Total original Words found')


max = 0
ind = -1
for i, phrase in enumerate(modern_data[0]):
  if phrase[0].shape[0] >= max:
    max = phrase[0].shape[0]
    ind = i
print(f'{ind} Longest modern phrase {max} (by words)')

max = 0
ind = -1
for i, phrase in enumerate(original_data[0]):
  if phrase[0].shape[0] >= max:
    max = phrase[0].shape[0]
    ind = i
print(f'{ind} Longest orignal phrase {max} (by words)')

3238 Samples loaded
6156 Total Modern Words found
7021 Total original Words found
3237 Longest modern phrase 330 (by words)
3237 Longest orignal phrase 330 (by words)


# This cell contains the models architecture and training cell
Again these three cells are for Training the model and need not be tampered with

In [None]:
def model(input_shape=330):
  model = keras.Sequential()
  model.add(LSTM(units=500, return_sequences = True, input_shape = (1, input_shape)))
  model.add(LSTM(units=500, return_sequences=True))
  model.add(LSTM(units=500, return_sequences=True))
  model.add(LSTM(units=500, return_sequences=True))
  model.add(Dense(units=330))
  adam = Adam(learning_rate=1e-2)
  model.compile(optimizer='adam', loss='mean_squared_error')
  model.summary()
  return model

In [9]:
#translator = model()
translator = load_model('drive/My Drive/Models/LSTM-v2.h5')

In [None]:
input_data = np.array(modern_data[0])
print(input_data.shape)
translator.fit(input_data, original_data[0], epochs=5000)
translator.save('drive/My Drive/Models/LSTM-v2-1.h5')

(3238, 1, 330)
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68/5000
Epoch 69/5000
Epoch 70/5000
Epoch 71/5000


In [None]:
sample_num = 25
input_data = np.array(modern_data[0])
test_data = np.array([input_data[sample_num]])
print(test_data.shape)
prediction = translator.predict(test_data)

s0 = ''
for num in test_data[0][0]:
  c = [key for key, value in modern_data[1].items() if value == int(num)]
  if len(c) > 0:
    s0 += c[0] + ' '

s = ''
for num in prediction[0][0]:
  c = [key for key, value in original_data[1].items() if value == int(num)]
  if len(c) > 0:
    s += c[0] + ' '

s2 = ''
for num in original_data[0][sample_num][0]:
  c = [key for key, value in original_data[1].items() if value == int(num)]
  if len(c) > 0:
    s2 += c[0] + ' '

print(f'Input: {s0}')
print(f'Prediction: {s}')
print(f'Excpected: {s2}')

(1, 1, 330)
Input: egeus its true rude lysander i do love him . thats why im giving him my daughter . shes mine and im giving her to demetrius . 
Prediction: egeus have his estate worthy hath wanes wooed and what is as lingers love new possessed within apace and desires funerals either theseus life a have solemnities hath exit rather let unto demetrius apace text original text original original enter hippolyta text original hippolyta original text original text enter text text and and original original enter text enter original original text theseus original original original original original original original original original 
Excpected: egeus scornful lysander true he hath my love and what is mine my love shall render him . and she is mine and all my right of her i do estate unto demetrius . 
