In [43]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
import warnings
import re

html = urlopen('https://storm.cis.fordham.edu/~yli/data/MyShakespeare.txt').read().decode('utf-8','ignore')
soup = BeautifulSoup(html, features='lxml')

all_href = soup.find_all('p')


l = re.sub(r'\r\n\r\n','[P]',str(all_href))
l = re.sub(r'<.*?>','',l)
l = re.sub(r'\r\n',' ',l)
l = re.sub(r'\[P\]','\r\n\r\n',l)

with open('data.txt','w')as f:
    f.write(l[1:-1])

In [44]:
import os 
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data

text = load_data('data.txt')

In [45]:
lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

Number of lines: 725
Average number of words in each line: 12.32


In [46]:

SPECIAL_WORDS = {'PADDING': '<PAD>'}
def create_lookup_tables(text):
    vocab_to_int = { v:i+1 for i,v in enumerate(set(text))}
    int_to_vocab = { v:k for k,v in vocab_to_int.items()}
    # return tuple
    return (vocab_to_int, int_to_vocab)

def token_lookup():
    
    answer = {'.' : '||Period||',
              ',' : '||Comma||',
              '"' : '||Quotation_Mark||',
              ';' : '||Semicolon||',
              '!' : '||Exclamation_mark||',
              '?' : '||Question_mark||',
              '(' : '||Left_Parentheses||',
              ')' : '||Right_Parentheses||',
              '\n': '||Return||',
              '-' : '||Dash||'}
    
        
    return answer

In [47]:
def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables):

    text = load_data(dataset_path)

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))



In [67]:
class NgramModel(object):
    def __init__(self,n = 3):
        super(NgramModel,self).__init__()
        self.n = n
        self.create_lookup_tables = create_lookup_tables
        self.token_lookup = token_lookup
        self.vocab_to_int = None
        self.int_to_vocab = None
        self.int_text = None
        assert self.n > 1, "N should larger than 1 !!!!!"
        
    @staticmethod    
    def load_data(path):
        input_file = os.path.join(path)
        with open(input_file, "r") as f:
            data = f.read()
        return data    
    
    def update(self,text):
        text = load_data(text)

        token_dict = self.token_lookup()
        for key, token in token_dict.items():
            text = text.replace(key, ' {} '.format(token))

        text = text.lower()
        text = text.split()

        self.vocab_to_int, self.int_to_vocab = self.create_lookup_tables(text + list(SPECIAL_WORDS.values()))
        self.int_text = [self.vocab_to_int[word] for word in text]
    def get_vocab(self):
        return self.vocab_to_int
    
    def size_vocab(self):
        return len(self.vocab_to_int)

In [68]:
ng = NgramModel(2)
ng.update('data.txt')