In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
import warnings
import re

html = urlopen('https://storm.cis.fordham.edu/~yli/data/MyShakespeare.txt').read().decode('utf-8','ignore')
soup = BeautifulSoup(html, features='lxml')

all_href = soup.find_all('p')


l = re.sub(r'\r\n\r\n','[P]',str(all_href))
l = re.sub(r'<.*?>','',l)
l = re.sub(r'\r\n',' ',l)
l = re.sub(r'\[P\]','\r\n\r\n',l)

with open('data.txt','w')as f:
    f.write(l[1:-1])

In [2]:
import os 


In [3]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}
def create_lookup_tables(text):
    vocab_to_int = { v:i+2 for i,v in enumerate(set(text))}
    vocab_to_int['<START>'] = 0
    vocab_to_int['<end>'] = 1
    int_to_vocab = { v:k for k,v in vocab_to_int.items()}
    # return tuple
    return (vocab_to_int, int_to_vocab)

def token_lookup():
    
    answer = {'.' : '||Period||',
              ',' : '||Comma||',
              '"' : '||Quotation_Mark||',
              ';' : '||Semicolon||',
              '!' : '||Exclamation_mark||',
              '?' : '||Question_mark||',
              '(' : '||Left_Parentheses||',
              ')' : '||Right_Parentheses||',
              #'\n': '||Return||',
              '-' : '||Dash||'}
    
        
    return answer

In [33]:

from collections import Counter
class NgramModel(object):
    def __init__(self,n = 3):
        super(NgramModel,self).__init__()
        self.n = n
        self.create_lookup_tables = create_lookup_tables
        self.token_lookup = token_lookup
        self.vocab_to_int = None
        self.int_to_vocab = None
        self.word_counter = None
        self.int_text = None
        self.corpus = None
        self.ngram_matrix = None
        self.gram_counter = None
        assert self.n > 1, "N should larger than 1 !!!!!"
        
    
    def load_data(self,path):
        input_file = os.path.join(path)
        with open(input_file, "r") as f:
            data = f.read()
        return data    
    
    def update(self,text):
        text = self.load_data(text)
        
        
        token_dict = self.token_lookup()
        for key, token in token_dict.items():
            text = text.replace(key, ' {} '.format(token))

        text = text.lower()
        self.corpus = ['<START> ' + t + ' <END>' for t in text.split('\n\n')]
        text = text.split()
        
        self.word_counter = Counter(text)
        self.vocab_to_int, self.int_to_vocab = self.create_lookup_tables(text + list(SPECIAL_WORDS.values()))
        self.int_text = [self.vocab_to_int[word] for word in text]
    def get_vocab(self):
        return self.vocab_to_int
    
    def size_vocab(self):
        return len(self.vocab_to_int)
    
    def get_gram(self):
        m = []
    
        for i in self.corpus:
            try:
                if len(i.split()) < self.n:
                    ng = self.pad(i.split)
                    m.append(tuple(ng))
                else:
                    for j in range(len(i.split())-self.n):
                        ng = i.split()[j:j+self.n]
                        m.append(tuple(ng))
            except:
                KeyboardInterrupt
        
           
        self.ngram_matrix = m
    
    def len_text(self):
        return len(self.text.split())-2
    
    def len_ngram(self):
        return len(self.ngram_matrix)
    
    def word_freq(self,word):
        print(self.counter[word])
            
        
    def pad(self,text,):
        l = len(text)
        n = self.n-l
        for _ in range(n):
            text.append('<PAD>')
        return text
    
    def len_gram(self):
        return len(self.ngram_matrix)
    
    def ngram_freq(self,gram):
        answer = {'.' : '||Period||',
              ',' : '||Comma||',
              '"' : '||Quotation_Mark||',
              ';' : '||Semicolon||',
              '!' : '||Exclamation_mark||',
              '?' : '||Question_mark||',
              '(' : '||Left_Parentheses||',
              ')' : '||Right_Parentheses||',
              #'\n': '||Return||',
              '-' : '||Dash||'}
        
        for key, token in answer.items():
            gram = gram.replace(key, ' {} '.format(token))
        test = [ i for i in gram.lower().split()]
        
        assert len(test) == self.n, 'It seems the length of you input is not match !!'
        try:
            if self.gram_counter == None:
                self.gram_counter = Counter(self.ngram_matrix)
            
            if self.gram_counter[tuple(test)] == 0 :
                print('Come on, we dont have these combo !!')
                pro = 1/(self.size_vocab()*2)
                print('Probobility is {a}'.format(a=pro))
            else:                     
                print(self.gram_counter[tuple(test)])
        except:
            KeyboardInterrupt
                
                
    def text_generate(self,gram, min_length, max_length):
        
        
        answer = {'.' : '||Period||',
              ',' : '||Comma||',
              '"' : '||Quotation_Mark||',
              ';' : '||Semicolon||',
              '!' : '||Exclamation_mark||',
              '?' : '||Question_mark||',
              '(' : '||Left_Parentheses||',
              ')' : '||Right_Parentheses||',
              #'\n': '||Return||',
              '-' : '||Dash||'}
        
        for key, token in answer.items():
            gram = gram.replace(key, ' {} '.format(token))
        test = [ i for i in gram.lower().split()]
        outcomt = [test]
        
        assert len(test) >= self.n, 'You are too short to gen !!!!'
        while len(outcomt[-1]) <= max_length:
            gen = test[-2:]
            test.append(self.findCondition(gen))
            outcomt.append(test)
            
        return outcomt[-1]
        
        
        
        
    def findCondition(self,n_1gram):
        candidate = [i for i in self.ngram_matrix if i[:self.n] == n_1gram]
        c = Counter(candidate)
        print(c.most_common(1))
        next_word = c.most_common(1)[0][0][-1]
        
        return next_word
        
        
        
        
        

In [34]:
ng = NgramModel(3)
ng.update('data.txt')

In [35]:
ng.get_gram()
ng.text_generate('members, th',10,20)

[]


IndexError: list index out of range

In [20]:
ng.gram_counter.most_common(1)[0][0][-1]

'citizen:'

In [11]:
def load_data(path):
        input_file = os.path.join(path)
        with open(input_file, "r") as f:
            data = f.read()
        return data
text = load_data('data.txt')
lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

Number of lines: 725
Average number of words in each line: 12.32


In [96]:
Counter(ng.ngram_matrix)

Counter({('<START>', 'first', 'citizen:'): 45,
         ('first', 'citizen:', 'before'): 2,
         ('citizen:', 'before', 'we'): 2,
         ('before', 'we', 'proceed'): 2,
         ('we', 'proceed', 'any'): 2,
         ('proceed', 'any', 'further'): 2,
         ('any', 'further', '||comma||'): 2,
         ('further', '||comma||', 'hear'): 2,
         ('||comma||', 'hear', 'me'): 2,
         ('hear', 'me', 'speak'): 2,
         ('me', 'speak', '||period||'): 2,
         ('<START>', 'all:', 'speak'): 2,
         ('all:', 'speak', '||comma||'): 2,
         ('speak', '||comma||', 'speak'): 2,
         ('||comma||', 'speak', '||period||'): 2,
         ('first', 'citizen:', 'you'): 2,
         ('citizen:', 'you', 'are'): 2,
         ('you', 'are', 'all'): 2,
         ('are', 'all', 'resolved'): 2,
         ('all', 'resolved', 'rather'): 2,
         ('resolved', 'rather', 'to'): 2,
         ('rather', 'to', 'die'): 2,
         ('to', 'die', 'than'): 2,
         ('die', 'than', 'to'): 2,
  

In [35]:
text

"First Citizen: Before we proceed any further, hear me speak.\n\nAll: Speak, speak.\n\nFirst Citizen: You are all resolved rather to die than to famish?\n\nAll: Resolved. resolved.\n\nFirst Citizen: First, you know Caius Marcius is chief enemy to the people.\n\nAll: We know't, we know't.\n\nFirst Citizen: Let us kill him, and we'll have corn at our own price. Is't a verdict?\n\nAll: No more talking on't; let it be done: away, away!\n\nSecond Citizen: One word, good citizens.\n\nFirst Citizen: We are accounted poor citizens, the patricians good. What authority surfeits on would relieve us: if they would yield us but the superfluity, while it were wholesome, we might guess they relieved us humanely; but they think we are too dear: the leanness that afflicts us, the object of our misery, is as an inventory to particularise their abundance; our sufferance is a gain to them Let us revenge this with our pikes, ere we become rakes: for the gods know I speak this in hunger for bread, not in th