In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
import warnings
import os 
import re

html = urlopen('https://storm.cis.fordham.edu/~yli/data/MyShakespeare.txt').read().decode('utf-8','ignore')
soup = BeautifulSoup(html, features='lxml')
all_href = soup.find_all('p')
l = re.sub(r'\r\n\r\n','[P]',str(all_href))
l = re.sub(r'<.*?>','',l)
l = re.sub(r'\r\n',' ',l)
l = re.sub(r'\[P\]','\r\n\r\n',l)

with open('data.txt','w')as f:
    f.write(l[1:-1])
    

In [2]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}


In [204]:

from collections import Counter
class NgramModel(object):
    def __init__(self,n = 3):
        super(NgramModel,self).__init__()
        self.n = n # Tell me how many gram you want?
        self.create_lookup_tables = self.create_lookup_tables
        self.token_lookup = self._token_lookup()
        self.vocab_to_int = None
        self.int_to_vocab = None
        self.word_counter = None
        self.int_text = None
        self.corpus = None
        self.ngram_matrix = None
        self.gram_counter = None
        self.ngram_1matrix = None
        assert self.n > 1, "N should larger than 1 !!!!!"
        
    
    def load_data(self,path):
        input_file = os.path.join(path)
        with open(input_file, "r") as f:
            data = f.read()
        return data    
    
    def _token_lookup(self):
        answer = {'.' : '||period||',
                  ',' : '||comma||',
                  '"' : '||quotation_mark||',
                  ';' : '||semicolon||',
                  '!' : '||exclamation_mark||',
                  '?' : '||question_mark||',
                  '(' : '||left_Parentheses||',
                  ')' : '||right_Parentheses||',
                  #'\n': '||return||',
                  '-' : '||dash||'}
        return answer
    
    def update(self,text):
        text = self.load_data(text)
        text = self.preprocessing(text).lower()
        self.corpus = ['<START> '* (self.n-1) + t + ' <END>' * (self.n-1) for t in text.split('\n\n')]
        
        text = text.split()
        self.word_counter = Counter(text)
        self.vocab_to_int, self.int_to_vocab = self.create_lookup_tables(text + list(SPECIAL_WORDS.values()))
        self.int_text = [self.vocab_to_int[word] for word in text]
        
        
    def preprocessing(self,text):
        for key, token in self.token_lookup.items():
            text = text.replace(key, ' {} '.format(token))
        return text
    
    def create_lookup_tables(self,text):
        vocab_to_int = { v:i+2 for i,v in enumerate(set(text))}
        vocab_to_int['<START>'] = 0
        vocab_to_int['<end>'] = 1
        int_to_vocab = { v:k for k,v in vocab_to_int.items()}
        # return tuple
        return (vocab_to_int, int_to_vocab)
        
    
    def get_vocab(self):
        return self.vocab_to_int
    
    def size_vocab(self):
        return len(self.vocab_to_int)
    
    def get_gram(self):
        self._n_1gram()
        m = []
        for i in self.corpus:
            try:
                if len(i.split()) < self.n:
                    ng = self.pad(i.split)
                    m.append(tuple(ng))
                else:
                    for j in range(len(i.split())-self.n):
                        ng = i.split()[j:j+self.n]
                        m.append(tuple(ng))
            except:
                KeyboardInterrupt
        self.ngram_matrix = m
        
    def _n_1gram(self):
        m = []
        for i in self.corpus:
            try:
                if len(i.split()) < self.n:
                    ng = self.pad(i.split)
                    m.append(tuple(ng))
                else:
                    for j in range(len(i.split())-self.n+1):
                        ng = i.split()[j:j+self.n-1]
                        m.append(tuple(ng))
            except:
                KeyboardInterrupt
        self.ngram_1matrix = m
        
    
    def len_text(self):
        return len(self.text.split())-2
    
    def len_ngram(self):
        return len(self.ngram_matrix)
    
    def word_freq(self,word):
        print(self.counter[word])
            
        
    def pad(self,text,):
        l = len(text)
        n = self.n-l
        for _ in range(n):
            text.append('<PAD>')
        return text
    
    def len_gram(self):
        return len(self.ngram_matrix)
    
    def ngram_freq(self,gram):
        gram = self.preprocessing(gram)
        test = [ i for i in gram.lower().split()]
        assert len(test) == self.n, 'It seems the length of you input is not match !!'
        try:
            if self.gram_counter == None:
                self.gram_counter = Counter(self.ngram_matrix)
            
            if self.gram_counter[tuple(test)] == 0 :
                print('Come on, we dont have these combo !!')
                pro = 1/(self.size_vocab()*2)
                print('Probobility is {a}'.format(a=pro))
            else:                     
                #print(self.gram_counter[tuple(test)])
                return self.gram_counter[tuple(test)]
        except:
            KeyboardInterrupt
            
    def n_1gram_freq(self,gram):
        gram = self.preprocessing(gram)
        test = [ i for i in gram.lower().split()]
        test.pop()
        try:
            gram_counter = Counter(self.ngram_1matrix)
            
            if gram_counter[tuple(test)] == 0 :
                print('Come on, we dont have these combo !!')
            else:                     
                #print(self.gram_counter[tuple(test)])
                return gram_counter[tuple(test)]
        except:
            KeyboardInterrupt
            
    def get_pro(self,ngram):
        return self.ngram_freq(ngram)/self.ngram_freq(ngram)
        
                
                
    def text_generate(self,gram, min_length, max_length):
        gram = self.preprocessing(gram)
        test = [ i for i in gram.lower().split()]
        #print(test)
        assert len(test) >= self.n -1, 'You are too short to gen !!!!'
        while len(test) <= max_length:
            gen = tuple(test[-2:])
            #print(gen)
            test.append(self.findCondition(gen))
            if self.findCondition(gen) == '<END>':
                break
            
        return self.parse(test)
        
    def findCondition(self,n_1gram):
        candidate = [i for i in self.ngram_matrix if i[:self.n-1] == n_1gram]
        #print(candidate)
        c = Counter(candidate)
        print(len(c.most_common(1)))
        try:
            if len(c.most_common(1))>0:
                next_word = c.most_common(1)[0][0][-1]
                return next_word
            return
        except:
            KeyboardInterrupt
        
    def parse(self,text):
        text = " ".join(text)
        for token, key in self.token_lookup.items():
            text = text.replace(key, ' {} '.format(token))
        return " ".join(re.findall(r'.+(?=\s\<END\>)',text)[0].split())    
        

In [205]:
ng = NgramModel(3)
ng.update('data.txt')
ng.get_gram()


In [206]:
ng.ngram_freq('members, the')
ng.get_pro('members, the')




1.0

In [207]:
ng.text_generate('Alex is ',10,20)


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


TypeError: sequence item 2: expected str instance, NoneType found