In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
import warnings
import os 
import re

html = urlopen('https://storm.cis.fordham.edu/~yli/data/MyShakespeare.txt').read().decode('utf-8','ignore')
soup = BeautifulSoup(html, features='lxml')
all_href = soup.find_all('p')
l = re.sub(r'\r\n\r\n','[P]',str(all_href))
l = re.sub(r'<.*?>','',l)
l = re.sub(r'\r\n',' ',l)
l = re.sub(r'\[P\]','\r\n\r\n',l)

with open('data.txt','w')as f:
    f.write(l[1:-1])
    

In [2]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}


In [198]:

from collections import Counter
class NgramModel(object):
    def __init__(self,n = 3):
        super(NgramModel,self).__init__()
        self.n = n # Tell me how many gram you want?
        self.create_lookup_tables = self.create_lookup_tables
        self.token_lookup = self._token_lookup()
        self.vocab_to_int = None
        self.int_to_vocab = None
        self.word_counter = None
        self.int_text = None
        self.corpus = None
        self.ngram_matrix = None
        self.gram_counter = None
        self.ngram_1matrix = None
        self.pp = None
    
        assert self.n > 1, "N should larger than 1 !!!!!"
        
    
    def load_data(self,path):
        input_file = os.path.join(path)
        with open(input_file, "r") as f:
            data = f.read()
        return data    
    
    def _token_lookup(self):
        answer = {'.' : '||period||',
                  ',' : '||comma||',
                  '"' : '||quotation_mark||',
                  ';' : '||semicolon||',
                  '!' : '||exclamation_mark||',
                  '?' : '||question_mark||',
                  '(' : '||left_Parentheses||',
                  ')' : '||right_Parentheses||',
                  #'\n': '||return||',
                  '-' : '||dash||'}
        return answer
    
    def update(self,text):
        text = self.load_data(text)
        text = self.preprocessing(text).lower()
        self.corpus = ['<START> '* (self.n-1) + t + ' <END>' * (self.n-1) for t in text.split('\n\n')]
        
        text = text.split()
        self.word_counter = Counter(text)
        self.vocab_to_int, self.int_to_vocab = self.create_lookup_tables(text + list(SPECIAL_WORDS.values()))
        self.int_text = [self.vocab_to_int[word] for word in text]
        
        
    def preprocessing(self,text):
        for key, token in self.token_lookup.items():
            text = text.replace(key, ' {} '.format(token))
        return text
    
    def create_lookup_tables(self,text):
        vocab_to_int = { v:i+2 for i,v in enumerate(set(text))}
        vocab_to_int['<START>'] = 0
        vocab_to_int['<end>'] = 1
        int_to_vocab = { v:k for k,v in vocab_to_int.items()}
        # return tuple
        return (vocab_to_int, int_to_vocab)
        
    
    def get_vocab(self):
        return self.vocab_to_int
    
    def size_vocab(self):
        return len(self.vocab_to_int)
    
    def get_gram(self):
        self._n_1gram()
        m = []
        for i in self.corpus:
            try:
                if len(i.split()) < self.n:
                    ng = self.pad(i.split)
                    m.append(tuple(ng))
                else:
                    for j in range(len(i.split())-self.n):
                        ng = i.split()[j:j+self.n]
                        m.append(tuple(ng))
            except:
                KeyboardInterrupt
        self.ngram_matrix = m
    
        
    def _n_1gram(self):
        m = []
        for i in self.corpus:
            try:
                if len(i.split()) < self.n:
                    ng = self.pad(i.split)
                    m.append(tuple(ng))
                else:
                    for j in range(len(i.split())-self.n+1):
                        ng = i.split()[j:j+self.n-1]
                        m.append(tuple(ng))
            except:
                KeyboardInterrupt
        self.ngram_1matrix = m
        
    
    def len_text(self):
        return len(self.text.split())-2
    
    def len_ngram(self):
        return len(self.ngram_matrix)
    
    def word_freq(self,word):
        print(self.counter[word])
            
        
    def pad(self,text,):
        l = len(text)
        n = self.n-l
        for _ in range(n):
            text.append('<PAD>')
        return text
    
    def len_gram(self):
        return len(self.ngram_matrix)
    
    def ngram_freq(self,gram):
        gram = self.preprocessing(gram)
        test = [ i for i in gram.lower().split()]
        assert len(test) == self.n, 'It seems the length of you input is not match !!'
        try:
            if self.gram_counter == None:
                self.gram_counter = Counter(self.ngram_matrix)
            
            if self.gram_counter[tuple(test)] == 0 :
                print('Come on, we dont have these combo !!')
                pro = 1/(self.size_vocab()*2)
                print('Probobility is {a}'.format(a=pro))
            else:                     
                #print(self.gram_counter[tuple(test)])
                return self.gram_counter[tuple(test)]
        except:
            KeyboardInterrupt
            
    def n_1gram_freq(self,gram):
        gram = self.preprocessing(gram)
        test = [ i for i in gram.lower().split()]
        test.pop()
        try:
            gram_counter = Counter(self.ngram_1matrix)
            
            if gram_counter[tuple(test)] == 0 :
                print('Come on, we dont have these combo !!')
                return 
            else:                     
                #print(self.gram_counter[tuple(test)])
                return gram_counter[tuple(test)]
        except:
            KeyboardInterrupt
            
    def get_pro(self,ngram):
        if self.ngram_freq(ngram):
            return self.ngram_freq(ngram)/self.n_1gram_freq(ngram)
        else:
            return 1/(len(self.vocab_to_int)+1)
        
                
                
    def text_generate(self,gram, min_length, max_length):
        gram = self.preprocessing(gram)
        test = [ i for i in gram.lower().split()]
        print(test)
        
        
        assert len(test) >= self.n -1, 'You are too short to gen !!!!'
        while len(test) <= max_length:
            
            gen = tuple(test[-(self.n-1):])
            #print(gen)
            test.append(self.findCondition(gen))
            #print(test)
            if self.findCondition(gen) == '<END>':
                return self.parse(test)
        return self.parse(test)
        
    def findCondition(self,n_1gram):
        candidate = [i for i in self.ngram_matrix if i[:self.n-1] == n_1gram]
        #print(candidate)
        c = Counter(candidate)
        #print(len(c.most_common(1)))
        try:
            if len(c.most_common(1))>0:
                next_word = c.most_common(1)[0][0][-1]
                return next_word
            return
        except:
            KeyboardInterrupt
        
    def parse(self,text):
        #print(text)
        if text[-1] == '<END>':
            text = " ".join(text)
            for token, key in self.token_lookup.items():
                text = text.replace(key, ' {} '.format(token))
                #print(text)
            return " ".join(re.findall(r'.+(?=\s\<END\>)',text)[0].split()) 
        
        else:
            text = " ".join(text)
            for token, key in self.token_lookup.items():
                text = text.replace(key, ' {} '.format(token))
            return text
        
    def perplexity(self,text):
        gram = '<START> '* (self.n - 1) + self.preprocessing(text)
        test = [ i for i in gram.lower().split()]
        #print(len(test))
        pp = 1
        import math
        for i in range(len(test)-self.n+1):
            ngram = " ".join(test[i:i+self.n])
            #print(1/self.get_pro(ngram))
            pp *= 1/self.get_pro(ngram)
        print(pp**(1/((len(test)-2))))
        
    

In [199]:
ng = NgramModel(2)
ng.update('data.txt')
ng.get_gram()


In [200]:
ng.ngram_freq('members, the'),ng.get_pro('members, the')


AssertionError: It seems the length of you input is not match !!

In [201]:
ng.text_generate('our business',10,30)


['our', 'business']


'our business is  ,  and hear me  ,  and hear me  ,  and hear me  ,  and hear me  ,  and hear me  ,  and hear me  ,  and hear me'

In [196]:
ng.perplexity('make you a sword for me')

Come on, we dont have these combo !!
Probobility is 0.0003244646333549643
Come on, we dont have these combo !!
Probobility is 0.0003244646333549643
Come on, we dont have these combo !!
Probobility is 0.0003244646333549643
774.6404268972499


In [175]:
len(ng.vocab_to_int)

1541

In [332]:
# 随机生成 vocab 为10 的 sequence
a = [str(np.random.randint(0,10)) for _ in range(1000)]
print(a)

# 2, 3, 比较 
tokens2 = ("<START>\n" * 1 + "\n".join(a)).split('\n')
tokens3 = ("<START>\n" * 2 + "\n".join(a)).split('\n')

['0', '1', '9', '2', '7', '0', '7', '4', '0', '9', '8', '8', '1', '8', '3', '0', '2', '0', '8', '0', '0', '6', '0', '5', '3', '5', '5', '3', '8', '6', '8', '3', '1', '4', '1', '5', '4', '0', '2', '6', '4', '8', '1', '0', '2', '4', '8', '2', '1', '2', '6', '0', '0', '4', '4', '4', '2', '7', '0', '7', '9', '2', '4', '0', '4', '7', '1', '9', '6', '3', '2', '8', '6', '9', '9', '6', '0', '2', '7', '7', '8', '3', '7', '9', '4', '9', '7', '0', '0', '7', '3', '3', '6', '9', '3', '6', '0', '0', '8', '1', '4', '7', '5', '1', '9', '5', '7', '6', '2', '6', '3', '0', '3', '9', '7', '6', '1', '0', '7', '3', '1', '3', '1', '4', '2', '0', '4', '4', '5', '8', '3', '9', '5', '9', '3', '7', '5', '7', '9', '8', '1', '4', '7', '7', '8', '2', '3', '2', '8', '0', '6', '2', '9', '8', '5', '0', '8', '5', '7', '9', '6', '6', '8', '7', '7', '9', '9', '3', '5', '7', '2', '1', '1', '2', '6', '4', '2', '6', '8', '4', '7', '5', '1', '6', '6', '2', '3', '3', '0', '9', '4', '3', '1', '3', '9', '3', '9', '4', '0', '7',

In [378]:
from copy import deepcopy
def get_gram(token,n):
    m = []
    for i in range(len(token)-n+1):
        ng = token[i:i+n]
        m.append(tuple(ng))
    return m

def ngram_freq(test=('6','7'),counter=counter):

    if counter[tuple(test)] ==0:
        return 0
    else:                     
        return counter[tuple(test)]
    
def pro(gram,counter,jianyicounter):
    leng = len(gram)-1
    num = gram[:leng]
    #print(num)
    if ngram_freq(gram,counter=counter)!= 0 and ngram_freq(num,counter=jianyicounter)!=0:
        return ngram_freq(gram,counter=counter)/ngram_freq(num,counter=jianyicounter)
    else:
        return 0.1

        
gram2 = get_gram(tokens2,2)
counter2 =Counter(gram)
gram3 = get_gram(tokens3,3)
counter3 =Counter(gram)
counter1 =Counter([tuple(i) for i in a])
test = [str(np.random.randint(0,10)) for _ in range(6)]
test3 = ("<START>\n" * N + "\n".join(test)).split('\n')
test2 = ("<START>\n" * 1 + "\n".join(test)).split('\n')
print(test)
test3gram = get_gram(test3,3)
test2gram = get_gram(test2,2)

['8', '2', '1', '6', '3', '4']


In [379]:
print(test3gram)
print()

print(test2gram)

[('<START>', '<START>', '8'), ('<START>', '8', '2'), ('8', '2', '1'), ('2', '1', '6'), ('1', '6', '3'), ('6', '3', '4')]

[('<START>', '8'), ('8', '2'), ('2', '1'), ('1', '6'), ('6', '3'), ('3', '4')]


In [382]:
# 3gram 结果 
a =1
for i in test3gram:
    a *=(1/pro(i,counter3,counter2))
a

1000000.0

In [383]:
# 2 gram 结果 
b=1
for i in test2gram:
    b *=(1/pro(i,counter2,counter1))
b

620804.029090909