In [1]:
from nltk.tokenize import sent_tokenize
import codecs
import regex as re
from sklearn.model_selection import train_test_split
import numpy as np

file = codecs.open('speeches.txt', 'r', 'UTF-8')
raw_text = file.read()
raw_text = raw_text.replace("\r\n","")
raw_text = re.sub(r"[0-9]","",raw_text)

sent_tokenize_list = sent_tokenize(raw_text)
sentences = []
for s in sent_tokenize_list:
    sent = '<s> ' + s.lower() + ' </s>'
    sentences.append(sent)
train_list,test_list = sentences[:int(len(sentences)*0.8)], sentences[int(len(sentences)*0.8):]
train_list[10:12]

['<s> i love the people of iowa. </s>', "<s> so that's the way it is. </s>"]

## Classical Approach

In [9]:
def nGramCounter(train,n):
    counts = {}
    for i in train:
        tokens = i.split()
        for j in range(0,len(tokens)- n + 1,1):
            k = tuple(tokens[j:j+n])
            if k in counts:
                counts[k] += 1
            else:
                counts[k] = 1
    return counts

def nGramMLE(inp,counts):
    inp = inp.split()
    if(len(inp)==1):
        return counts[tuple(inp)]/sum(counts.values()) if tuple(inp) in counts else 0
    else:
        den = 0
        for i in counts.keys():
            if(i[:-1]==tuple(inp[:-1])):
                den += counts[i]        
        return counts[tuple(inp)]/den if tuple(inp) in counts else 0

counts1 = nGramCounter(train_list,1)
counts2 = nGramCounter(train_list,2)
counts3 = nGramCounter(train_list,3)
counts4 = nGramCounter(train_list,4)

print(nGramMLE("the",counts1))
print(nGramMLE("going to",counts2))
print(nGramMLE("we're going to",counts3))
print(nGramMLE("<s> we're going to",counts4))

0.03004506102287208
0.9093093093093093
0.88
1.0


In [14]:
def nGramPredictor(n,counts,initial):
    initial = initial.split()
    if(n==1):
        probs = [i/sum(counts.values()) for i in counts.values()]
        c = np.random.multinomial(3,probs,size=None)
        return list(counts.keys())[np.argmax(c)]
    else:
        prob_list = []
        all_keys = []
        for i in counts.keys():
            if(i[:-1]==tuple(initial)):
                all_keys.append(i)
                prob_list.append(counts[i])
        prob_list = [i/sum(prob_list) for i in prob_list]
        c = np.random.multinomial(3,prob_list,size=None)
        return all_keys[np.argmax(c)][-1]

def nGramGenerator(n,counts,maxLength,initial):
    gen_text = initial.split()
    next_word = gen_text[-1]
    while(next_word != '</s>' and len(gen_text) <= maxLength):
        next_word = nGramPredictor(n,counts,' '.join(gen_text[-n+1:]))
        gen_text.append(next_word)
    return ' '.join(gen_text)

# [nGramGenerator(n=1,counts = counts1,maxLength=10,initial="<s>") for i in range(5)]
print([nGramGenerator(n=2,counts = counts2,maxLength=100,initial="the") for i in range(5)])
print([nGramGenerator(n=3,counts = counts3,maxLength=100,initial="going to") for i in range(5)])
print([nGramGenerator(n=4,counts = counts4,maxLength=100,initial="we're going to") for i in range(5)])

["we're going to start winning again.so, second amendment protected. </s>",
 "we're going to do it fast; we have to stop doing things for some people, but for this country, and what’s going wrong with our country – massive companies – because they can’t get the banks to give you the names right now, which is not a testament to me but a testament to me but a testament to all of those bernie sanders voters who have been left out in the battlefield, trying to kill us from coming in. </s>",
 "we're going to do it, folks. </s>",
 "we're going to do is look at his credit card. </s>",
 "we're going to nevada.i lead -- i lead with the hispanics. </s>"]

## Neural Approach

In [None]:
with open('speeches.txt','r',encoding='utf-8') as f:
    sent = f.read().replace("\n","")
sent = sent.encode('ascii','ignore').decode('ascii')
sent = sent.lower()
