In [1]:
from nltk.tokenize import sent_tokenize
import codecs
import regex as re
from sklearn.model_selection import train_test_split
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

Using TensorFlow backend.


In [2]:
file = codecs.open('speeches.txt', 'r', 'UTF-8')
raw_text = file.read()
raw_text = raw_text.replace("\r\n","")
raw_text = re.sub(r"[0-9]","",raw_text)

sent_tokenize_list = sent_tokenize(raw_text)
sentences = []
for s in sent_tokenize_list:
    sent = '<s> ' + s.lower() + ' </s>'
    sentences.append(sent)
train_list,test_list = sentences[:int(len(sentences)*0.8)], sentences[int(len(sentences)*0.8):]
train_list[10:12]

['<s> i love the people of iowa. </s>', "<s> so that's the way it is. </s>"]

## Classical Approach

In [3]:
def nGramCounter(train,n):
    counts = {}
    for i in train:
        tokens = i.split()
        for j in range(0,len(tokens)- n + 1,1):
            k = tuple(tokens[j:j+n])
            if k in counts:
                counts[k] += 1
            else:
                counts[k] = 1
    return counts

def nGramMLE(inp,counts):
    inp = inp.split()
    if(len(inp)==1):
        return counts[tuple(inp)]/sum(counts.values()) if tuple(inp) in counts else 0
    else:
        den = 0
        for i in counts.keys():
            if(i[:-1]==tuple(inp[:-1])):
                den += counts[i]        
        return counts[tuple(inp)]/den if tuple(inp) in counts else 0

counts1 = nGramCounter(train_list,1)
counts2 = nGramCounter(train_list,2)
counts3 = nGramCounter(train_list,3)
counts4 = nGramCounter(train_list,4)

print(nGramMLE("the",counts1))
print(nGramMLE("going to",counts2))
print(nGramMLE("we're going to",counts3))
print(nGramMLE("<s> we're going to",counts4))

0.03004506102287208
0.9093093093093093
0.88
1.0


In [4]:
def nGramPredictor(n,counts,initial):
    initial = initial.split()
    if(n==1):
        probs = [i/sum(counts.values()) for i in counts.values()]
        c = np.random.multinomial(3,probs,size=None)
        return list(counts.keys())[np.argmax(c)]
    else:
        prob_list = []
        all_keys = []
        for i in counts.keys():
            if(i[:-1]==tuple(initial)):
                all_keys.append(i)
                prob_list.append(counts[i])
        prob_list = [i/sum(prob_list) for i in prob_list]
        c = np.random.multinomial(3,prob_list,size=None)
        return all_keys[np.argmax(c)][-1]

def nGramGenerator(n,counts,maxLength,initial):
    gen_text = initial.split()
    next_word = gen_text[-1]
    while(next_word != '</s>' and len(gen_text) <= maxLength):
        next_word = nGramPredictor(n,counts,' '.join(gen_text[-n+1:]))
        gen_text.append(next_word)
    return ' '.join(gen_text)

# [nGramGenerator(n=1,counts = counts1,maxLength=10,initial="<s>") for i in range(5)]
print([nGramGenerator(n=2,counts = counts2,maxLength=100,initial="the") for i in range(5)])
print([nGramGenerator(n=3,counts = counts3,maxLength=100,initial="going to") for i in range(5)])
print([nGramGenerator(n=4,counts = counts4,maxLength=100,initial="we're going to") for i in range(5)])

['the country. </s>', 'the best way to make our military so unfair. </s>', 'the hell out of you, you can’t do you can keep it was a couple of the next years. </s>', 'the only the people were radicalized and i think it’s going to do it doesn’t work. </s>', 'the united states. </s>']
['going to forget about school. </s>', 'going to happen. </s>', 'going to do it themselves, or it will be by both parties — democrats, republicans, independents, everybody, as well as you know what he did one deal. </s>', 'going to do is i’m going to get the five people that want to help the christians, nothing, and we don’t know what that means, he has a baby, lives in asia or lives in asia or lives in mexico if i was with my wife, melania, and we have to open up things, press a computer, takes you minutes.well, the same thing, you have to be privatization. </s>', 'going to be thrilled to be a great quarterback that’s pretty interesting – i don’t have to know what is that the president deems detrimental to 

## Neural Approach

In [None]:
seq = []
for i in train_list:
    tokens = i.split()
    for j in range(0,len(tokens)-1,1):
        k = list(tokens[j:j+2])
        seq.append(k)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(seq)
encoded = tokenizer.texts_to_sequences(seq)

X = encoded[:,:-1]
y = encoded[:,-1]
length = tokenizer.word_index
y = to_categorical(y,num_classes = length)

In [None]:
from keras.layers import SimpleRNN

model_rnn = Sequential()
model_rnn.add(Embedding(vocab_size,100 , input_length=1))
model_rnn.add(SimpleRNN(300))
model_rnn.add(Dropout(0.2))
model_rnn.add(Dense(vocab_size, activation='softmax'))

model_rnn.summary()

In [None]:
model_rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_rnn.fit(X, y, batch_size=128, epochs=20)

In [None]:
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, 100, input_length=n_gram-1))
model_lstm.add(LSTM(300))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(vocab_size, activation='softmax'))

model_lstm.summary()

In [None]:
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X, y, batch_size=128, epochs=20)