In [37]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import copy

import random


In [38]:
corpus = []
f = open('alice_in_wonderland.txt','r')
while(1):
    line =  f.readline()
    if len(line) == 0: break
    corpus.extend(line.split())
        
f.close()
corpus = ' '.join(corpus)

def clean_word(word):
    word = word.lower()
    for punctuation in ['"',"'",'.',',','-','?','!',';',':','—','(',')','[',']']:
        word = word.split(punctuation)[0]
    return word



corpus = [clean_word(word) for word in corpus.split()]
corpus = [word for word in corpus if len(word) > 0]
print(corpus[:25])
D = len(corpus)
print('corpus len: ',D)

['alice', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'the', 'millennium', 'fulcrum', 'edition', '3', 'contents', 'chapter', 'i', 'down', 'the', 'rabbit', 'chapter', 'ii', 'the', 'pool', 'of', 'tears', 'chapter']
corpus len:  25320


In [3]:
tokenize = {}
dictionary = []
token = 0
for word in corpus:
    if word not in tokenize.keys():
        tokenize[word] = token
        dictionary.append(word)
        token += 1
    
V = len(dictionary)
print('dictionary size (number of distinct words): ', V)



dictionary size (number of distinct words):  2637


In [31]:
#past word as feature

posterior_1word = np.zeros((V, V))
for k in range(1,D):
    word = corpus[k]
    token = tokenize[word]
    
    past_word = corpus[k-1]
    past_token = tokenize[past_word]
    posterior_1word[past_token, token] += 1
    

prior = np.zeros(V)
for k in range(D-1):
    word = corpus[k]
    token = tokenize[word]
    
    prior[token] += 1
prior = prior / np.sum(prior)

def get_likelihood_2gram(word):
    
    token = tokenize[word]
    likelihood = posterior_1word[token,:]*prior
    return(likelihood)
def pred_2gram(word):
    likelihood = get_likelihood_2gram(word)
    i = np.argmax(likelihood)
    return(dictionary[i], likelihood[i])
print(pred_2gram('alice'))
print(pred_2gram('the'))
print(pred_2gram('cheshire'))
print(pred_2gram('mock'))
    

('and', 0.4351672656897982)
('queen', 0.21011888305225324)
('cat', 0.007306765670050159)
('turtle', 0.1305738773253288)


In [53]:
#get accuracy of bigram classifier
correct = 0.
for k in range(1,D):
    word = corpus[k]
    prev_word = corpus[k-1]
    pred_word = pred_2gram(prev_word)[0]
    if word == pred_word: correct += 1
print(correct / (D-1))

0.22366602156483273


In [43]:
#past 2 words as features

posterior_2words = np.zeros((V, V))
for k in range(2,D):
    word = corpus[k]
    token = tokenize[word]
    
    past_word = corpus[k-2]
    past_token = tokenize[past_word]
    posterior_2words[past_token,token] += 1

posterior_2gram = np.vstack([posterior_1word,posterior_2words])


def get_likelihood_3gram(word2ago,word1ago):
    
    token1ago = tokenize[word1ago]
    token2ago = tokenize[word2ago]
    likelihood = posterior_2gram[token1ago,:] * posterior_2gram[token2ago + V,:]  * prior
    return likelihood
def pred_3gram(word2ago,word1ago):
    likelihood = get_likelihood_3gram(word2ago,word1ago)
    i = np.argmax(likelihood)
    
    return dictionary[i], likelihood[i]
print(pred_3gram('pack','of'))
print(pred_3gram('the','mad'))
print(pred_3gram('she','jumped'))

    

('cards', 0.0010663928275208342)
('you', 0.707018444646313)
('up', 0.19716418499940755)


In [54]:
#get accuracy of bigram classifier
correct = 0.
for k in range(2,D):
    word = corpus[k]
    prev_word = corpus[k-1]
    prev2_word = corpus[k-2]
    pred_word = pred_3gram(prev2_word,prev_word)[0]
    if word == pred_word: correct += 1
print(correct / (D-1))

0.36087523203918004


In [55]:
word = 'alice'
gen_2gram = [word]
for i in range(25):
    word = pred_2gram(word)[0]
    gen_2gram.append(word)
print(' '.join(gen_2gram))


word_bigram = ['alice','was']
gen_3gram = word_bigram
for i in range(25):
    word = pred_3gram(word_bigram[0],word_bigram[1])[0]
    gen_3gram.append(word)
    word_bigram = [word_bigram[1],word]
print(' '.join(gen_3gram))



alice and the queen and the queen and the queen and the queen and the queen and the queen and the queen and the queen and
alice was the little of the queen and the queen and the queen and the queen and the queen and the queen and the queen and the


In [56]:
word = 'alice'
gen_2gram = [word]
for i in range(25):
    likelihood = get_likelihood_2gram(word)
    word = random.choices(dictionary,likelihood)[0]
    gen_2gram.append(word)
print(' '.join(gen_2gram))


word_bigram = ['alice','was']
gen_3gram = word_bigram
for i in range(25):
    likelihood = get_likelihood_3gram(word_bigram[0],word_bigram[1])
    word = random.choices(dictionary,likelihood)[0]
    gen_3gram.append(word)
    word_bigram = [word_bigram[1],word]
print(' '.join(gen_3gram))



alice it said the mouse the mock turtle to the queen had a little alice was the poor alice as the gryphon the hatter the other
alice was the white and that the gryphon and the other and i to be a very little of it was the little the mock turtle in
