In [2]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import copy

import random


In [3]:
corpus = []
f = open('alice_in_wonderland.txt','r')
while(1):
    line =  f.readline()
    if len(line) == 0: break
    corpus.extend(line.split())
        
f.close()
corpus = ' '.join(corpus)

def clean_word(word):
    word = word.lower()
    for punctuation in ['"',"'",'.',',','-','?','!',';',':','—','(',')','[',']']:
        word = word.split(punctuation)[0]
    return word



corpus = [clean_word(word) for word in corpus.split()]
corpus = [word for word in corpus if len(word) > 0]
print(corpus[:25])
D = len(corpus)
print('corpus len: ',D)

['alice', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'the', 'millennium', 'fulcrum', 'edition', '3', 'contents', 'chapter', 'i', 'down', 'the', 'rabbit', 'chapter', 'ii', 'the', 'pool', 'of', 'tears', 'chapter']
corpus len:  25320


In [4]:
tokenize = {}
wordlist = []
token = 0
for word in corpus:
    if word not in tokenize.keys():
        tokenize[word] = token
        wordlist.append(word)
        token += 1
    
V = len(wordlist)
print('word list size (number of distinct words): ', V)



word list size (number of distinct words):  2501


In [5]:
# bin how many times a word follows another word
counts_2gram = np.zeros((V,V))
for i in range(1,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-1]]
    counts_2gram[token_i,token_im1] += 1
print(counts_2gram)
    

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [9. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [73]:
#past word as feature

posterior_1word = np.zeros((V, V))
prior = np.zeros(V)

def get_likelihood_2gram(word):
    i = tokenize[word]
    posterior_1word = counts_2gram.copy()
    posterior_1word = posterior_1word.T
    likelihood = posterior_1word[i]
    
    count = 0
    
    for word in corpus:
        if i == tokenize[word]:
            count = count+1
    
    count = count/D
    
    sum = 0
    for i in range(V):
        sum += likelihood[i]
        
    for i in range(V):
        likelihood[i] = likelihood[i]/sum * count
    return(likelihood)
def pred_2gram(word):
    likelihood = get_likelihood_2gram(word)
    i = np.argmax(likelihood)
    return(wordlist[i], likelihood[i])

def classification_accuracy():
    count = 0
    next_word = ''
    for word in corpus:
        if next_word == word:
            count = count + 1
        likelihood = get_likelihood_2gram(word)
        i = np.argmax(likelihood)
        next_word = wordlist[i]
    return(count/(len(corpus)-1))
        
        
print(pred_2gram('alice'))
print(pred_2gram('the'))
print(pred_2gram('cheshire'))
print(pred_2gram('mock'))
print(pred_2gram('cat'))
print(pred_2gram('turtle'))
print("The accuracy for n=1")
print(classification_accuracy())

    

('was', 0.0007109004739336493)
('queen', 0.0027646129541864135)
('cat', 0.00019747235387045816)
('turtle', 0.0022511848341232226)
('and', 0.00015797788309636652)
('said', 0.00015797788309636652)
The accuracy for n=1
0.2453493423910897


In [70]:
#past 2 words as features

posterior_2words = np.zeros((V, V))

pre2_gram = np.zeros((V,V))
for i in range(2,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-2]]
    pre2_gram[token_i,token_im1] += 1

pre3_gram = np.zeros((V,V))
for i in range(3,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-3]]
    pre3_gram[token_i,token_im1] += 1
    
pre4_gram = np.zeros((V,V))
for i in range(4,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-4]]
    pre4_gram[token_i,token_im1] += 1
    
pre5_gram = np.zeros((V,V))
for i in range(5,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-5]]
    pre5_gram[token_i,token_im1] += 1
    
pre6_gram = np.zeros((V,V))
for i in range(6,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-6]]
    pre6_gram[token_i,token_im1] += 1
    
pre7_gram = np.zeros((V,V))
for i in range(7,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-7]]
    pre7_gram[token_i,token_im1] += 1
    
pre8_gram = np.zeros((V,V))
for i in range(8,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-8]]
    pre8_gram[token_i,token_im1] += 1
    
pre9_gram = np.zeros((V,V))
for i in range(9,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-9]]
    pre9_gram[token_i,token_im1] += 1
    
pre10_gram = np.zeros((V,V))
for i in range(10,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-10]]
    pre10_gram[token_i,token_im1] += 1
    
def get_likelihood_3gram(word2ago,word1ago):
    i1 = tokenize[word1ago]
    i2 = tokenize[word2ago]
    posterior_1word = counts_2gram.copy()
    posterior_2word = pre2_gram.copy()
    
    p1 = posterior_1word[i1].copy()
    p2 = posterior_2word[i2].copy()
    
    c = np.zeros(V)
    
    posterior_1word = posterior_1word.T
    posterior_2word = posterior_2word.T
    
    likelihood1 = posterior_1word[i1]
    likelihood2 = posterior_2word[i2]
    likelihood = np.zeros(V)
    
    count1 = 0
    count2 = 0
    
    for word in corpus:
        if i1 == tokenize[word]:
            count1 = count1+1
    
    for word in corpus:
        if i2 == tokenize[word]:
            count2 = count2+1
        c[tokenize[word]] = c[tokenize[word]] + 1
        
    
    count1 = count1/D
    count2 = count2/D
    
    sum1 = 0
    sum2 = 0
    sc = 0
    s1 = 0
    s2 = 0
    
    for i in range(V):
        sum1 += likelihood1[i]
        sum2 += likelihood2[i]
        s1 += p1[i]
        s2 += p2[i]
        sc += c[i]
        
    for i in range(V):
        likelihood1[i] = likelihood1[i]/sum1 * count1 
        likelihood2[i] = likelihood2[i]/sum2 * count2/(c[i]/sc)
        likelihood[i] = likelihood1[i] * likelihood2[i]
    
    return likelihood
def pred_3gram(word2ago,word1ago):
    likelihood = get_likelihood_3gram(word2ago,word1ago)
    i = np.argmax(likelihood)
    return wordlist[i], likelihood[i]
#print(pred_3gram('pack','of'))
#print(pred_3gram('the','mad'))
#print(pred_3gram('she','jumped'))
#print(pred_3gram('four','thousand'))


def get_likelihood_4gram(word4ago,word3ago,word2ago,word1ago):
    i1 = tokenize[word1ago]
    i2 = tokenize[word2ago]
    i3 = tokenize[word3ago]
    i4 = tokenize[word4ago]
    posterior_1word = counts_2gram.copy()
    posterior_2word = pre2_gram.copy()
    posterior_3word = pre3_gram.copy()
    posterior_4word = pre4_gram.copy()
    
    posterior_1word = posterior_1word.T
    posterior_2word = posterior_2word.T
    posterior_3word = posterior_3word.T
    posterior_4word = posterior_4word.T
    
    c = np.zeros(V)

    
    likelihood1 = posterior_1word[i1]
    likelihood2 = posterior_2word[i2]
    likelihood3 = posterior_3word[i3]
    likelihood4 = posterior_4word[i4]

    likelihood = np.zeros(V)
    
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0

    
    for word in corpus:
        c[tokenize[word]] = c[tokenize[word]] + 1
        if i1 == tokenize[word]:
            count1 = count1+1
    
    for word in corpus:
        if i2 == tokenize[word]:
            count2 = count2+1

    for word in corpus:
        if i3 == tokenize[word]:
            count3 = count3+1
            
    for word in corpus:
        if i4 == tokenize[word]:
            count4 = count4+1
    
    count1 = count1/D
    count2 = count2/D
    count3 = count3/D
    count4 = count4/D

    
    sum1 = 0
    sum2 = 0
    sum3 = 0
    sum4 = 0
    sc = 0
    
    for i in range(V):
        sum1 += likelihood1[i]
        sum2 += likelihood2[i]
        sum3 += likelihood3[i]
        sum4 += likelihood4[i]
        sc += c[i]
        
    for i in range(V):
        likelihood1[i] = likelihood1[i]/sum1 * count1
        likelihood2[i] = likelihood2[i]/sum2 * count2
        likelihood3[i] = likelihood3[i]/sum3 * count3
        likelihood4[i] = likelihood4[i]/sum4 * count4
        likelihood[i] = likelihood1[i] * likelihood2[i] * likelihood3[i] * likelihood4[i] /(c[i]/sc)/(c[i]/sc)/(c[i]/sc)
    
    return likelihood
def pred_4gram(word4ago,word3ago,word2ago,word1ago):
    likelihood = get_likelihood_4gram(word4ago,word3ago,word2ago,word1ago)
    i = np.argmax(likelihood)
    return wordlist[i], likelihood[i]

#print(pred_4gram ('what', 'an', 'ignorant', 'little'))
    
def get_likelihood_5gram(word5ago,word4ago,word3ago,word2ago,word1ago):
    i1 = tokenize[word1ago]
    i2 = tokenize[word2ago]
    i3 = tokenize[word3ago]
    i4 = tokenize[word4ago]
    i5 = tokenize[word5ago]
    posterior_1word = counts_2gram.copy()
    posterior_2word = pre2_gram.copy()
    posterior_3word = pre3_gram.copy()
    posterior_4word = pre4_gram.copy()
    posterior_5word = pre5_gram.copy()
    
    posterior_1word = posterior_1word.T
    posterior_2word = posterior_2word.T
    posterior_3word = posterior_3word.T
    posterior_4word = posterior_4word.T
    posterior_5word = posterior_5word.T
    
    c = np.zeros(V)

    
    likelihood1 = posterior_1word[i1]
    likelihood2 = posterior_2word[i2]
    likelihood3 = posterior_3word[i3]
    likelihood4 = posterior_4word[i4]
    likelihood5 = posterior_5word[i5]

    likelihood = np.zeros(V)
    
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count5 = 0

    
    for word in corpus:
        c[tokenize[word]] = c[tokenize[word]] + 1
        if i1 == tokenize[word]:
            count1 = count1+1
        if i2 == tokenize[word]:
            count2 = count2+1
        if i3 == tokenize[word]:
            count3 = count3+1
        if i4 == tokenize[word]:
            count4 = count4+1
        if i5 == tokenize[word]:
            count5 = count5+1
    
    
    count1 = count1/D
    count2 = count2/D
    count3 = count3/D
    count4 = count4/D
    count5 = count5/D

    
    sum1 = 0
    sum2 = 0
    sum3 = 0
    sum4 = 0
    sum5 = 0
    sc = 0
    
    for i in range(V):
        sum1 += likelihood1[i]
        sum2 += likelihood2[i]
        sum3 += likelihood3[i]
        sum4 += likelihood4[i]
        sum5 += likelihood5[i]
        sc += c[i]
        
    
        
    for i in range(V):
        likelihood1[i] = likelihood1[i]/sum1 * count1
        likelihood2[i] = likelihood2[i]/sum2 * count2
        likelihood3[i] = likelihood3[i]/sum3 * count3
        likelihood4[i] = likelihood4[i]/sum4 * count4
        likelihood5[i] = likelihood5[i]/sum5 * count5
        likelihood[i] = likelihood1[i] * likelihood2[i] * likelihood3[i] * likelihood4[i] * likelihood5[i] /(c[i]/sc)/(c[i]/sc)/(c[i]/sc)/(c[i]/sc)
    
    return likelihood
def pred_5gram(word5ago,word4ago,word3ago,word2ago,word1ago):
    likelihood = get_likelihood_5gram(word5ago,word4ago,word3ago,word2ago,word1ago)
    i = np.argmax(likelihood)
    return wordlist[i], likelihood[i]

def get_likelihood_t3gram(word3ago,word2ago,word1ago):
    i1 = tokenize[word1ago]
    i2 = tokenize[word2ago]
    i3 = tokenize[word3ago]

    posterior_1word = counts_2gram.copy()
    posterior_2word = pre2_gram.copy()
    posterior_3word = pre3_gram.copy()

    
    posterior_1word = posterior_1word.T
    posterior_2word = posterior_2word.T
    posterior_3word = posterior_3word.T

    
    c = np.zeros(V)

    
    likelihood1 = posterior_1word[i1]
    likelihood2 = posterior_2word[i2]
    likelihood3 = posterior_3word[i3]

    likelihood = np.zeros(V)
    
    count1 = 0
    count2 = 0
    count3 = 0

    
    for word in corpus:
        c[tokenize[word]] = c[tokenize[word]] + 1
        if i1 == tokenize[word]:
            count1 = count1+1
        if i2 == tokenize[word]:
            count2 = count2+1
        if i3 == tokenize[word]:
            count3 = count3+1
    
    
    count1 = count1/D
    count2 = count2/D
    count3 = count3/D

    
    sum1 = 0
    sum2 = 0
    sum3 = 0
    sc = 0
    
    for i in range(V):
        sum1 += likelihood1[i]
        sum2 += likelihood2[i]
        sum3 += likelihood3[i]
        sc += c[i]
        
    for i in range(V):
        likelihood1[i] = likelihood1[i]/sum1 * count1
        likelihood2[i] = likelihood2[i]/sum2 * count2
        likelihood3[i] = likelihood3[i]/sum3 * count3
        likelihood[i] = likelihood1[i] * likelihood2[i] * likelihood3[i] /(c[i]/sc)/(c[i]/sc)
    
    return likelihood
def pred_t3gram(word3ago,word2ago,word1ago):
    likelihood = get_likelihood_t3gram(word3ago,word2ago,word1ago)
    i = np.argmax(likelihood)
    return wordlist[i], likelihood[i]

#print(pred_t3gram('the', 'mad', 'hatter'))
def text_gene(word3ago,word2ago,word1ago):
    likelihood = get_likelihood_t3gram(word3ago,word2ago,word1ago)
    a = word3ago
    b = word2ago
    c = word1ago
    for i in range(25):
        i = np.argmax(likelihood)
        print(wordlist[i])
        a = b
        b = c
        c = wordlist[i]
        likelihood = get_likelihood_t3gram(a,b,c)

def text_gene1(word3ago,word2ago,word1ago):
    likelihood = get_likelihood_t3gram(word3ago,word2ago,word1ago)
    a = word3ago
    b = word2ago
    c = word1ago
    for i in range(25):
        a = b
        b = c
        c = random.choices(wordlist, weights = likelihood, k = 1)
        c = ' '.join(c)
        print(c)
        likelihood = get_likelihood_t3gram(a,b,c)
    
#text_gene('the', 'mad', 'hatter')

#print(pred_5gram('falling', 'down', 'a', 'very', 'deep'))
#text_gene1('the', 'mad', 'hatter')

def get_likelihood_10gram(word10ago,word9ago,word8ago,word7ago,word6ago,word5ago,word4ago,word3ago,word2ago,word1ago):
    i1 = tokenize[word1ago]
    i2 = tokenize[word2ago]
    i3 = tokenize[word3ago]
    i4 = tokenize[word4ago]
    i5 = tokenize[word5ago]
    i6 = tokenize[word6ago]
    i7 = tokenize[word7ago]
    i8 = tokenize[word8ago]
    i9 = tokenize[word9ago]
    i10 = tokenize[word10ago]
    
    posterior_1word = counts_2gram.copy()
    posterior_2word = pre2_gram.copy()
    posterior_3word = pre3_gram.copy()
    posterior_4word = pre4_gram.copy()
    posterior_5word = pre5_gram.copy()
    posterior_6word = pre6_gram.copy()
    posterior_7word = pre7_gram.copy()
    posterior_8word = pre8_gram.copy()
    posterior_9word = pre9_gram.copy()
    posterior_10word = pre10_gram.copy()
    
    posterior_1word = posterior_1word.T
    posterior_2word = posterior_2word.T
    posterior_3word = posterior_3word.T
    posterior_4word = posterior_4word.T
    posterior_5word = posterior_5word.T
    posterior_6word = posterior_6word.T
    posterior_7word = posterior_7word.T
    posterior_8word = posterior_8word.T
    posterior_9word = posterior_9word.T
    posterior_10word = posterior_10word.T
    
    c = np.zeros(V)

    
    likelihood1 = posterior_1word[i1]
    likelihood2 = posterior_2word[i2]
    likelihood3 = posterior_3word[i3]
    likelihood4 = posterior_4word[i4]
    likelihood5 = posterior_5word[i5]
    likelihood6 = posterior_6word[i6]
    likelihood7 = posterior_7word[i7]
    likelihood8 = posterior_8word[i8]
    likelihood9 = posterior_9word[i9]
    likelihood10 = posterior_10word[i10]

    likelihood = np.zeros(V)
    
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count5 = 0
    count6 = 0
    count7 = 0
    count8 = 0
    count9 = 0
    count10 = 0

    
    for word in corpus:
        c[tokenize[word]] = c[tokenize[word]] + 1
        if i1 == tokenize[word]:
            count1 = count1+1
        if i2 == tokenize[word]:
            count2 = count2+1
        if i3 == tokenize[word]:
            count3 = count3+1
        if i4 == tokenize[word]:
            count4 = count4+1
        if i5 == tokenize[word]:
            count5 = count5+1
        if i6 == tokenize[word]:
            count6 = count6+1
        if i7 == tokenize[word]:
            count7 = count7+1
        if i8 == tokenize[word]:
            count8 = count8+1
        if i9 == tokenize[word]:
            count9 = count9+1
        if i10 == tokenize[word]:
            count10 = count10+1
    
    
    count1 = count1/D
    count2 = count2/D
    count3 = count3/D
    count4 = count4/D
    count5 = count5/D
    count6 = count6/D
    count7 = count7/D
    count8 = count8/D
    count9 = count9/D
    count10 = count10/D

    
    sum1 = 0
    sum2 = 0
    sum3 = 0
    sum4 = 0
    sum5 = 0
    sum6 = 0
    sum7 = 0
    sum8 = 0
    sum9 = 0
    sum10 = 0
    
    sc = 0
    
    for i in range(V):
        sum1 += likelihood1[i]
        sum2 += likelihood2[i]
        sum3 += likelihood3[i]
        sum4 += likelihood4[i]
        sum5 += likelihood5[i]
        sum6 += likelihood6[i]
        sum7 += likelihood7[i]
        sum8 += likelihood8[i]
        sum9 += likelihood9[i]
        sum10 += likelihood10[i]
        
        sc += c[i]
        
    
        
    for i in range(V):
        likelihood1[i] = likelihood1[i]/sum1 * count1
        likelihood2[i] = likelihood2[i]/sum2 * count2
        likelihood3[i] = likelihood3[i]/sum3 * count3
        likelihood4[i] = likelihood4[i]/sum4 * count4
        likelihood5[i] = likelihood5[i]/sum5 * count5
        likelihood6[i] = likelihood6[i]/sum6 * count6
        likelihood7[i] = likelihood7[i]/sum7 * count7
        likelihood8[i] = likelihood8[i]/sum8 * count8
        likelihood9[i] = likelihood9[i]/sum9 * count9
        likelihood10[i] = likelihood10[i]/sum10 * count10
        
        likelihood[i] = likelihood1[i] * likelihood2[i] * likelihood3[i] * likelihood4[i] * likelihood5[i] * likelihood6[i]* likelihood7[i]* likelihood8[i]* likelihood9[i]* likelihood10[i]/(c[i]/sc)/(c[i]/sc)/(c[i]/sc)/(c[i]/sc)/(c[i]/sc)/(c[i]/sc)/(c[i]/sc)/(c[i]/sc)/(c[i]/sc)
    
    return likelihood

def classification_n3accuracy():
    count = 0
    next_word = ''
    a = corpus[0]
    b = corpus[1]
    c = corpus[2]
    d = 0
    
    for word in corpus:
        if d < 3:
            d +=1
            continue

        likelihood = get_likelihood_t3gram(a,b,c)
        i = np.argmax(likelihood)
        a = b
        b = c
        c = wordlist[i]
        if c == word:
            count = count + 1
        c = word
    return(count/(len(corpus)-3))
    


def classification_n5accuracy():
    count = 0
    next_word = ''
    a = corpus[0]
    b = corpus[1]
    c = corpus[2]
    d = corpus[3]
    e = corpus[4]
    f = 0
    
    for word in corpus:
        if f < 5:
            f +=1
            continue
        likelihood = get_likelihood_5gram(a,b,c,d,e)
        i = np.argmax(likelihood)
        a = b
        b = c
        c = d
        d = e
        e = wordlist[i]
        if e == word:
            count = count + 1
        e = word
    return(count/(len(corpus)-5))
    


def classification_n10accuracy():
    count = 0
    next_word = ''
    a = corpus[0]
    b = corpus[1]
    c = corpus[2]
    d = corpus[3]
    e = corpus[4]
    f = corpus[5]
    g = corpus[6]
    h = corpus[7]
    i = corpus[8]
    j = corpus[9]
    k = 0
    
    for word in corpus:
        if k < 10:
            k +=1
            continue
        likelihood = get_likelihood_10gram(a,b,c,d,e,f,g,h,i,j)
        ii = np.argmax(likelihood)
        a = b
        b = c
        c = d
        d = e
        e = f
        f = g
        g = h
        h = i
        i = j
        j = wordlist[ii]
        if j == word:
            count = count + 1
        j = word
    return(count/(len(corpus)-10))
    
#print(pred_3gram('pack','of'))
#print(pred_3gram('the','mad'))
#print(pred_3gram('she','jumped'))
print(pred_3gram('four','thousand'))
#print(pred_t3gram('the', 'mad', 'hatter'))
print(pred_4gram ('what', 'an', 'ignorant', 'little'))
print(pred_5gram('falling', 'down', 'a', 'very', 'deep'))
print("Answer for 4c")
text_gene('the', 'mad', 'hatter')
print("Answer for 4d")
text_gene1('the', 'mad', 'hatter')
print("The accuracy for n=3")
print(classification_n3accuracy())
print("The accuracy for n=5")
print(classification_n5accuracy())
print("The accuracy for n=10")
print(classification_n10accuracy())

('miles', 1.3164823591363875e-05)
('girl', 1.8513033175355453e-06)
('well', 4.7027976363754134e-11)
Answer for 4c
with
this
as
she
could
guess
she
was
now
about
two
feet
high
even
then
they
walked
off
together
alice
heard
a
little
pattering
of
Answer for 4d
with
this
said
the
king
saves
a
world
of
trouble
you
know
you
know
why
do
you
call
it
sad
and
she
kept
on
puzzling
The accuracy for n=3
0.7499703756369238
The accuracy for n=5
0.9401935611297649
The accuracy for n=10
0.9960489924930858


In [91]:
def text_gene1(word3ago,word2ago,word1ago):
    likelihood = get_likelihood_t3gram(word3ago,word2ago,word1ago)
    a = word3ago
    b = word2ago
    c = word1ago
    for i in range(25):
        a = b
        b = c
        c = random.choices(wordlist, weights = likelihood, k = 1)
        c = ' '.join(c)
        print(c)
        likelihood = get_likelihood_t3gram(a,b,c)
        
text_gene1('the', 'mad', 'hatter')

with
this
as
she
was
shrinking
rapidly
so
she
set
to
work
throwing
everything
within
her
reach
at
the
duchess
and
seemed
ready
to
agree
