## Importing Libraries

In [1]:
import numpy as np
import re

Using Cornell Movie dialog corpous for the model

Access corpous at https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

### Data Cleaning

In [2]:
data=[]
with open('movie_lines.txt', 'rb') as f:
    for line in f:
        data.append(line.decode(errors='ignore'))

In [3]:
data=[x.split('+++$+++')[-1].replace('\n','') for x in data]

In [4]:
len(data)

304713

In [5]:
# joining all the lines to form a single text corpus
data=" ".join(data)

In [6]:
data[:100]

" They do not!  They do to!  I hope so.  She okay?  Let's go.  Wow  Okay -- you're gonna need to lear"

In [7]:
def clean(string):
    string = re.sub(r"\'s", " ", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)

    return string.strip().lower()

In [8]:
clean_data= clean(data)
clean_data[:100]

"they do not!  they do to!  i hope so.  she okay?  let  go.  wow  okay -- you 're gonna need to learn"

In [9]:
# create a dictionary of unique words in the corpus
word_dct= list(set(re.split(r'(?<!\d)\.(?!\d)|[\?`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,/<>?\s]', clean_data)))
word_dct.remove("")
len(word_dct)

49576

In [10]:
word_dct[:10]

['mechanisms',
 'currents',
 'nickel',
 'undetectable',
 'skua',
 'lernt',
 'hypnotize',
 'morema',
 'gottingen',
 'presumption']

In [11]:
#reverse dictionary for mapping word to indices
rev_dct = {j:i for i,j in enumerate(word_dct)}

#checking the index of a random word from corpus
rev_dct['hi'],rev_dct['unforeseen']

(38948, 39985)

In [12]:
w_count=len(word_dct)

In [13]:
#2-d to store count of next word based on current word
uni_count=np.zeros((w_count,w_count))

#count of current word
word_count=np.zeros(w_count)

#tokenisation based on spaces and special characters with decimal being taken care of
all_words=list(re.split(r'(?<!\d)\.(?!\d)|[\?`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,/<>?\s]', clean_data))
# while '' in all_words: all_words.remove('')
all_words=list(filter(('').__ne__, all_words))

In [14]:
#iterating throughout the complete corpus to store the count
for i in range(0,len(all_words)-1):
    prev_index=rev_dct[all_words[i]]
    word_count[prev_index]+=1
    cur_index =rev_dct[all_words[i+1]]
    uni_count[prev_index,cur_index]+=1

word_count[rev_dct[all_words[-1]]]+=1

## Probability of next word

Formula to calculate the probability of next word W’ if the current word is W in the corpus is

$$ P(W'|W) = \frac{\text{Count}(W,W')}{\text{Count}(W)} $$

where, $ \text{Count}(W,W’)$ is number of times $W’$ follows $W$ and Count$(W)$ is total occurrence of $W$ in the
corpus.


In [15]:
#calculating probability matrix using word count and next word count using the formula
for i in range(len(uni_count)):
    uni_count[i]=uni_count[i]/word_count[i]

In [16]:
# np.savetxt("prob_matrix.csv", uni_count, delimiter=",")

In [17]:
#function to predict next words based on a single word.
def predict(word,limit=10):
    
    cur_word=word.lower()
    print (cur_word,end=' ')
    for i in range(limit):
        #index of next word using probability matrix
        cur_prob_idx = uni_count[rev_dct[cur_word]]
        
#         nextW_idx= np.argmax(cur_prob_idx)
        nextW_idx = np.random.choice(np.argwhere(cur_prob_idx == np.amax(cur_prob_idx)).ravel())
        #index to word using dictionary of words
        nextW=word_dct[nextW_idx]
        cur_word=nextW
        print (nextW,end=' ')
    print ()

In [18]:
predict('great')
predict("enemy")
predict("Brucie")
predict('former')
predict('Did')
predict('Harry')
predict('dressing')
predict('creative')
predict('Already')
predict('feminist')

great i m not a little girl i m not a 
enemy is n t know what i m not a little 
brucie best friend of the way to be a little girl 
former commander i m not a little girl i m not 
did n t know what i m not a little girl 
harry i m not a little girl i m not a 
dressing room and i m not a little girl i m 
creative power to be a little girl i m not a 
already know what i m not a little girl i m 
feminist prose i m not a little girl i m not 
