# Markov Chain Generation
In this notebook, we generate the necessary Markov Chains for each author which will be used as likelihood functions during the identification process.

In [36]:
import string
import sys
import re

class MarkovChain():
    """
    Markov Chain class. 
    chain:       nested dictionary representing the number of occurences of a word given the previous word.
    wordCount:   dictionary of the number of total number of words (value) that have occured after the previous word (key).
    """
    
    def __init__(self, author):
        """
        Parameter: 
            - author: name of author (string)
        """
        self.author = author
        self.chain = {'*': {}}
        self.wordCount = {'*': 0}
    
    def addWord(self, prevWord, word):
        """
        Add a word to the Markov Chain.
        Takes the previous word and the current word as strings
        """
        self.chain[prevWord][word] = 1 + self.chain[prevWord].get(word, 0)
        self.wordCount[prevWord]= 1 + self.wordCount.get(prevWord, 0)
    
        # When encountering a new word, add it to the prefix dictionary
        if not self.chain.get(word):
            self.chain[word] = {}
        
    def addSentence(self, sentence):
        """
        Process a "sentence" to produce a Markov Chain. Takes a sentence as a list of lowercase words.
        NO ASTERISKS. SERIOUSLY.
        """
        # '*' represents the beginning of a sentence in the chain. 
        # This way, we can determine the probability of a word starting a sentence
        sentence = ['*'] + sentence
        if len(sentence) > 1:
            for i in [i + 1 for i in range(len(sentence)-1)]: # Start at the second word
                self.addWord(sentence[i-1], sentence[i])
        
    def getProb(self, prevWord, word):
        """
        Return the probability of getting word given prevWord.
        Takes two strings.
        """
        return self.chain[prevWord][word]/self.wordCount[prevWord]
    
def processGutenberg(fileName, author, make=True):
    """
    Process a Gutenberg text file.
    fileName: string
    author: string
    returns a markovChain object.
    """
    f = open(fileName)
    
    #Skip to the beginning of the actual text
    for line in f:
        if line.startswith("*** START OF THIS PROJECT"):
            break
    
    text = ''
    
    # Put the text into one big string
    for line in f:
        # Stop when hitting the end of the book
        if line.startswith("*** END OF THIS PROJECT"):
            break
        
        text += line + ' '
        
    sentences = re.split('[.?!]', text) # Seperate text into a list of sentences sentence
    
    listOSentences = []
    for sentence in sentences:
        # Make all words lowercase and strip off punctuation
        sentenceList = ''.join(char for char in sentence if char in set(string.letters + string.digits + ' ')).lower().split()
        
        if sentenceList != []:
            listOSentences.append(sentenceList)
    
    f.close()
    
    if make==True:
        return makeMarkov(listOSentences, author)
    else:
        return listOSentences

def makeMarkov(sentenceList, author):
    markovChain = MarkovChain(author)
    for sentence in sentenceList:
        # Process the sentence in the Markov Chain
        markovChain.addSentence(sentence)
    return markovChain

In [43]:
greatExp = processGutenberg('GreatExpectations.txt', 'Charles Dickens')
frank = processGutenberg('Frankenstein.txt', 'Mary Shelley')
romeoJuliet = processGutenberg('RomeoAndJuliet.txt', 'Shakespeare')

In [37]:
unknown = processGutenberg('testText.txt', 'Unknown', make=False)
print(unknown)

[['o', 'that', 'this', 'too', 'too', 'solid', 'flesh', 'would', 'melt', 'thaw', 'and', 'resolve', 'itself', 'into', 'a', 'dew'], ['or', 'that', 'the', 'everlasting', 'had', 'not', 'fixd', 'his', 'canon', 'gainst', 'selfslaughter'], ['o', 'god'], ['god'], ['how', 'weary', 'stale', 'flat', 'and', 'unprofitable', 'seem', 'to', 'me', 'all', 'the', 'uses', 'of', 'this', 'world'], ['fie', 'ont'], ['ah', 'fie'], ['tis', 'an', 'unweeded', 'garden', 'that', 'grows', 'to', 'seed', 'things', 'rank', 'and', 'gross', 'in', 'nature', 'possess', 'it', 'merely'], ['that', 'it', 'should', 'come', 'to', 'this'], ['but', 'two', 'months', 'dead'], ['nay', 'not', 'so', 'much', 'not', 'two'], ['so', 'excellent', 'a', 'king', 'that', 'was', 'to', 'this', 'hyperion', 'to', 'a', 'satyr', 'so', 'loving', 'to', 'my', 'mother', 'that', 'he', 'might', 'not', 'beteem', 'the', 'winds', 'of', 'heaven', 'visit', 'her', 'face', 'too', 'roughly'], ['heaven', 'and', 'earth'], ['must', 'i', 'remember'], ['why', 'she', 'wo

In [46]:
greatExp.wordCount
count = 0
for key, val in romeoJuliet.wordCount.iteritems():
    if val > 10000:
        print(key, val)
        count += 1
print(greatExp.wordCount['*'])

('the', 16187)
('and', 12056)
('*', 25974)
25974


In [39]:
from pickle import dump

def pickleDump(filename, todump):
    out = open(filename, 'wb+')
    for d in todump:
        dump(d, out)
    out.close()

In [47]:
pickleDump('GreatExp.dat', [greatExp.chain, greatExp.wordCount, greatExp.author])
pickleDump('Frankenstein.dat', [frank.chain, frank.wordCount, frank.author])
pickleDump('RomeoJuliet.dat', [romeoJuliet.chain, romeoJuliet.wordCount, romeoJuliet.author])

In [41]:
pickleDump('testText.dat', [unknown])

True