# Markov Chain Generation
In this notebook, we generate the necessary Markov Chains for each author which will be used as likelihood functions during the identification process.

In [7]:
import string
import sys
import re

class MarkovChain():
    """
    Markov Chain class. 
    chain:       nested dictionary representing the number of occurences of a word given the previous word.
    wordCount:   dictionary of the number of total number of words (value) that have occured after the previous word (key).
    """
    chain = {}
    wordCount = 0
    
    def addWord(self, prevWord, word):
        """
        Add a word to the Markov Chain.
        Takes the previous word and the current word as strings
        """
        self.chain[(prevWord, word)] = self.chain.get((prevWord, word), 0) + 1
        self.wordCount += 1
        
#         # If the word isn't in the prefix's dictionary, add it
#         if not self.chain[prevWord].get(word):
#             self.chain[prevWord][word] = 0
            
#         if not self.chain.wordCount[prevWord]:
#             self.wordCount[prevWord] = 0
                
#         self.chain[prevWord][word] += 1
#         self.wordCount[prevWord] += 1
        
#         if not self.chain.get(prevWord): # If the word doesn't already exist, add one to the total number of words
#             self.wordCount[prevWord] = 1
#             self.wordCount[prevWord] += 1
#         self.chain[prevWord][word] += 1
        
    def addSentence(self, sentence):
        """
        Process a "sentence" to produce a Markov Chain. Takes a sentence as a list of lowercase words.
        NO ASTERISKS. SERIOUSLY.
        """
        # '*' represents the beginning of a sentence in the chain. 
        # This way, we can determine the probability of a word starting a sentence
        modifiedSentence = ['*'] + sentence
        
        if len(sentence) > 1:
            for i in [i + 1 for i in range(len(sentence))]: # Start at the second word
                self.addWord(sentence[i-1], sentence[1])
        
    def getProb(self, prevWord, word):
        """
        Return the probability of getting word given prevWord.
        Takes two strings.
        """
        return self.chain[(prevWord, word)]/self.wordCount
    
def processGutenberg(fileName):
    """
    Process a Gutenberg text file.
    fileName: string
    returns a markovChain object.
    """
    f = open(fileName)
    
    #Skip to the beginning of the actual text
    for line in f:
        if line.startswith("*** START OF THIS PROJECT"):
            break
    
    text = ''
    markovChain = MarkovChain()
    
    # Put the text into one big string
    for line in f:
        # Stop when hitting the end of the book
        if line.startswith("*** END OF THIS PROJECT"):
            break
        
        text += line + ' '
        
    sentences = re.split('[.?!]', text) # Seperate text into a list of sentences sentence
    
    for sentence in sentences:
        # Make all words lowercase and strip off punctuation
        sentenceList = ''.join(char for char in sentence if char in set(string.letters + string.digits + ' ')).lower().split()
        
        # Process the sentence in the Markov Chain
        markovChain.addSentence(sentenceList)
    
    f.close()
    
    return markovChain

In [8]:
greatExp = processGutenberg('GreatExpectations.txt')

In [9]:
greatExp.chain

{('instantly', 'miss'): 1,
 ('would', 'stood'): 1,
 ('gate', 'heard'): 1,
 ('twist', 'i'): 2,
 ('it', 'couldnt'): 1,
 ('mrs', 'lady'): 1,
 ('halfyearly', 'it'): 1,
 ('hart', 'i'): 1,
 ('about', 'hung'): 1,
 ('hunter', 'i'): 1,
 ('have', 'made'): 3,
 ('gifthorses', 'handel'): 1,
 ('it', 'younger'): 1,
 ('write', 'rather'): 1,
 ('think', 'are'): 1,
 ('about', 'then'): 2,
 ('circumstances', 'mean'): 1,
 ('for', 'made'): 4,
 ('gird', 'was'): 1,
 ('didnt', 'two'): 1,
 ('done', 'say'): 1,
 ('knew', 'had'): 7,
 ('my', 'construction'): 3,
 ('house', 'heavily'): 1,
 ('equally', 'xxviii'): 1,
 ('going', 'as'): 3,
 ('moaned', 'miss'): 3,
 ('cork', 'a'): 1,
 ('sounds', 'statement'): 1,
 ('with', 'gargery'): 2,
 ('ran', 'imperceptible'): 1,
 ('money', 'since'): 1,
 ('half', 'then'): 1,
 ('to', 'accidentally'): 1,
 ('in', 'up'): 2,
 ('opinion', 'hesitated'): 1,
 ('bacon', 'we'): 1,
 ('nor', 'giv'): 1,
 ('exacted', 'he'): 1,
 ('put', 'her'): 1,
 ('fortunes', 'my'): 1,
 ('made', 'boast'): 2,
 ('making