# Author Identification
#### Carl Moser, Matthew Beaudouin-Lafon

We did stuff to do things with doodas.

In [10]:
from thinkbayes2 import Suite
from pickle import load
import thinkplot

def getPickle(fileName):
    infile = open(fileName, 'rb+')
    chain = load(infile)
    wordCount = load(infile)
    author = load(infile)
    infile.close()
    return (chain, wordCount, author)

class AuthorId(Suite):
    """
    Bayesian model for author identification. Uses Markov chains generated from texts who's authors are known as a likelihood function.
    Has a list of {author : MarkovChain objects}
    """
    def __init__(self):
        self.markovChains = {}
        Suite.__init__(self)
    
    def isWorthChecking(self, word):
        worth = True
        for author, prob in self.Items():
            wordCount = self.markovChains[author][1]
            worth = worth and bool(wordCount.get(word))
        return worth
                
    def Likelihood(self, data, hypo):
        """
        data: (string, string)
        hypo: (Markov Chain, Word Count dictionary, Author)
        """
        hypoAuthor = hypo
        chain, wordCount = self.markovChains[hypoAuthor]
        word = data
            
        return wordCount[word]

In [11]:
files = ['Frankenstein.dat', 'GreatExpectations.dat', 'RomeoAndJuliet.dat', 'MobyDick.dat', "The Hitch Hiker's Guide to the Galaxy.dat", 'Twilight.dat']

authorId = AuthorId()

for file in files:
    chain, wordCount, author = getPickle(file)
    authorId[author] = 1
    authorId.markovChains[author] = (chain, wordCount)

In [12]:
f = open('testText.dat', 'rb')
testText = load(f)
f.close()

In [13]:
for author, prob in authorId.Items():
    authorId[author] = 1
    
authorId.Normalize()
maryV = []
charlesV = []
shakeV = []
totalWords = 0
for sentence in testText:
    sentence = ['*'] + sentence
    for word in sentence:
        totalWords += 1
        if authorId.isWorthChecking(word):
            authorId.Update(word)
            
        maryV.append(authorId['Mary Shelley'])
        charlesV.append(authorId['Charles Dickens'])
        shakeV.append(authorId['Shakespeare'])
        
authorId.Print()

Charles Dickens 4.860197789227251e-43
Douglas Adams 0.0
Herman Melville 1.0
Mary Shelley 6.334662243176631e-212
Shakespeare 1.5844206093958314e-246
Stephenie Meyer 9.450347529934758e-162


In [2]:
import matplotlib.pyplot as plt

#for i in (shakeV):
#    print(str(i) + ', ')
x = range(0, len(maryV))
plt.xlabel('Update')
plt.ylabel('Probability')
plt.plot(x, maryV, label='Mary')
plt.plot(x, charlesV, label='Charles')
plt.plot(x, shakeV, label='Shakespeare')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=5)
plt.show()
# for fuck, shit in zip(maryV, charlesV):
#     print(fuck, shit)
# authorId

NameError: name 'maryV' is not defined

Questions:
    - What to do with words that aren't there?
    - "To" appears a lot more than other words. Should it be weighed differently?
    - Integrate sentence lengths