In [1]:
from lxml import etree
import nltk
import readability
from scipy import stats

In [27]:
class Character(): 
    def __init__(self, tree, name):
        self.name = name
        self.xpath = ".//sp[speaker='%s']/p" % name
        self.element = tree.findall(self.xpath) 
        if len(self.element) == 0: 
            # Something's wrong. Let's try the other format. 
            self.xpath = ".//said[@who='#%s']" % name
            self.element = tree.findall(self.xpath)
        if len(self.element) == 0: 
            raise Exception("Can't find any dialog!")
        self.lines = [line.text for line in self.element]
        self.text = '\n'.join(self.lines)
        self.sents = nltk.sent_tokenize(self.text)
        # This seems weird, but it's the required format for the readability module
        self.sentWords = [' '.join(nltk.word_tokenize(sent)) for sent in self.sents]
        self.stats = readability.getmeasures(self.sentWords)
        self.kincaid = self.stats['readability grades']['Kincaid']
        self.flesch = self.stats['readability grades']['FleschReadingEase']
        self.ttr = self.stats['sentence info']['type_token_ratio']
        self.words = [w for w in nltk.word_tokenize(self.text)]
        self.wordsLower = [w.lower() for w in nltk.word_tokenize(self.text)]
        self.uniquewords = list(set(self.words)) 
        self.freqDist = nltk.FreqDist(self.words)
        self.makeProbs()
    def makeProbs(self): 
        table = {}
        for word in self.uniquewords:
            idx = self.wordsLower.index(word.lower())
            if idx+1 < len(self.wordsLower): 
                table[word.lower()] = self.words[idx+1]
        self.probs = table

In [28]:
class Comparison(): 
    def __init__(self, char1, char2): 
        # Takes as input two Character objects
        self.char1 = char1
        self.char2 = char2
        self.allWords = char1.words + char2.words
        self.allFreqs = {w: [char1.freqDist[w], char2.freqDist[w]] for w in self.allWords}
        self.distances = self.getDistances(raw=True)
        self.adjustedDistances = self.getDistances()
    
    def getDistances(self, raw=False): 
        """ Computes distances (i.e. how many more times a word occurs in  
        one character's speech than another's) and adjusts them for whether they're common. """
        distances = {}
        aDistances = {}
        for w in self.allFreqs: 
            char1Freq = self.allFreqs[w][0]
            char2Freq = self.allFreqs[w][1]
            distances[w] = char1Freq - char2Freq
            # Figure out which one is smaller
            smallerFreq = min([char1Freq, char2Freq])
            # Divide by the smaller one to adjust for whether the word is common. 
            if smallerFreq is not 0: 
                aDistances[w] = (char1Freq/smallerFreq) - (char2Freq/smallerFreq)
        if raw: 
            return distances
        else: 
            return aDistances
    
    def compare(self, n=20, raw=False): 
        """ 
        Shows a comparison of two characters. 
        N is the top number of words to show.
        Raw is whether to use the raw list or the adjusted list.
        """
        
        if raw: 
            distances = self.distances
        else: 
            distances = self.adjustedDistances
        
        self.getStats(self.char1, distances, reverse=True)
        self.getStats(self.char2, distances, reverse=False)
    
    def getStats(self, char, distances, reverse, n=20): 
        print('\n%s speaks %s words, of which %s are unique.' % 
              (char.name, len(char.words), len(char.uniquewords)) )
        print('Type/token ratio: %s' % char.ttr)
        print('Kincaid grade reading level: %s' % char.kincaid)
        print('Flesch reading level: %s\n' % char.flesch)
        print('\nWords distinctive of %s:\n' % char.name)
        
        for w in sorted(distances, key=distances.get, reverse=reverse)[:n]: 
            print(w, distances[w])
    
class MultiCompare(Comparison): 
    def __init__(self, charList): 
        """ Takes as input a list of character objects. """
        self.chars = charList
        self.allWords = []
        # Concatenate wordlists
        for wordList in [char.words for char in charList]: 
            self.allWords += wordList
        self.allFreqs = {}
        # Compile frequencies
        for w in self.allWords:
            freqs = []
            for char in charList: 
                freqs.append(char.freqDist[w])
            self.allFreqs[w] = freqs
        self.distances = self.getDistances()
        self.adjustedDistances = self.distances
    
    def getDistances(self): 
        """ Computes aggregate distances of a word's frequency in one character 
        compared to that word's frequency among other characters. 
        
        Ex: Word | Socrates | Phaedrus | Timaeus
           truth |    5     |    1     |    1
            the  |   557    |   126    |   365

        Aaron's algorithm: 
        
        (total # of given word in speaker / total # words spoken by speaker ) * 
        (total # of words used by author / total # words in corpus )
        """ 
        
        distances = {}
        for word in self.allFreqs: 
            freqs = self.allFreqs[word]
            # Ignore lists with identical items, 
            # Since scipy will call this [nan, nan, nan]
            # Since it's trying to divide by a std of 0
            if len(set(freqs)) > 1: 
                # Normalize by converting frequencies to 
                # z-scores. 
                centeredFreqs = stats.zscore(freqs).tolist()
                distances[word] = centeredFreqs
        return distances
    
    def compare(self, n=20, raw=False): 
        """ 
        Shows a comparison of two characters. 
        N is the top number of words to show.
        Raw is whether to use the raw list or the adjusted list.
        """
        
        distances = self.distances
        
        for i, char in enumerate(self.chars): 
            char.mdw = []
            charScores = {}
            for word in distances: 
                charScores[word] = distances[word][i]
            sortedScores = sorted(charScores, key=charScores.get, reverse=True)
            char.mdw = [(w, charScores[w]) for w in sortedScores]
            print(char.name, char.mdw[:10])
        
        
                
    def getStats(self, char, distances, reverse, n=20): 
        print('\n%s speaks %s words, of which %s are unique.' % 
              (char.name, len(char.words), len(char.uniquewords)) )
        print('Type/token ratio: %s' % char.ttr)
        print('Kincaid grade reading level: %s' % char.kincaid)
        print('Flesch reading level: %s\n' % char.flesch)
        print('\nWords distinctive of %s:\n' % char.name)
        
        for w in sorted(distances, key=distances.get, reverse=reverse)[:n]: 
            print(w, distances[w])

In [29]:
tree = etree.parse('phaedrus.xml')
soc = Character(tree, 'Socrates')
phae = Character(tree, 'Phaedrus')
#c = Comparison(soc, phae).compare()
#c = Comparison(soc, phae)

In [30]:
soc.probs

{'confute': 'me',
 'dividing': 'things',
 'ago': ',',
 'niceties': 'of',
 'silent': '.',
 'high': 'speculation',
 'law': 'courts',
 'doubtful': 'things',
 'superhuman': 'wonder',
 'proofs': ',',
 'cross': 'over',
 'most': 'of',
 'delivered': 'to',
 'leading': 'them',
 'incomplete': '.',
 'you': 'come',
 'dissimilarity': 'of',
 'duplication': 'and',
 'is': 'right',
 'break': 'any',
 'pushed': 'her',
 'ordinary': 'man',
 'voted': 'by',
 'off': 'the',
 'branch': 'of',
 'sometimes': 'making',
 'averted': '.',
 'dependent': 'on',
 'improper': 'for',
 'praising': 'the',
 'conversation': '?',
 'stated': 'that',
 'middle': 'and',
 'utterly': 'simple',
 'clearness': 'and',
 'holy': 'place',
 'shady': 'willow',
 'water': 'is',
 'course': '.',
 'impression': 'of',
 'begetting': 'and',
 'prove': 'that',
 'alternative': 'You',
 'sows': 'in',
 'talk': ',',
 'helter-skelter': '?',
 'fight': 'against',
 'achelous': ',',
 'such': 'matters',
 'terrible': 'and',
 'preserve': 'a',
 'greatly': 'mistaken',


In [16]:
c.allFreqs

{'had': [6, 4],
 'wisdom': [2, 0],
 'understand': [5, 2],
 'water': [3, 1],
 'need': [2, 2],
 'turn': [2, 0],
 'finely': [1, 0],
 'other': [23, 9],
 'feel': [1, 0],
 'crazy': [0, 1],
 'also': [7, 4],
 'longer': [1, 0],
 'say': [25, 15],
 'noble': [3, 3],
 'called': [1, 3],
 'through': [4, 1],
 'exercised': [0, 1],
 'wind': [1, 0],
 'sit': [1, 2],
 'learns': [1, 0],
 'presented': [1, 0],
 'mistaken': [1, 0],
 'niceties': [1, 0],
 'seriousness': [1, 0],
 'dissimilarity': [1, 0],
 'exposition': [1, 0],
 'too': [4, 3],
 'counsel': [1, 0],
 'customary': [1, 0],
 'effect': [1, 0],
 'quite': [4, 4],
 'mind': [6, 1],
 'compel': [2, 0],
 'speaker': [2, 0],
 'poverty': [1, 0],
 'lawsuits': [0, 1],
 'hand': [2, 1],
 'whoever': [1, 0],
 'motives': [0, 1],
 'there': [13, 6],
 'defend': [2, 0],
 'so-and-so': [1, 0],
 'put': [3, 0],
 'flatter': [0, 1],
 'mere': [2, 0],
 'divisions': [1, 0],
 'best': [3, 1],
 'oak': [1, 0],
 'seems': [7, 3],
 'hands': [2, 0],
 'let': [10, 5],
 'an': [17, 7],
 'hermes'

In [8]:
mc = MultiCompare([soc, phae])
mc.allFreqs['truth']

[15, 1]

In [9]:
tree = etree.parse('timaeus.xml')
tim = Character(tree, 'Timaeus')
soc = Character(tree, 'Socrates')
crit = Character(tree, 'Critias')
Comparison(soc, crit).compare()


Socrates speaks 1456 words, of which 512 are unique.
Type/token ratio: 0.4052590873936582
Kincaid grade reading level: 13.678252330361868
Flesch reading level: 57.60084167786057


Words distinctive of Socrates:

? 5.5
may 3.0
should 3.0
their 3.0
nature 3.0
good 3.0
proper 2.0
because 2.0
feast 2.0
friends 2.0
speech 2.0
themselves 2.0
seeing 2.0
thus 2.0
must 1.5
they 1.2000000000000002
trained 1.0
turn 1.0
class 1.0
go 1.0

Critias speaks 2474 words, of which 748 are unique.
Type/token ratio: 0.335278276481149
Kincaid grade reading level: 19.596691692508568
Flesch reading level: 44.75282519993473


Words distinctive of Critias:

he -13.0
has -8.0
was -5.333333333333333
also -5.0
mind -5.0
man -5.0
here -4.0
being -4.0
many -4.0
citizens -4.0
up -4.0
after -4.0
at -3.666666666666667
are -3.666666666666667
had -3.0
indeed -3.0
very -3.0
story -2.5
from -2.4
were -2.25


In [56]:
phae.stats

OrderedDict([('readability grades',
              OrderedDict([('Kincaid', 2.6027610196494955),
                           ('ARI', 2.6016580368206768),
                           ('Coleman-Liau', 4.916065883364315),
                           ('FleschReadingEase', 94.65420570012394),
                           ('GunningFogIndex', 6.366705611612675),
                           ('LIX', 24.002265887767745),
                           ('SMOGIndex', 7.174754056057845),
                           ('RIX', 1.4095238095238096)])),
             ('sentence info',
              OrderedDict([('characters_per_word', 4.014405204460966),
                           ('syll_per_word', 1.2030669144981412),
                           ('words_per_sentence', 10.247619047619047),
                           ('sentences_per_paragraph', 210.0),
                           ('type_token_ratio', 0.2908921933085502),
                           ('characters', 8639),
                           ('syllables', 2589),
    

In [30]:
phae.readingLevel

2.6027610196494955