# A Socratic Dialogue Generator

This generator analyzes speech from a Platonic dialogue in TEI XML, and generates that speech using Markov chains. Scroll to the bottom to see it in action. 

In [95]:
# Library for parsing XML
from lxml import etree

# We'll mostly use NLTK for tokenizing. 
import nltk

# Randomly choose things. 
from random import choice as pick

# Display things nicely. 
from IPython.display import display, Markdown

In [106]:
class Character(): 
    """
    This class analyzes and generates character-level speech. 
    """
    def __init__(self, tree, name):
        """
        Gets a character's speech from the TEI XML, and breaks it up
        by utterance, sentence, and word.
        """
        self.name = name
        # Get dialogue by speaker from the TEI. 
        self.xpath = ".//sp[speaker='%s']/p" % name
        self.element = tree.findall(self.xpath) 
        if len(self.element) == 0: 
            # Something's wrong. Let's try the other format. 
            self.xpath = ".//said[@who='#%s']" % name
            self.element = tree.findall(self.xpath)
        if len(self.element) == 0: 
            raise Exception("Can't find any dialog!")
        self.lines = [line.text for line in self.element]
        self.lineWords = [nltk.word_tokenize(line) for line in self.lines]
        self.lineLens = [len(line) for line in self.lineWords]
        self.text = '\n'.join(self.lines)
        self.sents = nltk.sent_tokenize(self.text)
        # This seems weird, but it's the required format for the readability module
        self.sentWords = [nltk.word_tokenize(sent) for sent in self.sents]
        self.words = [w for w in nltk.word_tokenize(self.text)]
        self.wordsLower = [w.lower() for w in nltk.word_tokenize(self.text)]
        self.uniquewords = list(set(self.words)) 
        self.firstWords = [s[0] for s in self.sentWords]
        self.makeProbs()
        
    def makeProbs(self): 
        """ 
        Makes a list of words and the words that follow those words.
        Some words are actually punctuation marks. 
        """
        table = {}
        for word in self.uniquewords:
            lword = word.lower()
            idxs = [i for i, val in enumerate(self.wordsLower) if val==lword]
            for idx in idxs: 
                # Make sure we don't fall off the edge of the list. 
                if idx+1 < len(self.wordsLower): 
                    nextWord = self.words[idx+1]
                    if lword not in table: 
                        table[word.lower()] = [nextWord]
                    else: 
                        table[lword].append(nextWord)
        self.probs = table
        
    def chain(self, n): 
        """
        Chains together words according to the "probs" dictionary.
        """
        chain = [] 
        # Pick first word
        word = pick(self.firstWords)
        chain.append(word)
        # Now get a bunch of subsequent words.
        for i in range(n): 
            nextWord = pick(self.probs[word.lower()])
            chain.append(nextWord)
            word = nextWord
        # Keep going until the end of the sentence. 
        while chain[-1] not in ['?', '.', '!']: 
            nextWord = pick(self.probs[word.lower()])
            chain.append(nextWord)
            word = nextWord
        chain = self.untokenize(chain)
        display(Markdown(chain))

    def untokenize(self, chain): 
        """
        Stitches sentences back together. 
        """
        out = ""
        for word in chain: 
            # Handle words that aren't totally alphabetical
            if word in ["(", ")"]: 
                # Just skip parentheses, since they hardly end up
                # closing. 
                continue
            if word[0].isalpha() or word[0] == "“" or word[0].isdigit(): 
                out = out + ' ' + word
            else: 
                out = out + word
        return out

In [107]:
class Dialogue():
    """
    This class analyzes and generates dialogue-level speech. 
    It chooses an amount of text that is appropriate for the character,
    given the amount 
    
    """
    def __init__(self, filename, char1, char2): 
        tree = etree.parse(filename)
        self.c1 = Character(tree, char1)
        self.c2 = Character(tree, char2) 

    def generate(self, n):
        for i in range(n//2):
            for char in [self.c1, self.c2]: 
                self.makeDialogue(n, char)

    def makeDialogue(self, n, c): 
        display(Markdown('\n**' + c.name + "**: "))
        lineLen = pick(c.lineLens)
        c.chain(lineLen)

In [108]:
Dialogue('texts/phaedrus.xml', 'Socrates', 'Phaedrus').generate(4)


**Socrates**: 

 Perhaps must be silent. On your friend. Why he met a pen with intelligence, Phaedrus, will, Then he was in this, and bounds.


**Phaedrus**: 

 You say; for us stay and where have said just so it is shade there and leave writings behind them, according to cause to my walk along and will go along and pure and clear that he ought to it.


**Socrates**: 

 Yes, this that he will never will listen to be unacquainted with the thing Read, Phaedrus, my boy, what of procedure would write, one can not at any appreciable knowledge of discourse because they not the souls and before whom you for it seems, I; for this order?


**Phaedrus**: 

 Certainly. They also, the Olympieum.

In [109]:
Dialogue('texts/timaeus.xml', 'Socrates', 'Timaeus').generate(4)


**Socrates**: 

 Bounteous and has no one proper and all should go about the soul of the imitative and into the best of its own inability of words; and deed. You requested me yesterday, ready for the extent and children all the allotment to treat their wage in many fine discourses of yesterday, and still harder in dealing with the same opinion about to behold them and our State, to do you have formed the fourth Then, from without or was this city and female, what my feeling is requisite in this city and certainly we discover other pursuits.


**Timaeus**: 

 And the Four in this one Universe to be that exist within itself around it separated off; whereas every object which is to the work He assigned unto it has its nature is the splendid hospitality we might be so far as it being detergent in a man of prophets upon by straying all, so likewise we have now if in conjunction therewith, moreover,' supported thereon it smooth with dense composition than Him, and surrounded by any one kind contracts, was not encircled by condensation, and her announcement, his right, causes it fire, and it is congealed by the following.


**Socrates**: 

 And exhibiting all the present as are a suitable war and sisters, by dividing off the qualities of you and sisters, in order that they would I gratified you requested me back again, in festal garb, that those of mine there any point omitted, three, shares the deserving amongst them back again, following Solons report, seeing that as children all men?


**Timaeus**: 

 Every disease of an excess; just as long as a different kind of fire or any of corn fall short thereof; which is defeated and the navel, which passes out gladly; for smallness; while if, unto it disperses them center, in the form that creature, to discern by all actions and all creatures, because it were of the rapidity of shellfish and extended far away by the nature is of the food and, like, like those last term becomes cooled again, and the necessary demonstration; “was” or investigation, partly to form the series of you will theirs to Nature thereof which exists between the last mentioned; but the “living creature.” Whenever he thought, one and dissolution, sometimes of the begetters more free will never one sole and their own new air, desire mingled with it receives its body, but formerly the head and the causes the art of the light in due to this part of Time.