In [1]:
import nltk
import re
import difflib
from nltk.util import ngrams
from difflib import SequenceMatcher
from string import punctuation
from termcolor import colored

In [2]:
class Text: 
    def __init__(self, filename): 
        self.filename = filename
        self.trigrams = self.ngrams(3)
        
    @property
    def text(self):
        """ Reads the file in memory. """
        return open(self.filename).read()

    @property
    def tokens(self): 
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        self.spans = list(tokenizer.span_tokenize(self.text))
        return tokenizer.tokenize(self.text)
    
    def ngrams(self, n): 
        """ Returns ngrams for the text."""
        return list(ngrams(self.tokens, n))

In [3]:
class Matcher: 
    def __init__(self, fileA, fileB, threshold, ngramSize):
        """
        Gets the texts from the files, tokenizes them, 
        cleans them up as necessary. 
        """
        self.threshold = threshold
        self.ngramSize = ngramSize
        
        self.textA, self.textB = Text(fileA), Text(fileB)
        
        self.textAgrams = self.textA.ngrams(ngramSize)
        self.textBgrams = self.textB.ngrams(ngramSize)

    def getContext(self, text, start, length, context): 
        match = self.getTokensText(text, start, length)
        before = self.getTokensText(text, start-context, context)
        after = self.getTokensText(text, start+length, context)
        match = colored(match, 'red')
        out = " ".join([before, match, after])
        out = out.replace('\n', ' ') # Replace newlines with spaces. 
        out = re.sub('\s+', ' ', out)
        return out

    def getTokensText(self, text, start, length):  
        """ Looks up the passage in the original text, using its spans. """
        matchTokens = text.tokens[start:start+length]
        spans = text.spans[start:start+length]
        passage = text.text[spans[0][0]:spans[-1][-1]]
        return passage 

    def getMatch(self, match, textA, textB, context): 
        wordsA = self.getContext(textA, match.a, match.size, context)
        wordsB = self.getContext(textB, match.b, match.size, context)
        line1 = ('%s: %s' % (colored(textA.filename, 'green'), wordsA) )
        line2 = ('%s: %s' % (colored(textB.filename, 'green'), wordsB) )
        return line1 + '\n' + line2

    def match(self): 
        """
        This does the main work of finding matching n-gram sequences between
        the texts.
        """
        sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams)
        matchingBlocks = sequence.get_matching_blocks()

        # Only return the matching sequences that are higher than the 
        # threshold given by the user. 
        highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold]
    
        numBlocks = len(highMatchingBlocks)
        
        print('%s total matches found.' % numBlocks)
        for num, match in enumerate(highMatchingBlocks): 
            out = self.getMatch(match, self.textA, self.textB, 3)
            print('\n')
            print('match %s:' % (num+1))
            print(out)

In [4]:
myMatch = Matcher('texts/milton.txt', 'texts/kjv.txt', 2, 3)
myMatch.match()

23 total matches found.


match 1:
[32mtexts/milton.txt[0m: wrest his Glass [31mout of his[0m hand, and break
[32mtexts/kjv.txt[0m: away the jawbone [31mout of his[0m hand, and called


match 2:
[32mtexts/milton.txt[0m: things, hold fast [31mthat which is[0m good. And he
[32mtexts/kjv.txt[0m: them, and speak [31mthat which is[0m good. 22:14 And Micaiah


match 3:
[32mtexts/milton.txt[0m: of their judgement [31mthat it was[0m not the not
[32mtexts/kjv.txt[0m: the chariots perceived [31mthat it was[0m not the king


match 4:
[32mtexts/milton.txt[0m: Temple, but a direction [31mfor the courses[0m of the Priests
[32mtexts/kjv.txt[0m: dedicated things: 28:13 Also [31mfor the courses[0m of the priests


match 5:
[32mtexts/milton.txt[0m: the Priests and [31mLevites, and for[0m all the worke
[32mtexts/kjv.txt[0m: priests and the [31mLevites, and for[0m all the work


match 6:
[32mtexts/milton.txt[0m: Israel hath beene [31mwithout the true God, and[0m

In [None]:
myMatch = Matcher('texts/yeats.txt', 'texts/kjv.txt', 2, 4)
myMatch.match()