In [2]:
import nltk
import re
from nltk.util import ngrams
from difflib import SequenceMatcher
from string import punctuation
from termcolor import colored
from fuzzysearch import find_near_matches

In [3]:
class Matcher: 
    def __init__(self, fileA, fileB, threshold, ngramSize):
        """
        Gets the texts from the files, tokenizes them, 
        cleans them up as necessary. 
        """
        self.threshold = threshold
        
        self.filenameA = fileA
        self.filenameB = fileB
        
        self.textA = self.readFile(fileA)
        self.textB = self.readFile(fileB)
        
        textATokens = self.tokenize(self.textA)
        textBTokens = self.tokenize(self.textB)
        
        self.textAgrams = list(ngrams(textATokens, ngramSize))
        self.textBgrams = list(ngrams(textBTokens, ngramSize))
        
    def readFile(self, filename): 
        """ Reads the file in memory. """
        return open(filename).read()

    def tokenize(self, text): 
        """ Tokenizes the text, breaking it up into words. """
        return nltk.word_tokenize(text.lower())

    def gramsToString(self, grams): 
        """
        Takes a list of tuples (3-grams, 4-grams, etc.) 
        and stitches it back together into a string, so that
        we can search the non-tokenized text for the string later. 
        """
        string = " ".join(grams[0][:-1])
        for gram in grams:
            lastGram = gram[-1]
            if lastGram not in punctuation: 
                string += " " + lastGram
            else: 
                string += lastGram
        return string
    
    def getMatch(self, match, textA, textB): 
        """ 
        Takes the match object returned by get_matching_blocks() and
        gets the matched n-gram. It uses gramsToString() to 
        reformat this into a string.
        """
        textAs, textBs = [], []
        for i in range(match.size):
            textAs.append(textA[match.a+i])
            textBs.append(textB[match.b+i])
        return (self.gramsToString(textAs), self.gramsToString(textBs))

    def match(self): 
        """
        This does the main work of finding matching n-gram sequences between
        the texts.
        """
        sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams)
        matchingBlocks = sequence.get_matching_blocks()

        # Only return the matching sequences that are higher than the 
        # threshold given by the user. 
        highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold]

        for match in highMatchingBlocks: 
            out = self.getMatch(match, self.textAgrams, self.textBgrams)
            print('\n', out)
            self.findInText(out[0], self.textA, self.filenameA, 20)
            self.findInText(out[1], self.textB, self.filenameB, 20)
        
    def findInText(self, needle, haystack, haystackName, context):
        """
        This takes the matches found by match() and tries to find that match
        again in the text, so that we can return some context. Uses the
        fuzzysearch library, because I couldn't find anything better.
        """
        m = find_near_matches(needle, haystack, max_l_dist=2)
        
        if len(m) > 0: 
            m = m[0] # just get the first match for now. TODO: get all of them
            
            before = haystack[m.start-context:m.start]
            match  = colored(haystack[m.start:m.end], 'red')
            after  = haystack[m.end:m.end+context]    
            
            contextualized = before + match + after
            cleaned = re.sub( '\s+', ' ', contextualized ).strip()
            print(colored(haystackName, 'green') + ": " + cleaned)
        else: 
            print('Couldn\'t find this match in file: ', haystackName)
        

In [5]:
myMatch = Matcher('texts/milton.txt', 'texts/kjv.txt', 2, 3)
myMatch.match()


 (', and thou shalt be', ', and thou shalt be')
[32mtexts/milton.txt[0m: e of streaming light[31m, And thou shalt be[0m our star of Arcady,
[32mtexts/kjv.txt[0m: hall bruise thy head[31m, and thou shalt br[0muise his heel. 3:16

 (', and herds, and', ', and herds, and')
[32mtexts/milton.txt[0m: eerfull haunt of men[31m, and herds, And[0m sits as safe as in
[32mtexts/kjv.txt[0m: th Abram, had flocks[31m, and herds, and[0m tents. 13:6 And th

 (', as he said,', ', as he said,')
[32mtexts/milton.txt[0m: in another Countrey[31m, as he said,[0m Bore a bright golde
[32mtexts/kjv.txt[0m: one giving him drink[31m, she said,[0m I will draw water f

 ('out of his hand, and', 'out of his hand, and')
[32mtexts/milton.txt[0m: wn, wrest his Glass [31mout of his hand, and[0m break it against th
[32mtexts/kjv.txt[0m: he cast the tables [31mout of his hands, and[0m brake them beneath

 (', if it seem good to', ', if it seem good to')
[32mtexts/milton.txt[0m: amp: Impr

In [6]:
myMatch = Matcher('texts/portrait.txt', 'texts/kjv.txt', 3, 4)

In [7]:
myMatch.match()


 (', we beseech thee, o lord,', ', we beseech thee, o lord,')
Couldn't find this match in file:  texts/portrait.txt
Couldn't find this match in file:  texts/kjv.txt

 ('better for him that a millstone were', 'better for him that a millstone were')
Couldn't find this match in file:  texts/portrait.txt
[32mtexts/kjv.txt[0m: ieve in me, it were [31mbetter for him that a millstone were[0m hanged about his ne

 ('depart from me, ye cursed, into everlasting fire', 'depart from me, ye cursed, into everlasting fire')
[32mtexts/portrait.txt[0m: it lay, he said: --[31mDepart from me, ye cursed, into everlasting fire[0m! Taking Stephen's
[32mtexts/kjv.txt[0m: m on the left hand, [31mDepart from me, ye cursed, into everlasting fire[0m, prepared for the d

 ('prepared for the devil and his angels', 'prepared for the devil and his angels')
Couldn't find this match in file:  texts/portrait.txt
[32mtexts/kjv.txt[0m: o everlasting fire, [31mprepared for the devil and his angels[0m: 25:

In [9]:
myMatch = Matcher('texts/yeats.txt', 'texts/kjv.txt', 3, 4)
myMatch.match()


 ('eat , drink, and be merry.', 'eat , drink, and be merry.')
[32mtexts/yeats.txt[0m: eal with us Shall [31meat, drink, and be merry.[0m SHEMUS (_to_ MAR
[32mtexts/kjv.txt[0m: s; take thine ease, [31meat, drink, and be merry.[0m 12:20 But God said
