In [3]:
import json
from collections import Counter
import pandas as pd
from nltk.corpus import names
import nltk
import re 
import os
import difflib 
import logging
import itertools
from nltk.util import ngrams 
from difflib import SequenceMatcher
from string import punctuation
from termcolor import colored
from IPython.display import clear_output
%matplotlib inline

In [4]:
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [5]:
with open('txt/e1a.json') as f: 
    rawData = f.read()

df = pd.read_json(rawData)

In [6]:
test1 = df.loc[0]['ocr']

In [7]:
tests = df['ocr']

In [8]:
class Text: 
    def __init__(self, raw_text, label, removeStopwords=True): 
        if type(raw_text) == list: 
            # JSTOR critical works come in lists, where each item represents a page. 
            self.text = ' \n '.join(raw_text)
        else: 
            self.text = raw_text
        self.label = label
        self.tokens = self.getTokens(removeStopwords)
        self.trigrams = self.ngrams(3)
        
    def getTokens(self, removeStopwords=True): 
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        #tokenizer = nltk.RegexpTokenizer('\w+|\$[\d\.]+|\S+') # A custom regex tokenizer. 
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        #print(spans)
        self.length = spans[-1][-1] 
        tokens = tokenizer.tokenize(self.text)
        tokens = [ token.lower() for token in tokens ] # make them lowercase
        if not removeStopwords: 
            self.spans = spans
            return tokens
        tokenSpans = list(zip(tokens, spans)) # zip it up
        stopwords = nltk.corpus.stopwords.words('english') # get stopwords
        tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
        self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
        return [ x[0] for x in tokenSpans ] # unzip; get tokens
    
    def ngrams(self, n): 
        """ Returns ngrams for the text."""
        return list(ngrams(self.tokens, n))

class Matcher: 
    def __init__(self, textObjA, textObjB, threshold=5, ngramSize=3, removeStopwords=True):
        """
        Takes as input two Text() objects, and matches between them.
        """
        self.threshold = threshold
        self.ngramSize = ngramSize
        
        #self.textA, self.textB = Text(fileA, removeStopwords=removeStopwords), \
        #        Text(fileB, removeStopwords=removeStopwords)
        self.textA = textObjA
        self.textB = textObjB 
        
        self.textAgrams = self.textA.ngrams(ngramSize)
        self.textBgrams = self.textB.ngrams(ngramSize)

        self.locationsA = []
        self.locationsB = []

    def getContext(self, text, start, length, context): 
        match = self.getTokensText(text, start, length)
        before = self.getTokensText(text, start-context, context)
        after = self.getTokensText(text, start+length, context)
        match = colored(match, 'red')
        out = " ".join([before, match, after])
        out = out.replace('\n', ' ') # Replace newlines with spaces. 
        out = re.sub('\s+', ' ', out)
        return out

    def getTokensText(self, text, start, length):  
        """ Looks up the passage in the original text, using its spans. """
        matchTokens = text.tokens[start:start+length]
        spans = text.spans[start:start+length]
        if len(spans) == 0: 
            # Don't try to get text or context beyond the end of a text. 
            passage = ""
        else: 
            passage = text.text[spans[0][0]:spans[-1][-1]]
        return passage 

    def getLocations(self, text, start, length, asPercentages=False): 
        """ Gets the numeric locations of the match. """
        spans = text.spans[start:start+length]
        if asPercentages: 
            locations = (spans[0][0]/text.length, spans[-1][-1]/text.length)
        else: 
            locations = (spans[0][0], spans[-1][-1])
        return locations

    def getMatch(self, match, textA, textB, context): 
        length = match.size + self.ngramSize - 1 # offset according to nGram size 
        wordsA = self.getContext(textA, match.a, length, context)
        wordsB = self.getContext(textB, match.b, length, context)
        spansA = self.getLocations(textA, match.a, length)
        spansB = self.getLocations(textB, match.b, length)
        self.locationsA.append(spansA)
        self.locationsB.append(spansB)
        line1 = ('%s: %s %s' % (colored(textA.label, 'green'), spansA, wordsA) )
        line2 = ('%s: %s %s' % (colored(textB.label, 'green'), spansB, wordsB) )
        return line1 + '\n' + line2

    def match(self): 
        """
        This does the main work of finding matching n-gram sequences between
        the texts.
        """
        sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams)
        matchingBlocks = sequence.get_matching_blocks()

        # Only return the matching sequences that are higher than the 
        # threshold given by the user. 
        highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold]
    
        numBlocks = len(highMatchingBlocks)
        self.numMatches = numBlocks
        
        if numBlocks > 0: 
            print('%s total matches found.' % numBlocks, flush=True)

        for num, match in enumerate(highMatchingBlocks): 
            print('match: ', match)
            out = self.getMatch(match, self.textA, self.textB, 5)
            print('\n')
            print('match %s:' % (num+1), flush=True)
            print(out, flush=True)

        return self.numMatches, self.locationsA, self.locationsB



In [9]:
test1Text = Text(test1, 'test1')

In [10]:
mm = Text(open('middlemarch.txt').read(), 'Middlemarch')

In [18]:
class Match(): 
    """ Replacement for SequenceMatcher that does fuzzy text matching."""
    def __init__(self, textAgrams, textBgrams): 
        self.textAgrams = textAgrams
        self.textBgrams = textBgrams
    
    @property
    def initialMatches(self): 
        matches = set(self.textAgrams).intersection(self.textBgrams)
        return list(matches)
    
    @property
    def extendedMatches(self): 
        extendedMatches = []
        for match in self.initialMatches: 
            # Extend the match as far as possible exactly, first.
            match = self.extendedExactMatch(match)
                
            # Now extend the match as far as possible fuzzily. 
            match = self.extendedFuzzyMatch(match)
            
            extendedMatches.append(match)
        return extendedMatches
    
    def extendedExactMatch(self, match): 
        extended = [match]
        print(match)
        aLocation = self.textAgrams.index(match)
        bLocation = self.textBgrams.index(match)
        print('aLocation: ', aLocation)
        print('bLocation: ', bLocation)
        go = True
        pi, ni = 1, 1
        counter = 0
        while go: 
            prevA = self.textAgrams[aLocation-pi]
            prevB = self.textBgrams[bLocation-pi]
            nextA = self.textAgrams[aLocation+ni]
            nextB = self.textBgrams[bLocation+ni]
            print('prevA: ', prevA)
            print('prevB: ', prevB)
            print('nextA: ', nextA)
            print('nextB: ', nextB)
            if prevA == prevB: 
                print('prevA is same as prev B')
                extended = [prevA] + extended 
                pi += 1
            elif nextA == nextB:
                print('next A is same as next B')
                extended = extended + [nextA]
                ni += 1
            else: 
                go = False   
            counter += 1 
            if counter > 15: 
                go = False
        return extended
        
    def extendedFuzzyMatch(self, match): 
        pass

In [19]:
Match(test1Text.trigrams, mm.trigrams).extendedMatches

('reason', 'think', 'asks')
aLocation:  1710
bLocation:  14753
prevA:  ('chief', 'reason', 'think')
prevB:  ('chief', 'reason', 'think')
nextA:  ('think', 'asks', 'large')
nextB:  ('think', 'asks', 'large')
prevA is same as prev B
prevA:  ('eyes', 'chief', 'reason')
prevB:  ('eyes', 'chief', 'reason')
nextA:  ('think', 'asks', 'large')
nextB:  ('think', 'asks', 'large')
prevA is same as prev B
prevA:  ('important', 'eyes', 'chief')
prevB:  ('important', 'eyes', 'chief')
nextA:  ('think', 'asks', 'large')
nextB:  ('think', 'asks', 'large')
prevA is same as prev B
prevA:  ('lot', 'important', 'eyes')
prevB:  ('lot', 'important', 'eyes')
nextA:  ('think', 'asks', 'large')
nextB:  ('think', 'asks', 'large')
prevA is same as prev B
prevA:  ('less', 'lot', 'important')
prevB:  ('doubtless', 'lot', 'important')
nextA:  ('think', 'asks', 'large')
nextB:  ('think', 'asks', 'large')
next A is same as next B
prevA:  ('less', 'lot', 'important')
prevB:  ('doubtless', 'lot', 'important')
nextA:  ('

IndexError: list index out of range

In [32]:
Matcher(test1Text, mm, ngramSize=2).match()

25 total matches found.
match:  Match(a=925, b=515, size=30)


match 1:
[32mtest1[0m: (10484, 10893) closely, as the first description of Dorothea shows [31mmind was theoretic, and yearned by its nature after some lofty conception of the world which might frankly include the parish of Tipton and her own rule of conduct there; she was enamoured of intensity and greatness, and rash in embracing whatever seemed to her to have those aspects; likely to seek martyrdom, to make retractations, and then to incur martyrdom after all in a quarter where she had not sought[0m central issue of marriage is raised immediately
[32mMiddlemarch[0m: (5809, 6218) interest in gimp and artificial protrusions of drapery [31mmind was theoretic, and yearned by its nature after some lofty conception of the world which might frankly include the parish of Tipton and her own rule of conduct there; she was enamoured of intensity and greatness, and rash in embracing whatever seemed to her to have those aspects

(25,
 [(10484, 10893),
  (11022, 11122),
  (11610, 11765),
  (12440, 12527),
  (13752, 13883),
  (29395, 29580),
  (29659, 30092),
  (34726, 34866),
  (35477, 35916),
  (35930, 36003),
  (37725, 37845),
  (42065, 42515),
  (42700, 43207),
  (43222, 43347),
  (43367, 43466),
  (44773, 44882),
  (44892, 44989),
  (45831, 46249),
  (46338, 46542),
  (47322, 47456),
  (47469, 47699),
  (47713, 47862),
  (49114, 49317),
  (49335, 49454),
  (50226, 50531)],
 [(5809, 6218),
  (8751, 8851),
  (8890, 9046),
  (57013, 57100),
  (83868, 83999),
  (116900, 117085),
  (117160, 117594),
  (192301, 192441),
  (195148, 195588),
  (195600, 195673),
  (402604, 402726),
  (411725, 412177),
  (449403, 449911),
  (449924, 450049),
  (450145, 450244),
  (1575265, 1575374),
  (1576340, 1576437),
  (1648987, 1649408),
  (1649495, 1649704),
  (1688955, 1689089),
  (1689915, 1690146),
  (1690158, 1690307),
  (1708999, 1709204),
  (1709221, 1709342),
  (1793142, 1793447)])

In [47]:
Matcher(test1Text, mm, threshold=2, ngramSize=1).match()

33 total matches found.
match:  Match(a=157, b=0, size=3)


match 1:
[32mtest1[0m: (1742, 1772) inspiration whose final result is the novel [31mMiddlemarch. When George Eliot[0m embarked on "Miss Brooke" in November 1870, she was undertaking
[32mMiddlemarch[0m: (3, 33)  [31mMiddlemarch By George Eliot[0m PRELUDE Who that cares much to know the history
match:  Match(a=925, b=515, size=31)


match 2:
[32mtest1[0m: (10484, 10893) closely, as the first description of Dorothea shows [31mmind was theoretic, and yearned by its nature after some lofty conception of the world which might frankly include the parish of Tipton and her own rule of conduct there; she was enamoured of intensity and greatness, and rash in embracing whatever seemed to her to have those aspects; likely to seek martyrdom, to make retractations, and then to incur martyrdom after all in a quarter where she had not sought[0m central issue of marriage is raised immediately
[32mMiddlemarch[0m: (5809, 6218) inter

(33,
 [(1742, 1772),
  (10484, 10893),
  (11022, 11122),
  (11610, 11765),
  (12082, 12119),
  (12440, 12527),
  (13752, 13883),
  (29395, 29580),
  (29597, 29647),
  (29659, 30092),
  (34726, 34866),
  (35477, 35916),
  (35930, 36003),
  (37725, 37845),
  (42065, 42515),
  (42700, 43207),
  (43222, 43347),
  (43367, 43466),
  (43860, 43910),
  (44423, 44444),
  (44773, 44882),
  (44892, 44989),
  (45203, 45259),
  (45831, 46249),
  (46269, 46311),
  (46338, 46542),
  (47322, 47456),
  (47469, 47699),
  (47713, 47862),
  (49041, 49107),
  (49114, 49317),
  (49335, 49454),
  (50226, 50531)],
 [(3, 33),
  (5809, 6218),
  (8751, 8851),
  (8890, 9046),
  (28768, 28805),
  (57013, 57100),
  (83868, 83999),
  (116900, 117085),
  (117100, 117150),
  (117160, 117594),
  (192301, 192441),
  (195148, 195588),
  (195600, 195673),
  (402604, 402726),
  (411725, 412177),
  (449403, 449911),
  (449924, 450049),
  (450145, 450244),
  (585739, 585789),
  (1327698, 1327719),
  (1575265, 1575374),
  (15

In [39]:
test9text = Text(tests[9], 'test9')

In [55]:
Matcher(test9text, mm).match()

16 total matches found.
match:  Match(a=1145, b=341, size=16)


match 1:
[32mtest9[0m: (13769, 13998) opening passage of Middlemarch: [Dorothea Brooke [31musually spoken of (1) as being remarkably clever, but with the addition that her sister Celia had more common sense. Nevertheless, Celia wore scarcely more trimmings; and it was only to close observers (2) that her dress differed[0m sister’s, and had a shade of coquetry in its arrangements; for Miss
[32mMiddlemarch[0m: (3790, 4011) elder poets,--in a paragraph of to-day's newspaper [31musually spoken of as being remarkably clever, but with the addition that her sister Celia had more common-sense. Nevertheless, Celia wore scarcely more trimmings; and it was only to close observers that her dress differed[0m sister's, and had a shade of coquetry in its arrangements; for Miss
match:  Match(a=1172, b=367, size=28)


match 2:
[32mtest9[0m: (14106, 14474) Miss Brooke’s plain dress- ing [31mdue to mixed conditions, in most of whi

(16,
 [(13769, 13998),
  (14106, 14474),
  (14482, 14586),
  (14667, 14856),
  (14877, 15044),
  (15058, 15264),
  (15324, 15593),
  (15603, 15803),
  (15835, 16003),
  (16013, 16117),
  (16146, 16450),
  (16460, 16695),
  (16752, 16887),
  (17110, 17307),
  (17503, 17652),
  (17667, 17762)],
 [(3790, 4011),
  (4117, 4481),
  (4703, 4807),
  (4883, 5069),
  (5809, 5972),
  (6224, 6426),
  (6856, 7123),
  (7134, 7324),
  (7962, 8125),
  (8136, 8241),
  (8450, 8746),
  (9229, 9456),
  (9502, 9628),
  (10016, 10203),
  (12014, 12164),
  (12179, 12276)])

In [44]:
Matcher(test9text, mm, threshold=2).match()

24 total matches found.
match:  Match(a=1145, b=341, size=16)


match 1:
[32mtest9[0m: (13769, 13998) opening passage of Middlemarch: [Dorothea Brooke [31musually spoken of (1) as being remarkably clever, but with the addition that her sister Celia had more common sense. Nevertheless, Celia wore scarcely more trimmings; and it was only to close observers (2) that her dress differed[0m sister’s, and had a shade of coquetry in its arrangements; for Miss
[32mMiddlemarch[0m: (3790, 4011) elder poets,--in a paragraph of to-day's newspaper [31musually spoken of as being remarkably clever, but with the addition that her sister Celia had more common-sense. Nevertheless, Celia wore scarcely more trimmings; and it was only to close observers that her dress differed[0m sister's, and had a shade of coquetry in its arrangements; for Miss
match:  Match(a=1172, b=367, size=28)


match 2:
[32mtest9[0m: (14106, 14474) Miss Brooke’s plain dress- ing [31mdue to mixed conditions, in most of whi

(24,
 [(13769, 13998),
  (14106, 14474),
  (14482, 14586),
  (14667, 14856),
  (14877, 15044),
  (15058, 15264),
  (15324, 15593),
  (15603, 15803),
  (15835, 16003),
  (16013, 16117),
  (16146, 16450),
  (16460, 16695),
  (16752, 16887),
  (16923, 16982),
  (16992, 17054),
  (17110, 17307),
  (17349, 17401),
  (17418, 17479),
  (17503, 17652),
  (17667, 17762),
  (17776, 17832),
  (17851, 17900),
  (17916, 18002),
  (33013, 33102)],
 [(3790, 4011),
  (4117, 4481),
  (4703, 4807),
  (4883, 5069),
  (5809, 5972),
  (6224, 6426),
  (6856, 7123),
  (7134, 7324),
  (7962, 8125),
  (8136, 8241),
  (8450, 8746),
  (9229, 9456),
  (9502, 9628),
  (9656, 9715),
  (9725, 9787),
  (10016, 10203),
  (11867, 11920),
  (11936, 11992),
  (12014, 12164),
  (12179, 12276),
  (12290, 12346),
  (12364, 12409),
  (12425, 12511),
  (321957, 322046)])

In [57]:
Matcher(test9text, mm, threshold=2, removeStopwords=False).match()

24 total matches found.
match:  Match(a=1145, b=341, size=16)


match 1:
[32mtest9[0m: (13769, 13998) opening passage of Middlemarch: [Dorothea Brooke [31musually spoken of (1) as being remarkably clever, but with the addition that her sister Celia had more common sense. Nevertheless, Celia wore scarcely more trimmings; and it was only to close observers (2) that her dress differed[0m sister’s, and had a shade of coquetry in its arrangements; for Miss
[32mMiddlemarch[0m: (3790, 4011) elder poets,--in a paragraph of to-day's newspaper [31musually spoken of as being remarkably clever, but with the addition that her sister Celia had more common-sense. Nevertheless, Celia wore scarcely more trimmings; and it was only to close observers that her dress differed[0m sister's, and had a shade of coquetry in its arrangements; for Miss
match:  Match(a=1172, b=367, size=28)


match 2:
[32mtest9[0m: (14106, 14474) Miss Brooke’s plain dress- ing [31mdue to mixed conditions, in most of whi

(24,
 [(13769, 13998),
  (14106, 14474),
  (14482, 14586),
  (14667, 14856),
  (14877, 15044),
  (15058, 15264),
  (15324, 15593),
  (15603, 15803),
  (15835, 16003),
  (16013, 16117),
  (16146, 16450),
  (16460, 16695),
  (16752, 16887),
  (16923, 16982),
  (16992, 17054),
  (17110, 17307),
  (17349, 17401),
  (17418, 17479),
  (17503, 17652),
  (17667, 17762),
  (17776, 17832),
  (17851, 17900),
  (17916, 18002),
  (33013, 33102)],
 [(3790, 4011),
  (4117, 4481),
  (4703, 4807),
  (4883, 5069),
  (5809, 5972),
  (6224, 6426),
  (6856, 7123),
  (7134, 7324),
  (7962, 8125),
  (8136, 8241),
  (8450, 8746),
  (9229, 9456),
  (9502, 9628),
  (9656, 9715),
  (9725, 9787),
  (10016, 10203),
  (11867, 11920),
  (11936, 11992),
  (12014, 12164),
  (12179, 12276),
  (12290, 12346),
  (12364, 12409),
  (12425, 12511),
  (321957, 322046)])

In [86]:
Matcher(test9text, mm, ngramSize=1, threshold=2).match()

27 total matches found.
match:  Match(a=1145, b=341, size=18)


match 1:
[32mtest9[0m: (13769, 13998) opening passage of Middlemarch: [Dorothea Brooke [31musually spoken of (1) as being remarkably clever, but with the addition that her sister Celia had more common sense. Nevertheless, Celia wore scarcely more trimmings; and it was only to close observers (2) that her dress differed[0m sister’s, and had a shade of coquetry in its arrangements; for Miss
[32mMiddlemarch[0m: (3790, 4011) elder poets,--in a paragraph of to-day's newspaper [31musually spoken of as being remarkably clever, but with the addition that her sister Celia had more common-sense. Nevertheless, Celia wore scarcely more trimmings; and it was only to close observers that her dress differed[0m sister's, and had a shade of coquetry in its arrangements; for Miss
match:  Match(a=1164, b=360, size=4)


match 2:
[32mtest9[0m: (14028, 14075) close observers (2) that her dress differed from her sister [31mshade of co

(27,
 [(13769, 13998),
  (14028, 14075),
  (14106, 14474),
  (14482, 14586),
  (14603, 14650),
  (14667, 14856),
  (14877, 15044),
  (15058, 15264),
  (15279, 15298),
  (15324, 15593),
  (15603, 15803),
  (15835, 16003),
  (16013, 16117),
  (16146, 16450),
  (16460, 16695),
  (16752, 16887),
  (16923, 16982),
  (16992, 17054),
  (17110, 17307),
  (17349, 17401),
  (17418, 17479),
  (17503, 17652),
  (17667, 17762),
  (17776, 17832),
  (17851, 17900),
  (17916, 18002),
  (33013, 33102)],
 [(3790, 4011),
  (4041, 4088),
  (4117, 4481),
  (4703, 4807),
  (4823, 4866),
  (4883, 5069),
  (5809, 5972),
  (6224, 6426),
  (6827, 6846),
  (6856, 7123),
  (7134, 7324),
  (7962, 8125),
  (8136, 8241),
  (8450, 8746),
  (9229, 9456),
  (9502, 9628),
  (9656, 9715),
  (9725, 9787),
  (10016, 10203),
  (11867, 11920),
  (11936, 11992),
  (12014, 12164),
  (12179, 12276),
  (12290, 12346),
  (12364, 12409),
  (12425, 12511),
  (321957, 322046)])