In [6]:
import sys
import mmh3
import os
import inspect, os
import json
from gensim.parsing import preprocessing as genPreProc
from gensim.parsing.preprocessing import preprocess_string
from spacy import load

def loadDoc(dataPath):
    with open(dataPath, 'r') as loadedDoc:
        doc = loadedDoc.read().replace('\n', '')
    return doc

def preprocess(datafolder):
    docs = {}
    nlp = load('en')
    
    for file in os.listdir(datafolder):
        filepath = os.path.join(datafolder, file)
        if not file.startswith('.'):
            document = loadDoc(filepath)

            genSettings2 = [lambda x: x.lower(), genPreProc.remove_stopwords, genPreProc.stem]
            step1preprocess = ' '.join(preprocess_string(document, filters=genSettings2))
            sentenceSplit = list(nlp(step1preprocess).sents) #splitting document into sentences

            genSettings3 = [lambda x: genPreProc.strip_non_alphanum(x), genPreProc.strip_multiple_whitespaces]
            sentencePreprocess = [' '.join(preprocess_string(str(ite), filters=genSettings3)) for ite in sentenceSplit]

            docs[os.path.basename(filepath)] = sentencePreprocess
    return docs

def listhash(l,seed): 
    val = 0
    for e in l:
        val = val ^ mmh3.hash(e, seed)
    return val 

def ngram(shingle_length, string):
    tokens = string.split()
    shingles = [tokens[i:i+shingle_length] for i in range(len(tokens) - shingle_length + 1)]
    
    return shingles

def minhash(shingles, k):
    min_hashes = [sys.maxsize] * k
    for i in range(k):
        for shingle in shingles:
            shingle_hash = listhash(shingle, i)
            if(shingle_hash < min_hashes[i]):
                min_hashes[i] = shingle_hash
    return min_hashes

def signature(docs, q, k):
    docsSignature = {}
    for file_name, sentences in docs.items():
        signatures = []
        for sentence in sentences:
            signatures.append(minhash(ngram(q, sentence), k))
        docsSignature[file_name] = signatures
    return docsSignature

def lshSentence(docsSignature, b, k):
    r = int(k/b)
    M = [{}]*b
    for name, signatures in docsSignature.items():
        for signature in signatures:
            for band in range(0, k, r):
                signatureBand = tuple(signature[band:band+r])
                index = int(band/r)
                sentenceName = name + '_' + str(signatures.index(signature)) + '_' + str(len(signatures))
                if tuple(signatureBand) not in M[index]:
                    M[index][signatureBand] = {sentenceName}
                else:
                    M[index][signatureBand].add(sentenceName)
                    
    with open('data.json', 'w') as fp:
        json.dump(M, fp)

def checkfileSentence(filename, b, k, M):
    nlp = load('en')
    srcfolder = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
    datafolder = os.path.join(srcfolder, "testDocuments")   # change to ats_corpus for large data set
    filepath = os.path.join(datafolder, filename)
    
    document = loadDoc(filepath)

    genSettings2 = [lambda x: x.lower(), genPreProc.remove_stopwords, genPreProc.stem]
    step1preprocess = ' '.join(preprocess_string(document, filters=genSettings2))
    sentenceSplit = list(nlp(step1preprocess).sents)

    genSettings3 = [lambda x: genPreProc.strip_non_alphanum(x), genPreProc.strip_multiple_whitespaces]
    sentencePreprocess = [' '.join(preprocess_string(str(ite), filters=genSettings3)) for ite in sentenceSplit]

    textFile = sentencePreprocess

    print("Looking for documents similar to " + filename)
    
    similarDocs = set([])
    r = int(k/b)
    signatures = []
    for sentence in textFile:
        signatures.append(minhash(ngram(q, sentence), k))
    
    for signature in signatures:
        for band in range(0, k, r):      
            index = int(band/r)
            signatureBand = tuple(signature[band:band+r])
            sentenceName = filename + '_' + str(signatures.index(signature)) + '_' + str(len(signatures))
            if signatureBand in M[index]:
                for item in M[index][signatureBand]:
                    similarDocs.add((sentenceName,item))
            
    if len(similarDocs):
        print('Similar sentences')
        print(similarDocs)
    else:
        print("no similar sentences")
    return textFile

In [2]:
q = 3 # length of shingle
k = 100 # number of minhashes
b = 5 # number of bands

srcfolder = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
datafolder = os.path.join(srcfolder, "ats_corpus_small")   # change to ats_corpus for large data set

docs = preprocess(datafolder)
docsSignature = signature(docs, q, k)
M = lshSentence(docsSignature, b, k)

In [7]:
checkfileSentence('testDoc1.txt', b, k, M)

Looking for documents similar to testDoc1.txt
Similar sentences
{('testDoc1.txt_3305_6776', 'calltounconv00baxt.txt_3305_6776'), ('testDoc1.txt_5118_6776', 'calltounconv00baxt.txt_5118_6776'), ('testDoc1.txt_3623_6776', 'calltounconv00baxt.txt_3623_6776'), ('testDoc1.txt_5688_6776', 'calltounconv00baxt.txt_5688_6776'), ('testDoc1.txt_4116_6776', 'calltounconv00baxt.txt_4116_6776'), ('testDoc1.txt_5173_6776', 'calltounconv00baxt.txt_5173_6776'), ('testDoc1.txt_6240_6776', 'calltounconv00baxt.txt_6240_6776'), ('testDoc1.txt_4711_6776', 'calltounconv00baxt.txt_4711_6776'), ('testDoc1.txt_5535_6776', 'calltounconv00baxt.txt_5535_6776'), ('testDoc1.txt_5073_6776', 'calltounconv00baxt.txt_5073_6776'), ('testDoc1.txt_711_6776', 'calltounconv00baxt.txt_711_6776'), ('testDoc1.txt_933_6776', 'calltounconv00baxt.txt_933_6776'), ('testDoc1.txt_6054_6776', 'calltounconv00baxt.txt_6054_6776'), ('testDoc1.txt_6687_6776', 'calltounconv00baxt.txt_6687_6776'), ('testDoc1.txt_1594_6776', 'calltounconv00b

In [None]:
docs[testDoct]

In [10]:
import pickle
with open("test", "wb") as file:
    pickle.dump(M, file)

In [9]:
M

[{(-1326658912,
   -36836084,
   -2062854568,
   -2138420332,
   -1308648664,
   -1128005952,
   -1894567608,
   -1891331900,
   -1128834791,
   -2142051412,
   -1826189946,
   -1717801022,
   -1836419327,
   -1435561661,
   -1588599230,
   -2073148056,
   -1347106453,
   -2009205190,
   -1748639923,
   -2018372133): {'practicalthought00nev.txt_0_8676'},
  (-2070260564,
   -290098985,
   -1500525075,
   -1438072711,
   -2141037704,
   -2121591785,
   -1502088651,
   -1011575530,
   -444219821,
   -2121744093,
   -1893676991,
   -426768964,
   -1246594401,
   -593826426,
   -1812249303,
   -2031691108,
   -1682172386,
   -2102510239,
   -1826487114,
   -1339852268): {'practicalthought00nev.txt_0_8676'},
  (-1209732284,
   -1956366558,
   -1354026651,
   -1406029702,
   -1810763673,
   -843234052,
   -1107344685,
   -808305118,
   -1996249473,
   6880057,
   -1926643530,
   -1139348506,
   -558901031,
   -491110747,
   -1299516895,
   -1324359882,
   -679398809,
   -1543443789,
   -15692