In [8]:
import sys
import inspect, os
import mmh3

#################### Utilities ######################
#hashes a list of strings
def listhash(l,seed): 
    val = 0
    for e in l:
        val = val ^ mmh3.hash(e, seed)
    return val 

def ngram(shingle_length, string):
    tokens = string.split()
    shingles = [tokens[i:i+shingle_length] for i in range(len(tokens) - shingle_length + 1)]
    
    return shingles

def minhash(shingles, k):
    min_hashes = [sys.maxsize] * k
    for i in range(k):
        for shingle in shingles:
            shingle_hash = listhash(shingle, i)
            if(shingle_hash < min_hashes[i]):
                min_hashes[i] = shingle_hash
    return min_hashes

def signatures(docs, q, k):
    for file_name, text in docs.items():
        docs[file_name] = minhash(ngram(q, text), k)

def jaccard(Ssig,Tsig):
    k = len(Ssig)
    eq = 0
    for i in range(0,k):
        if Ssig[i] == Tsig[i]:
            eq += 1
    return eq/k  

def lsh(docs, b, k):
    r = int(k/b)
    M = [{}]*b
    for name, signature in docs.items():
        for band in range(0, k, r):
            signatureBand = tuple(signature[band:band+r])
            index = int(band/r)
            if tuple(signatureBand) not in M[index]:
                M[index][signatureBand] = {name}
            else:
                M[index][signatureBand].add(name)
    return M

def checkfile(filename, b, k, M):
    srcfolder = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
    datafolder = os.path.join(srcfolder, "testDocuments")   # change to ats_corpus for large data set
    print(datafolder)
    filepath = os.path.join(datafolder, filename)
    f = open(filepath, 'rb')
    textFile = f.read()
    print("Looking for documents similar to " + filename)
    f.close()
    
    r = int(k/b)
    signature = minhash(ngram(q, textFile), k)
    similarDocs = set([])
    
    for band in range(0, k, r):      
        index = int(band/r)
        signatureBand = tuple(signature[band:band+r])
        
        if signatureBand in M[index]:
            for item in M[index][signatureBand]:
                similarDocs.add(item)
                
    if len(similarDocs):
        print('Similar documents')
        print(similarDocs)
    else:
        print("no similar files")
        
################### Similarity ######################
q = 3 # length of shingle
k = 100 # number of minhashes
docs = {} #dictionary mapping document id to document contents

# read data sets

srcfolder = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
datafolder = os.path.join(srcfolder, "ats_corpus_small")   # change to ats_corpus for large data set

for file in os.listdir(datafolder):
    filepath = os.path.join(datafolder, file)
    if not file.startswith('.'):
        f = open(filepath, 'rb')
        docs[file] = f.read()
        print("read document " + file)
        f.close()

read document practicalthought00nev.txt
read document calltounconv00baxt.txt
read document remembermeorholy00palm.txt
read document thoughtsonpopery00nevi.txt
read document remember00palm.txt
read document lifeofrevrichard00baxt.txt
read document gospeltruth00whit.txt
read document memoirjamesbrai00ricegoog.txt


In [2]:
signatures(docs, q, k)

In [3]:
M = lsh(docs, 5, 100)

In [4]:
#printing the dictionary
for item in M:
    for ele in item.values():
        print(ele)
    print('\n')

{'practicalthought00nev.txt'}
{'practicalthought00nev.txt'}
{'practicalthought00nev.txt'}
{'practicalthought00nev.txt'}
{'practicalthought00nev.txt'}
{'newDoc.txt', 'calltounconv00baxt.txt'}
{'newDoc.txt', 'calltounconv00baxt.txt'}
{'newDoc.txt', 'calltounconv00baxt.txt'}
{'newDoc.txt', 'calltounconv00baxt.txt'}
{'newDoc.txt', 'calltounconv00baxt.txt'}
{'remembermeorholy00palm.txt'}
{'remembermeorholy00palm.txt'}
{'remembermeorholy00palm.txt'}
{'remembermeorholy00palm.txt'}
{'remembermeorholy00palm.txt'}
{'thoughtsonpopery00nevi.txt', 'thoughtsonpopery00nevi copy.txt'}
{'thoughtsonpopery00nevi.txt', 'thoughtsonpopery00nevi copy.txt'}
{'thoughtsonpopery00nevi.txt'}
{'thoughtsonpopery00nevi.txt', 'thoughtsonpopery00nevi copy.txt'}
{'thoughtsonpopery00nevi.txt', 'thoughtsonpopery00nevi copy.txt'}
{'remember00palm.txt'}
{'remember00palm.txt'}
{'remember00palm.txt'}
{'remember00palm.txt'}
{'remember00palm.txt'}
{'lifeofrevrichard00baxt.txt'}
{'lifeofrevrichard00baxt.txt'}
{'lifeofrevrichard

In [9]:
checkfile('testDoc1.txt', 5, 100, M)

/Users/jacob/Desktop/CTBD/Week 8/DataandTemplate/testDocuments
Looking for documents similar to testDoc1.txt
Similar documents
{'newDoc.txt', 'calltounconv00baxt.txt'}
