In [11]:
# One-liner to start the debugger here.
#from IPython.core.debugger import Tracer; Tracer()()

import scipy.sparse as sp
import numpy as np
import time
import hashlib


In [None]:

def chunk_generator(fname, chunk_size=1000):
        data = []
        counter = 0

        with open(fname, 'r') as ifile:

            for line in ifile:
                if counter < chunk_size:
                    data.append([int(p) for p in line.split(" ")])
                    counter += 1
                if counter == chunk_size:
                    counter = 0
                    yield data
                    data = []
                    
            # process remaining elements
            if len(data) > 0:
                yield data

class ShingleFileParser:
    def __init__(self, fname):
        cindex = 0;
        for chunk in chunk_generator(fname, 40000):
            if cindex == 0 :
                self.D = chunk[0][0]
                self.W = chunk[1][0]
                self.X = sp.lil_matrix ((self.W,self.D))
                for i in range(3,len(chunk)):
                    self.add_to_matrix(chunk[i])
            else:
                for line in chunk :
                    self.add_to_matrix(line)
            cindex = cindex + 1
        
    def add_to_matrix(self,item):
            D_i = item[0] - 1
            W_i = item[1] - 1
            self.X[W_i,D_i] = 1
    
    def save_csc(self,filename):
        print("converting to csc (column-access optimized) sparse format ...")
        self.X = sp.csc_matrix(self.X)
        print("saving matrix with " + str(len(self.X.data)) + " elements")
        np.savez(filename,data = self.X.data ,indices=self.X.indices,
                 indptr = self.X.indptr, shape = self.X.shape)
        print ("content saved to " + filename)
            


In [None]:
start = time.time()
sf_parser =  ShingleFileParser('../data/docword.kos.txt')
end = time.time()
print("Parsing the file took " + str(end - start) + " seconds")
sf_parser.save_csc('../data/csc_kos');
del sf_parser

In [4]:
def load_sparse_csc(filename):
    loader = np.load(filename)
    return sp.csc_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])
X = load_sparse_csc('../data/csc_kos.npz')

In [8]:
def jaccard_similarity(A):
    # assumes that every document is a column-vector.
    # We will compute the "covariance" between every document to find how many elements are common.
    intersect = A.T.dot(A)
    D_len = np.zeros([1,A.shape[1]])
    for i in range(0,A.shape[1]):
        D_len[0,i] = A[:,i].size
    # a bit tricky .. create 2 "meshes" of the sizes : one vertical and one horizontal
    cardinality_sum = np.array(np.meshgrid(D_len, D_len))
    # add up the 2 mesh grids .. this adds up all the pairs of 2 documents
    cardinality_sum = np.sum(cardinality_sum,axis=0)
    # |union(a, b)| == |a| + |b| - |intersection(a, b)|
    union = cardinality_sum - intersect
    J= np.divide(intersect.todense(),union)
    return J

In [9]:
start = time.time()
# this contains duplicates (J[a,b] =J[b,a]). 
J = jaccard_similarity(X)
end = time.time()
print("Computing Jaccard similarity matrix took " + str(end - start) + " seconds")
# Also the diagonal would need to be ignored, since documents are similar to themselves
np.mean(np.triu(J))

Computing Jaccard similarity matrix took 1.52885699272 seconds


0.017038137668006964

In [10]:
def find_next_prime(n):
    def find_prime_in_range(a, b):
            for p in range(a, b):
                for i in range(2, p):
                    if p % i == 0:
                        break
                else:
                     return p
            return None
    return find_prime_in_range(n, 2 * n)

In [12]:
def row2sig(A,i):
    tokens = np.squeeze(np.asarray(A[i,:].todense())).astype(int).astype(str)
    big = ''.join(tokens)
    hsh = int(hashlib.sha1(big).hexdigest(), 16)
    return hsh


In [13]:
hsh = row2sig(X,1)
hsh

1437417051751051074730810493049826714766962945964L

In [14]:

def N_universal_hashfunctions(N):
    N_prime = find_next_prime(N)
    hsh_sz = 100
    hsh = np.random.randint(low=1,high=hsh_sz,size=[2,hsh_sz])
    fn = []
    for i in range(0,hsh_sz):
        coefs = hsh[:,i]
        def h(x, a=coefs[0], b=coefs[1]):
            return int((a * x + b) % N_prime % N)
        fn.append(h)
    return fn



In [None]:
# check out the distribution of these random hash functions
import matplotlib.pyplot as plt

fn = N_universal_hashfunctions(100)
mods = [f(hsh) for f in fn]
plt.hist(mods)
plt.show()

In [None]:
N = 100
fn = N_universal_hashfunctions(N)
D = X.shape[1]
W = X.shape[0]
SGN = np.full([N,W],-1)
for i_r in range(0,W):
    rowcontent = row2sig(X,i_r);
    hshs = []
    for f in fn:
        hshs.append(f(rowcontent))
    for i_c in range(0,D):
        nz = X[:,i_c].nonzero()[0];
        #from IPython.core.debugger import Tracer; Tracer()()
        for i in nz:
            # column is nonzero in X[i,i_c]
            h_i = 0
            for hsh in hshs:
                if SGN[h_i,i] == -1 or hsh < SGN[h_i,i]:
                    SGN[h_i,i] = hsh
                h_i = h_i + 1
        




In [None]:
SGN