In [18]:
# One-liner to start the debugger here.
#from IPython.core.debugger import Tracer; Tracer()()

def chunk_generator(fname, chunk_size=1000):
        data = []
        counter = 0

        with open(fname, 'r') as ifile:

            for line in ifile:
                if counter < chunk_size:
                    data.append([int(p) for p in line.split(" ")])
                    counter += 1
                if counter == chunk_size:
                    counter = 0
                    yield data
                    data = []
                    
            # process remaining elements
            if len(data) > 0:
                yield data
    
import scipy.sparse as sp
import numpy as np

class ShingleFileParser:
    def __init__(self, fname):
        cindex = 0;
        for chunk in chunk_generator(fname, 40000):
            if cindex == 0 :
                self.D = chunk[0][0]
                self.W = chunk[1][0]
                self.X = sp.lil_matrix ((self.W,self.D))
                for i in range(3,len(chunk)):
                    self.add_to_matrix(chunk[i])
            else:
                for line in chunk :
                    self.add_to_matrix(line)
            cindex = cindex + 1 ;
        
    def add_to_matrix(self,item):
            D_i = item[0] - 1
            W_i = item[1] - 1
            self.X[W_i,D_i] = 1
    
    def save_csc(self,filename):
        print("converting to csc (column-access optimized) sparse format ...")
        self.X = sp.csc_matrix(self.X)
        np.savez(filename,data = self.X.data ,indices=self.X.indices,
                 indptr = self.X.indptr, shape = self.X.shape)
        print ("content saved to " + filename)
            

def load_sparse_csc(filename):
    loader = np.load(filename)
    return sp.csc_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])


In [19]:
import time

start = time.time()
sf_parser =  ShingleFileParser('../data/docword.kos.txt')
end = time.time()
print("Parsing the file took " + str(end - start) + " seconds")
sf_parser.save_csc('../data/csc_kos');
del sf_parser

Parsing the file took 2.85614800453 seconds
converting to csc (column-access optimized) sparse format ...
content saved to ../data/csc_kos


In [20]:
X = load_sparse_csc('../data/csc_kos.npz')

In [105]:
def jaccard_similarity(A):
    # assumes that every document is a column-vector.
    # We will compute the "covariance" between every document to find how many elements are common.
    intersect = A.T.dot(A)
    # a bit tricky .. create 2 "meshes" of the sizes : one vertical and one horizontal
    cardinality_sum = np.array(np.meshgrid(D_len, D_len))
    # add up the 2 mesh grids .. this adds up all the pairs of 2 documents
    cardinality_sum = np.sum(cardinality_sum,axis=0)
    # |union(a, b)| == |a| + |b| - |intersection(a, b)|
    union = cardinality_sum - intersect
    J= np.divide(intersect.todense(),union)
    return J

In [107]:
J = jaccard_similarity(X)
# this contains duplicates (J[a,b] =J[b,a]). 
# Also the diagonal would need to be ignored, since documents are similar to themselves
np.mean(J)

0.03378473014650963