diff --git a/Makefile b/Makefile index e36325e..df824c8 100644 --- a/Makefile +++ b/Makefile @@ -20,12 +20,12 @@ clean: # Tests... # -test-code: in +test-code: $(NOSETESTS) -s mrec test-coverage: $(NOSETESTS) -s --with-coverage --cover-html --cover-html-dir=coverage \ --cover-package=mrec mrec -test: test-code test-doc +test: test-code diff --git a/mrec/sparse.py b/mrec/sparse.py index 3ecd58e..0c581ce 100644 --- a/mrec/sparse.py +++ b/mrec/sparse.py @@ -13,7 +13,6 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o Parameters ---------- - filepath : file or str File containing simply formatted row,col,val sparse matrix data. comments : str, optional @@ -33,7 +32,6 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o Returns ------- - mat : scipy.sparse.coo_matrix The sparse matrix. """ @@ -46,7 +44,7 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o shape = (max(row)+1,max(col)+1) return coo_matrix((data,(row,col)),shape=shape) -def savez(d,filepath): +def savez(d,file): """ Save a sparse matrix to file in numpy binary format. @@ -54,21 +52,28 @@ def savez(d,filepath): ---------- d : scipy sparse matrix The sparse matrix to save. - filepath : str - The filepath to write to. + file : str or file + Either the file name (string) or an open file (file-like object) + where the matrix will be saved. If file is a string, the ``.npz`` + extension will be appended to the file name if it is not already there. """ - np.savez(filepath,row=d.row,col=d.col,data=d.data,shape=d.shape) + np.savez(file,row=d.row,col=d.col,data=d.data,shape=d.shape) -def loadz(filepath): +def loadz(file): """ Load a sparse matrix saved to file with savez. Parameters ---------- - filepath : str - The filepath to read from. + file : str + The open file or filepath to read from. + + Returns + ------- + mat : scipy.sparse.coo_matrix + The sparse matrix. """ - y = np.load(filepath) + y = np.load(file) return coo_matrix((y['data'],(y['row'],y['col'])),shape=y['shape']) class fast_sparse_matrix(object): @@ -113,22 +118,56 @@ def __init__(self,X,col_view=None): @property def shape(self): - """Return the shape of the underlying matrix.""" + """ + Return the shape of the underlying matrix. + """ return self.X.shape def fast_get_col(self,j): - """Return column j.""" + """ + Return column j of the underlying matrix. + + Parameters + ---------- + j : int + Index of column to get. + + Returns + ------- + col : scipy.sparse.csc_matrix + Copy of column j of the matrix. + """ col = self.col_view[:,j].copy() col.data = self.X.data[col.data] return col def fast_update_col(self,j,vals): - """Update values of existing non-zeros in column j.""" + """ + Update values of existing non-zeros in column + of the underlying matrix. + + Parameters + ---------- + j : int + Index of the column to update. + vals : array like + The new values to be assigned, must satisfy + len(vals) == X[:,j].nnz i.e. this method can + only change the value of existing non-zero entries + of column j, it cannot add new ones. + """ dataptr = self.col_view[:,j].data self.X.data[dataptr] = vals def save(self,filepath): - """Save to file as arrays in numpy binary format.""" + """ + Save to file as arrays in numpy binary format. + + Parameters + ---------- + filepath : str + The filepath to write to. + """ d = self.X.tocoo(copy=False) v = self.col_view.tocoo(copy=False) np.savez(filepath,row=d.row,col=d.col,data=d.data,shape=d.shape, @@ -138,6 +177,11 @@ def save(self,filepath): def load(filepath): """ Load a fast_sparse_matrix from file written by fast_sparse_matrix.save(). + + Parameters + ---------- + filepath : str + The filepath to load. """ y = np.load(filepath,mmap_mode='r') X = coo_matrix((y['data'],(y['row'],y['col'])),shape=y['shape']) @@ -152,7 +196,6 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o Parameters ---------- - filepath : file or str File containing simply formatted row,col,val sparse matrix data. comments : str, optional @@ -169,6 +212,11 @@ def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_o Offset applied to the row and col indices in the input data (default: 1). The default offset is chosen so that 1-indexed data on file results in a fast_sparse_matrix holding 0-indexed matrices. + + Returns + ------- + mat : mrec.sparse.fast_sparse_matrix + A fast_sparse_matrix holding the data in the file. """ X = loadtxt(filepath,comments=comments,delimiter=delimiter,skiprows=skiprows,usecols=usecols) return fast_sparse_matrix(X) @@ -180,9 +228,13 @@ def loadmm(filepath): Parameters ---------- - filepath : file or str The matrixmarket file to read. + + Returns + ------- + mat : mrec.sparse.fast_sparse_matrix + A fast_sparse_matrix holding the data in the file. """ X = mmread(filepath) return fast_sparse_matrix(X) diff --git a/mrec/tests/test_sparse.py b/mrec/tests/test_sparse.py new file mode 100644 index 0000000..e773e6c --- /dev/null +++ b/mrec/tests/test_sparse.py @@ -0,0 +1,74 @@ +import tempfile +import os +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_array_equal + +from mrec.testing import get_random_coo_matrix + +from mrec.sparse import loadtxt +from mrec.sparse import savez +from mrec.sparse import loadz +from mrec.sparse import fast_sparse_matrix + +def test_loadtxt(): + X = get_random_coo_matrix() + f,path = tempfile.mkstemp(suffix='.npz') + with open(path,'w') as f: + for i,j,v in zip(X.row,X.col,X.data): + print >>f,'{0}\t{1}\t{2}'.format(i+1,j+1,v) + Y = loadtxt(path) + os.remove(path) + assert_array_equal(X.toarray(),Y.toarray()) + +def test_savez_loadz(): + m = get_random_coo_matrix() + f,path = tempfile.mkstemp(suffix='.npz') + savez(m,path) + n = loadz(path) + os.remove(path) + assert_array_equal(n.toarray(),m.toarray()) + +def test_init_fast_sparse_matrix(): + X = get_random_coo_matrix() + Y = X.tocsr() + Z = X.tocsc() + for M in [X,Y,Z]: + m = fast_sparse_matrix(M) + assert_array_equal(m.X.toarray(),M.toarray()) + assert_equal(m.shape,M.shape) + +def test_fast_get_col(): + X = get_random_coo_matrix().tocsc() + m = fast_sparse_matrix(X) + rows,cols = X.shape + for j in xrange(cols): + assert_array_equal(m.fast_get_col(j).toarray(),X[:,j].toarray()) + +def test_fast_update_col(): + X = get_random_coo_matrix().tocsc() + m = fast_sparse_matrix(X) + rows,cols = X.shape + for j in xrange(cols): + col = m.fast_get_col(j) + new_vals = [] + for i in X[:,j].indices: + new_vals.append(X[i,j]+1) + m.fast_update_col(j,new_vals) + expected = X[:,j].toarray() + for i in xrange(expected.shape[0]): + if expected[i] > 0: + expected[i] += 1 + assert_array_equal(m.fast_get_col(j).toarray(),expected) + +def test_save_load(): + """Save to file as arrays in numpy binary format.""" + X = get_random_coo_matrix() + m = fast_sparse_matrix(X) + f,path = tempfile.mkstemp(suffix='.npz') + m.save(path) + n = fast_sparse_matrix.load(path) + os.remove(path) + assert_equal(m.shape,n.shape) + assert_array_equal(m.X.toarray(),n.X.toarray()) + assert_array_equal(m.col_view.toarray(),n.col_view.toarray()) +