Merge 7f90231 into 5a63d0f

KrishnaswamyLab · Oct 4, 2019 · 84d7ef4 · 84d7ef4
2 parents 5a63d0f + 7f90231
commit 84d7ef4
Show file tree

Hide file tree

Showing 28 changed files with 541 additions and 189 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 numpy>=1.10.0
 scipy>=0.18.0
 scikit-learn>=0.19.1
-pandas>=0.19.0,<0.24
+pandas>=0.25
 decorator>=4.3.0
diff --git a/scprep/__init__.py b/scprep/__init__.py
@@ -15,11 +15,13 @@
 import scprep.reduce
 import scprep.run
 
-import pandas as _pd
-if int(_pd.__version__.split(".")[1]) < 24:
-    import numpy as _np
-
-    def __rmatmul__(self, other):
-        """ Matrix multiplication using binary `@` operator in Python>=3.5 """
-        return self.dot(_np.transpose(other))
-    _pd.core.series.Series.__rmatmul__ = __rmatmul__
+import pandas as pd
+if int(pd.__version__.split('.')[1]) < 26:
+    def fill_value(self):
+        # Used in reindex_indexer
+        try:
+            return self.values.dtype.fill_value
+        except AttributeError:
+            return self.values.dtype.na_value
+    from pandas.core.internals.blocks import ExtensionBlock
+    setattr(ExtensionBlock, 'fill_value', property(fill_value))
diff --git a/scprep/filter.py b/scprep/filter.py
@@ -28,14 +28,14 @@ def remove_empty_cells(data, *extra_data, sample_labels=None):
     warnings.warn("`scprep.filter.remove_empty_cells` is deprecated. "
                   "Use `scprep.filter.filter_empty_cells` instead.",
                   DeprecationWarning)
-    return filter_empty_cells(data, *extra_data)
+    return filter_empty_cells(data, *extra_data, sample_labels=sample_labels)
 
 
 def remove_duplicates(data, *extra_data, sample_labels=None):
     warnings.warn("`scprep.filter.remove_duplicates` is deprecated. "
                   "Use `scprep.filter.filter_duplicates` instead.",
                   DeprecationWarning)
-    return filter_duplicates(data, *extra_data)
+    return filter_duplicates(data, *extra_data, sample_labels=sample_labels)
 
 
 def filter_empty_genes(data, *extra_data):
@@ -288,7 +288,7 @@ def filter_gene_set_expression(data, *extra_data, genes=None,
         Filtered extra data, if passed.
     """
     cell_sums = measure.gene_set_expression(
-        data, genes,
+        data, genes=genes,
         starts_with=starts_with, ends_with=ends_with,
         exact_word=exact_word, regex=regex,
         library_size_normalize=library_size_normalize)
@@ -315,6 +315,8 @@ def _find_unique_cells(data):
     """
     if isinstance(data, pd.SparseDataFrame):
         unique_idx = _find_unique_cells(data.to_coo())
+    elif utils.is_sparse_dataframe(data):
+        unique_idx = _find_unique_cells(data.sparse.to_coo())
     elif isinstance(data, pd.DataFrame):
         unique_idx = ~data.duplicated()
     elif isinstance(data, np.ndarray):

diff --git a/scprep/io/csv.py b/scprep/io/csv.py
@@ -4,13 +4,14 @@
 import pandas as pd
 
 from .utils import _matrix_to_data_frame
+from .. import utils
 
 
 def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs):
-    """Read a csv file into a pandas.SparseDataFrame
+    """Read a csv file into a pd.DataFrame[pd.SparseArray]
     """
     chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs)
-    data = pd.concat(chunk.to_sparse(fill_value=fill_value)
+    data = pd.concat(utils.dataframe_to_sparse(chunk, fill_value=fill_value)
                      for chunk in chunks)
     return data
 
@@ -36,15 +37,15 @@ def load_csv(filename, cell_axis='row', delimiter=',',
         If `True`, we assume cell names are in the first row/column. Otherwise
         expects a filename or an array containing a list of cell barcodes.
     sparse : bool, optional (default: False)
-        If True, loads the data as a pd.SparseDataFrame. This uses less memory
+        If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
         but more CPU.
     **kwargs : optional arguments for `pd.read_csv`.
 
     Returns
     -------
     data : array-like, shape=[n_samples, n_features]
         If either gene or cell names are given, data will be a pd.DataFrame or
-        pd.SparseDataFrame. If no names are given, data will be a np.ndarray
+        pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
         or scipy.sparse.spmatrix
     """
     if cell_axis not in ['row', 'column', 'col']:
@@ -113,15 +114,15 @@ def load_tsv(filename, cell_axis='row', delimiter='\t',
         If `True`, we assume cell names are in the first row/column. Otherwise
         expects a filename or an array containing a list of cell barcodes.
     sparse : bool, optional (default: False)
-        If True, loads the data as a pd.SparseDataFrame. This uses less memory
+        If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
         but more CPU.
     **kwargs : optional arguments for `pd.read_csv`.
 
     Returns
     -------
     data : array-like, shape=[n_samples, n_features]
         If either gene or cell names are given, data will be a pd.DataFrame or
-        pd.SparseDataFrame. If no names are given, data will be a np.ndarray
+        pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
         or scipy.sparse.spmatrix
     """
     return load_csv(filename, cell_axis=cell_axis, delimiter=delimiter,

diff --git a/scprep/io/fcs.py b/scprep/io/fcs.py
@@ -242,7 +242,7 @@ def load_fcs(filename, gene_names=True, cell_names=True,
         If `True`, we assume cell names are contained in the file. Otherwise
         expects a filename or an array containing a list of cell barcodes.
     sparse : bool, optional (default: None)
-        If True, loads the data as a pd.SparseDataFrame. This uses less memory
+        If True, loads the data as a pd.DataFrame[SparseArray]. This uses less memory
         but more CPU.
     metadata_channels : list-like, optional, shape=[n_meta] (default: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1'])
         Channels to be excluded from the data
@@ -273,7 +273,7 @@ def load_fcs(filename, gene_names=True, cell_names=True,
         Values from metadata channels
     data : array-like, shape=[n_samples, n_features]
         If either gene or cell names are given, data will be a pd.DataFrame or
-        pd.SparseDataFrame. If no names are given, data will be a np.ndarray
+        pd.DataFrame[SparseArray]. If no names are given, data will be a np.ndarray
         or scipy.sparse.spmatrix
     """
     if cell_names is True:

diff --git a/scprep/io/mtx.py b/scprep/io/mtx.py
@@ -21,14 +21,14 @@ def load_mtx(mtx_file, cell_axis='row',
     cell_names : `str`, array-like, or `None` (default: None)
         Expects a filename or an array containing a list of cell barcodes.
     sparse : bool, optional (default: None)
-        If True, loads the data as a pd.SparseDataFrame. This uses less memory
+        If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
         but more CPU.
 
     Returns
     -------
     data : array-like, shape=[n_samples, n_features]
         If either gene or cell names are given, data will be a pd.DataFrame or
-        pd.SparseDataFrame. If no names are given, data will be a np.ndarray
+        pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
         or scipy.sparse.spmatrix
     """
     if cell_axis not in ['row', 'column', 'col']:

diff --git a/scprep/io/tenx.py b/scprep/io/tenx.py
@@ -95,7 +95,7 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol',
     Returns
     -------
     data: array-like, shape=[n_samples, n_features]
-        If sparse, data will be a pd.SparseDataFrame. Otherwise, data will
+        If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
         be a pd.DataFrame.
     """
 
@@ -168,7 +168,7 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol',
     Returns
     -------
     data: array-like, shape=[n_samples, n_features]
-        If sparse, data will be a pd.SparseDataFrame. Otherwise, data will
+        If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
         be a pd.DataFrame.
     """
 
@@ -247,7 +247,7 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol',
     Returns
     -------
     data: array-like, shape=[n_samples, n_features]
-        If sparse, data will be a pd.SparseDataFrame. Otherwise, data will
+        If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
         be a pd.DataFrame.
     """
 

diff --git a/scprep/io/utils.py b/scprep/io/utils.py
@@ -6,6 +6,8 @@
 import warnings
 import numpy as np
 
+from .. import utils
+
 
 def _parse_header(header, n_expected, header_type="gene_names"):
     """
@@ -93,37 +95,40 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None):
         # dataframe with index and/or columns
         if sparse is None:
             # let the input data decide
-            sparse = isinstance(data, pd.SparseDataFrame) or sp.issparse(data)
+            sparse = utils.is_sparse_dataframe(data) or sp.issparse(data)
         if sparse and gene_names is not None and \
                 len(np.unique(gene_names)) < len(gene_names):
             warnings.warn(
                 "Duplicate gene names detected! Forcing dense matrix",
                 RuntimeWarning)
             sparse = False
         if sparse:
-            # return pandas.SparseDataFrame
+            # return pandas.DataFrame[SparseArray]
             if isinstance(data, pd.DataFrame):
                 if gene_names is not None:
                     data.columns = gene_names
                 if cell_names is not None:
                     data.index = cell_names
-                if not isinstance(data, pd.SparseDataFrame):
-                    data = data.to_sparse(fill_value=0.0)
+                if not utils.is_sparse_dataframe(data):
+                    data = utils.dataframe_to_sparse(data, fill_value=0.0)
+            elif sp.issparse(data):
+                data = pd.DataFrame.sparse.from_spmatrix(data, index=cell_names, columns=gene_names)
             else:
-                data = pd.SparseDataFrame(data, default_fill_value=0.0)
-                data.index = cell_names
-                data.columns = gene_names
+                data = pd.DataFrame(data, index=cell_names, columns=gene_names)
+                data = utils.dataframe_to_sparse(data, fill_value=0.0)
         else:
             # return pandas.DataFrame
             if isinstance(data, pd.DataFrame):
                 if gene_names is not None:
                     data.columns = gene_names
                 if cell_names is not None:
                     data.index = cell_names
-                if isinstance(data, pd.SparseDataFrame):
-                    data = data.to_dense()
+                if utils.is_sparse_dataframe(data):
+                    data = data.sparse.to_dense()
             else:
                 if sp.issparse(data):
                     data = data.toarray()
                 data = pd.DataFrame(data, index=cell_names, columns=gene_names)
+        # convert data to float
+        data = data.astype(float)
         return data
diff --git a/scprep/normalize.py b/scprep/normalize.py
@@ -64,13 +64,15 @@ def library_size_normalize(data, rescale='median',
     """
     # pandas support
     columns, index = None, None
-    if isinstance(data, pd.SparseDataFrame) or \
-            pd.api.types.is_sparse(data):
+    if isinstance(data, pd.DataFrame):
         columns, index = data.columns, data.index
-        data = data.to_coo()
-    elif isinstance(data, pd.DataFrame):
-        columns, index = data.columns, data.index
-        data = data.values
+        if utils.is_sparse_dataframe(data):
+            data = data.sparse.to_coo()
+        elif isinstance(data, pd.SparseDataFrame):
+            data = data.to_coo()
+        else:
+            # dense data
+            data = data.to_numpy()
 
     calc_libsize = sparse.issparse(data) and (return_library_size or
                                               data.nnz > 2**31)
@@ -91,7 +93,7 @@ def library_size_normalize(data, rescale='median',
     if columns is not None:
         # pandas dataframe
         if sparse.issparse(data_norm):
-            data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0.0)
+            data_norm = utils.SparseDataFrame(data_norm, default_fill_value=0.0)
         else:
             data_norm = pd.DataFrame(data_norm)
         data_norm.columns = columns
@@ -120,7 +122,7 @@ def batch_mean_center(data, sample_idx=None):
     data : array-like, shape=[n_samples, n_features]
         Batch mean-centered output data.
     """
-    if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame):
+    if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame) or utils.is_sparse_dataframe(data):
         raise ValueError("Cannot mean center sparse data. "
                          "Convert to dense matrix first.")
     if sample_idx is None:

diff --git a/scprep/reduce.py b/scprep/reduce.py
@@ -279,8 +279,10 @@ def pca(data, n_components=100, eps=0.3,
     # handle dataframes
     if isinstance(data, pd.SparseDataFrame):
         data = data.to_coo()
+    elif utils.is_sparse_dataframe(data):
+        data = data.sparse.to_coo()
     elif isinstance(data, pd.DataFrame):
-        data = data.values
+        data = data.to_numpy()
 
     # handle sparsity
     if sparse.issparse(data):

diff --git a/scprep/select.py b/scprep/select.py
@@ -5,6 +5,7 @@
 import warnings
 import re
 import sys
+
 from . import utils
 
 if int(sys.version.split(".")[1]) < 7:
@@ -218,7 +219,7 @@ def get_gene_set(data, starts_with=None, ends_with=None,
     """
     if not _is_1d(data):
         try:
-            data = data.columns.values
+            data = data.columns.to_numpy()
         except AttributeError:
             raise TypeError("data must be a list of gene names or a pandas "
                             "DataFrame. Got {}".format(type(data).__name__))
@@ -255,7 +256,7 @@ def get_cell_set(data, starts_with=None, ends_with=None,
     """
     if not _is_1d(data):
         try:
-            data = data.index.values
+            data = data.index.to_numpy()
         except AttributeError:
             raise TypeError("data must be a list of cell names or a pandas "
                             "DataFrame. Got {}".format(type(data).__name__))
@@ -329,21 +330,37 @@ def select_cols(data, *extra_data, idx=None,
         _check_idx_1d(idx)
         idx = idx.flatten()
 
+    if isinstance(data, pd.SparseDataFrame):
+        # evil deprecated dataframe; get rid of it
+        data = utils.SparseDataFrame(data)
     if isinstance(data, pd.DataFrame):
         try:
-            data = data.loc[:, idx]
+            if isinstance(idx, (numbers.Integral, str)):
+                data = data.loc[:, idx]
+            else:
+                if np.issubdtype(idx.dtype, np.dtype(bool).type):
+                    # temporary workaround for pandas error
+                    raise TypeError
+                data = data.loc[:, idx]
         except (KeyError, TypeError):
+            if isinstance(idx, str):
+                raise
             if isinstance(idx, numbers.Integral) or \
-                    issubclass(np.array(idx).dtype.type, numbers.Integral):
+                    np.issubdtype(idx.dtype, np.dtype(int)) or \
+                    np.issubdtype(idx.dtype, np.dtype(bool)):
                 data = data.loc[:, np.array(data.columns)[idx]]
             else:
                 raise
     elif isinstance(data, pd.Series):
         try:
+            if np.issubdtype(idx.dtype, np.dtype(bool).type):
+                # temporary workaround for pandas error
+                raise TypeError
             data = data.loc[idx]
         except (KeyError, TypeError):
             if isinstance(idx, numbers.Integral) or \
-                    issubclass(np.array(idx).dtype.type, numbers.Integral):
+                    np.issubdtype(idx.dtype, np.dtype(int)) or \
+                    np.issubdtype(idx.dtype, np.dtype(bool)):
                 data = data.loc[np.array(data.index)[idx]]
             else:
                 raise
@@ -432,18 +449,36 @@ def select_rows(data, *extra_data, idx=None,
         _check_idx_1d(idx)
         idx = idx.flatten()
 
+    if isinstance(data, pd.SparseDataFrame):
+        # evil deprecated dataframe; get rid of it
+        data = utils.SparseDataFrame(data)
     if isinstance(data, (pd.DataFrame, pd.Series)):
         try:
-            with warnings.catch_warnings():
-                warnings.filterwarnings(
-                    "error", "Passing list-likes to .loc")
+            if isinstance(idx, (numbers.Integral, str)):
                 data = data.loc[idx]
+            else:
+                if np.issubdtype(idx.dtype, np.dtype(bool).type):
+                    # temporary workaround for pandas error
+                    raise TypeError
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "error", "Passing list-likes to .loc")
+                    data = data.loc[idx]
         except (KeyError, TypeError, FutureWarning):
+            if isinstance(idx, str):
+                raise
             if isinstance(idx, numbers.Integral) or \
-                    issubclass(np.array(idx).dtype.type, numbers.Integral):
-                data = data.iloc[idx]
+                    np.issubdtype(idx.dtype, np.dtype(int)) or \
+                    np.issubdtype(idx.dtype, np.dtype(bool)):
+                data = data.loc[np.array(data.index)[idx]]
             else:
                 raise
+        # temporary workaround for https://github.com/pandas-dev/pandas/issues/27781 
+        if utils.is_sparse_dataframe(data):
+            for col in np.where(data.isna().any())[0]:
+                colname = data.columns[col]
+                if utils.is_sparse_series(data[colname]) and data[colname].isna().all():
+                    data[colname] = data[colname].fillna(data[colname].sparse.fill_value)
     elif _is_1d(data):
         if isinstance(data, list):
             # can't numpy index a list