Skip to content

Commit

Permalink
Merge 7f90231 into 5a63d0f
Browse files Browse the repository at this point in the history
  • Loading branch information
scottgigante committed Oct 4, 2019
2 parents 5a63d0f + 7f90231 commit 84d7ef4
Show file tree
Hide file tree
Showing 28 changed files with 541 additions and 189 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
numpy>=1.10.0
scipy>=0.18.0
scikit-learn>=0.19.1
pandas>=0.19.0,<0.24
pandas>=0.25
decorator>=4.3.0
18 changes: 10 additions & 8 deletions scprep/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
import scprep.reduce
import scprep.run

import pandas as _pd
if int(_pd.__version__.split(".")[1]) < 24:
import numpy as _np

def __rmatmul__(self, other):
""" Matrix multiplication using binary `@` operator in Python>=3.5 """
return self.dot(_np.transpose(other))
_pd.core.series.Series.__rmatmul__ = __rmatmul__
import pandas as pd
if int(pd.__version__.split('.')[1]) < 26:
def fill_value(self):
# Used in reindex_indexer
try:
return self.values.dtype.fill_value
except AttributeError:
return self.values.dtype.na_value
from pandas.core.internals.blocks import ExtensionBlock
setattr(ExtensionBlock, 'fill_value', property(fill_value))
8 changes: 5 additions & 3 deletions scprep/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ def remove_empty_cells(data, *extra_data, sample_labels=None):
warnings.warn("`scprep.filter.remove_empty_cells` is deprecated. "
"Use `scprep.filter.filter_empty_cells` instead.",
DeprecationWarning)
return filter_empty_cells(data, *extra_data)
return filter_empty_cells(data, *extra_data, sample_labels=sample_labels)


def remove_duplicates(data, *extra_data, sample_labels=None):
warnings.warn("`scprep.filter.remove_duplicates` is deprecated. "
"Use `scprep.filter.filter_duplicates` instead.",
DeprecationWarning)
return filter_duplicates(data, *extra_data)
return filter_duplicates(data, *extra_data, sample_labels=sample_labels)


def filter_empty_genes(data, *extra_data):
Expand Down Expand Up @@ -288,7 +288,7 @@ def filter_gene_set_expression(data, *extra_data, genes=None,
Filtered extra data, if passed.
"""
cell_sums = measure.gene_set_expression(
data, genes,
data, genes=genes,
starts_with=starts_with, ends_with=ends_with,
exact_word=exact_word, regex=regex,
library_size_normalize=library_size_normalize)
Expand All @@ -315,6 +315,8 @@ def _find_unique_cells(data):
"""
if isinstance(data, pd.SparseDataFrame):
unique_idx = _find_unique_cells(data.to_coo())
elif utils.is_sparse_dataframe(data):
unique_idx = _find_unique_cells(data.sparse.to_coo())
elif isinstance(data, pd.DataFrame):
unique_idx = ~data.duplicated()
elif isinstance(data, np.ndarray):
Expand Down
13 changes: 7 additions & 6 deletions scprep/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import pandas as pd

from .utils import _matrix_to_data_frame
from .. import utils


def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs):
"""Read a csv file into a pandas.SparseDataFrame
"""Read a csv file into a pd.DataFrame[pd.SparseArray]
"""
chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs)
data = pd.concat(chunk.to_sparse(fill_value=fill_value)
data = pd.concat(utils.dataframe_to_sparse(chunk, fill_value=fill_value)
for chunk in chunks)
return data

Expand All @@ -36,15 +37,15 @@ def load_csv(filename, cell_axis='row', delimiter=',',
If `True`, we assume cell names are in the first row/column. Otherwise
expects a filename or an array containing a list of cell barcodes.
sparse : bool, optional (default: False)
If True, loads the data as a pd.SparseDataFrame. This uses less memory
If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
but more CPU.
**kwargs : optional arguments for `pd.read_csv`.
Returns
-------
data : array-like, shape=[n_samples, n_features]
If either gene or cell names are given, data will be a pd.DataFrame or
pd.SparseDataFrame. If no names are given, data will be a np.ndarray
pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
or scipy.sparse.spmatrix
"""
if cell_axis not in ['row', 'column', 'col']:
Expand Down Expand Up @@ -113,15 +114,15 @@ def load_tsv(filename, cell_axis='row', delimiter='\t',
If `True`, we assume cell names are in the first row/column. Otherwise
expects a filename or an array containing a list of cell barcodes.
sparse : bool, optional (default: False)
If True, loads the data as a pd.SparseDataFrame. This uses less memory
If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
but more CPU.
**kwargs : optional arguments for `pd.read_csv`.
Returns
-------
data : array-like, shape=[n_samples, n_features]
If either gene or cell names are given, data will be a pd.DataFrame or
pd.SparseDataFrame. If no names are given, data will be a np.ndarray
pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
or scipy.sparse.spmatrix
"""
return load_csv(filename, cell_axis=cell_axis, delimiter=delimiter,
Expand Down
4 changes: 2 additions & 2 deletions scprep/io/fcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def load_fcs(filename, gene_names=True, cell_names=True,
If `True`, we assume cell names are contained in the file. Otherwise
expects a filename or an array containing a list of cell barcodes.
sparse : bool, optional (default: None)
If True, loads the data as a pd.SparseDataFrame. This uses less memory
If True, loads the data as a pd.DataFrame[SparseArray]. This uses less memory
but more CPU.
metadata_channels : list-like, optional, shape=[n_meta] (default: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1'])
Channels to be excluded from the data
Expand Down Expand Up @@ -273,7 +273,7 @@ def load_fcs(filename, gene_names=True, cell_names=True,
Values from metadata channels
data : array-like, shape=[n_samples, n_features]
If either gene or cell names are given, data will be a pd.DataFrame or
pd.SparseDataFrame. If no names are given, data will be a np.ndarray
pd.DataFrame[SparseArray]. If no names are given, data will be a np.ndarray
or scipy.sparse.spmatrix
"""
if cell_names is True:
Expand Down
4 changes: 2 additions & 2 deletions scprep/io/mtx.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def load_mtx(mtx_file, cell_axis='row',
cell_names : `str`, array-like, or `None` (default: None)
Expects a filename or an array containing a list of cell barcodes.
sparse : bool, optional (default: None)
If True, loads the data as a pd.SparseDataFrame. This uses less memory
If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
but more CPU.
Returns
-------
data : array-like, shape=[n_samples, n_features]
If either gene or cell names are given, data will be a pd.DataFrame or
pd.SparseDataFrame. If no names are given, data will be a np.ndarray
pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
or scipy.sparse.spmatrix
"""
if cell_axis not in ['row', 'column', 'col']:
Expand Down
6 changes: 3 additions & 3 deletions scprep/io/tenx.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol',
Returns
-------
data: array-like, shape=[n_samples, n_features]
If sparse, data will be a pd.SparseDataFrame. Otherwise, data will
If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
be a pd.DataFrame.
"""

Expand Down Expand Up @@ -168,7 +168,7 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol',
Returns
-------
data: array-like, shape=[n_samples, n_features]
If sparse, data will be a pd.SparseDataFrame. Otherwise, data will
If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
be a pd.DataFrame.
"""

Expand Down Expand Up @@ -247,7 +247,7 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol',
Returns
-------
data: array-like, shape=[n_samples, n_features]
If sparse, data will be a pd.SparseDataFrame. Otherwise, data will
If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
be a pd.DataFrame.
"""

Expand Down
23 changes: 14 additions & 9 deletions scprep/io/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import warnings
import numpy as np

from .. import utils


def _parse_header(header, n_expected, header_type="gene_names"):
"""
Expand Down Expand Up @@ -93,37 +95,40 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None):
# dataframe with index and/or columns
if sparse is None:
# let the input data decide
sparse = isinstance(data, pd.SparseDataFrame) or sp.issparse(data)
sparse = utils.is_sparse_dataframe(data) or sp.issparse(data)
if sparse and gene_names is not None and \
len(np.unique(gene_names)) < len(gene_names):
warnings.warn(
"Duplicate gene names detected! Forcing dense matrix",
RuntimeWarning)
sparse = False
if sparse:
# return pandas.SparseDataFrame
# return pandas.DataFrame[SparseArray]
if isinstance(data, pd.DataFrame):
if gene_names is not None:
data.columns = gene_names
if cell_names is not None:
data.index = cell_names
if not isinstance(data, pd.SparseDataFrame):
data = data.to_sparse(fill_value=0.0)
if not utils.is_sparse_dataframe(data):
data = utils.dataframe_to_sparse(data, fill_value=0.0)
elif sp.issparse(data):
data = pd.DataFrame.sparse.from_spmatrix(data, index=cell_names, columns=gene_names)
else:
data = pd.SparseDataFrame(data, default_fill_value=0.0)
data.index = cell_names
data.columns = gene_names
data = pd.DataFrame(data, index=cell_names, columns=gene_names)
data = utils.dataframe_to_sparse(data, fill_value=0.0)
else:
# return pandas.DataFrame
if isinstance(data, pd.DataFrame):
if gene_names is not None:
data.columns = gene_names
if cell_names is not None:
data.index = cell_names
if isinstance(data, pd.SparseDataFrame):
data = data.to_dense()
if utils.is_sparse_dataframe(data):
data = data.sparse.to_dense()
else:
if sp.issparse(data):
data = data.toarray()
data = pd.DataFrame(data, index=cell_names, columns=gene_names)
# convert data to float
data = data.astype(float)
return data
18 changes: 10 additions & 8 deletions scprep/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,15 @@ def library_size_normalize(data, rescale='median',
"""
# pandas support
columns, index = None, None
if isinstance(data, pd.SparseDataFrame) or \
pd.api.types.is_sparse(data):
if isinstance(data, pd.DataFrame):
columns, index = data.columns, data.index
data = data.to_coo()
elif isinstance(data, pd.DataFrame):
columns, index = data.columns, data.index
data = data.values
if utils.is_sparse_dataframe(data):
data = data.sparse.to_coo()
elif isinstance(data, pd.SparseDataFrame):
data = data.to_coo()
else:
# dense data
data = data.to_numpy()

calc_libsize = sparse.issparse(data) and (return_library_size or
data.nnz > 2**31)
Expand All @@ -91,7 +93,7 @@ def library_size_normalize(data, rescale='median',
if columns is not None:
# pandas dataframe
if sparse.issparse(data_norm):
data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0.0)
data_norm = utils.SparseDataFrame(data_norm, default_fill_value=0.0)
else:
data_norm = pd.DataFrame(data_norm)
data_norm.columns = columns
Expand Down Expand Up @@ -120,7 +122,7 @@ def batch_mean_center(data, sample_idx=None):
data : array-like, shape=[n_samples, n_features]
Batch mean-centered output data.
"""
if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame):
if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame) or utils.is_sparse_dataframe(data):
raise ValueError("Cannot mean center sparse data. "
"Convert to dense matrix first.")
if sample_idx is None:
Expand Down
4 changes: 3 additions & 1 deletion scprep/reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,10 @@ def pca(data, n_components=100, eps=0.3,
# handle dataframes
if isinstance(data, pd.SparseDataFrame):
data = data.to_coo()
elif utils.is_sparse_dataframe(data):
data = data.sparse.to_coo()
elif isinstance(data, pd.DataFrame):
data = data.values
data = data.to_numpy()

# handle sparsity
if sparse.issparse(data):
Expand Down
55 changes: 45 additions & 10 deletions scprep/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import warnings
import re
import sys

from . import utils

if int(sys.version.split(".")[1]) < 7:
Expand Down Expand Up @@ -218,7 +219,7 @@ def get_gene_set(data, starts_with=None, ends_with=None,
"""
if not _is_1d(data):
try:
data = data.columns.values
data = data.columns.to_numpy()
except AttributeError:
raise TypeError("data must be a list of gene names or a pandas "
"DataFrame. Got {}".format(type(data).__name__))
Expand Down Expand Up @@ -255,7 +256,7 @@ def get_cell_set(data, starts_with=None, ends_with=None,
"""
if not _is_1d(data):
try:
data = data.index.values
data = data.index.to_numpy()
except AttributeError:
raise TypeError("data must be a list of cell names or a pandas "
"DataFrame. Got {}".format(type(data).__name__))
Expand Down Expand Up @@ -329,21 +330,37 @@ def select_cols(data, *extra_data, idx=None,
_check_idx_1d(idx)
idx = idx.flatten()

if isinstance(data, pd.SparseDataFrame):
# evil deprecated dataframe; get rid of it
data = utils.SparseDataFrame(data)
if isinstance(data, pd.DataFrame):
try:
data = data.loc[:, idx]
if isinstance(idx, (numbers.Integral, str)):
data = data.loc[:, idx]
else:
if np.issubdtype(idx.dtype, np.dtype(bool).type):
# temporary workaround for pandas error
raise TypeError
data = data.loc[:, idx]
except (KeyError, TypeError):
if isinstance(idx, str):
raise
if isinstance(idx, numbers.Integral) or \
issubclass(np.array(idx).dtype.type, numbers.Integral):
np.issubdtype(idx.dtype, np.dtype(int)) or \
np.issubdtype(idx.dtype, np.dtype(bool)):
data = data.loc[:, np.array(data.columns)[idx]]
else:
raise
elif isinstance(data, pd.Series):
try:
if np.issubdtype(idx.dtype, np.dtype(bool).type):
# temporary workaround for pandas error
raise TypeError
data = data.loc[idx]
except (KeyError, TypeError):
if isinstance(idx, numbers.Integral) or \
issubclass(np.array(idx).dtype.type, numbers.Integral):
np.issubdtype(idx.dtype, np.dtype(int)) or \
np.issubdtype(idx.dtype, np.dtype(bool)):
data = data.loc[np.array(data.index)[idx]]
else:
raise
Expand Down Expand Up @@ -432,18 +449,36 @@ def select_rows(data, *extra_data, idx=None,
_check_idx_1d(idx)
idx = idx.flatten()

if isinstance(data, pd.SparseDataFrame):
# evil deprecated dataframe; get rid of it
data = utils.SparseDataFrame(data)
if isinstance(data, (pd.DataFrame, pd.Series)):
try:
with warnings.catch_warnings():
warnings.filterwarnings(
"error", "Passing list-likes to .loc")
if isinstance(idx, (numbers.Integral, str)):
data = data.loc[idx]
else:
if np.issubdtype(idx.dtype, np.dtype(bool).type):
# temporary workaround for pandas error
raise TypeError
with warnings.catch_warnings():
warnings.filterwarnings(
"error", "Passing list-likes to .loc")
data = data.loc[idx]
except (KeyError, TypeError, FutureWarning):
if isinstance(idx, str):
raise
if isinstance(idx, numbers.Integral) or \
issubclass(np.array(idx).dtype.type, numbers.Integral):
data = data.iloc[idx]
np.issubdtype(idx.dtype, np.dtype(int)) or \
np.issubdtype(idx.dtype, np.dtype(bool)):
data = data.loc[np.array(data.index)[idx]]
else:
raise
# temporary workaround for https://github.com/pandas-dev/pandas/issues/27781
if utils.is_sparse_dataframe(data):
for col in np.where(data.isna().any())[0]:
colname = data.columns[col]
if utils.is_sparse_series(data[colname]) and data[colname].isna().all():
data[colname] = data[colname].fillna(data[colname].sparse.fill_value)
elif _is_1d(data):
if isinstance(data, list):
# can't numpy index a list
Expand Down

0 comments on commit 84d7ef4

Please sign in to comment.