From 40099a75a43ffc033f45c5947d3bad6b480f293e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 12:17:11 -0400 Subject: [PATCH 01/10] fix 10x docs --- scprep/io/tenx.py | 13 +++++++------ test/test_io.py | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/scprep/io/tenx.py b/scprep/io/tenx.py index aaa8e8e8..6e6f949f 100644 --- a/scprep/io/tenx.py +++ b/scprep/io/tenx.py @@ -81,8 +81,8 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', ---------- data_dir: string path to input data directory - expects 'matrix.mtx', 'genes.tsv', 'barcodes.tsv' to be present and - will raise an error otherwise + expects 'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', 'barcodes.tsv(.gz)' + to be present and will raise an error otherwise sparse: boolean If True, a sparse Pandas DataFrame is returned. gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol' @@ -127,7 +127,7 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', except (FileNotFoundError, IOError): raise FileNotFoundError( - "'matrix.mtx', 'genes.tsv', and 'barcodes.tsv' must be present " + "'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)' must be present " "in {}".format(data_dir)) cell_names = barcodes[0] @@ -205,12 +205,13 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol', dirname = files[0].strip("/") subdir_files = [f.split("/")[-1] for f in files] valid = (("barcodes.tsv" in subdir_files or "barcodes.tsv.gz" in subdir_files) and - ("genes.tsv" in subdir_files or "genes.tsv.gz" in subdir_files) and + (("genes.tsv" in subdir_files or "genes.tsv.gz" in subdir_files) or + ("features.tsv" in subdir_files or "features.tsv.gz" in subdir_files)) and ("matrix.mtx" in subdir_files or "matrix.mtx.gz" in subdir_files)) if not valid: raise ValueError( - "Expected a single zipped folder containing 'matrix.mtx', " - "'genes.tsv', and 'barcodes.tsv'. Got {}".format(files)) + "Expected a single zipped folder containing 'matrix.mtx(.gz)', " + "'[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)'. Got {}".format(files)) handle.extractall(path=tmpdir) data = load_10X(os.path.join(tmpdir, dirname)) shutil.rmtree(tmpdir) diff --git a/test/test_io.py b/test/test_io.py index a3218a79..7c34edf8 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -163,7 +163,7 @@ def test_10X(): os.path.join(data.data_dir, "test_10X.zip")) assert_raise_message( FileNotFoundError, - "'matrix.mtx', 'genes.tsv', and 'barcodes.tsv' must be present " + "'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)' must be present " "in {}".format(data.data_dir), scprep.io.load_10X, data.data_dir) @@ -187,8 +187,8 @@ def test_10X_zip(): gene_labels='invalid') assert_raise_message( ValueError, - "Expected a single zipped folder containing 'matrix.mtx', " - "'genes.tsv', and 'barcodes.tsv'. Got ", + "Expected a single zipped folder containing 'matrix.mtx(.gz)', " + "'[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)'. Got ", scprep.io.load_10X_zip, os.path.join(data.data_dir, "test_10X_invalid.zip") ) From 882eeb9da7d0ff266512f0021306d03bc2946fe0 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 12:29:38 -0400 Subject: [PATCH 02/10] format with black --- .travis.yml | 1 + doc/source/conf.py | 81 ++- scprep/__init__.py | 8 +- scprep/_lazyload.py | 44 +- scprep/filter.py | 185 +++-- scprep/io/csv.py | 71 +- scprep/io/download.py | 18 +- scprep/io/fcs.py | 184 ++--- scprep/io/hdf5.py | 29 +- scprep/io/mtx.py | 20 +- scprep/io/tenx.py | 187 ++--- scprep/io/utils.py | 45 +- scprep/measure.py | 48 +- scprep/normalize.py | 46 +- scprep/plot/colors.py | 33 +- scprep/plot/histogram.py | 206 ++++-- scprep/plot/jitter.py | 159 +++-- scprep/plot/marker.py | 168 +++-- scprep/plot/scatter.py | 616 ++++++++++------- scprep/plot/scree.py | 22 +- scprep/plot/tools.py | 177 +++-- scprep/plot/utils.py | 54 +- scprep/plot/variable_genes.py | 54 +- scprep/reduce.py | 127 ++-- scprep/run/r_function.py | 46 +- scprep/run/slingshot.py | 110 +-- scprep/run/splatter.py | 109 ++- scprep/sanitize.py | 8 +- scprep/select.py | 271 +++++--- scprep/stats.py | 274 +++++--- scprep/transform.py | 54 +- scprep/utils.py | 237 ++++--- setup.py | 118 ++-- test/_test_lazyload.py | 16 +- test/test_filter.py | 339 +++++---- test/test_hdf5.py | 51 +- test/test_io.py | 319 +++++---- test/test_lazyload.py | 19 +- test/test_measure.py | 72 +- test/test_normalize.py | 109 ++- test/test_patch.py | 8 +- test/test_plot.py | 1219 +++++++++++++++++++-------------- test/test_reduce.py | 167 +++-- test/test_run.py | 410 ++++++----- test/test_sanitize.py | 30 +- test/test_select.py | 589 ++++++++++------ test/test_stats.py | 383 +++++++---- test/test_transform.py | 102 ++- test/test_utils.py | 517 +++++++++----- test/tools/__init__.py | 1 + test/tools/data.py | 9 +- test/tools/matrix.py | 82 +-- test/tools/utils.py | 18 +- 53 files changed, 5094 insertions(+), 3176 deletions(-) diff --git a/.travis.yml b/.travis.yml index 96ed5e04..bdf123fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,6 +42,7 @@ script: - pip install -U .[test] + - black . --check --diff - python setup.py test - pip install -U .[doc] - cd doc; make html diff --git a/doc/source/conf.py b/doc/source/conf.py index 4dc748f4..e1143950 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -22,14 +22,13 @@ import glob import shutil -root_dir = os.path.abspath(os.path.join(os.path.dirname( - __file__), '..', '..')) +root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) sys.path.insert(0, root_dir) # print(sys.path) # Copy ipython notebooks dest_dir = "examples" -for file in glob.glob(os.path.join(root_dir, 'examples', '*.ipynb')): +for file in glob.glob(os.path.join(root_dir, "examples", "*.ipynb")): print("Copy {} to {}".format(file, dest_dir)) shutil.copy(file, dest_dir) @@ -42,44 +41,45 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.napoleon', - 'sphinx.ext.doctest', - 'sphinx.ext.coverage', - 'sphinx.ext.viewcode', - 'nbsphinx', - 'sphinx.ext.mathjax', - 'autodocsumm', - 'IPython.sphinxext.ipython_console_highlighting'] +extensions = [ + "sphinx.ext.napoleon", + "sphinx.ext.doctest", + "sphinx.ext.coverage", + "sphinx.ext.viewcode", + "nbsphinx", + "sphinx.ext.mathjax", + "autodocsumm", + "IPython.sphinxext.ipython_console_highlighting", +] autodoc_mock_imports = ["h5py", "tables", "rpy2", "fcsparser"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['ytemplates'] +templates_path = ["ytemplates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = ['.rst', '.ipynb'] +source_suffix = [".rst", ".ipynb"] # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'scprep' -copyright = '2018 Krishnaswamy Lab, Yale University' -author = 'Scott Gigante, Jay Stanley, Daniel Burkhardt' +project = "scprep" +copyright = "2018 Krishnaswamy Lab, Yale University" +author = "Scott Gigante, Jay Stanley, Daniel Burkhardt" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # -version_py = os.path.join(root_dir, 'scprep', 'version.py') +version_py = os.path.join(root_dir, "scprep", "version.py") # The full version, including alpha/beta/rc tags. -release = open(version_py).read().strip().split( - '=')[-1].replace('"', '').strip() +release = open(version_py).read().strip().split("=")[-1].replace('"', "").strip() # The short X.Y version. -version = release.split('-')[0] +version = release.split("-")[0] # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -91,10 +91,10 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', '**.ipynb_checkpoints'] +exclude_patterns = ["_build", "**.ipynb_checkpoints"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -105,7 +105,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'default' +html_theme = "default" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -116,13 +116,13 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['ystatic'] +html_static_path = ["ystatic"] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'scprepdoc' +htmlhelp_basename = "scprepdoc" # -- Options for LaTeX output --------------------------------------------- @@ -131,15 +131,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -149,8 +146,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'scprep.tex', 'scprep Documentation', - 'Scott Gigante, Jay Stanley, Daniel Burkhardt', 'manual'), + ( + master_doc, + "scprep.tex", + "scprep Documentation", + "Scott Gigante, Jay Stanley, Daniel Burkhardt", + "manual", + ), ] @@ -158,10 +160,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'scprep', 'scprep Documentation', - [author], 1) -] +man_pages = [(master_doc, "scprep", "scprep Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -170,9 +169,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'scprep', 'scprep Documentation', - author, 'scprep', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "scprep", + "scprep Documentation", + author, + "scprep", + "One line description of project.", + "Miscellaneous", + ), ] nbsphinx_execute = "always" diff --git a/scprep/__init__.py b/scprep/__init__.py index a7a8ffdd..52e5c53d 100644 --- a/scprep/__init__.py +++ b/scprep/__init__.py @@ -16,12 +16,16 @@ import scprep.run import pandas as pd -if int(pd.__version__.split('.')[1]) < 26: + +if int(pd.__version__.split(".")[1]) < 26: + def fill_value(self): # Used in reindex_indexer try: return self.values.dtype.fill_value except AttributeError: return self.values.dtype.na_value + from pandas.core.internals.blocks import ExtensionBlock - setattr(ExtensionBlock, 'fill_value', property(fill_value)) + + setattr(ExtensionBlock, "fill_value", property(fill_value)) diff --git a/scprep/_lazyload.py b/scprep/_lazyload.py index 876106ec..dcdfd833 100644 --- a/scprep/_lazyload.py +++ b/scprep/_lazyload.py @@ -6,39 +6,49 @@ # each module loads submodules on initialization but is only imported # and loads methods/classes when these are accessed _importspec = { - 'matplotlib': ['colors', 'pyplot', 'animation', 'cm', - 'axes', 'lines', 'ticker', 'transforms'], - 'mpl_toolkits': ['mplot3d'], - 'fcsparser': ['api'], - 'rpy2': [{'robjects': ['numpy2ri', 'packages', 'vectors', 'conversion']}, - 'rinterface', - {'rinterface_lib': ['callbacks']}], - 'h5py': [], - 'tables': [], - 'requests': [], + "matplotlib": [ + "colors", + "pyplot", + "animation", + "cm", + "axes", + "lines", + "ticker", + "transforms", + ], + "mpl_toolkits": ["mplot3d"], + "fcsparser": ["api"], + "rpy2": [ + {"robjects": ["numpy2ri", "packages", "vectors", "conversion"]}, + "rinterface", + {"rinterface_lib": ["callbacks"]}, + ], + "h5py": [], + "tables": [], + "requests": [], } class AliasModule(object): - def __init__(self, name, members=None): # easy access to AliasModule members to avoid recursionerror super_setattr = super().__setattr__ if members is None: members = [] - builtin_members = ['__class__', '__doc__'] - super_setattr('__module_name__', name) + builtin_members = ["__class__", "__doc__"] + super_setattr("__module_name__", name) # create submodules submodules = [] for member in members: if isinstance(member, dict): for submodule, submembers in member.items(): - super_setattr(submodule, AliasModule( - "{}.{}".format(name, submodule), submembers)) + super_setattr( + submodule, + AliasModule("{}.{}".format(name, submodule), submembers), + ) submodules.append(submodule) else: - super_setattr(member, AliasModule( - "{}.{}".format(name, member))) + super_setattr(member, AliasModule("{}.{}".format(name, member))) submodules.append(member) super_setattr("__submodules__", submodules) super_setattr("__builtin_members__", builtin_members) diff --git a/scprep/filter.py b/scprep/filter.py index 0723648c..9a9de2c4 100644 --- a/scprep/filter.py +++ b/scprep/filter.py @@ -12,31 +12,38 @@ def remove_empty_genes(data, *extra_data): - warnings.warn("`scprep.filter.remove_empty_genes` is deprecated. " - "Use `scprep.filter.filter_empty_genes` instead.", - DeprecationWarning) + warnings.warn( + "`scprep.filter.remove_empty_genes` is deprecated. " + "Use `scprep.filter.filter_empty_genes` instead.", + DeprecationWarning, + ) return filter_empty_genes(data, *extra_data) def remove_rare_genes(data, *extra_data, cutoff=0, min_cells=5): - warnings.warn("`scprep.filter.remove_rare_genes` is deprecated. " - "Use `scprep.filter.filter_rare_genes` instead.", - DeprecationWarning) - return filter_rare_genes(data, *extra_data, - cutoff=cutoff, min_cells=min_cells) + warnings.warn( + "`scprep.filter.remove_rare_genes` is deprecated. " + "Use `scprep.filter.filter_rare_genes` instead.", + DeprecationWarning, + ) + return filter_rare_genes(data, *extra_data, cutoff=cutoff, min_cells=min_cells) def remove_empty_cells(data, *extra_data, sample_labels=None): - warnings.warn("`scprep.filter.remove_empty_cells` is deprecated. " - "Use `scprep.filter.filter_empty_cells` instead.", - DeprecationWarning) + warnings.warn( + "`scprep.filter.remove_empty_cells` is deprecated. " + "Use `scprep.filter.filter_empty_cells` instead.", + DeprecationWarning, + ) return filter_empty_cells(data, *extra_data, sample_labels=sample_labels) def remove_duplicates(data, *extra_data, sample_labels=None): - warnings.warn("`scprep.filter.remove_duplicates` is deprecated. " - "Use `scprep.filter.filter_duplicates` instead.", - DeprecationWarning) + warnings.warn( + "`scprep.filter.remove_duplicates` is deprecated. " + "Use `scprep.filter.filter_duplicates` instead.", + DeprecationWarning, + ) return filter_duplicates(data, *extra_data, sample_labels=sample_labels) @@ -112,9 +119,11 @@ def filter_empty_cells(data, *extra_data, sample_labels=None): Filtered extra data, if passed. """ if sample_labels is not None: - warnings.warn("`sample_labels` is deprecated. " - "Passing `sample_labels` as `extra_data`.", - DeprecationWarning) + warnings.warn( + "`sample_labels` is deprecated. " + "Passing `sample_labels` as `extra_data`.", + DeprecationWarning, + ) extra_data = list(extra_data) + [sample_labels] cell_sums = measure.library_size(data) keep_cells_idx = cell_sums > 0 @@ -122,12 +131,17 @@ def filter_empty_cells(data, *extra_data, sample_labels=None): return data -def filter_values(data, *extra_data, values=None, - cutoff=None, percentile=None, - keep_cells='above', - return_values=False, - sample_labels=None, - filter_per_sample=None): +def filter_values( + data, + *extra_data, + values=None, + cutoff=None, + percentile=None, + keep_cells="above", + return_values=False, + sample_labels=None, + filter_per_sample=None +): """Remove all cells with `values` above or below a certain threshold It is recommended to use :func:`~scprep.plot.histogram` to @@ -168,29 +182,35 @@ def filter_values(data, *extra_data, values=None, Filtered extra data, if passed. """ if sample_labels is not None: - warnings.warn("`sample_labels` is deprecated. " - "Passing `sample_labels` as `extra_data`.", - DeprecationWarning) + warnings.warn( + "`sample_labels` is deprecated. " + "Passing `sample_labels` as `extra_data`.", + DeprecationWarning, + ) extra_data = list(extra_data) + [sample_labels] if filter_per_sample is not None: - warnings.warn("`filter_per_sample` is deprecated. " - "Filtering as a single sample.", - DeprecationWarning) + warnings.warn( + "`filter_per_sample` is deprecated. " "Filtering as a single sample.", + DeprecationWarning, + ) assert values is not None - keep_cells_idx = utils._get_filter_idx(values, - cutoff, percentile, - keep_cells) + keep_cells_idx = utils._get_filter_idx(values, cutoff, percentile, keep_cells) if return_values: extra_data = [values] + list(extra_data) data = select.select_rows(data, *extra_data, idx=keep_cells_idx) return data -def filter_library_size(data, *extra_data, cutoff=None, percentile=None, - keep_cells=None, - return_library_size=False, - sample_labels=None, - filter_per_sample=None): +def filter_library_size( + data, + *extra_data, + cutoff=None, + percentile=None, + keep_cells=None, + return_library_size=False, + sample_labels=None, + filter_per_sample=None +): """Remove all cells with library size above or below a certain threshold It is recommended to use :func:`~scprep.plot.plot_library_size` to @@ -229,23 +249,35 @@ def filter_library_size(data, *extra_data, cutoff=None, percentile=None, Filtered extra data, if passed. """ cell_sums = measure.library_size(data) - return filter_values(data, *extra_data, values=cell_sums, - cutoff=cutoff, percentile=percentile, - keep_cells=keep_cells, - return_values=return_library_size, - sample_labels=sample_labels, - filter_per_sample=filter_per_sample) - - -def filter_gene_set_expression(data, *extra_data, genes=None, - starts_with=None, ends_with=None, - exact_word=None, regex=None, - cutoff=None, percentile=None, - library_size_normalize=False, - keep_cells=None, - return_expression=False, - sample_labels=None, - filter_per_sample=None): + return filter_values( + data, + *extra_data, + values=cell_sums, + cutoff=cutoff, + percentile=percentile, + keep_cells=keep_cells, + return_values=return_library_size, + sample_labels=sample_labels, + filter_per_sample=filter_per_sample + ) + + +def filter_gene_set_expression( + data, + *extra_data, + genes=None, + starts_with=None, + ends_with=None, + exact_word=None, + regex=None, + cutoff=None, + percentile=None, + library_size_normalize=False, + keep_cells=None, + return_expression=False, + sample_labels=None, + filter_per_sample=None +): """Remove cells with total expression of a gene set above or below a certain threshold It is recommended to use :func:`~scprep.plot.plot_gene_set_expression` to @@ -295,20 +327,28 @@ def filter_gene_set_expression(data, *extra_data, genes=None, Filtered extra data, if passed. """ if keep_cells is None: - if isinstance(cutoff, numbers.Number) or \ - isinstance(percentile, numbers.Number): - keep_cells = 'below' + if isinstance(cutoff, numbers.Number) or isinstance(percentile, numbers.Number): + keep_cells = "below" cell_sums = measure.gene_set_expression( - data, genes=genes, - starts_with=starts_with, ends_with=ends_with, - exact_word=exact_word, regex=regex, - library_size_normalize=library_size_normalize) - return filter_values(data, *extra_data, values=cell_sums, - cutoff=cutoff, percentile=percentile, - keep_cells=keep_cells, - return_values=return_expression, - sample_labels=sample_labels, - filter_per_sample=filter_per_sample) + data, + genes=genes, + starts_with=starts_with, + ends_with=ends_with, + exact_word=exact_word, + regex=regex, + library_size_normalize=library_size_normalize, + ) + return filter_values( + data, + *extra_data, + values=cell_sums, + cutoff=cutoff, + percentile=percentile, + keep_cells=keep_cells, + return_values=return_expression, + sample_labels=sample_labels, + filter_per_sample=filter_per_sample + ) def _find_unique_cells(data): @@ -336,8 +376,7 @@ def _find_unique_cells(data): elif sparse.issparse(data): _, unique_data = np.unique(data.tolil().data, return_index=True) _, unique_index = np.unique(data.tolil().rows, return_index=True) - unique_idx = np.sort( - list(set(unique_index).union(set(unique_data)))) + unique_idx = np.sort(list(set(unique_index).union(set(unique_data)))) return unique_idx @@ -360,9 +399,11 @@ def filter_duplicates(data, *extra_data, sample_labels=None): Filtered extra data, if passed. """ if sample_labels is not None: - warnings.warn("`sample_labels` is deprecated. " - "Passing `sample_labels` as `extra_data`.", - DeprecationWarning) + warnings.warn( + "`sample_labels` is deprecated. " + "Passing `sample_labels` as `extra_data`.", + DeprecationWarning, + ) extra_data = list(extra_data) + [sample_labels] unique_idx = _find_unique_cells(data) data = select.select_rows(data, *extra_data, idx=unique_idx) diff --git a/scprep/io/csv.py b/scprep/io/csv.py index bf17bc55..940ac6b0 100644 --- a/scprep/io/csv.py +++ b/scprep/io/csv.py @@ -11,14 +11,21 @@ def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs): """Read a csv file into a pd.DataFrame[pd.SparseArray] """ chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs) - data = pd.concat(utils.dataframe_to_sparse(chunk, fill_value=fill_value) - for chunk in chunks) + data = pd.concat( + utils.dataframe_to_sparse(chunk, fill_value=fill_value) for chunk in chunks + ) return data -def load_csv(filename, cell_axis='row', delimiter=',', - gene_names=True, cell_names=True, - sparse=False, **kwargs): +def load_csv( + filename, + cell_axis="row", + delimiter=",", + gene_names=True, + cell_names=True, + sparse=False, + **kwargs +): """Load a csv file Parameters @@ -48,26 +55,26 @@ def load_csv(filename, cell_axis='row', delimiter=',', pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ - if cell_axis not in ['row', 'column', 'col']: + if cell_axis not in ["row", "column", "col"]: raise ValueError( - "cell_axis {} not recognized. Expected 'row' or 'column'".format( - cell_axis)) + "cell_axis {} not recognized. Expected 'row' or 'column'".format(cell_axis) + ) - if 'index_col' in kwargs: + if "index_col" in kwargs: # override - index_col = kwargs['index_col'] + index_col = kwargs["index_col"] cell_names = None - del kwargs['index_col'] + del kwargs["index_col"] elif cell_names is True: index_col = 0 cell_names = None else: index_col = None - if 'header' in kwargs: + if "header" in kwargs: # override - header = kwargs['header'] - del kwargs['header'] + header = kwargs["header"] + del kwargs["header"] gene_names = None elif gene_names is True: header = 0 @@ -80,22 +87,28 @@ def load_csv(filename, cell_axis='row', delimiter=',', read_fun = _read_csv_sparse else: read_fun = pd.read_csv - data = read_fun(filename, delimiter=delimiter, - header=header, index_col=index_col, - **kwargs) + data = read_fun( + filename, delimiter=delimiter, header=header, index_col=index_col, **kwargs + ) - if cell_axis in ['column', 'col']: + if cell_axis in ["column", "col"]: data = data.T data = _matrix_to_data_frame( - data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) + data, gene_names=gene_names, cell_names=cell_names, sparse=sparse + ) return data -def load_tsv(filename, cell_axis='row', delimiter='\t', - gene_names=True, cell_names=True, - sparse=False, **kwargs): +def load_tsv( + filename, + cell_axis="row", + delimiter="\t", + gene_names=True, + cell_names=True, + sparse=False, + **kwargs +): """Load a tsv file Parameters @@ -125,6 +138,12 @@ def load_tsv(filename, cell_axis='row', delimiter='\t', pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ - return load_csv(filename, cell_axis=cell_axis, delimiter=delimiter, - gene_names=gene_names, cell_names=cell_names, - sparse=sparse, **kwargs) + return load_csv( + filename, + cell_axis=cell_axis, + delimiter=delimiter, + gene_names=gene_names, + cell_names=cell_names, + sparse=sparse, + **kwargs + ) diff --git a/scprep/io/download.py b/scprep/io/download.py index a43ae714..14c9a5c4 100644 --- a/scprep/io/download.py +++ b/scprep/io/download.py @@ -8,23 +8,23 @@ _CHUNK_SIZE = 32768 _GOOGLE_DRIVE_URL = "https://docs.google.com/uc?export=download" -_FAKE_HEADERS = [('User-Agent', 'Mozilla/5.0')] +_FAKE_HEADERS = [("User-Agent", "Mozilla/5.0")] def _save_response_content(response, destination): global _CHUNK_SIZE if isinstance(destination, str): - with open(destination, 'wb') as handle: + with open(destination, "wb") as handle: _save_response_content(response, handle) else: for chunk in response.iter_content(_CHUNK_SIZE): - if chunk: # filter out keep-alive new chunks + if chunk: # filter out keep-alive new chunks destination.write(chunk) def _google_drive_confirm_token(response): for key, value in response.cookies.items(): - if key.startswith('download_warning'): + if key.startswith("download_warning"): return value return None @@ -35,12 +35,12 @@ def _GET_google_drive(id): global _GOOGLE_DRIVE_URL with requests.Session() as session: - response = session.get(_GOOGLE_DRIVE_URL, params = { 'id' : id }, stream = True) + response = session.get(_GOOGLE_DRIVE_URL, params={"id": id}, stream=True) token = _google_drive_confirm_token(response) if token: - params = { 'id' : id, 'confirm' : token } - response = session.get(_GOOGLE_DRIVE_URL, params = params, stream = True) + params = {"id": id, "confirm": token} + response = session.get(_GOOGLE_DRIVE_URL, params=params, stream=True) return response @@ -73,7 +73,7 @@ def download_url(url, destination): File to which to save the downloaded data """ if isinstance(destination, str): - with open(destination, 'wb') as handle: + with open(destination, "wb") as handle: download_url(url, handle) else: # destination is File @@ -102,7 +102,7 @@ def unzip(filename, destination=None, delete=True): destination = os.path.dirname(filename) elif not os.path.isdir(destination): os.mkdir(destination) - with zipfile.ZipFile(filename, 'r') as handle: + with zipfile.ZipFile(filename, "r") as handle: handle.extractall(destination) if delete: os.unlink(filename) diff --git a/scprep/io/fcs.py b/scprep/io/fcs.py index 5c91fe86..7ffdf023 100644 --- a/scprep/io/fcs.py +++ b/scprep/io/fcs.py @@ -15,8 +15,7 @@ def _channel_names_from_meta(meta, channel_numbers, naming="N"): try: - return tuple([meta['$P{0}{1}'.format(i, naming)] - for i in channel_numbers]) + return tuple([meta["$P{0}{1}".format(i, naming)] for i in channel_numbers]) except KeyError: return [] @@ -30,28 +29,31 @@ def _get_channel_names(meta, channel_numbers, channel_naming="$PnS"): names_s = _channel_names_from_meta(meta, channel_numbers, "S") # Figure out which channel names to use - if channel_naming == '$PnS': + if channel_naming == "$PnS": channel_names, channel_names_alternate = names_s, names_n - elif channel_naming == '$PnN': + elif channel_naming == "$PnN": channel_names, channel_names_alternate = names_n, names_s else: - raise ValueError("Expected channel_naming in ['$PnS', '$PnN']. " - "Got '{}'".format(channel_naming)) + raise ValueError( + "Expected channel_naming in ['$PnS', '$PnN']. " + "Got '{}'".format(channel_naming) + ) if len(channel_names) == 0: channel_names = channel_names_alternate if len(set(channel_names)) != len(channel_names): - warnings.warn('The default channel names (defined by the {} ' - 'parameter in the FCS file) were not unique. To avoid ' - 'problems in downstream analysis, the channel names ' - 'have been switched to the alternate channel names ' - 'defined in the FCS file. To avoid ' - 'seeing this warning message, explicitly instruct ' - 'the FCS parser to use the alternate channel names by ' - 'specifying the channel_naming parameter.'.format( - channel_naming), - RuntimeWarning) + warnings.warn( + "The default channel names (defined by the {} " + "parameter in the FCS file) were not unique. To avoid " + "problems in downstream analysis, the channel names " + "have been switched to the alternate channel names " + "defined in the FCS file. To avoid " + "seeing this warning message, explicitly instruct " + "the FCS parser to use the alternate channel names by " + "specifying the channel_naming parameter.".format(channel_naming), + RuntimeWarning, + ) channel_names = channel_names_alternate return channel_names @@ -67,36 +69,37 @@ def _reformat_meta(meta, channel_numbers): channel_properties = [] for key, value in meta.items(): - if key[:3] == '$P1': + if key[:3] == "$P1": if key[3] not in string.digits: channel_properties.append(key[3:]) # Capture all the channel information in a list of lists -- used to create # a data frame channel_matrix = [ - [meta.get('$P{0}{1}'.format(ch, p)) for p in channel_properties] + [meta.get("$P{0}{1}".format(ch, p)) for p in channel_properties] for ch in channel_numbers ] # Remove this information from the dictionary for ch in channel_numbers: for p in channel_properties: - key = '$P{0}{1}'.format(ch, p) + key = "$P{0}{1}".format(ch, p) if key in meta: meta.pop(key) - num_channels = meta['$PAR'] - column_names = ['$Pn{0}'.format(p) for p in channel_properties] + num_channels = meta["$PAR"] + column_names = ["$Pn{0}".format(p) for p in channel_properties] - df = pd.DataFrame(channel_matrix, columns=column_names, - index=(1 + np.arange(num_channels))) + df = pd.DataFrame( + channel_matrix, columns=column_names, index=(1 + np.arange(num_channels)) + ) - if '$PnE' in column_names: - df['$PnE'] = df['$PnE'].apply(lambda x: x.split(',')) - if '$PnB' in column_names: - df['$PnB'] = df['$PnB'].apply(lambda x: int(x)) + if "$PnE" in column_names: + df["$PnE"] = df["$PnE"].apply(lambda x: x.split(",")) + if "$PnB" in column_names: + df["$PnB"] = df["$PnB"].apply(lambda x: int(x)) - df.index.name = 'Channel Number' + df.index.name = "Channel Number" return df @@ -125,26 +128,26 @@ def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): into a DataFrame and moved into the '_channels_' key """ meta = dict() - with open(filename, 'rb') as handle: + with open(filename, "rb") as handle: # Parse HEADER header = handle.read(58) - meta['__header__'] = dict() - meta['__header__']['FCS format'] = header[0:6].strip() - meta['__header__']['text start'] = int(header[10:18].strip()) - meta['__header__']['text end'] = int(header[18:26].strip()) - meta['__header__']['data start'] = data_start = int( - header[26:34].strip()) - meta['__header__']['data end'] = data_end = int(header[34:42].strip()) - meta['__header__']['analysis start'] = int(header[42:50].strip()) - meta['__header__']['analysis end'] = int(header[50:58].strip()) + meta["__header__"] = dict() + meta["__header__"]["FCS format"] = header[0:6].strip() + meta["__header__"]["text start"] = int(header[10:18].strip()) + meta["__header__"]["text end"] = int(header[18:26].strip()) + meta["__header__"]["data start"] = data_start = int(header[26:34].strip()) + meta["__header__"]["data end"] = data_end = int(header[34:42].strip()) + meta["__header__"]["analysis start"] = int(header[42:50].strip()) + meta["__header__"]["analysis end"] = int(header[50:58].strip()) # Parsing TEXT segment # read TEXT portion - handle.seek(meta['__header__']['text start']) + handle.seek(meta["__header__"]["text start"]) # First byte of the text portion defines the delimeter delimeter = handle.read(1) - text = handle.read(meta['__header__']['text end'] - - meta['__header__']['text start'] + 1) + text = handle.read( + meta["__header__"]["text end"] - meta["__header__"]["text start"] + 1 + ) # Variables in TEXT poriton are stored "key/value/key/value/key/value" keyvalarray = text.split(delimeter) @@ -152,26 +155,30 @@ def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): for k, v in zip(keyvalarray[::2], keyvalarray[1::2]): meta[k.decode()] = v.decode() - if meta['__header__']['data start'] == 0 and \ - meta['__header__']['data end'] == 0: - data_start = int(meta['$DATASTART']) - data_end = int(meta['$DATAEND']) + if ( + meta["__header__"]["data start"] == 0 + and meta["__header__"]["data end"] == 0 + ): + data_start = int(meta["$DATASTART"]) + data_end = int(meta["$DATAEND"]) - num_dims = meta['$PAR'] = int(meta['$PAR']) - num_events = meta['$TOT'] = int(meta['$TOT']) + num_dims = meta["$PAR"] = int(meta["$PAR"]) + num_events = meta["$TOT"] = int(meta["$TOT"]) # Read DATA portion handle.seek(data_start) data = handle.read(data_end - data_start + 1) # Determine data format - datatype = meta['$DATATYPE'].lower() - if datatype not in ['f', 'd']: - raise ValueError("Expected $DATATYPE in ['F', 'D']. " - "Got '{}'".format(meta['$DATATYPE'])) + datatype = meta["$DATATYPE"].lower() + if datatype not in ["f", "d"]: + raise ValueError( + "Expected $DATATYPE in ['F', 'D']. " + "Got '{}'".format(meta["$DATATYPE"]) + ) # Determine endianess - endian = meta['$BYTEORD'] + endian = meta["$BYTEORD"] if endian == "4,3,2,1": # Big endian data format endian = ">" @@ -179,8 +186,10 @@ def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): # Little endian data format endian = "<" else: - raise ValueError("Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " - "Got '{}'".format(endian)) + raise ValueError( + "Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " + "Got '{}'".format(endian) + ) # Put data in StringIO so we can read bytes like a file data = BytesIO(data) @@ -197,38 +206,50 @@ def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): # Number the channels - pars = meta['$PAR'] + pars = meta["$PAR"] # Checking whether channel number count starts from 0 or from 1 - if '$P0B' in meta: + if "$P0B" in meta: # Channel number count starts from 0 channel_numbers = range(0, pars) else: # Channel numbers start from 1 channel_numbers = range(1, pars + 1) - channel_names = _get_channel_names( - meta, channel_numbers, channel_naming) + channel_names = _get_channel_names(meta, channel_numbers, channel_naming) - events = pd.DataFrame(np.array(events), columns=channel_names, - index=np.arange(len(events))) + events = pd.DataFrame( + np.array(events), columns=channel_names, index=np.arange(len(events)) + ) if reformat_meta: try: - meta['_channels_'] = _reformat_meta(meta, channel_numbers) + meta["_channels_"] = _reformat_meta(meta, channel_numbers) except Exception as e: warnings.warn("Metadata reformatting failed: {}".format(str(e))) - meta['_channel_names_'] = channel_names + meta["_channel_names_"] = channel_names return meta, events @utils._with_pkg(pkg="fcsparser") -def load_fcs(filename, gene_names=True, cell_names=True, - sparse=None, - metadata_channels=['Time', 'Event_length', 'DNA1', 'DNA2', - 'Cisplatin', 'beadDist', 'bead1'], - channel_naming='$PnS', - reformat_meta=True, override=False, - **kwargs): +def load_fcs( + filename, + gene_names=True, + cell_names=True, + sparse=None, + metadata_channels=[ + "Time", + "Event_length", + "DNA1", + "DNA2", + "Cisplatin", + "beadDist", + "bead1", + ], + channel_naming="$PnS", + reformat_meta=True, + override=False, + **kwargs +): """Load a fcs file Parameters @@ -281,21 +302,28 @@ def load_fcs(filename, gene_names=True, cell_names=True, # Parse the fcs file if override: channel_metadata, data = _fcsextract( - filename, reformat_meta=reformat_meta, - channel_naming=channel_naming, **kwargs) + filename, + reformat_meta=reformat_meta, + channel_naming=channel_naming, + **kwargs + ) else: try: channel_metadata, data = fcsparser.api.parse( - filename, reformat_meta=reformat_meta, **kwargs) + filename, reformat_meta=reformat_meta, **kwargs + ) except (fcsparser.api.ParserFeatureNotImplementedError, ValueError): - raise RuntimeError("fcsparser failed to load {}, likely due to a " - "malformed header. You can try using " - "`override=True` to use scprep's built-in " - "experimental FCS parser.".format(filename)) + raise RuntimeError( + "fcsparser failed to load {}, likely due to a " + "malformed header. You can try using " + "`override=True` to use scprep's built-in " + "experimental FCS parser.".format(filename) + ) metadata_channels = data.columns.intersection(metadata_channels) data_channels = data.columns.difference(metadata_channels) cell_metadata = data[metadata_channels] data = data[data_channels] - data = _matrix_to_data_frame(data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) + data = _matrix_to_data_frame( + data, gene_names=gene_names, cell_names=cell_names, sparse=sparse + ) return channel_metadata, cell_metadata, data diff --git a/scprep/io/hdf5.py b/scprep/io/hdf5.py index 1e8d5c7c..2ec41f80 100644 --- a/scprep/io/hdf5.py +++ b/scprep/io/hdf5.py @@ -23,12 +23,13 @@ def with_HDF5(fun, *args, **kwargs): raise ModuleNotFoundError( "Found neither tables nor h5py. " "Please install one of them with e.g. " - "`pip install --user tables` or `pip install --user h5py`") + "`pip install --user tables` or `pip install --user h5py`" + ) return fun(*args, **kwargs) @with_HDF5 -def open_file(filename, mode='r', backend=None): +def open_file(filename, mode="r", backend=None): """Open an HDF5 file with either tables or h5py Gives a simple, unified interface for both tables and h5py @@ -52,16 +53,17 @@ def open_file(filename, mode='r', backend=None): if backend is None: try: tables - backend = 'tables' + backend = "tables" except NameError: - backend = 'h5py' - if backend == 'tables': + backend = "h5py" + if backend == "tables": return tables.open_file(filename, mode) - elif backend == 'h5py': + elif backend == "h5py": return h5py.File(filename, mode) else: raise ValueError( - "Expected backend in ['tables', 'h5py']. Got {}".format(backend)) + "Expected backend in ['tables', 'h5py']. Got {}".format(backend) + ) def _is_tables(obj, allow_file=True, allow_group=True, allow_dataset=True): @@ -113,7 +115,10 @@ def list_nodes(f): return [node._v_name for node in f.list_nodes(f.root)] else: raise TypeError( - "Expected h5py.File, tables.File, h5py.Group or tables.Group. Got {}".format(type(f))) + "Expected h5py.File, tables.File, h5py.Group or tables.Group. Got {}".format( + type(f) + ) + ) @with_HDF5 @@ -141,7 +146,10 @@ def get_node(f, node): return f[node] else: raise TypeError( - "Expected h5py.File, tables.File, h5py.Group or tables.Group. Got {}".format(type(f))) + "Expected h5py.File, tables.File, h5py.Group or tables.Group. Got {}".format( + type(f) + ) + ) @with_HDF5 @@ -163,4 +171,5 @@ def get_values(dataset): return dataset.read() else: raise TypeError( - "Expected h5py.Dataset or tables.CArray. Got {}".format(type(dataset))) + "Expected h5py.Dataset or tables.CArray. Got {}".format(type(dataset)) + ) diff --git a/scprep/io/mtx.py b/scprep/io/mtx.py index 6f1bd2a8..5921b9d1 100644 --- a/scprep/io/mtx.py +++ b/scprep/io/mtx.py @@ -10,8 +10,7 @@ from .. import utils -def load_mtx(mtx_file, cell_axis='row', - gene_names=None, cell_names=None, sparse=None): +def load_mtx(mtx_file, cell_axis="row", gene_names=None, cell_names=None, sparse=None): """Load a mtx file Parameters @@ -36,19 +35,20 @@ def load_mtx(mtx_file, cell_axis='row', pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ - if cell_axis not in ['row', 'column', 'col']: + if cell_axis not in ["row", "column", "col"]: raise ValueError( - "cell_axis {} not recognized. Expected 'row' or 'column'".format( - cell_axis)) + "cell_axis {} not recognized. Expected 'row' or 'column'".format(cell_axis) + ) # Read in mtx file data = sio.mmread(mtx_file) - if cell_axis in ['column', 'col']: + if cell_axis in ["column", "col"]: data = data.T data = _matrix_to_data_frame( - data, gene_names=gene_names, - cell_names=cell_names, sparse=sparse) + data, gene_names=gene_names, cell_names=cell_names, sparse=sparse + ) return data + def save_mtx(data, destination, cell_names=None, gene_names=None): """Save a mtx file @@ -87,11 +87,11 @@ def save_mtx(data, destination, cell_names=None, gene_names=None): if not os.path.isdir(destination): os.mkdir(destination) if cell_names is not None: - with open(os.path.join(destination, "cell_names.tsv"), 'w') as handle: + with open(os.path.join(destination, "cell_names.tsv"), "w") as handle: for name in cell_names: handle.write("{}\n".format(name)) if gene_names is not None: - with open(os.path.join(destination, "gene_names.tsv"), 'w') as handle: + with open(os.path.join(destination, "gene_names.tsv"), "w") as handle: for name in gene_names: handle.write("{}\n".format(name)) sio.mmwrite(os.path.join(destination, "matrix.mtx"), data) diff --git a/scprep/io/tenx.py b/scprep/io/tenx.py index 6e6f949f..4a960cdb 100644 --- a/scprep/io/tenx.py +++ b/scprep/io/tenx.py @@ -29,29 +29,28 @@ def _combine_gene_id(symbols, ids): pandas.Index with combined gene symbols and ids """ - columns = np.core.defchararray.add( - np.array(symbols, dtype=str), ' (') - columns = np.core.defchararray.add( - columns, np.array(ids, dtype=str)) - columns = np.core.defchararray.add(columns, ')') + columns = np.core.defchararray.add(np.array(symbols, dtype=str), " (") + columns = np.core.defchararray.add(columns, np.array(ids, dtype=str)) + columns = np.core.defchararray.add(columns, ")") return columns -def _parse_10x_genes(symbols, ids, gene_labels='symbol', - allow_duplicates=True): - assert gene_labels in ['symbol', 'id', 'both'] - if gene_labels == 'symbol': +def _parse_10x_genes(symbols, ids, gene_labels="symbol", allow_duplicates=True): + assert gene_labels in ["symbol", "id", "both"] + if gene_labels == "symbol": columns = symbols if not allow_duplicates and len(np.unique(columns)) < len(columns): warnings.warn( "Duplicate gene names detected! Forcing `gene_labels='both'`. " "Alternatively, try `gene_labels='id'`, " "`allow_duplicates=True`, or load the matrix" - " with `sparse=False`", RuntimeWarning) - gene_labels = 'both' - if gene_labels == 'both': + " with `sparse=False`", + RuntimeWarning, + ) + gene_labels = "both" + if gene_labels == "both": columns = _combine_gene_id(symbols, ids) - elif gene_labels == 'id': + elif gene_labels == "id": columns = ids return columns @@ -66,8 +65,7 @@ def _find_gz_file(*path): return path + ".gz" -def load_10X(data_dir, sparse=True, gene_labels='symbol', - allow_duplicates=None): +def load_10X(data_dir, sparse=True, gene_labels="symbol", allow_duplicates=None): """Basic IO for 10X data produced from the 10X Cellranger pipeline. A default run of the `cellranger count` command will generate gene-barcode @@ -99,53 +97,58 @@ def load_10X(data_dir, sparse=True, gene_labels='symbol', be a pd.DataFrame. """ - if gene_labels not in ['id', 'symbol', 'both']: + if gene_labels not in ["id", "symbol", "both"]: raise ValueError( "gene_labels='{}' not recognized. " - "Choose from ['symbol', 'id', 'both']".format(gene_labels)) + "Choose from ['symbol', 'id', 'both']".format(gene_labels) + ) if not os.path.isdir(data_dir): - raise FileNotFoundError( - "{} is not a directory".format(data_dir)) + raise FileNotFoundError("{} is not a directory".format(data_dir)) try: m = sio.mmread(_find_gz_file(data_dir, "matrix.mtx")) try: - genes = pd.read_csv(_find_gz_file(data_dir, "genes.tsv"), - delimiter='\t', header=None) + genes = pd.read_csv( + _find_gz_file(data_dir, "genes.tsv"), delimiter="\t", header=None + ) except FileNotFoundError: - genes = pd.read_csv(_find_gz_file(data_dir, "features.tsv"), - delimiter='\t', header=None) + genes = pd.read_csv( + _find_gz_file(data_dir, "features.tsv"), delimiter="\t", header=None + ) if genes.shape[1] == 2: # Cellranger < 3.0 - genes.columns = ['id', 'symbol'] + genes.columns = ["id", "symbol"] else: # Cellranger >= 3.0 - genes.columns = ['id', 'symbol', 'measurement'] - barcodes = pd.read_csv(_find_gz_file(data_dir, "barcodes.tsv"), - delimiter='\t', header=None) + genes.columns = ["id", "symbol", "measurement"] + barcodes = pd.read_csv( + _find_gz_file(data_dir, "barcodes.tsv"), delimiter="\t", header=None + ) except (FileNotFoundError, IOError): raise FileNotFoundError( "'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)' must be present " - "in {}".format(data_dir)) + "in {}".format(data_dir) + ) cell_names = barcodes[0] if allow_duplicates is None: allow_duplicates = not sparse - gene_names = _parse_10x_genes(genes['symbol'].values.astype(str), - genes['id'].values.astype(str), - gene_labels=gene_labels, - allow_duplicates=allow_duplicates) - - data = _matrix_to_data_frame(m.T, cell_names=cell_names, - gene_names=gene_names, - sparse=sparse) + gene_names = _parse_10x_genes( + genes["symbol"].values.astype(str), + genes["id"].values.astype(str), + gene_labels=gene_labels, + allow_duplicates=allow_duplicates, + ) + + data = _matrix_to_data_frame( + m.T, cell_names=cell_names, gene_names=gene_names, sparse=sparse + ) return data -def load_10X_zip(filename, sparse=True, gene_labels='symbol', - allow_duplicates=None): +def load_10X_zip(filename, sparse=True, gene_labels="symbol", allow_duplicates=None): """Basic IO for zipped 10X data produced from the 10X Cellranger pipeline. Runs `load_10X` after unzipping the data contained in `filename` @@ -172,14 +175,15 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol', be a pd.DataFrame. """ - if gene_labels not in ['id', 'symbol', 'both']: + if gene_labels not in ["id", "symbol", "both"]: raise ValueError( "gene_labels='{}' not recognized. " - "Choose from ['symbol', 'id', 'both']".format(gene_labels)) + "Choose from ['symbol', 'id', 'both']".format(gene_labels) + ) if not os.path.isfile(filename): with tempfile.TemporaryDirectory() as download_dir: - zip_filename = os.path.join(download_dir, 'download.zip') + zip_filename = os.path.join(download_dir, "download.zip") try: with urllib.request.urlopen(filename) as url: with open(zip_filename, "wb") as handle: @@ -187,14 +191,16 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol', except ValueError as e: if str(e).startswith("unknown url type:"): # not actually a url - raise FileNotFoundError( - "No such file: '{}'".format(filename)) + raise FileNotFoundError("No such file: '{}'".format(filename)) else: raise else: - return load_10X_zip(zip_filename, sparse=sparse, - gene_labels=gene_labels, - allow_duplicates=allow_duplicates) + return load_10X_zip( + zip_filename, + sparse=sparse, + gene_labels=gene_labels, + allow_duplicates=allow_duplicates, + ) tmpdir = tempfile.mkdtemp() with zipfile.ZipFile(filename) as handle: @@ -204,14 +210,24 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol', else: dirname = files[0].strip("/") subdir_files = [f.split("/")[-1] for f in files] - valid = (("barcodes.tsv" in subdir_files or "barcodes.tsv.gz" in subdir_files) and - (("genes.tsv" in subdir_files or "genes.tsv.gz" in subdir_files) or - ("features.tsv" in subdir_files or "features.tsv.gz" in subdir_files)) and - ("matrix.mtx" in subdir_files or "matrix.mtx.gz" in subdir_files)) + valid = ( + ("barcodes.tsv" in subdir_files or "barcodes.tsv.gz" in subdir_files) + and ( + ("genes.tsv" in subdir_files or "genes.tsv.gz" in subdir_files) + or ( + "features.tsv" in subdir_files + or "features.tsv.gz" in subdir_files + ) + ) + and ("matrix.mtx" in subdir_files or "matrix.mtx.gz" in subdir_files) + ) if not valid: raise ValueError( "Expected a single zipped folder containing 'matrix.mtx(.gz)', " - "'[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)'. Got {}".format(files)) + "'[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)'. Got {}".format( + files + ) + ) handle.extractall(path=tmpdir) data = load_10X(os.path.join(tmpdir, dirname)) shutil.rmtree(tmpdir) @@ -219,8 +235,14 @@ def load_10X_zip(filename, sparse=True, gene_labels='symbol', @hdf5.with_HDF5 -def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol', - allow_duplicates=None, backend=None): +def load_10X_HDF5( + filename, + genome=None, + sparse=True, + gene_labels="symbol", + allow_duplicates=None, + backend=None, +): """Basic IO for HDF5 10X data produced from the 10X Cellranger pipeline. Equivalent to `load_10X` but for HDF5 format. @@ -252,69 +274,76 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol', be a pd.DataFrame. """ - if gene_labels not in ['id', 'symbol', 'both']: + if gene_labels not in ["id", "symbol", "both"]: raise ValueError( "gene_labels='{}' not recognized. " - "Choose from ['symbol', 'id', 'both']".format(gene_labels)) + "Choose from ['symbol', 'id', 'both']".format(gene_labels) + ) # default allow_duplicates if allow_duplicates is None: allow_duplicates = not sparse - with hdf5.open_file(filename, 'r', backend=backend) as f: + with hdf5.open_file(filename, "r", backend=backend) as f: # handle genome groups = hdf5.list_nodes(f) try: # Cellranger 3.0 - group = hdf5.get_node(f, 'matrix') + group = hdf5.get_node(f, "matrix") if genome is not None: raise NotImplementedError( "Selecting genomes for Cellranger 3.0 files is not " "currently supported. Please file an issue at " - "https://github.com/KrishnaswamyLab/scprep/issues") + "https://github.com/KrishnaswamyLab/scprep/issues" + ) except (AttributeError, KeyError): # Cellranger 2.0 if genome is None: print_genomes = ", ".join(groups) genome = groups[0] if len(groups) > 1: - print("Available genomes: {}. Selecting {} by default".format( - print_genomes, genome)) + print( + "Available genomes: {}. Selecting {} by default".format( + print_genomes, genome + ) + ) try: group = hdf5.get_node(f, genome) except (AttributeError, KeyError): print_genomes = ", ".join(groups) raise ValueError( "Genome {} not found in {}. " - "Available genomes: {}".format(genome, filename, - print_genomes)) + "Available genomes: {}".format(genome, filename, print_genomes) + ) try: # Cellranger 3.0 - features = hdf5.get_node(group, 'features') - gene_symbols = hdf5.get_node(features, 'name') - gene_ids = hdf5.get_node(features, 'id') + features = hdf5.get_node(group, "features") + gene_symbols = hdf5.get_node(features, "name") + gene_ids = hdf5.get_node(features, "id") except (KeyError, IndexError): # Cellranger 2.0 - gene_symbols = hdf5.get_node(group, 'gene_names') - gene_ids = hdf5.get_node(group, 'genes') + gene_symbols = hdf5.get_node(group, "gene_names") + gene_ids = hdf5.get_node(group, "genes") # convert to string column names gene_names = _parse_10x_genes( symbols=[g.decode() for g in hdf5.get_values(gene_symbols)], ids=[g.decode() for g in hdf5.get_values(gene_ids)], - gene_labels=gene_labels, allow_duplicates=allow_duplicates) - - cell_names = [b.decode() for b in hdf5.get_values( - hdf5.get_node(group, 'barcodes'))] - data = hdf5.get_values(hdf5.get_node(group, 'data')) - indices = hdf5.get_values(hdf5.get_node(group, 'indices')) - indptr = hdf5.get_values(hdf5.get_node(group, 'indptr')) - shape = hdf5.get_values(hdf5.get_node(group, 'shape')) + gene_labels=gene_labels, + allow_duplicates=allow_duplicates, + ) + + cell_names = [ + b.decode() for b in hdf5.get_values(hdf5.get_node(group, "barcodes")) + ] + data = hdf5.get_values(hdf5.get_node(group, "data")) + indices = hdf5.get_values(hdf5.get_node(group, "indices")) + indptr = hdf5.get_values(hdf5.get_node(group, "indptr")) + shape = hdf5.get_values(hdf5.get_node(group, "shape")) data = sp.csc_matrix((data, indices, indptr), shape=shape) - data = _matrix_to_data_frame(data.T, - gene_names=gene_names, - cell_names=cell_names, - sparse=sparse) + data = _matrix_to_data_frame( + data.T, gene_names=gene_names, cell_names=cell_names, sparse=sparse + ) return data diff --git a/scprep/io/utils.py b/scprep/io/utils.py index 53eb9f70..3506f662 100644 --- a/scprep/io/utils.py +++ b/scprep/io/utils.py @@ -33,28 +33,35 @@ def _parse_header(header, n_expected, header_type="gene_names"): delimiter = "\t" else: delimiter = "," - columns = pd.read_csv(header, delimiter=delimiter, - header=None).values.flatten().astype(str) + columns = ( + pd.read_csv(header, delimiter=delimiter, header=None) + .values.flatten() + .astype(str) + ) if not len(columns) == n_expected: - raise ValueError("Expected {} entries in {}. Got {}".format( - n_expected, header, len(columns))) + raise ValueError( + "Expected {} entries in {}. Got {}".format( + n_expected, header, len(columns) + ) + ) else: # treat as list columns = header if not len(columns) == n_expected: - raise ValueError("Expected {} entries in {}. Got {}".format( - n_expected, header_type, len(columns))) + raise ValueError( + "Expected {} entries in {}. Got {}".format( + n_expected, header_type, len(columns) + ) + ) return columns def _parse_gene_names(header, data): - return _parse_header(header, data.shape[1], - header_type="gene_names") + return _parse_header(header, data.shape[1], header_type="gene_names") def _parse_cell_names(header, data): - return _parse_header(header, data.shape[0], - header_type="cell_names") + return _parse_header(header, data.shape[0], header_type="cell_names") def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): @@ -74,8 +81,7 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): sparse : `bool` or `None` (default: None) If not `None`, overrides default sparsity of the data. """ - if gene_names is None and cell_names is None and \ - not isinstance(data, pd.DataFrame): + if gene_names is None and cell_names is None and not isinstance(data, pd.DataFrame): # just a matrix if sparse is not None: if sparse: @@ -96,11 +102,14 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): if sparse is None: # let the input data decide sparse = utils.is_sparse_dataframe(data) or sp.issparse(data) - if sparse and gene_names is not None and \ - len(np.unique(gene_names)) < len(gene_names): + if ( + sparse + and gene_names is not None + and len(np.unique(gene_names)) < len(gene_names) + ): warnings.warn( - "Duplicate gene names detected! Forcing dense matrix", - RuntimeWarning) + "Duplicate gene names detected! Forcing dense matrix", RuntimeWarning + ) sparse = False if sparse: # return pandas.DataFrame[SparseArray] @@ -112,7 +121,9 @@ def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): if not utils.is_sparse_dataframe(data): data = utils.dataframe_to_sparse(data, fill_value=0.0) elif sp.issparse(data): - data = pd.DataFrame.sparse.from_spmatrix(data, index=cell_names, columns=gene_names) + data = pd.DataFrame.sparse.from_spmatrix( + data, index=cell_names, columns=gene_names + ) else: data = pd.DataFrame(data, index=cell_names, columns=gene_names) data = utils.dataframe_to_sparse(data, fill_value=0.0) diff --git a/scprep/measure.py b/scprep/measure.py index 971c83e5..bc488a5e 100644 --- a/scprep/measure.py +++ b/scprep/measure.py @@ -23,13 +23,19 @@ def library_size(data): """ library_size = utils.matrix_sum(data, axis=1) if isinstance(library_size, pd.Series): - library_size.name = 'library_size' + library_size.name = "library_size" return library_size -def gene_set_expression(data, genes=None, library_size_normalize=False, - starts_with=None, ends_with=None, - exact_word=None, regex=None): +def gene_set_expression( + data, + genes=None, + library_size_normalize=False, + starts_with=None, + ends_with=None, + exact_word=None, + regex=None, +): """Measure the expression of a set of genes in each cell. Parameters @@ -56,16 +62,22 @@ def gene_set_expression(data, genes=None, library_size_normalize=False, """ if library_size_normalize: from .normalize import library_size_normalize + data = library_size_normalize(data) - gene_data = select.select_cols(data, idx=genes, starts_with=starts_with, - ends_with=ends_with, - exact_word=exact_word, regex=regex) + gene_data = select.select_cols( + data, + idx=genes, + starts_with=starts_with, + ends_with=ends_with, + exact_word=exact_word, + regex=regex, + ) if len(gene_data.shape) > 1: gene_set_expression = library_size(gene_data) else: gene_set_expression = gene_data if isinstance(gene_set_expression, pd.Series): - gene_set_expression.name = 'expression' + gene_set_expression.name = "expression" return gene_set_expression @@ -100,25 +112,31 @@ def gene_variability(data, kernel_size=0.005, smooth=5, return_means=False): data_mean = utils.toarray(np.mean(data, axis=0)).flatten() if kernel_size < 1: - kernel_size = 2*(int(kernel_size * len(data_std))//2)+1 + kernel_size = 2 * (int(kernel_size * len(data_std)) // 2) + 1 order = np.argsort(data_mean) data_std_med = np.empty_like(data_std) data_std_order = data_std[order] # handle overhang with reflection - data_std_order = np.r_[data_std_order[kernel_size::-1], data_std_order, data_std_order[:-kernel_size:-1]] - medfilt = scipy.signal.medfilt(data_std_order, kernel_size=kernel_size)[kernel_size:-kernel_size] + data_std_order = np.r_[ + data_std_order[kernel_size::-1], + data_std_order, + data_std_order[:-kernel_size:-1], + ] + medfilt = scipy.signal.medfilt(data_std_order, kernel_size=kernel_size)[ + kernel_size:-kernel_size + ] # apply a little smoothing for i in range(smooth): - medfilt = np.r_[(medfilt[1:] + medfilt[:-1])/2, medfilt[-1]] + medfilt = np.r_[(medfilt[1:] + medfilt[:-1]) / 2, medfilt[-1]] data_std_med[order] = medfilt result = data_std - data_std_med if columns is not None: - result = pd.Series(result, index=columns, name='variability') - data_mean = pd.Series(data_mean, index=columns, name='mean') + result = pd.Series(result, index=columns, name="variability") + data_mean = pd.Series(data_mean, index=columns, name="mean") if return_means: result = result, data_mean return result @@ -141,5 +159,5 @@ def gene_capture_count(data, cutoff=0): """ gene_sums = np.array(utils.matrix_sum(data > cutoff, axis=0)).reshape(-1) if isinstance(data, pd.DataFrame): - gene_sums = pd.Series(gene_sums, index=data.columns, name='capture_count') + gene_sums = pd.Series(gene_sums, index=data.columns, name="capture_count") return gene_sums diff --git a/scprep/normalize.py b/scprep/normalize.py index 1908e79c..69cb128d 100644 --- a/scprep/normalize.py +++ b/scprep/normalize.py @@ -11,31 +11,33 @@ def _get_scaled_libsize(data, rescale=10000, return_library_size=False): - if return_library_size or rescale in ['median', 'mean']: + if return_library_size or rescale in ["median", "mean"]: libsize = measure.library_size(data) else: libsize = None - if rescale == 'median': + if rescale == "median": rescale = np.median(utils.toarray(libsize)) if rescale == 0: - warnings.warn("Median library size is zero. " - "Rescaling to mean instead.", - UserWarning) + warnings.warn( + "Median library size is zero. " "Rescaling to mean instead.", + UserWarning, + ) rescale = np.mean(utils.toarray(libsize)) - elif rescale == 'mean': + elif rescale == "mean": rescale = np.mean(utils.toarray(libsize)) elif isinstance(rescale, numbers.Number): pass elif rescale is None: rescale = 1 else: - raise ValueError("Expected rescale in ['median', 'mean'], a number " - "or `None`. Got {}".format(rescale)) + raise ValueError( + "Expected rescale in ['median', 'mean'], a number " + "or `None`. Got {}".format(rescale) + ) return rescale, libsize -def library_size_normalize(data, rescale=10000, - return_library_size=False): +def library_size_normalize(data, rescale=10000, return_library_size=False): """Performs L1 normalization on input data Performs L1 normalization on input data such that the sum of expression values for each cell sums to 1 @@ -74,20 +76,19 @@ def library_size_normalize(data, rescale=10000, # dense data data = data.to_numpy() - calc_libsize = sparse.issparse(data) and (return_library_size or - data.nnz > 2**31) + calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2 ** 31) rescale, libsize = _get_scaled_libsize(data, rescale, calc_libsize) if libsize is not None: divisor = utils.toarray(libsize) data_norm = utils.matrix_vector_elementwise_multiply( - data, 1 / np.where(divisor == 0, 1, divisor), axis=0) + data, 1 / np.where(divisor == 0, 1, divisor), axis=0 + ) else: if return_library_size: - data_norm, libsize = normalize( - data, norm='l1', axis=1, return_norm=True) + data_norm, libsize = normalize(data, norm="l1", axis=1, return_norm=True) else: - data_norm = normalize(data, norm='l1', axis=1) + data_norm = normalize(data, norm="l1", axis=1) data_norm = data_norm * rescale if columns is not None: @@ -98,7 +99,7 @@ def library_size_normalize(data, rescale=10000, data_norm = pd.DataFrame(data_norm) data_norm.columns = columns data_norm.index = index - libsize = pd.Series(libsize, index=index, name='library_size') + libsize = pd.Series(libsize, index=index, name="library_size") if return_library_size: return data_norm, libsize else: @@ -123,9 +124,14 @@ def batch_mean_center(data, sample_idx=None): data : array-like, shape=[n_samples, n_features] Batch mean-centered output data. """ - if sparse.issparse(data) or isinstance(data, pd.SparseDataFrame) or utils.is_sparse_dataframe(data): - raise ValueError("Cannot mean center sparse data. " - "Convert to dense matrix first.") + if ( + sparse.issparse(data) + or isinstance(data, pd.SparseDataFrame) + or utils.is_sparse_dataframe(data) + ): + raise ValueError( + "Cannot mean center sparse data. " "Convert to dense matrix first." + ) if sample_idx is None: sample_idx = np.ones(len(data)) for sample in np.unique(sample_idx): diff --git a/scprep/plot/colors.py b/scprep/plot/colors.py index 4d688dd5..8d682f46 100644 --- a/scprep/plot/colors.py +++ b/scprep/plot/colors.py @@ -2,6 +2,7 @@ from . import tools from .._lazyload import matplotlib as mpl + plt = mpl.pyplot @@ -27,22 +28,24 @@ def tab10_continuous(n_colors=10, n_step=200, reverse=False): cmap : `matplotlib.colors.ListedColormap` """ if n_colors < 1 or n_colors > 10: - raise ValueError( - "Expected 0 < n_colors <= 10. Got {}".format(n_colors)) + raise ValueError("Expected 0 < n_colors <= 10. Got {}".format(n_colors)) if n_step < 2: - raise ValueError( - "Expected n_step >= 2. Got {}".format(n_step)) + raise ValueError("Expected n_step >= 2. Got {}".format(n_step)) base_color_idx = np.repeat(np.arange(n_colors), 2) * 2 if reverse: offset = np.tile([0, 1], n_colors) else: offset = np.tile([1, 0], n_colors) color_idx = base_color_idx + offset - full_cmap = tools.create_colormap( - np.array(plt.cm.tab20.colors)[color_idx]) + full_cmap = tools.create_colormap(np.array(plt.cm.tab20.colors)[color_idx]) linspace = np.linspace(0, 1 / (n_colors * 2 - 1), n_step) - restricted_cmap = mpl.colors.ListedColormap(full_cmap(np.concatenate([ - linspace + 2 * i / (n_colors * 2 - 1) for i in range(n_colors)]))) + restricted_cmap = mpl.colors.ListedColormap( + full_cmap( + np.concatenate( + [linspace + 2 * i / (n_colors * 2 - 1) for i in range(n_colors)] + ) + ) + ) return restricted_cmap @@ -56,8 +59,7 @@ def tab30(): ------- cmap : `matplotlib.colors.ListedColormap` """ - colors = np.vstack([mpl.cm.tab20c.colors, - mpl.cm.tab20b.colors]) + colors = np.vstack([mpl.cm.tab20c.colors, mpl.cm.tab20b.colors]) select_idx = np.repeat(np.arange(10), 3) * 4 + np.tile(np.arange(3), 10) return mpl.colors.ListedColormap(colors[select_idx]) @@ -71,8 +73,7 @@ def tab40(): ------- cmap : `matplotlib.colors.ListedColormap` """ - colors = np.vstack([mpl.cm.tab20c.colors, - mpl.cm.tab20b.colors]) + colors = np.vstack([mpl.cm.tab20c.colors, mpl.cm.tab20b.colors]) return mpl.colors.ListedColormap(colors) @@ -100,8 +101,7 @@ def tab(n=10): cmap : `matplotlib.colors.ListedColormap` """ if n < 1: - raise ValueError( - "Expected n >= 1. Got {}".format(n)) + raise ValueError("Expected n >= 1. Got {}".format(n)) n_shades = int(np.ceil(n / 10)) if n_shades == 1: cmap = mpl.cm.tab10 @@ -115,7 +115,8 @@ def tab(n=10): cmap = tab10_continuous(n_colors=10, n_step=n_shades) # restrict to n values if n > 1 and n < cmap.N: - select_idx = np.tile(np.arange(10), n_shades) * \ - n_shades + np.repeat(np.arange(n_shades), 10) + select_idx = np.tile(np.arange(10), n_shades) * n_shades + np.repeat( + np.arange(n_shades), 10 + ) cmap = mpl.colors.ListedColormap(np.array(cmap.colors)[select_idx[:n]]) return cmap diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py index c3429214..02128785 100644 --- a/scprep/plot/histogram.py +++ b/scprep/plot/histogram.py @@ -3,11 +3,11 @@ import warnings from .. import measure, utils -from .utils import (_get_figure, show, - temp_fontsize, parse_fontsize) +from .utils import _get_figure, show, temp_fontsize, parse_fontsize from .tools import label_axis -_EPS = np.finfo('float').eps +_EPS = np.finfo("float").eps + def _log_bins(xmin, xmax, bins): if xmin > xmax: @@ -34,7 +34,9 @@ def _symlog_bins(xmin, xmax, abs_min, bins): pos_range = np.log(xmax) - np.log(abs_min) total_range = pos_range + neg_range if total_range > 0: - n_pos_bins = np.round((bins-1) * pos_range / (pos_range + neg_range)).astype(int) + n_pos_bins = np.round( + (bins - 1) * pos_range / (pos_range + neg_range) + ).astype(int) else: n_pos_bins = 1 n_neg_bins = max(bins - n_pos_bins - 1, 1) @@ -56,19 +58,24 @@ def _symlog_bins(xmin, xmax, abs_min, bins): @utils._with_pkg(pkg="matplotlib", min_version=3) -def histogram(data, - bins=100, log=False, - cutoff=None, percentile=None, - ax=None, figsize=None, - xlabel=None, - ylabel='Number of cells', - title=None, - fontsize=None, - histtype='stepfilled', - alpha=None, - filename=None, - dpi=None, - **kwargs): +def histogram( + data, + bins=100, + log=False, + cutoff=None, + percentile=None, + ax=None, + figsize=None, + xlabel=None, + ylabel="Number of cells", + title=None, + fontsize=None, + histtype="stepfilled", + alpha=None, + filename=None, + dpi=None, + **kwargs +): """Plot a histogram. Parameters @@ -133,33 +140,34 @@ def histogram(data, xmax = np.max(data) if alpha is None: alpha = 1 - if log == 'x' or log is True: + if log == "x" or log is True: d_flat = np.concatenate(data) if isinstance(data, list) else data - abs_min = np.min(np.where(d_flat != 0, np.abs(d_flat), np.max(np.abs(d_flat)))) + abs_min = np.min( + np.where(d_flat != 0, np.abs(d_flat), np.max(np.abs(d_flat))) + ) if abs_min == 0: abs_min = 0.1 bins = _symlog_bins(xmin, xmax, abs_min, bins=bins) ax.hist(data, bins=bins, histtype=histtype, alpha=alpha, **kwargs) - if log == 'x' or log is True: - ax.set_xscale('symlog', linthreshx=abs_min) - if log == 'y' or log is True: - ax.set_yscale('log') + if log == "x" or log is True: + ax.set_xscale("symlog", linthreshx=abs_min) + if log == "y" or log is True: + ax.set_yscale("log") label_axis(ax.xaxis, label=xlabel) label_axis(ax.yaxis, label=ylabel) if title is not None: - ax.set_title(title, fontsize=parse_fontsize(None, 'xx-large')) + ax.set_title(title, fontsize=parse_fontsize(None, "xx-large")) - cutoff = utils._get_percentile_cutoff( - data, cutoff, percentile, required=False) + cutoff = utils._get_percentile_cutoff(data, cutoff, percentile, required=False) if cutoff is not None: if isinstance(cutoff, numbers.Number): - ax.axvline(cutoff, color='red') + ax.axvline(cutoff, color="red") else: for c in cutoff: - ax.axvline(c, color='red') + ax.axvline(c, color="red") # save and show if show_fig: show(fig) @@ -169,16 +177,21 @@ def histogram(data, @utils._with_pkg(pkg="matplotlib", min_version=3) -def plot_library_size(data, - bins=100, log=True, - cutoff=None, percentile=None, - ax=None, figsize=None, - xlabel='Library size', - title=None, - fontsize=None, - filename=None, - dpi=None, - **kwargs): +def plot_library_size( + data, + bins=100, + log=True, + cutoff=None, + percentile=None, + ax=None, + figsize=None, + xlabel="Library size", + title=None, + fontsize=None, + filename=None, + dpi=None, + **kwargs +): """Plot the library size histogram. Parameters @@ -223,31 +236,48 @@ def plot_library_size(data, data = utils.to_array_or_spmatrix(data) if len(data.shape) > 2 or data.dtype.type is np.object_: # top level must be list - libsize = [measure.library_size(d) - for d in data] + libsize = [measure.library_size(d) for d in data] else: libsize = measure.library_size(data) - return histogram(libsize, - cutoff=cutoff, percentile=percentile, - bins=bins, log=log, ax=ax, figsize=figsize, - xlabel=xlabel, title=title, fontsize=fontsize, - filename=filename, dpi=dpi, **kwargs) + return histogram( + libsize, + cutoff=cutoff, + percentile=percentile, + bins=bins, + log=log, + ax=ax, + figsize=figsize, + xlabel=xlabel, + title=title, + fontsize=fontsize, + filename=filename, + dpi=dpi, + **kwargs + ) @utils._with_pkg(pkg="matplotlib", min_version=3) -def plot_gene_set_expression(data, genes=None, - starts_with=None, ends_with=None, - exact_word=None, regex=None, - bins=100, log=False, - cutoff=None, percentile=None, - library_size_normalize=False, - ax=None, figsize=None, - xlabel='Gene expression', - title=None, - fontsize=None, - filename=None, - dpi=None, - **kwargs): +def plot_gene_set_expression( + data, + genes=None, + starts_with=None, + ends_with=None, + exact_word=None, + regex=None, + bins=100, + log=False, + cutoff=None, + percentile=None, + library_size_normalize=False, + ax=None, + figsize=None, + xlabel="Gene expression", + title=None, + fontsize=None, + filename=None, + dpi=None, + **kwargs +): """Plot the histogram of the expression of a gene set. Parameters @@ -301,27 +331,51 @@ def plot_gene_set_expression(data, genes=None, ax : `matplotlib.Axes` axis on which plot was drawn """ - if hasattr(data, 'shape') and len(data.shape) == 2: + if hasattr(data, "shape") and len(data.shape) == 2: expression = measure.gene_set_expression( - data, genes=genes, - starts_with=starts_with, ends_with=ends_with, - exact_word=exact_word, regex=regex, - library_size_normalize=library_size_normalize) + data, + genes=genes, + starts_with=starts_with, + ends_with=ends_with, + exact_word=exact_word, + regex=regex, + library_size_normalize=library_size_normalize, + ) else: data_array = utils.to_array_or_spmatrix(data) if len(data_array.shape) == 2 and data_array.dtype.type is not np.object_: expression = measure.gene_set_expression( - data, genes=genes, - starts_with=starts_with, ends_with=ends_with, regex=regex, - library_size_normalize=library_size_normalize) + data, + genes=genes, + starts_with=starts_with, + ends_with=ends_with, + regex=regex, + library_size_normalize=library_size_normalize, + ) else: - expression = [measure.gene_set_expression( - d, genes=genes, - starts_with=starts_with, ends_with=ends_with, regex=regex, - library_size_normalize=library_size_normalize) - for d in data] - return histogram(expression, - cutoff=cutoff, percentile=percentile, - bins=bins, log=log, ax=ax, figsize=figsize, - xlabel=xlabel, title=title, fontsize=fontsize, - filename=filename, dpi=dpi, **kwargs) + expression = [ + measure.gene_set_expression( + d, + genes=genes, + starts_with=starts_with, + ends_with=ends_with, + regex=regex, + library_size_normalize=library_size_normalize, + ) + for d in data + ] + return histogram( + expression, + cutoff=cutoff, + percentile=percentile, + bins=bins, + log=log, + ax=ax, + figsize=figsize, + xlabel=xlabel, + title=title, + fontsize=fontsize, + filename=filename, + dpi=dpi, + **kwargs + ) diff --git a/scprep/plot/jitter.py b/scprep/plot/jitter.py index 8a1469d9..35689d6f 100644 --- a/scprep/plot/jitter.py +++ b/scprep/plot/jitter.py @@ -2,15 +2,13 @@ import pandas as pd from .. import utils -from .utils import (_get_figure, show, - temp_fontsize, parse_fontsize, _with_default) +from .utils import _get_figure, show, temp_fontsize, parse_fontsize, _with_default from .tools import label_axis, generate_colorbar, generate_legend from .scatter import _ScatterParams class _JitterParams(_ScatterParams): - @property def x_labels(self): try: @@ -27,32 +25,43 @@ def x_coords(self): @utils._with_pkg(pkg="matplotlib", min_version=3) -def jitter(labels, values, sigma=0.1, - c=None, cmap=None, cmap_scale='linear', - s=None, mask=None, - plot_means=True, means_s=100, means_c='lightgrey', - discrete=None, - ax=None, - legend=None, colorbar=None, - shuffle=True, - figsize=None, - ticks=True, - xticks=None, - yticks=None, - ticklabels=True, - xticklabels=None, - yticklabels=None, - xlabel=None, - ylabel=None, - title=None, - fontsize=None, - legend_title=None, - legend_loc='best', - legend_anchor=None, - vmin=None, vmax=None, - filename=None, - dpi=None, - **plot_kwargs): +def jitter( + labels, + values, + sigma=0.1, + c=None, + cmap=None, + cmap_scale="linear", + s=None, + mask=None, + plot_means=True, + means_s=100, + means_c="lightgrey", + discrete=None, + ax=None, + legend=None, + colorbar=None, + shuffle=True, + figsize=None, + ticks=True, + xticks=None, + yticks=None, + ticklabels=True, + xticklabels=None, + yticklabels=None, + xlabel=None, + ylabel=None, + title=None, + fontsize=None, + legend_title=None, + legend_loc="best", + legend_anchor=None, + vmin=None, + vmax=None, + filename=None, + dpi=None, + **plot_kwargs +): """Creates a 2D scatterplot showing the distribution of `values` for points that have associated `labels`. @@ -151,33 +160,56 @@ def jitter(labels, values, sigma=0.1, """ with temp_fontsize(fontsize): params = _JitterParams( - labels, values, c=c, discrete=discrete, - cmap=cmap, cmap_scale=cmap_scale, - vmin=vmin, vmax=vmax, s=s, mask=mask, - legend=legend, colorbar=colorbar, - xlabel=xlabel, ylabel=ylabel) + labels, + values, + c=c, + discrete=discrete, + cmap=cmap, + cmap_scale=cmap_scale, + vmin=vmin, + vmax=vmax, + s=s, + mask=mask, + legend=legend, + colorbar=colorbar, + xlabel=xlabel, + ylabel=ylabel, + ) - fig, ax, show_fig = _get_figure( - ax, figsize, subplot_kw=params.subplot_kw) + fig, ax, show_fig = _get_figure(ax, figsize, subplot_kw=params.subplot_kw) # Plotting cells sc = ax.scatter( params.x_coords + np.random.normal(0, sigma, params.size)[params.plot_idx], - params.y, c=params.c, - cmap=params.cmap, norm=params.norm, s=params.s, - vmin=params.vmin, vmax=params.vmax, **plot_kwargs) + params.y, + c=params.c, + cmap=params.cmap, + norm=params.norm, + s=params.s, + vmin=params.vmin, + vmax=params.vmax, + **plot_kwargs + ) # Plotting means if plot_means: - ax.scatter(np.arange(len(params.x_labels)), - [np.nanmean(params.y[params.x_coords == i]) - for i in range(len(params.x_labels))], - c=means_c, edgecolors='black', lw=1.5, - marker='o', zorder=3, s=means_s) + ax.scatter( + np.arange(len(params.x_labels)), + [ + np.nanmean(params.y[params.x_coords == i]) + for i in range(len(params.x_labels)) + ], + c=means_c, + edgecolors="black", + lw=1.5, + marker="o", + zorder=3, + s=means_s, + ) # Plotting vetical lines for i in range(len(params.x_labels)): - ax.axvline(i, c='k', lw=.1, zorder=0) + ax.axvline(i, c="k", lw=0.1, zorder=0) # x axis labels xticks = _with_default(xticks, ticks) @@ -189,8 +221,12 @@ def jitter(labels, values, sigma=0.1, # label axes label_axis(ax.xaxis, xticks, xticklabels, params.xlabel) - label_axis(ax.yaxis, _with_default(yticks, ticks), - _with_default(yticklabels, ticklabels), params.ylabel) + label_axis( + ax.yaxis, + _with_default(yticks, ticks), + _with_default(yticklabels, ticklabels), + params.ylabel, + ) # manually set x limits xmin = np.min(params.x_coords) @@ -198,20 +234,31 @@ def jitter(labels, values, sigma=0.1, ax.set_xlim(xmin - 0.5, xmax + 0.5) if title is not None: - ax.set_title(title, fontsize=parse_fontsize(None, 'xx-large')) + ax.set_title(title, fontsize=parse_fontsize(None, "xx-large")) # generate legend if params.legend: if params.discrete: - generate_legend({params.labels[i]: sc.cmap(sc.norm(i)) - for i in range(len(params.labels))}, ax=ax, - loc=legend_loc, bbox_to_anchor=legend_anchor, - title=legend_title) + generate_legend( + { + params.labels[i]: sc.cmap(sc.norm(i)) + for i in range(len(params.labels)) + }, + ax=ax, + loc=legend_loc, + bbox_to_anchor=legend_anchor, + title=legend_title, + ) else: - generate_colorbar(params.cmap, ax=ax, - vmin=params.vmin, vmax=params.vmax, - title=legend_title, extend=params.extend, - scale=sc.norm) + generate_colorbar( + params.cmap, + ax=ax, + vmin=params.vmin, + vmax=params.vmax, + title=legend_title, + extend=params.extend, + scale=sc.norm, + ) # save and show if show_fig: diff --git a/scprep/plot/marker.py b/scprep/plot/marker.py index 33f8a418..5d71484f 100644 --- a/scprep/plot/marker.py +++ b/scprep/plot/marker.py @@ -3,15 +3,19 @@ from scipy.cluster import hierarchy from .. import utils, stats, select -from .utils import (_get_figure, show, - temp_fontsize, parse_fontsize, shift_ticklabels) +from .utils import _get_figure, show, temp_fontsize, parse_fontsize, shift_ticklabels from .tools import label_axis -def _make_scatter_arrays(data_clust, cluster_names, - tissues, markers, - gene_names, - normalize_emd, normalize_expression): +def _make_scatter_arrays( + data_clust, + cluster_names, + tissues, + markers, + gene_names, + normalize_emd, + normalize_expression, +): cluster_labels = [] marker_labels = [] tissue_labels = [] @@ -33,8 +37,7 @@ def _make_scatter_arrays(data_clust, cluster_names, tissue_labels.append(tissues[j]) gidx = np.where(gene_names == marker) marker_expr = in_cluster_expr[:, gidx] - s_row.append(stats.EMD(marker_expr, - out_cluster_expr[:, gidx])) + s_row.append(stats.EMD(marker_expr, out_cluster_expr[:, gidx])) c_row.append(np.mean(marker_expr)) # row normalize s_row = np.array(s_row) @@ -54,34 +57,30 @@ def _make_scatter_arrays(data_clust, cluster_names, return x, y, c, s, cluster_labels, tissue_labels, marker_labels -def _cluster_tissues(tissue_names, cluster_names, - tissue_labels, cluster_labels, - s, c): +def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels, s, c): # cluster tissues hierarchically using mean size and color tissue_features = [] for tissue in tissue_names: tissue_data = [] for cluster in cluster_names: tissue_cluster_idx = np.where( - (np.array(tissue_labels) == tissue) & ( - np.array(cluster_labels) == cluster)) - tissue_data.append(np.vstack( - [s[tissue_cluster_idx], - c[tissue_cluster_idx]]).mean(axis=1)) + (np.array(tissue_labels) == tissue) + & (np.array(cluster_labels) == cluster) + ) + tissue_data.append( + np.vstack([s[tissue_cluster_idx], c[tissue_cluster_idx]]).mean(axis=1) + ) tissue_features.append(np.concatenate(tissue_data)) tissue_features = np.array(tissue_features) # normalize - tissue_features = tissue_features / \ - np.sqrt(np.sum(tissue_features ** 2)) - tissues_order = hierarchy.leaves_list( - hierarchy.linkage(tissue_features)) + tissue_features = tissue_features / np.sqrt(np.sum(tissue_features ** 2)) + tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features)) return tissues_order -def _cluster_markers(markers, tissues, - marker_labels, tissue_labels, - marker_groups_order, - s, c): +def _cluster_markers( + markers, tissues, marker_labels, tissue_labels, marker_groups_order, s, c +): # cluster markers hierarchically using mean size and color markers_order = [] for marker_group in marker_groups_order: @@ -93,15 +92,15 @@ def _cluster_markers(markers, tissues, if tissues is not None: # check for markers that appear in multiple tissues marker_idx = marker_idx & ( - tissue_labels == tissues[marker_group[0]]) - marker_features.append(np.concatenate( - [s[marker_idx], c[marker_idx]])) + tissue_labels == tissues[marker_group[0]] + ) + marker_features.append(np.concatenate([s[marker_idx], c[marker_idx]])) marker_features = np.array(marker_features) # normalize - marker_features = marker_features / \ - np.sqrt(np.sum(marker_features ** 2)) + marker_features = marker_features / np.sqrt(np.sum(marker_features ** 2)) marker_group_order = hierarchy.leaves_list( - hierarchy.linkage(marker_features)) + hierarchy.linkage(marker_features) + ) markers_order.append(marker_group[marker_group_order]) else: markers_order.append(marker_group) @@ -110,13 +109,21 @@ def _cluster_markers(markers, tissues, @utils._with_pkg(pkg="matplotlib", min_version=3) -def marker_plot(data, clusters, markers, gene_names=None, - normalize_expression=True, normalize_emd=True, - reorder_tissues=True, - reorder_markers=True, - cmap='magma', - title=None, figsize=None, - ax=None, fontsize=None): +def marker_plot( + data, + clusters, + markers, + gene_names=None, + normalize_expression=True, + normalize_emd=True, + reorder_tissues=True, + reorder_markers=True, + cmap="magma", + title=None, + figsize=None, + ax=None, + fontsize=None, +): """Marker gene enrichment plot Generate a plot indicating the expression level and enrichment of @@ -175,12 +182,14 @@ def marker_plot(data, clusters, markers, gene_names=None, raise ValueError( "Either `data` must be a pd.DataFrame, or gene_names must " "be provided. " - "Got gene_names=None, data as a {}".format(type(data))) + "Got gene_names=None, data as a {}".format(type(data)) + ) gene_names = data.columns if isinstance(markers, dict): tissues, markers = tuple( - zip(*[([k] * len(v), v) for k, v in markers.items()])) + zip(*[([k] * len(v), v) for k, v in markers.items()]) + ) tissues, markers = np.concatenate(tissues), np.concatenate(markers) else: markers = utils.toarray(markers) @@ -188,8 +197,10 @@ def marker_plot(data, clusters, markers, gene_names=None, for gene in markers: if gene not in gene_names: - raise ValueError('All genes in `markers` must appear ' - 'in gene_names. Did not find: {}'.format(gene)) + raise ValueError( + "All genes in `markers` must appear " + "in gene_names. Did not find: {}".format(gene) + ) data = utils.to_array_or_spmatrix(data) @@ -208,35 +219,52 @@ def marker_plot(data, clusters, markers, gene_names=None, out_cluster_expr = data[~in_cluster] data_clust[cluster] = (in_cluster_expr, out_cluster_expr) - (x, y, c, s, cluster_labels, - tissue_labels, marker_labels) = _make_scatter_arrays( - data_clust, cluster_names, - tissues, markers, + ( + x, + y, + c, + s, + cluster_labels, + tissue_labels, + marker_labels, + ) = _make_scatter_arrays( + data_clust, + cluster_names, + tissues, + markers, gene_names, - normalize_emd, normalize_expression) + normalize_emd, + normalize_expression, + ) # reorder y axis if tissues is not None and len(tissues) > 1: tissue_names = np.unique(tissues) if reorder_tissues: - tissues_order = _cluster_tissues(tissue_names, cluster_names, - tissue_labels, cluster_labels, - s, c) + tissues_order = _cluster_tissues( + tissue_names, cluster_names, tissue_labels, cluster_labels, s, c + ) else: # keep tissues in order tissues_order = np.arange(len(tissue_names)) marker_groups_order = [ np.arange(len(markers))[tissues == tissue_names[i]] - for i in tissues_order] + for i in tissues_order + ] else: # only one tissue marker_groups_order = [np.arange(len(markers))] if reorder_markers and len(markers) > 1: - markers_order = _cluster_markers(markers, tissues, - marker_labels, tissue_labels, - marker_groups_order, - s, c) + markers_order = _cluster_markers( + markers, + tissues, + marker_labels, + tissue_labels, + marker_groups_order, + s, + c, + ) else: # keep markers in order markers_order = np.concatenate(marker_groups_order) @@ -252,34 +280,42 @@ def marker_plot(data, clusters, markers, gene_names=None, # Vertical and Horizontal Grid Lines for h in np.unique(y): - ax.axhline(h, c='k', linewidth=0.1, zorder=0) + ax.axhline(h, c="k", linewidth=0.1, zorder=0) for v in np.unique(x): - ax.axvline(v, c='k', linewidth=0.1, zorder=0) + ax.axvline(v, c="k", linewidth=0.1, zorder=0) ax.set_ylim(-0.5, len(markers) - 0.5) # Title - title_fontsize = parse_fontsize(None, 'xx-large') - ax.set_title(title, fontsize=title_fontsize, fontweight='bold') + title_fontsize = parse_fontsize(None, "xx-large") + ax.set_title(title, fontsize=title_fontsize, fontweight="bold") # X axis decorators x_unique, x_unique_idx = np.unique(x, return_index=True) - label_axis(ax.xaxis, label='Cluster', ticks=x_unique, - ticklabels=np.array(cluster_labels)[x_unique_idx], - ticklabel_rotation=45, - ticklabel_horizontal_alignment='right') + label_axis( + ax.xaxis, + label="Cluster", + ticks=x_unique, + ticklabels=np.array(cluster_labels)[x_unique_idx], + ticklabel_rotation=45, + ticklabel_horizontal_alignment="right", + ) shift_ticklabels(ax.xaxis, dx=0.1) # Y axis decorators - label_axis(ax.yaxis, ticks=np.arange(len(markers)), - ticklabels=markers[markers_order]) + label_axis( + ax.yaxis, ticks=np.arange(len(markers)), ticklabels=markers[markers_order] + ) if tissues is not None: # Right Y axis decorators ax2 = ax.twinx() ax2.set_ylim(ax.get_ylim()) - label_axis(ax2.yaxis, ticks=np.arange(len(tissues)), - ticklabels=tissues[markers_order]) + label_axis( + ax2.yaxis, + ticks=np.arange(len(tissues)), + ticklabels=tissues[markers_order], + ) if show_fig: show(fig) diff --git a/scprep/plot/scatter.py b/scprep/plot/scatter.py index 254c76d5..812c900e 100644 --- a/scprep/plot/scatter.py +++ b/scprep/plot/scatter.py @@ -4,14 +4,26 @@ import warnings from .. import utils, select -from .utils import (_get_figure, _is_color_array, - show, _in_ipynb, parse_fontsize, temp_fontsize, - _with_default) -from .tools import (create_colormap, create_normalize, - label_axis, generate_colorbar, generate_legend) +from .utils import ( + _get_figure, + _is_color_array, + show, + _in_ipynb, + parse_fontsize, + temp_fontsize, + _with_default, +) +from .tools import ( + create_colormap, + create_normalize, + label_axis, + generate_colorbar, + generate_legend, +) from . import colors from .._lazyload import matplotlib as mpl + plt = mpl.pyplot @@ -25,13 +37,27 @@ def _squeeze_array(x): class _ScatterParams(object): - - def __init__(self, x, y, z=None, c=None, mask=None, - discrete=None, - cmap=None, cmap_scale=None, vmin=None, - vmax=None, s=None, legend=None, colorbar=None, - xlabel=None, ylabel=None, zlabel=None, - label_prefix=None, shuffle=True): + def __init__( + self, + x, + y, + z=None, + c=None, + mask=None, + discrete=None, + cmap=None, + cmap_scale=None, + vmin=None, + vmax=None, + s=None, + legend=None, + colorbar=None, + xlabel=None, + ylabel=None, + zlabel=None, + label_prefix=None, + shuffle=True, + ): self._x = x self._y = y self._z = z if z is not None else None @@ -141,8 +167,7 @@ def array_c(self): try: return self._array_c except AttributeError: - self._array_c = (not self.constant_c()) and _is_color_array( - self._c) + self._array_c = (not self.constant_c()) and _is_color_array(self._c) return self._array_c @property @@ -185,9 +210,9 @@ def discrete(self): if self.constant_c() or self.array_c(): return None else: - if isinstance(self._cmap, dict) or not \ - np.all([isinstance(x, numbers.Number) - for x in self._c_masked]): + if isinstance(self._cmap, dict) or not np.all( + [isinstance(x, numbers.Number) for x in self._c_masked] + ): # cmap dictionary or non-numeric values force discrete return True else: @@ -208,14 +233,16 @@ def c_discrete(self): if self._c_discrete is None: if isinstance(self._cmap, dict): self._labels = np.array( - [k for k in self._cmap.keys() if k in self.c_unique]) + [k for k in self._cmap.keys() if k in self.c_unique] + ) self._c_discrete = np.zeros_like(self._c, dtype=int) for i, label in enumerate(self._labels): self._c_discrete[self._c == label] = i else: self._c_discrete = np.zeros_like(self._c, dtype=int) self._c_discrete[self._mask], self._labels = pd.factorize( - self._c_masked, sort=True) + self._c_masked, sort=True + ) return self._c_discrete @property @@ -272,15 +299,15 @@ def vmax(self): def list_cmap(self): """Is the colormap a list?""" - return hasattr(self._cmap, '__len__') and \ - not isinstance(self._cmap, (str, dict)) + return hasattr(self._cmap, "__len__") and not isinstance( + self._cmap, (str, dict) + ) def process_string_cmap(self, cmap): """If necessary, subset a discrete colormap based on the number of colors""" cmap = mpl.cm.get_cmap(cmap) if self.discrete and cmap.N <= 20 and self.n_c_unique <= cmap.N: - return mpl.colors.ListedColormap( - cmap.colors[:self.n_c_unique]) + return mpl.colors.ListedColormap(cmap.colors[: self.n_c_unique]) else: return cmap @@ -289,7 +316,8 @@ def cmap(self): if self._cmap is not None: if isinstance(self._cmap, dict): return mpl.colors.ListedColormap( - [mpl.colors.to_rgba(self._cmap[l]) for l in self.labels]) + [mpl.colors.to_rgba(self._cmap[l]) for l in self.labels] + ) elif self.list_cmap(): return create_colormap(self._cmap) elif isinstance(self._cmap, str): @@ -302,7 +330,7 @@ def cmap(self): elif self.discrete: return colors.tab(n=self.n_c_unique) else: - return self.process_string_cmap('inferno') + return self.process_string_cmap("inferno") @property def cmap_scale(self): @@ -312,11 +340,11 @@ def cmap_scale(self): if self.discrete or not self.legend: return None else: - return 'linear' + return "linear" @property def norm(self): - if self.cmap_scale is not None and self.cmap_scale != 'linear': + if self.cmap_scale is not None and self.cmap_scale != "linear": return create_normalize(self.vmin, self.vmax, self.cmap_scale) else: return None @@ -328,16 +356,16 @@ def extend(self): extend_min = np.min(self.c) < self.vmin extend_max = np.max(self.c) > self.vmax if extend_min: - return 'both' if extend_max else 'min' + return "both" if extend_max else "min" else: - return 'max' if extend_max else 'neither' + return "max" if extend_max else "neither" else: return None @property def subplot_kw(self): if self.z is not None: - return {'projection': '3d'} + return {"projection": "3d"} else: return {} @@ -347,7 +375,8 @@ def check_vmin_vmax(self): warnings.warn( "Cannot set `vmin` or `vmax` with constant `c={}`. " "Setting `vmin = vmax = None`.".format(self.c), - UserWarning) + UserWarning, + ) self._vmin = None self._vmax = None elif self.discrete: @@ -355,7 +384,8 @@ def check_vmin_vmax(self): warnings.warn( "Cannot set `vmin` or `vmax` with discrete data. " "Setting to `None`.", - UserWarning) + UserWarning, + ) self._vmin = None self._vmax = None @@ -365,8 +395,8 @@ def check_legend(self): if self._legend is not None and self._legend != self._colorbar: raise ValueError( "Received conflicting values for synonyms " - "`legend={}` and `colorbar={}`".format( - self._legend, self._colorbar)) + "`legend={}` and `colorbar={}`".format(self._legend, self._colorbar) + ) else: self._legend = self._colorbar if self._legend: @@ -375,12 +405,14 @@ def check_legend(self): "`c` is a color array and cannot be used to create a " "legend. To interpret these values as labels instead, " "provide a `cmap` dictionary with label-color pairs.", - UserWarning) + UserWarning, + ) self._legend = False elif self.constant_c(): warnings.warn( - "Cannot create a legend with constant `c={}`".format( - self.c), UserWarning) + "Cannot create a legend with constant `c={}`".format(self.c), + UserWarning, + ) self._legend = False def check_size(self): @@ -389,76 +421,91 @@ def check_size(self): if len(d) != self.size: raise ValueError( "Expected all axes of data to have the same length" - ". Got {}".format([len(d) for d in self._data])) + ". Got {}".format([len(d) for d in self._data]) + ) def check_c(self): if not self.constant_c(): self._c = _squeeze_array(self._c) if not len(self._c) == self.size: - raise ValueError("Expected c of length {} or 1. Got {}".format( - self.size, len(self._c))) + raise ValueError( + "Expected c of length {} or 1. Got {}".format( + self.size, len(self._c) + ) + ) def check_mask(self): if self._mask is not None: self._mask = _squeeze_array(self._mask) if not len(self._mask) == self.size: - raise ValueError("Expected mask of length {}. Got {}".format( - self.size, len(self._mask))) + raise ValueError( + "Expected mask of length {}. Got {}".format( + self.size, len(self._mask) + ) + ) def check_s(self): if self._s is not None and not isinstance(self._s, numbers.Number): self._s = _squeeze_array(self._s) if not len(self._s) == self.size: - raise ValueError("Expected s of length {} or 1. Got {}".format( - self.size, len(self._s))) + raise ValueError( + "Expected s of length {} or 1. Got {}".format( + self.size, len(self._s) + ) + ) def check_discrete(self): if self._discrete is False: if not np.all([isinstance(x, numbers.Number) for x in self._c]): - raise ValueError( - "Cannot treat non-numeric data as continuous.") + raise ValueError("Cannot treat non-numeric data as continuous.") def check_cmap(self): if isinstance(self._cmap, dict): # dictionary cmap if self.constant_c() or self.array_c(): - raise ValueError("Expected list-like `c` with dictionary cmap." - " Got {}".format(type(self._c))) + raise ValueError( + "Expected list-like `c` with dictionary cmap." + " Got {}".format(type(self._c)) + ) elif not self.discrete: - raise ValueError("Cannot use dictionary cmap with " - "continuous data.") + raise ValueError("Cannot use dictionary cmap with " "continuous data.") elif np.any([l not in self._cmap for l in np.unique(self._c)]): - missing = set(np.unique(self._c).tolist() - ).difference(self._cmap.keys()) + missing = set(np.unique(self._c).tolist()).difference(self._cmap.keys()) raise ValueError( "Dictionary cmap requires a color " "for every unique entry in `c`. " "Missing colors for [{}]".format( - ", ".join([str(l) for l in missing]))) + ", ".join([str(l) for l in missing]) + ) + ) elif self.list_cmap(): if self.constant_c() or self.array_c(): - raise ValueError("Expected list-like `c` with list cmap. " - "Got {}".format(type(self._c))) + raise ValueError( + "Expected list-like `c` with list cmap. " + "Got {}".format(type(self._c)) + ) def check_cmap_scale(self): - if self._cmap_scale is not None and self._cmap_scale != 'linear': + if self._cmap_scale is not None and self._cmap_scale != "linear": if self.array_c(): warnings.warn( - "Cannot use non-linear `cmap_scale` with " - "`c` as a color array.", - UserWarning) - self._cmap_scale = 'linear' + "Cannot use non-linear `cmap_scale` with " "`c` as a color array.", + UserWarning, + ) + self._cmap_scale = "linear" elif self.constant_c(): warnings.warn( "Cannot use non-linear `cmap_scale` with constant " "`c={}`.".format(self._c), - UserWarning) - self._cmap_scale = 'linear' + UserWarning, + ) + self._cmap_scale = "linear" elif self.discrete: warnings.warn( "Cannot use non-linear `cmap_scale` with discrete data.", - UserWarning) - self._cmap_scale = 'linear' + UserWarning, + ) + self._cmap_scale = "linear" @property def xlabel(self): @@ -497,36 +544,46 @@ def zlabel(self): @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter(x, y, z=None, - c=None, cmap=None, cmap_scale='linear', s=None, - mask=None, - discrete=None, - ax=None, - legend=None, colorbar=None, - shuffle=True, - figsize=None, - ticks=True, - xticks=None, - yticks=None, - zticks=None, - ticklabels=True, - xticklabels=None, - yticklabels=None, - zticklabels=None, - label_prefix=None, - xlabel=None, - ylabel=None, - zlabel=None, - title=None, - fontsize=None, - legend_title=None, - legend_loc='best', - legend_anchor=None, - vmin=None, vmax=None, - elev=None, azim=None, - filename=None, - dpi=None, - **plot_kwargs): +def scatter( + x, + y, + z=None, + c=None, + cmap=None, + cmap_scale="linear", + s=None, + mask=None, + discrete=None, + ax=None, + legend=None, + colorbar=None, + shuffle=True, + figsize=None, + ticks=True, + xticks=None, + yticks=None, + zticks=None, + ticklabels=True, + xticklabels=None, + yticklabels=None, + zticklabels=None, + label_prefix=None, + xlabel=None, + ylabel=None, + zlabel=None, + title=None, + fontsize=None, + legend_title=None, + legend_loc="best", + legend_anchor=None, + vmin=None, + vmax=None, + elev=None, + azim=None, + filename=None, + dpi=None, + **plot_kwargs +): """Create a scatter plot Builds upon `matplotlib.pyplot.scatter` with nice defaults @@ -645,46 +702,87 @@ def scatter(x, y, z=None, """ with temp_fontsize(fontsize): params = _ScatterParams( - x, y, z, c=c, mask=mask, discrete=discrete, - cmap=cmap, cmap_scale=cmap_scale, - vmin=vmin, vmax=vmax, s=s, - legend=legend, colorbar=colorbar, - xlabel=xlabel, ylabel=ylabel, zlabel=zlabel, - label_prefix=label_prefix, shuffle=shuffle) - - fig, ax, show_fig = _get_figure( - ax, figsize, subplot_kw=params.subplot_kw) + x, + y, + z, + c=c, + mask=mask, + discrete=discrete, + cmap=cmap, + cmap_scale=cmap_scale, + vmin=vmin, + vmax=vmax, + s=s, + legend=legend, + colorbar=colorbar, + xlabel=xlabel, + ylabel=ylabel, + zlabel=zlabel, + label_prefix=label_prefix, + shuffle=shuffle, + ) + + fig, ax, show_fig = _get_figure(ax, figsize, subplot_kw=params.subplot_kw) # plot! sc = ax.scatter( *(params.data), - c=params.c, cmap=params.cmap, norm=params.norm, s=params.s, - vmin=params.vmin, vmax=params.vmax, **plot_kwargs) + c=params.c, + cmap=params.cmap, + norm=params.norm, + s=params.s, + vmin=params.vmin, + vmax=params.vmax, + **plot_kwargs + ) # label axes - label_axis(ax.xaxis, _with_default(xticks, ticks), - _with_default(xticklabels, ticklabels), params.xlabel) - label_axis(ax.yaxis, _with_default(yticks, ticks), - _with_default(yticklabels, ticklabels), params.ylabel) + label_axis( + ax.xaxis, + _with_default(xticks, ticks), + _with_default(xticklabels, ticklabels), + params.xlabel, + ) + label_axis( + ax.yaxis, + _with_default(yticks, ticks), + _with_default(yticklabels, ticklabels), + params.ylabel, + ) if z is not None: - label_axis(ax.zaxis, _with_default(zticks, ticks), - _with_default(zticklabels, ticklabels), params.zlabel) + label_axis( + ax.zaxis, + _with_default(zticks, ticks), + _with_default(zticklabels, ticklabels), + params.zlabel, + ) if title is not None: - ax.set_title(title, fontsize=parse_fontsize(None, 'xx-large')) + ax.set_title(title, fontsize=parse_fontsize(None, "xx-large")) # generate legend if params.legend: if params.discrete: - generate_legend({params.labels[i]: sc.cmap(sc.norm(i)) - for i in range(len(params.labels))}, ax=ax, - loc=legend_loc, bbox_to_anchor=legend_anchor, - title=legend_title) + generate_legend( + { + params.labels[i]: sc.cmap(sc.norm(i)) + for i in range(len(params.labels)) + }, + ax=ax, + loc=legend_loc, + bbox_to_anchor=legend_anchor, + title=legend_title, + ) else: - generate_colorbar(params.cmap, ax=ax, - vmin=params.vmin, vmax=params.vmax, - title=legend_title, extend=params.extend, - scale=sc.norm) + generate_colorbar( + params.cmap, + ax=ax, + vmin=params.vmin, + vmax=params.vmax, + title=legend_title, + extend=params.extend, + scale=sc.norm, + ) # set viewpoint if z is not None: @@ -699,29 +797,37 @@ def scatter(x, y, z=None, @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter2d(data, - c=None, cmap=None, cmap_scale='linear', s=None, - mask=None, - discrete=None, - ax=None, legend=None, colorbar=None, - shuffle=True, figsize=None, - ticks=True, - xticks=None, - yticks=None, - ticklabels=True, - xticklabels=None, - yticklabels=None, - label_prefix=None, - xlabel=None, - ylabel=None, - title=None, - fontsize=None, - legend_title=None, - legend_loc='best', - legend_anchor=None, - filename=None, - dpi=None, - **plot_kwargs): +def scatter2d( + data, + c=None, + cmap=None, + cmap_scale="linear", + s=None, + mask=None, + discrete=None, + ax=None, + legend=None, + colorbar=None, + shuffle=True, + figsize=None, + ticks=True, + xticks=None, + yticks=None, + ticklabels=True, + xticklabels=None, + yticklabels=None, + label_prefix=None, + xlabel=None, + ylabel=None, + title=None, + fontsize=None, + legend_title=None, + legend_loc="best", + legend_anchor=None, + filename=None, + dpi=None, + **plot_kwargs +): """Create a 2D scatter plot Builds upon `matplotlib.pyplot.scatter` with nice defaults @@ -832,61 +938,77 @@ def scatter2d(data, """ if isinstance(data, list): data = utils.toarray(data) - return scatter(x=select.select_cols(data, idx=0), - y=select.select_cols(data, idx=1), - c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, - mask=mask, - discrete=discrete, - ax=ax, legend=legend, colorbar=colorbar, - shuffle=shuffle, figsize=figsize, - ticks=ticks, - xticks=xticks, - yticks=yticks, - ticklabels=ticklabels, - xticklabels=xticklabels, - yticklabels=yticklabels, - label_prefix=label_prefix, - xlabel=xlabel, - ylabel=ylabel, - title=title, - fontsize=fontsize, - legend_title=legend_title, - legend_loc=legend_loc, - legend_anchor=legend_anchor, - filename=filename, - dpi=dpi, - **plot_kwargs) + return scatter( + x=select.select_cols(data, idx=0), + y=select.select_cols(data, idx=1), + c=c, + cmap=cmap, + cmap_scale=cmap_scale, + s=s, + mask=mask, + discrete=discrete, + ax=ax, + legend=legend, + colorbar=colorbar, + shuffle=shuffle, + figsize=figsize, + ticks=ticks, + xticks=xticks, + yticks=yticks, + ticklabels=ticklabels, + xticklabels=xticklabels, + yticklabels=yticklabels, + label_prefix=label_prefix, + xlabel=xlabel, + ylabel=ylabel, + title=title, + fontsize=fontsize, + legend_title=legend_title, + legend_loc=legend_loc, + legend_anchor=legend_anchor, + filename=filename, + dpi=dpi, + **plot_kwargs + ) @utils._with_pkg(pkg="matplotlib", min_version=3) -def scatter3d(data, - c=None, cmap=None, cmap_scale='linear', s=None, - mask=None, - discrete=None, - ax=None, legend=None, colorbar=None, - shuffle=True, - figsize=None, - ticks=True, - xticks=None, - yticks=None, - zticks=None, - ticklabels=True, - xticklabels=None, - yticklabels=None, - zticklabels=None, - label_prefix=None, - xlabel=None, - ylabel=None, - zlabel=None, - title=None, - fontsize=None, - legend_title=None, - legend_loc='best', - legend_anchor=None, - elev=None, azim=None, - filename=None, - dpi=None, - **plot_kwargs): +def scatter3d( + data, + c=None, + cmap=None, + cmap_scale="linear", + s=None, + mask=None, + discrete=None, + ax=None, + legend=None, + colorbar=None, + shuffle=True, + figsize=None, + ticks=True, + xticks=None, + yticks=None, + zticks=None, + ticklabels=True, + xticklabels=None, + yticklabels=None, + zticklabels=None, + label_prefix=None, + xlabel=None, + ylabel=None, + zlabel=None, + title=None, + fontsize=None, + legend_title=None, + legend_loc="best", + legend_anchor=None, + elev=None, + azim=None, + filename=None, + dpi=None, + **plot_kwargs +): """Create a 3D scatter plot Builds upon `matplotlib.pyplot.scatter` with nice defaults @@ -1007,45 +1129,58 @@ def scatter3d(data, z = select.select_cols(data, idx=2) except IndexError: raise ValueError("Expected data.shape[1] >= 3. Got {}".format(data.shape[1])) - return scatter(x=x, y=y, z=z, - c=c, cmap=cmap, cmap_scale=cmap_scale, s=s, mask=mask, - discrete=discrete, - ax=ax, legend=legend, colorbar=colorbar, - shuffle=shuffle, figsize=figsize, - ticks=ticks, - xticks=xticks, - yticks=yticks, - zticks=zticks, - ticklabels=ticklabels, - xticklabels=xticklabels, - yticklabels=yticklabels, - zticklabels=zticklabels, - label_prefix=label_prefix, - xlabel=xlabel, - ylabel=ylabel, - zlabel=zlabel, - title=title, - fontsize=fontsize, - legend_title=legend_title, - legend_loc=legend_loc, - legend_anchor=legend_anchor, - elev=elev, - azim=azim, - filename=filename, - dpi=dpi, - **plot_kwargs) + return scatter( + x=x, + y=y, + z=z, + c=c, + cmap=cmap, + cmap_scale=cmap_scale, + s=s, + mask=mask, + discrete=discrete, + ax=ax, + legend=legend, + colorbar=colorbar, + shuffle=shuffle, + figsize=figsize, + ticks=ticks, + xticks=xticks, + yticks=yticks, + zticks=zticks, + ticklabels=ticklabels, + xticklabels=xticklabels, + yticklabels=yticklabels, + zticklabels=zticklabels, + label_prefix=label_prefix, + xlabel=xlabel, + ylabel=ylabel, + zlabel=zlabel, + title=title, + fontsize=fontsize, + legend_title=legend_title, + legend_loc=legend_loc, + legend_anchor=legend_anchor, + elev=elev, + azim=azim, + filename=filename, + dpi=dpi, + **plot_kwargs + ) @utils._with_pkg(pkg="matplotlib", min_version=3) -def rotate_scatter3d(data, - filename=None, - rotation_speed=30, - fps=10, - ax=None, - figsize=None, - ipython_html="jshtml", - dpi=None, - **kwargs): +def rotate_scatter3d( + data, + filename=None, + rotation_speed=30, + fps=10, + ax=None, + figsize=None, + ipython_html="jshtml", + dpi=None, + **kwargs +): """Create a rotating 3D scatter plot Builds upon `matplotlib.pyplot.scatter` with nice defaults @@ -1097,19 +1232,19 @@ def rotate_scatter3d(data, if _in_ipynb(): # credit to # http://tiao.io/posts/notebooks/save-matplotlib-animations-as-gifs/ - mpl.rc('animation', html=ipython_html) + mpl.rc("animation", html=ipython_html) if filename is not None: if filename.endswith(".gif"): - writer = 'imagemagick' + writer = "imagemagick" elif filename.endswith(".mp4"): writer = "ffmpeg" else: raise ValueError( - "filename must end in .gif or .mp4. Got {}".format(filename)) + "filename must end in .gif or .mp4. Got {}".format(filename) + ) - fig, ax, show_fig = _get_figure( - ax, figsize, subplot_kw={'projection': '3d'}) + fig, ax, show_fig = _get_figure(ax, figsize, subplot_kw={"projection": "3d"}) degrees_per_frame = rotation_speed / fps frames = int(round(360 / degrees_per_frame)) @@ -1129,8 +1264,13 @@ def animate(i): return ax ani = mpl.animation.FuncAnimation( - fig, animate, init_func=init, - frames=range(frames), interval=interval, blit=False) + fig, + animate, + init_func=init, + frames=range(frames), + interval=interval, + blit=False, + ) if filename is not None: ani.save(filename, writer=writer, dpi=dpi) diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py index fc693798..7114856b 100644 --- a/scprep/plot/scree.py +++ b/scprep/plot/scree.py @@ -3,16 +3,23 @@ from .. import utils from .._lazyload import matplotlib as mpl -from .utils import (_get_figure, show, - temp_fontsize) +from .utils import _get_figure, show, temp_fontsize from .tools import label_axis @utils._with_pkg(pkg="matplotlib", min_version=3) -def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, - xlabel='Principal Component', ylabel='Explained Variance (%)', - fontsize=None, filename=None, dpi=None, - **kwargs): +def scree_plot( + singular_values, + cumulative=False, + ax=None, + figsize=None, + xlabel="Principal Component", + ylabel="Explained Variance (%)", + fontsize=None, + filename=None, + dpi=None, + **kwargs +): """Plot the explained variance of each principal component Parameters @@ -57,8 +64,7 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None, if cumulative: explained_variance = np.cumsum(explained_variance) fig, ax, show_fig = _get_figure(ax, figsize) - ax.bar(np.arange(len(explained_variance)) + 1, - explained_variance, **kwargs) + ax.bar(np.arange(len(explained_variance)) + 1, explained_variance, **kwargs) label_axis(ax.xaxis, label=xlabel) label_axis(ax.yaxis, label=ylabel) ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True)) diff --git a/scprep/plot/tools.py b/scprep/plot/tools.py index 89a9ac05..68da0357 100644 --- a/scprep/plot/tools.py +++ b/scprep/plot/tools.py @@ -5,6 +5,7 @@ from .utils import _get_figure, parse_fontsize, temp_fontsize from .._lazyload import matplotlib as mpl + plt = mpl.pyplot @@ -30,10 +31,10 @@ def create_colormap(colors, name="scprep_custom_cmap"): cdict = dict(red=[], green=[], blue=[], alpha=[]) for val, color in zip(vals, colors): r, g, b, a = mpl.colors.to_rgba(color) - cdict['red'].append((val, r, r)) - cdict['green'].append((val, g, g)) - cdict['blue'].append((val, b, b)) - cdict['alpha'].append((val, a, a)) + cdict["red"].append((val, r, r)) + cdict["green"].append((val, g, g)) + cdict["blue"].append((val, b, b)) + cdict["alpha"].append((val, a, a)) cmap = mpl.colors.LinearSegmentedColormap(name, cdict) return cmap @@ -54,32 +55,46 @@ def create_normalize(vmin, vmax, scale=None): """ if scale is None: scale = "linear" - if scale == 'linear': + if scale == "linear": norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax) - elif scale == 'log': + elif scale == "log": if vmin <= 0: raise ValueError( - "`vmin` must be positive for `cmap_scale='log'`. Got {}".format(vmin)) + "`vmin` must be positive for `cmap_scale='log'`. Got {}".format(vmin) + ) norm = mpl.colors.LogNorm(vmin=vmin, vmax=vmin) - elif scale == 'symlog': - norm = mpl.colors.SymLogNorm(linthresh=0.03, linscale=0.03, - vmin=vmin, vmax=vmax) - elif scale == 'sqrt': - norm = mpl.colors.PowerNorm(gamma=1. / 2.) + elif scale == "symlog": + norm = mpl.colors.SymLogNorm( + linthresh=0.03, linscale=0.03, vmin=vmin, vmax=vmax + ) + elif scale == "sqrt": + norm = mpl.colors.PowerNorm(gamma=1.0 / 2.0) elif isinstance(scale, mpl.colors.Normalize): norm = scale else: - raise ValueError("Expected norm in ['linear', 'log', 'symlog'," - "'sqrt'] or a matplotlib.colors.Normalize object." - " Got {}".format(scale)) + raise ValueError( + "Expected norm in ['linear', 'log', 'symlog'," + "'sqrt'] or a matplotlib.colors.Normalize object." + " Got {}".format(scale) + ) return norm @utils._with_pkg(pkg="matplotlib", min_version=3) -def generate_legend(cmap, ax, title=None, marker='o', markersize=10, - loc='best', bbox_to_anchor=None, - fontsize=None, title_fontsize=None, - max_rows=10, ncol=None, **kwargs): +def generate_legend( + cmap, + ax, + title=None, + marker="o", + markersize=10, + loc="best", + bbox_to_anchor=None, + fontsize=None, + title_fontsize=None, + max_rows=10, + ncol=None, + **kwargs +): """Generate a legend on an axis. Parameters @@ -117,25 +132,50 @@ def generate_legend(cmap, ax, title=None, marker='o', markersize=10, ------- legend : `matplotlib.legend.Legend` """ - fontsize = parse_fontsize(fontsize, 'large') - title_fontsize = parse_fontsize(title_fontsize, 'x-large') - handles = [mpl.lines.Line2D([], [], marker=marker, color=color, - linewidth=0, label=label, - markersize=markersize) - for label, color in cmap.items()] + fontsize = parse_fontsize(fontsize, "large") + title_fontsize = parse_fontsize(title_fontsize, "x-large") + handles = [ + mpl.lines.Line2D( + [], + [], + marker=marker, + color=color, + linewidth=0, + label=label, + markersize=markersize, + ) + for label, color in cmap.items() + ] if ncol is None: ncol = max(1, np.ceil(len(cmap) / max_rows).astype(int)) - legend = ax.legend(handles=handles, title=title, - loc=loc, bbox_to_anchor=bbox_to_anchor, - fontsize=fontsize, ncol=ncol, **kwargs) + legend = ax.legend( + handles=handles, + title=title, + loc=loc, + bbox_to_anchor=bbox_to_anchor, + fontsize=fontsize, + ncol=ncol, + **kwargs + ) plt.setp(legend.get_title(), fontsize=title_fontsize) return legend @utils._with_pkg(pkg="matplotlib", min_version=3) -def generate_colorbar(cmap=None, vmin=None, vmax=None, scale=None, ax=None, - title=None, title_rotation=270, fontsize=None, - n_ticks='auto', labelpad=10, mappable=None, **kwargs): +def generate_colorbar( + cmap=None, + vmin=None, + vmax=None, + scale=None, + ax=None, + title=None, + title_rotation=270, + fontsize=None, + n_ticks="auto", + labelpad=10, + mappable=None, + **kwargs +): """Generate a colorbar on an axis. Parameters @@ -184,14 +224,16 @@ def generate_colorbar(cmap=None, vmin=None, vmax=None, scale=None, ax=None, vmin = 0 remove_ticks = True norm = None - if n_ticks != 'auto': + if n_ticks != "auto": warnings.warn( "Cannot set `n_ticks` without setting `vmin` and `vmax`.", - UserWarning) + UserWarning, + ) elif vmax is None or vmin is None: raise ValueError( "Either both or neither of `vmax` and `vmin` should " - "be set. Got `vmax={}, vmin={}`".format(vmax, vmin)) + "be set. Got `vmax={}, vmin={}`".format(vmax, vmin) + ) else: remove_ticks = False norm = create_normalize(vmin, vmax, scale=scale) @@ -199,52 +241,67 @@ def generate_colorbar(cmap=None, vmin=None, vmax=None, scale=None, ax=None, ax = plot_axis xmin, xmax = plot_axis.get_xlim() ymin, ymax = plot_axis.get_ylim() - if hasattr(cmap, '__len__') and \ - not isinstance(cmap, (str, dict)): + if hasattr(cmap, "__len__") and not isinstance(cmap, (str, dict)): # list colormap cmap = create_colormap(cmap) mappable = plot_axis.imshow( np.linspace(vmin, vmax, 10).reshape(-1, 1), - vmin=vmin, vmax=vmax, cmap=cmap, norm=norm, - aspect='auto', origin='lower', - extent=[xmin, xmax, ymin, ymax]) + vmin=vmin, + vmax=vmax, + cmap=cmap, + norm=norm, + aspect="auto", + origin="lower", + extent=[xmin, xmax, ymin, ymax], + ) mappable.remove() else: if vmin is not None or vmax is not None: warnings.warn( - "Cannot set `vmin` or `vmax` when `mappable` is given.", - UserWarning) + "Cannot set `vmin` or `vmax` when `mappable` is given.", UserWarning + ) if cmap is not None: - warnings.warn("Cannot set `cmap` when `mappable` is given.", - UserWarning) + warnings.warn( + "Cannot set `cmap` when `mappable` is given.", UserWarning + ) if scale is not None: - warnings.warn("Cannot set `scale` when `mappable` is given.", - UserWarning) + warnings.warn( + "Cannot set `scale` when `mappable` is given.", UserWarning + ) remove_ticks = False colorbar = fig.colorbar(mappable, ax=ax, **kwargs) if remove_ticks or n_ticks == 0: colorbar.set_ticks([]) - labelpad += plt.rcParams['font.size'] + labelpad += plt.rcParams["font.size"] else: - if n_ticks != 'auto': - tick_locator = mpl.ticker.MaxNLocator( - nbins=n_ticks - 1) + if n_ticks != "auto": + tick_locator = mpl.ticker.MaxNLocator(nbins=n_ticks - 1) colorbar.locator = tick_locator colorbar.update_ticks() - colorbar.ax.tick_params(labelsize=parse_fontsize(None, 'large')) + colorbar.ax.tick_params(labelsize=parse_fontsize(None, "large")) if title is not None: - title_fontsize = parse_fontsize(None, 'x-large') - colorbar.set_label(title, rotation=title_rotation, - fontsize=title_fontsize, labelpad=labelpad) + title_fontsize = parse_fontsize(None, "x-large") + colorbar.set_label( + title, + rotation=title_rotation, + fontsize=title_fontsize, + labelpad=labelpad, + ) return colorbar -def label_axis(axis, ticks=True, ticklabels=True, label=None, - label_fontsize=None, tick_fontsize=None, - ticklabel_rotation=None, - ticklabel_horizontal_alignment=None, - ticklabel_vertical_alignment=None): +def label_axis( + axis, + ticks=True, + ticklabels=True, + label=None, + label_fontsize=None, + tick_fontsize=None, + ticklabel_rotation=None, + ticklabel_horizontal_alignment=None, + ticklabel_vertical_alignment=None, +): """Set axis ticks and labels Parameters @@ -281,7 +338,7 @@ def label_axis(axis, ticks=True, ticklabels=True, label=None, if ticklabels is False or ticklabels is None: axis.set_ticklabels([]) else: - tick_fontsize = parse_fontsize(tick_fontsize, 'large') + tick_fontsize = parse_fontsize(tick_fontsize, "large") if ticklabels is not True: axis.set_ticklabels(ticklabels) for tick in axis.get_ticklabels(): @@ -293,5 +350,5 @@ def label_axis(axis, ticks=True, ticklabels=True, label=None, tick.set_va(ticklabel_vertical_alignment) tick.set_fontsize(tick_fontsize) if label is not None: - label_fontsize = parse_fontsize(label_fontsize, 'x-large') + label_fontsize = parse_fontsize(label_fontsize, "x-large") axis.set_label_text(label, fontsize=label_fontsize) diff --git a/scprep/plot/utils.py b/scprep/plot/utils.py index f0d77e92..3f1d41ea 100644 --- a/scprep/plot/utils.py +++ b/scprep/plot/utils.py @@ -5,6 +5,7 @@ from .._lazyload import matplotlib as mpl from .._lazyload import mpl_toolkits + plt = mpl.pyplot @@ -14,7 +15,7 @@ def _with_default(param, default): def _mpl_is_gui_backend(): backend = mpl.get_backend() - if backend in ['module://ipykernel.pylab.backend_inline', 'agg']: + if backend in ["module://ipykernel.pylab.backend_inline", "agg"]: return False else: return True @@ -24,7 +25,7 @@ def _get_figure(ax=None, figsize=None, subplot_kw=None): if subplot_kw is None: subplot_kw = {} if ax is None: - if 'projection' in subplot_kw and subplot_kw['projection'] == '3d': + if "projection" in subplot_kw and subplot_kw["projection"] == "3d": # ensure mplot3d is loaded mpl_toolkits.mplot3d.Axes3D fig, ax = plt.subplots(figsize=figsize, subplot_kw=subplot_kw) @@ -34,15 +35,18 @@ def _get_figure(ax=None, figsize=None, subplot_kw=None): fig = ax.get_figure() except AttributeError as e: if not isinstance(ax, mpl.axes.Axes): - raise TypeError("Expected ax as a matplotlib.axes.Axes. " - "Got {}".format(type(ax))) + raise TypeError( + "Expected ax as a matplotlib.axes.Axes. " "Got {}".format(type(ax)) + ) else: raise e - if 'projection' in subplot_kw: - if subplot_kw['projection'] == '3d' and \ - not isinstance(ax, mpl_toolkits.mplot3d.Axes3D): - raise TypeError("Expected ax with projection='3d'. " - "Got 2D axis instead.") + if "projection" in subplot_kw: + if subplot_kw["projection"] == "3d" and not isinstance( + ax, mpl_toolkits.mplot3d.Axes3D + ): + raise TypeError( + "Expected ax with projection='3d'. " "Got 2D axis instead." + ) show_fig = False return fig, ax, show_fig @@ -62,8 +66,10 @@ def _in_ipynb(): Credit to https://stackoverflow.com/a/24937408/3996580 """ - __VALID_NOTEBOOKS = ["", - ""] + __VALID_NOTEBOOKS = [ + "", + "", + ] try: return str(type(get_ipython())) in __VALID_NOTEBOOKS except NameError: @@ -92,13 +98,13 @@ def show(fig): def _is_default_matplotlibrc(): __defaults = { - 'axes.labelsize': 'medium', - 'axes.titlesize': 'large', - 'figure.titlesize': 'large', - 'legend.fontsize': 'medium', - 'legend.title_fontsize': None, - 'xtick.labelsize': 'medium', - 'ytick.labelsize': 'medium' + "axes.labelsize": "medium", + "axes.titlesize": "large", + "figure.titlesize": "large", + "legend.fontsize": "medium", + "legend.title_fontsize": None, + "xtick.labelsize": "medium", + "ytick.labelsize": "medium", } for k, v in __defaults.items(): if plt.rcParams[k] != v: @@ -131,18 +137,17 @@ def parse_fontsize(size=None, default=None): class temp_fontsize(object): - def __init__(self, size=None): if size is None: - size = plt.rcParams['font.size'] + size = plt.rcParams["font.size"] self.size = size def __enter__(self): - self.old_size = plt.rcParams['font.size'] - plt.rcParams['font.size'] = self.size + self.old_size = plt.rcParams["font.size"] + plt.rcParams["font.size"] = self.size def __exit__(self, type, value, traceback): - plt.rcParams['font.size'] = self.old_size + plt.rcParams["font.size"] = self.old_size @utils._with_pkg(pkg="matplotlib", min_version=3) @@ -158,8 +163,7 @@ def shift_ticklabels(axis, dx=0, dy=0): dy : float, optional (default: 0) """ # Create offset transform by 5 points in x direction - offset = mpl.transforms.ScaledTranslation( - dx, dy, axis.get_figure().dpi_scale_trans) + offset = mpl.transforms.ScaledTranslation(dx, dy, axis.get_figure().dpi_scale_trans) # apply offset transform to all ticklabels. for label in axis.get_majorticklabels(): label.set_transform(label.get_transform() + offset) diff --git a/scprep/plot/variable_genes.py b/scprep/plot/variable_genes.py index f62554b8..31da3a39 100644 --- a/scprep/plot/variable_genes.py +++ b/scprep/plot/variable_genes.py @@ -3,15 +3,22 @@ @utils._with_pkg(pkg="matplotlib", min_version=3) -def plot_gene_variability(data, kernel_size=0.005, smooth=5, - cutoff=None, percentile=90, - ax=None, figsize=None, - xlabel='Gene mean', - ylabel='Standardized variance', - title=None, - fontsize=None, - filename=None, - dpi=None, **kwargs): +def plot_gene_variability( + data, + kernel_size=0.005, + smooth=5, + cutoff=None, + percentile=90, + ax=None, + figsize=None, + xlabel="Gene mean", + ylabel="Standardized variance", + title=None, + fontsize=None, + filename=None, + dpi=None, + **kwargs +): """Plot the histogram of gene variability Variability is computed as the deviation from a loess fit @@ -55,13 +62,22 @@ def plot_gene_variability(data, kernel_size=0.005, smooth=5, ax : `matplotlib.Axes` axis on which plot was drawn """ - variability, means = measure.gene_variability(data, kernel_size=kernel_size, - smooth=smooth, return_means=True) - keep_cells_idx = utils._get_filter_idx(variability, - cutoff, percentile, - keep_cells='above') - return scatter(means, variability, c=keep_cells_idx, - cmap={True : 'red', False : 'black'}, - xlabel=xlabel, ylabel=ylabel, title=title, - fontsize=fontsize, filename=filename, dpi=dpi, - **kwargs) + variability, means = measure.gene_variability( + data, kernel_size=kernel_size, smooth=smooth, return_means=True + ) + keep_cells_idx = utils._get_filter_idx( + variability, cutoff, percentile, keep_cells="above" + ) + return scatter( + means, + variability, + c=keep_cells_idx, + cmap={True: "red", False: "black"}, + xlabel=xlabel, + ylabel=ylabel, + title=title, + fontsize=fontsize, + filename=filename, + dpi=dpi, + **kwargs + ) diff --git a/scprep/reduce.py b/scprep/reduce.py index 089eaa33..cb515d8e 100644 --- a/scprep/reduce.py +++ b/scprep/reduce.py @@ -11,11 +11,11 @@ class InvertibleRandomProjection(random_projection.GaussianRandomProjection): """Gaussian random projection with an inverse transform using the pseudoinverse.""" - def __init__(self, n_components='auto', eps=0.3, - orthogonalize=False, random_state=None): + def __init__( + self, n_components="auto", eps=0.3, orthogonalize=False, random_state=None + ): self.orthogonalize = orthogonalize - super().__init__(n_components=n_components, eps=eps, - random_state=random_state) + super().__init__(n_components=n_components, eps=eps, random_state=random_state) @property def pseudoinverse(self): @@ -32,8 +32,7 @@ def pseudoinverse(self): # orthogonal matrix: inverse is just its transpose self._pseudoinverse = self.components_ else: - self._pseudoinverse = np.linalg.pinv( - self.components_.T) + self._pseudoinverse = np.linalg.pinv(self.components_.T) return self._pseudoinverse def fit(self, X): @@ -50,20 +49,34 @@ def inverse_transform(self, X): class AutomaticDimensionSVD(decomposition.TruncatedSVD): """Truncated SVD with automatic dimensionality selected by the Johnson-Lindenstrauss lemma.""" - def __init__(self, n_components='auto', eps=0.3, algorithm='randomized', - n_iter=5, random_state=None, tol=0.0): + def __init__( + self, + n_components="auto", + eps=0.3, + algorithm="randomized", + n_iter=5, + random_state=None, + tol=0.0, + ): self.eps = eps - if n_components == 'auto': + if n_components == "auto": # just pass through -1 - we will change it later n_components = -1 - super().__init__(n_components=n_components, algorithm=algorithm, - n_iter=n_iter, random_state=random_state, tol=tol) + super().__init__( + n_components=n_components, + algorithm=algorithm, + n_iter=n_iter, + random_state=random_state, + tol=tol, + ) def fit(self, X): if self.n_components == -1: super().set_params( n_components=random_projection.johnson_lindenstrauss_min_dim( - n_samples=X.shape[0], eps=self.eps)) + n_samples=X.shape[0], eps=self.eps + ) + ) try: return super().fit(X) except ValueError as e: @@ -72,7 +85,9 @@ def fit(self, X): "eps={} and n_samples={} lead to a target " "dimension of {} which is larger than the " "original space with n_features={}".format( - self.eps, X.shape[0], self.n_components, X.shape[1])) + self.eps, X.shape[0], self.n_components, X.shape[1] + ) + ) else: raise @@ -105,26 +120,27 @@ class SparseInputPCA(sklearn.base.BaseEstimator): Additional keyword arguments for `sklearn.decomposition.PCA` """ - def __init__(self, n_components=2, eps=0.3, - random_state=None, - method='svd', - **kwargs): - self.pca_op = decomposition.PCA(n_components=n_components, - random_state=random_state) - if method == 'svd': - self.proj_op = AutomaticDimensionSVD( - eps=eps, - random_state=random_state) - elif method == 'orth_rproj': + def __init__( + self, n_components=2, eps=0.3, random_state=None, method="svd", **kwargs + ): + self.pca_op = decomposition.PCA( + n_components=n_components, random_state=random_state + ) + if method == "svd": + self.proj_op = AutomaticDimensionSVD(eps=eps, random_state=random_state) + elif method == "orth_rproj": self.proj_op = InvertibleRandomProjection( - eps=eps, random_state=random_state, orthogonalize=True) - elif method == 'rproj': + eps=eps, random_state=random_state, orthogonalize=True + ) + elif method == "rproj": self.proj_op = InvertibleRandomProjection( - eps=eps, random_state=random_state, orthogonalize=False) + eps=eps, random_state=random_state, orthogonalize=False + ) else: raise ValueError( "Expected `method` in ['svd', 'orth_rproj', 'rproj']. " - "Got '{}'".format(method)) + "Got '{}'".format(method) + ) @property def singular_values_(self): @@ -219,9 +235,17 @@ def inverse_transform(self, X): return X_ambient -def pca(data, n_components=100, eps=0.3, - method='svd', seed=None, return_singular_values=False, - n_pca=None, svd_offset=None, svd_multiples=None): +def pca( + data, + n_components=100, + eps=0.3, + method="svd", + seed=None, + return_singular_values=False, + n_pca=None, + svd_offset=None, + svd_multiples=None, +): """Calculate PCA using random projections to handle sparse matrices Uses the Johnson-Lindenstrauss Lemma to determine the number of @@ -262,27 +286,30 @@ def pca(data, n_components=100, eps=0.3, """ if n_pca is not None: warnings.warn( - "n_pca is deprecated. Setting n_components={}.".format(n_pca), - FutureWarning) + "n_pca is deprecated. Setting n_components={}.".format(n_pca), FutureWarning + ) n_components = n_pca if svd_offset is not None: - warnings.warn("svd_offset is deprecated. Please use `eps` instead.", - FutureWarning) + warnings.warn( + "svd_offset is deprecated. Please use `eps` instead.", FutureWarning + ) if svd_multiples is not None: - warnings.warn("svd_multiples is deprecated. Please use `eps` instead.", - FutureWarning) + warnings.warn( + "svd_multiples is deprecated. Please use `eps` instead.", FutureWarning + ) if not 0 < n_components <= min(data.shape): - raise ValueError("n_components={} must be between 0 and " - "min(n_samples, n_features)={}".format( - n_components, min(data.shape))) + raise ValueError( + "n_components={} must be between 0 and " + "min(n_samples, n_features)={}".format(n_components, min(data.shape)) + ) # handle dataframes if isinstance(data, pd.DataFrame): index = data.index else: index = None - if method == 'dense': + if method == "dense": data = utils.toarray(data) else: data = utils.to_array_or_spmatrix(data) @@ -291,22 +318,28 @@ def pca(data, n_components=100, eps=0.3, if sparse.issparse(data): try: pca_op = SparseInputPCA( - n_components=n_components, eps=eps, method=method, - random_state=seed) + n_components=n_components, eps=eps, method=method, random_state=seed + ) data = pca_op.fit_transform(data) except RuntimeError as e: if "which is larger than the original space" in str(e): # eps too small - the best we can do is make the data dense return pca( - utils.toarray(data), n_components=n_components, - seed=seed, return_singular_values=return_singular_values) + utils.toarray(data), + n_components=n_components, + seed=seed, + return_singular_values=return_singular_values, + ) else: pca_op = decomposition.PCA(n_components, random_state=seed) data = pca_op.fit_transform(data) if index is not None: - data = pd.DataFrame(data, index=index, - columns=["PC{}".format(i+1) for i in range(n_components)]) + data = pd.DataFrame( + data, + index=index, + columns=["PC{}".format(i + 1) for i in range(n_components)], + ) if return_singular_values: data = (data, pca_op.singular_values_) diff --git a/scprep/run/r_function.py b/scprep/run/r_function.py index 12df8154..1ebddb3b 100644 --- a/scprep/run/r_function.py +++ b/scprep/run/r_function.py @@ -5,7 +5,6 @@ class _ConsoleWarning(object): - def __init__(self, verbose=1): if verbose is True: verbose = 1 @@ -16,17 +15,21 @@ def __init__(self, verbose=1): @staticmethod def warning(s: str) -> None: rpy2.rinterface_lib.callbacks.logger.warning( - rpy2.rinterface_lib.callbacks._WRITECONSOLE_EXCEPTION_LOG, s.strip()) + rpy2.rinterface_lib.callbacks._WRITECONSOLE_EXCEPTION_LOG, s.strip() + ) @staticmethod def debug(s: str) -> None: rpy2.rinterface_lib.callbacks.logger.debug( - rpy2.rinterface_lib.callbacks._WRITECONSOLE_EXCEPTION_LOG, s.strip()) + rpy2.rinterface_lib.callbacks._WRITECONSOLE_EXCEPTION_LOG, s.strip() + ) @staticmethod def set(fun): if not hasattr(_ConsoleWarning, "builtin_warning"): - _ConsoleWarning.builtin_warning = rpy2.rinterface_lib.callbacks.consolewrite_warnerror + _ConsoleWarning.builtin_warning = ( + rpy2.rinterface_lib.callbacks.consolewrite_warnerror + ) rpy2.rinterface_lib.callbacks.consolewrite_warnerror = fun @staticmethod @@ -85,10 +88,10 @@ def _build(self): {name} <- function({args}) {{ {body} }} - """.format(name=self.name, - args=self.args, body=self.body) - fun = getattr(rpy2.robjects.packages.STAP( - function_text, self.name), self.name) + """.format( + name=self.name, args=self.args, body=self.body + ) + fun = getattr(rpy2.robjects.packages.STAP(function_text, self.name), self.name) rpy2.robjects.numpy2ri.activate() return fun @@ -108,14 +111,15 @@ def convert(self, robject): if self.is_r_object(robject): if isinstance(robject, rpy2.robjects.vectors.ListVector): names = self.convert(robject.names) - if names is None or \ - len(names) > len(np.unique(names)): + if names is None or len(names) > len(np.unique(names)): # list robject = np.array([self.convert(obj) for obj in robject]) else: # dictionary - robject = {name: self.convert( - obj) for name, obj in zip(robject.names, robject)} + robject = { + name: self.convert(obj) + for name, obj in zip(robject.names, robject) + } else: # try numpy first robject = rpy2.robjects.numpy2ri.rpy2py(robject) @@ -156,9 +160,13 @@ def __call__(self, *args, rpy_verbose=None, **kwargs): } } } - """) + """, +) + -def install_bioconductor(package = None, site_repository = None, update = False, version = None, verbose = True): +def install_bioconductor( + package=None, site_repository=None, update=False, version=None, verbose=True +): """Install a Bioconductor package Parameters @@ -176,11 +184,11 @@ def install_bioconductor(package = None, site_repository = None, update = False, verbose : boolean, optional (default: True) Install script verbosity. """ - kwargs = {'update': update, 'rpy_verbose': verbose} + kwargs = {"update": update, "rpy_verbose": verbose} if package is not None: - kwargs['package'] = package + kwargs["package"] = package if site_repository is not None: - kwargs['site_repository'] = site_repository + kwargs["site_repository"] = site_repository if version is not None: - kwargs['version'] = version - _install_bioconductor(**kwargs) \ No newline at end of file + kwargs["version"] = version + _install_bioconductor(**kwargs) diff --git a/scprep/run/slingshot.py b/scprep/run/slingshot.py index f24a16e1..5b26af73 100644 --- a/scprep/run/slingshot.py +++ b/scprep/run/slingshot.py @@ -6,7 +6,7 @@ from .. import utils -def install(site_repository = None, update = False, version = None, verbose = True): +def install(site_repository=None, update=False, version=None, verbose=True): """Install the required R packages to run Slingshot Parameters @@ -25,8 +25,12 @@ def install(site_repository = None, update = False, version = None, verbose = Tr Install script verbosity. """ r_function.install_bioconductor( - 'slingshot', site_repository=site_repository, - update=update, version=version, verbose=verbose) + "slingshot", + site_repository=site_repository, + update=update, + version=version, + verbose=verbose, + ) _Slingshot = r_function.RFunction( @@ -57,18 +61,30 @@ def install(site_repository = None, update = False, version = None, verbose = Tr allow.breaks = allow_breaks) list(pseudotime = slingPseudotime(sling), curves = lapply(sling@curves, function(curve) curve$s[curve$ord,])) - """) + """, +) def Slingshot( - data, cluster_labels, - start_cluster = None, end_cluster = None, - distance = None, omega = None, shrink = True, - extend = "y", reweight = True, reassign = True, thresh = 0.001, - max_iter = 15, stretch = 2, - smoother = "smooth.spline", shrink_method = "cosine", - allow_breaks = True, - seed=None, verbose=1): + data, + cluster_labels, + start_cluster=None, + end_cluster=None, + distance=None, + omega=None, + shrink=True, + extend="y", + reweight=True, + reassign=True, + thresh=0.001, + max_iter=15, + stretch=2, + smoother="smooth.spline", + shrink_method="cosine", + allow_breaks=True, + seed=None, + verbose=1, +): """Perform lineage inference with Slingshot Given a reduced-dimensional data matrix n by p and a vector of cluster labels @@ -178,48 +194,64 @@ def Slingshot( ... ax.plot(curve[:,0], curve[:,1], c='black') """ if seed is None: - seed = np.random.randint(2**16 - 1) + seed = np.random.randint(2 ** 16 - 1) if distance is not None: raise NotImplementedError("distance argument not currently implemented") np.random.seed(seed) index = data.index if isinstance(data, pd.DataFrame) else None - + data = utils.toarray(data) if data.shape[1] > 3: - warnings.warn("Expected data to be low-dimensional. " - "Got data.shape[1] = {}".format(data.shape[1]), - UserWarning) + warnings.warn( + "Expected data to be low-dimensional. " + "Got data.shape[1] = {}".format(data.shape[1]), + UserWarning, + ) cluster_labels = utils.toarray(cluster_labels).flatten() if not cluster_labels.shape[0] == data.shape[0]: - raise ValueError("Expected len(cluster_labels) ({}) to equal " - "data.shape[0] ({})".format(cluster_labels.shape[0], data.shape[0])) + raise ValueError( + "Expected len(cluster_labels) ({}) to equal " + "data.shape[0] ({})".format(cluster_labels.shape[0], data.shape[0]) + ) kwargs = {} if start_cluster is not None: - kwargs['start_cluster'] = start_cluster + kwargs["start_cluster"] = start_cluster if end_cluster is not None: - kwargs['end_cluster'] = end_cluster + kwargs["end_cluster"] = end_cluster if omega is not None: - kwargs['omega'] = omega + kwargs["omega"] = omega slingshot = _Slingshot( - data=data, cluster_labels=cluster_labels, - shrink = shrink, - extend = extend, reweight = reweight, reassign = reassign, thresh = thresh, - max_iter = max_iter, stretch = stretch, - smoother = smoother, shrink_method = shrink_method, - allow_breaks = allow_breaks, **kwargs, - seed=seed, rpy_verbose=verbose) - slingshot['curves'] = np.array(list(slingshot['curves'].values())) - - membership = (~np.isnan(slingshot['pseudotime'])).astype(int) - branch = np.sum(membership * (2**np.arange(membership.shape[1])), axis=1) + data=data, + cluster_labels=cluster_labels, + shrink=shrink, + extend=extend, + reweight=reweight, + reassign=reassign, + thresh=thresh, + max_iter=max_iter, + stretch=stretch, + smoother=smoother, + shrink_method=shrink_method, + allow_breaks=allow_breaks, + **kwargs, + seed=seed, + rpy_verbose=verbose + ) + slingshot["curves"] = np.array(list(slingshot["curves"].values())) + + membership = (~np.isnan(slingshot["pseudotime"])).astype(int) + branch = np.sum(membership * (2 ** np.arange(membership.shape[1])), axis=1) # reorder based on pseudotime branch_ids = np.unique(branch) - branch_means = [np.nanmean(slingshot['pseudotime'][branch==id]) - if not np.all(np.isnan(slingshot['pseudotime'][branch==id])) else np.nan - for id in branch_ids] + branch_means = [ + np.nanmean(slingshot["pseudotime"][branch == id]) + if not np.all(np.isnan(slingshot["pseudotime"][branch == id])) + else np.nan + for id in branch_ids + ] branch_order = np.argsort(branch_means) branch_old = branch.copy() for i in range(len(branch_order)): @@ -228,9 +260,9 @@ def Slingshot( branch[branch_old == branch_ids[j]] = -1 else: branch[branch_old == branch_ids[j]] = i - slingshot['branch'] = branch + slingshot["branch"] = branch if index is not None: - slingshot['pseudotime'] = pd.DataFrame(slingshot['pseudotime'], index=index) - slingshot['branch'] = pd.Series(slingshot['branch'], name='branch', index=index) + slingshot["pseudotime"] = pd.DataFrame(slingshot["pseudotime"], index=index) + slingshot["branch"] = pd.Series(slingshot["branch"], name="branch", index=index) return slingshot diff --git a/scprep/run/splatter.py b/scprep/run/splatter.py index 4bbc18cb..a1b58f38 100644 --- a/scprep/run/splatter.py +++ b/scprep/run/splatter.py @@ -3,7 +3,7 @@ from . import r_function -def install(site_repository = None, update = False, version = None, verbose = True): +def install(site_repository=None, update=False, version=None, verbose=True): """Install the required R packages to run Splatter Parameters @@ -22,8 +22,12 @@ def install(site_repository = None, update = False, version = None, verbose = Tr Install script verbosity. """ r_function.install_bioconductor( - 'splatter', site_repository=site_repository, - update=update, version=version, verbose=verbose) + "splatter", + site_repository=site_repository, + update=update, + version=version, + verbose=verbose, + ) _SplatSimulate = r_function.RFunction( @@ -110,26 +114,43 @@ def install(site_repository = None, update = False, version = None, verbose = Tr result <- c(result, sigma_fac) } result - """) + """, +) def SplatSimulate( - method="paths", - batch_cells=100, n_genes=10000, - batch_fac_loc=0.1, batch_fac_scale=0.1, - mean_rate=0.3, mean_shape=0.6, - lib_loc=11, lib_scale=0.2, lib_norm=False, - out_prob=0.05, - out_fac_loc=4, out_fac_scale=0.5, - de_prob=0.1, de_down_prob=0.1, - de_fac_loc=0.1, de_fac_scale=0.4, - bcv_common=0.1, bcv_df=60, - dropout_type='none', dropout_prob=0.5, - dropout_mid=0, dropout_shape=-1, - group_prob=1, - path_from=0, path_length=100, path_skew=0.5, - path_nonlinear_prob=0.1, path_sigma_fac=0.8, - seed=None, verbose=1): + method="paths", + batch_cells=100, + n_genes=10000, + batch_fac_loc=0.1, + batch_fac_scale=0.1, + mean_rate=0.3, + mean_shape=0.6, + lib_loc=11, + lib_scale=0.2, + lib_norm=False, + out_prob=0.05, + out_fac_loc=4, + out_fac_scale=0.5, + de_prob=0.1, + de_down_prob=0.1, + de_fac_loc=0.1, + de_fac_scale=0.4, + bcv_common=0.1, + bcv_df=60, + dropout_type="none", + dropout_prob=0.5, + dropout_mid=0, + dropout_shape=-1, + group_prob=1, + path_from=0, + path_length=100, + path_skew=0.5, + path_nonlinear_prob=0.1, + path_sigma_fac=0.8, + seed=None, + verbose=1, +): """Simulate count data from a fictional single-cell RNA-seq experiment using the Splat method. SplatSimulate is a Python wrapper for the R package Splatter. For more @@ -237,8 +258,8 @@ def SplatSimulate( dropout : Logical matrix showing which values have been dropped in which cells. """ if seed is None: - seed = np.random.randint(2**16 - 1) - if dropout_type == 'binomial': + seed = np.random.randint(2 ** 16 - 1) + if dropout_type == "binomial": dropout_type = "none" else: dropout_prob = None @@ -246,22 +267,38 @@ def SplatSimulate( sim = _SplatSimulate( method=method, - batch_cells=batch_cells, n_genes=n_genes, - batch_fac_loc=batch_fac_loc, batch_fac_scale=batch_fac_scale, - mean_rate=mean_rate, mean_shape=mean_shape, - lib_loc=lib_loc, lib_scale=lib_scale, lib_norm=lib_norm, + batch_cells=batch_cells, + n_genes=n_genes, + batch_fac_loc=batch_fac_loc, + batch_fac_scale=batch_fac_scale, + mean_rate=mean_rate, + mean_shape=mean_shape, + lib_loc=lib_loc, + lib_scale=lib_scale, + lib_norm=lib_norm, out_prob=out_prob, - out_fac_loc=out_fac_loc, out_fac_scale=out_fac_scale, - de_prob=de_prob, de_down_prob=de_down_prob, - de_fac_loc=de_fac_loc, de_fac_scale=de_fac_scale, - bcv_common=bcv_common, bcv_df=bcv_df, - dropout_type=dropout_type, dropout_mid=dropout_mid, + out_fac_loc=out_fac_loc, + out_fac_scale=out_fac_scale, + de_prob=de_prob, + de_down_prob=de_down_prob, + de_fac_loc=de_fac_loc, + de_fac_scale=de_fac_scale, + bcv_common=bcv_common, + bcv_df=bcv_df, + dropout_type=dropout_type, + dropout_mid=dropout_mid, dropout_shape=dropout_shape, group_prob=group_prob, - path_from=path_from, path_length=path_length, path_skew=path_skew, - path_nonlinear_prob=path_nonlinear_prob, path_sigma_fac=path_sigma_fac, - seed=seed, rpy_verbose=verbose) + path_from=path_from, + path_length=path_length, + path_skew=path_skew, + path_nonlinear_prob=path_nonlinear_prob, + path_sigma_fac=path_sigma_fac, + seed=seed, + rpy_verbose=verbose, + ) if dropout_prob is not None: - sim['counts'] = np.random.binomial(n=sim['counts'], p=1 - dropout_prob, - size=sim['counts'].shape) + sim["counts"] = np.random.binomial( + n=sim["counts"], p=1 - dropout_prob, size=sim["counts"].shape + ) return sim diff --git a/scprep/sanitize.py b/scprep/sanitize.py index 637adb5f..0bf74591 100644 --- a/scprep/sanitize.py +++ b/scprep/sanitize.py @@ -4,7 +4,7 @@ import pandas as pd -def check_numeric(data, dtype='float', copy=None): +def check_numeric(data, dtype="float", copy=None): """Check a matrix contains only numeric data Parameters @@ -33,8 +33,10 @@ def check_numeric(data, dtype='float', copy=None): except TypeError as e: if isinstance(data, pd.SparseDataFrame): if not copy: - raise TypeError("pd.SparseDataFrame does not support " - "copy=False. Please use copy=True.") + raise TypeError( + "pd.SparseDataFrame does not support " + "copy=False. Please use copy=True." + ) else: return data.astype(dtype) else: diff --git a/scprep/select.py b/scprep/select.py index 39145c73..4e0b27c8 100644 --- a/scprep/select.py +++ b/scprep/select.py @@ -9,7 +9,7 @@ from . import utils if int(sys.version.split(".")[1]) < 7: - _re_pattern = type(re.compile('')) + _re_pattern = type(re.compile("")) else: _re_pattern = re.Pattern @@ -23,8 +23,7 @@ def _is_1d(data): def _check_idx_1d(idx): if (not _is_1d(idx)) and np.prod(idx.shape) != np.max(idx.shape): - raise ValueError( - "Expected idx to be 1D. Got shape {}".format(idx.shape)) + raise ValueError("Expected idx to be 1D. Got shape {}".format(idx.shape)) def _get_columns(data): @@ -50,15 +49,17 @@ def _check_columns_compatible(*data): if not _get_column_length(d) == _get_column_length(data[0]): raise ValueError( "Expected `data` and `extra_data` to have the same number of " - "columns. Got {}".format( - [_get_column_length(d) for d in data])) - if isinstance(d, (pd.DataFrame, pd.Series)) and \ - isinstance(data[0], (pd.DataFrame, pd.Series)): + "columns. Got {}".format([_get_column_length(d) for d in data]) + ) + if isinstance(d, (pd.DataFrame, pd.Series)) and isinstance( + data[0], (pd.DataFrame, pd.Series) + ): if not np.all(_get_columns(data[0]) == _get_columns(d)): raise ValueError( "Expected `data` and `extra_data` pandas inputs to have " "the same column names. Fix with " - "`scprep.select.select_cols(*extra_data, idx=data.columns)`") + "`scprep.select.select_cols(*extra_data, idx=data.columns)`" + ) def _check_rows_compatible(*data): @@ -66,15 +67,17 @@ def _check_rows_compatible(*data): if not _get_row_length(d) == _get_row_length(data[0]): raise ValueError( "Expected `data` and `extra_data` to have the same number of " - "rows. Got {}".format( - [d.shape[0] for d in data])) - if isinstance(d, (pd.DataFrame, pd.Series)) and \ - isinstance(data[0], (pd.DataFrame, pd.Series)): + "rows. Got {}".format([d.shape[0] for d in data]) + ) + if isinstance(d, (pd.DataFrame, pd.Series)) and isinstance( + data[0], (pd.DataFrame, pd.Series) + ): if not np.all(data[0].index == d.index): raise ValueError( "Expected `data` and `extra_data` pandas inputs to have " "the same index. Fix with " - "`scprep.select.select_rows(*extra_data, idx=data.index)`") + "`scprep.select.select_rows(*extra_data, idx=data.index)`" + ) def _convert_dataframe_1d(idx): @@ -105,19 +108,20 @@ def _string_vector_match(data, match, fun, dtype=str): fun = np.vectorize(fun) return fun(data, match) else: - return np.any([_string_vector_match(data, m, fun, dtype=dtype) - for m in match], axis=0) + return np.any( + [_string_vector_match(data, m, fun, dtype=dtype) for m in match], axis=0 + ) def _exact_word_regex(word): - allowed_chars = ['\\(', '\\)', '\\[', '\\]', '\\.', - ',', '!', '\\?', ' ', '^', '$'] + allowed_chars = ["\\(", "\\)", "\\[", "\\]", "\\.", ",", "!", "\\?", " ", "^", "$"] wildcard = "(" + "|".join(allowed_chars) + ")+" return "{wildcard}{word}{wildcard}".format(wildcard=wildcard, word=re.escape(word)) -def _get_string_subset_mask(data, starts_with=None, ends_with=None, - exact_word=None, regex=None): +def _get_string_subset_mask( + data, starts_with=None, ends_with=None, exact_word=None, regex=None +): """Get a subset from a string array Parameters @@ -141,11 +145,13 @@ def _get_string_subset_mask(data, starts_with=None, ends_with=None, mask = np.full_like(data, True, dtype=bool) if starts_with is not None: start_match = _string_vector_match( - data, starts_with, lambda x, match: x.startswith(match)) + data, starts_with, lambda x, match: x.startswith(match) + ) mask = np.logical_and(mask, start_match) if ends_with is not None: end_match = _string_vector_match( - data, ends_with, lambda x, match: x.endswith(match)) + data, ends_with, lambda x, match: x.endswith(match) + ) mask = np.logical_and(mask, end_match) if exact_word is not None: if not isinstance(exact_word, str): @@ -160,14 +166,15 @@ def _get_string_subset_mask(data, starts_with=None, ends_with=None, else: regex = re.compile(regex) regex_match = _string_vector_match( - data, regex, lambda x, match: bool(match.search(x)), - dtype=_re_pattern) + data, regex, lambda x, match: bool(match.search(x)), dtype=_re_pattern + ) mask = np.logical_and(mask, regex_match) return mask -def _get_string_subset(data, starts_with=None, ends_with=None, - exact_word=None, regex=None): +def _get_string_subset( + data, starts_with=None, ends_with=None, exact_word=None, regex=None +): """Get a subset from a string array Parameters @@ -190,13 +197,16 @@ def _get_string_subset(data, starts_with=None, ends_with=None, """ data = utils.toarray(data) mask = _get_string_subset_mask( - data, starts_with=starts_with, ends_with=ends_with, - exact_word=exact_word, regex=regex) + data, + starts_with=starts_with, + ends_with=ends_with, + exact_word=exact_word, + regex=regex, + ) return data[mask] -def get_gene_set(data, starts_with=None, ends_with=None, - exact_word=None, regex=None): +def get_gene_set(data, starts_with=None, ends_with=None, exact_word=None, regex=None): """Get a list of genes from data Parameters @@ -221,19 +231,29 @@ def get_gene_set(data, starts_with=None, ends_with=None, try: data = data.columns.to_numpy() except AttributeError: - raise TypeError("data must be a list of gene names or a pandas " - "DataFrame. Got {}".format(type(data).__name__)) - if starts_with is None and ends_with is None and \ - regex is None and exact_word is None: - warnings.warn("No selection conditions provided. " - "Returning all genes.", UserWarning) - return _get_string_subset(data, starts_with=starts_with, - ends_with=ends_with, - exact_word=exact_word, regex=regex) - - -def get_cell_set(data, starts_with=None, ends_with=None, - exact_word=None, regex=None): + raise TypeError( + "data must be a list of gene names or a pandas " + "DataFrame. Got {}".format(type(data).__name__) + ) + if ( + starts_with is None + and ends_with is None + and regex is None + and exact_word is None + ): + warnings.warn( + "No selection conditions provided. " "Returning all genes.", UserWarning + ) + return _get_string_subset( + data, + starts_with=starts_with, + ends_with=ends_with, + exact_word=exact_word, + regex=regex, + ) + + +def get_cell_set(data, starts_with=None, ends_with=None, exact_word=None, regex=None): """Get a list of cells from data Parameters @@ -258,20 +278,37 @@ def get_cell_set(data, starts_with=None, ends_with=None, try: data = data.index.to_numpy() except AttributeError: - raise TypeError("data must be a list of cell names or a pandas " - "DataFrame. Got {}".format(type(data).__name__)) - if starts_with is None and ends_with is None and \ - regex is None and exact_word is None: - warnings.warn("No selection conditions provided. " - "Returning all cells.", UserWarning) - return _get_string_subset(data, starts_with=starts_with, - ends_with=ends_with, - exact_word=exact_word, regex=regex) - - -def select_cols(data, *extra_data, idx=None, - starts_with=None, ends_with=None, - exact_word=None, regex=None): + raise TypeError( + "data must be a list of cell names or a pandas " + "DataFrame. Got {}".format(type(data).__name__) + ) + if ( + starts_with is None + and ends_with is None + and regex is None + and exact_word is None + ): + warnings.warn( + "No selection conditions provided. " "Returning all cells.", UserWarning + ) + return _get_string_subset( + data, + starts_with=starts_with, + ends_with=ends_with, + exact_word=exact_word, + regex=regex, + ) + + +def select_cols( + data, + *extra_data, + idx=None, + starts_with=None, + ends_with=None, + exact_word=None, + regex=None +): """Select columns from a data matrix Parameters @@ -309,19 +346,30 @@ def select_cols(data, *extra_data, idx=None, """ if len(extra_data) > 0: _check_columns_compatible(data, *extra_data) - if idx is None and starts_with is None and ends_with is None and \ - exact_word is None and regex is None: - warnings.warn("No selection conditions provided. " - "Returning all columns.", UserWarning) + if ( + idx is None + and starts_with is None + and ends_with is None + and exact_word is None + and regex is None + ): + warnings.warn( + "No selection conditions provided. " "Returning all columns.", UserWarning + ) return tuple([data] + list(extra_data)) if len(extra_data) > 0 else data if idx is None: if not isinstance(data, pd.DataFrame): raise ValueError( "Can only select based on column names with DataFrame input. " - "Please set `idx` to select specific columns.") - idx = get_gene_set(data, starts_with=starts_with, - ends_with=ends_with, - exact_word=exact_word, regex=regex) + "Please set `idx` to select specific columns." + ) + idx = get_gene_set( + data, + starts_with=starts_with, + ends_with=ends_with, + exact_word=exact_word, + regex=regex, + ) if isinstance(idx, pd.DataFrame): idx = _convert_dataframe_1d(idx) @@ -345,9 +393,11 @@ def select_cols(data, *extra_data, idx=None, except (KeyError, TypeError): if isinstance(idx, str): raise - if isinstance(idx, numbers.Integral) or \ - np.issubdtype(idx.dtype, np.dtype(int)) or \ - np.issubdtype(idx.dtype, np.dtype(bool)): + if ( + isinstance(idx, numbers.Integral) + or np.issubdtype(idx.dtype, np.dtype(int)) + or np.issubdtype(idx.dtype, np.dtype(bool)) + ): data = data.loc[:, np.array(data.columns)[idx]] else: raise @@ -358,9 +408,11 @@ def select_cols(data, *extra_data, idx=None, raise TypeError data = data.loc[idx] except (KeyError, TypeError): - if isinstance(idx, numbers.Integral) or \ - np.issubdtype(idx.dtype, np.dtype(int)) or \ - np.issubdtype(idx.dtype, np.dtype(bool)): + if ( + isinstance(idx, numbers.Integral) + or np.issubdtype(idx.dtype, np.dtype(int)) + or np.issubdtype(idx.dtype, np.dtype(bool)) + ): data = data.loc[np.array(data.index)[idx]] else: raise @@ -370,10 +422,15 @@ def select_cols(data, *extra_data, idx=None, data = np.array(data) data = data[idx] else: - if isinstance(data, (sparse.coo_matrix, - sparse.bsr_matrix, - sparse.lil_matrix, - sparse.dia_matrix)): + if isinstance( + data, + ( + sparse.coo_matrix, + sparse.bsr_matrix, + sparse.lil_matrix, + sparse.dia_matrix, + ), + ): data = data.tocsr() if isinstance(idx, pd.Series): idx = utils.toarray(idx) @@ -388,9 +445,15 @@ def select_cols(data, *extra_data, idx=None, return data -def select_rows(data, *extra_data, idx=None, - starts_with=None, ends_with=None, - exact_word=None, regex=None): +def select_rows( + data, + *extra_data, + idx=None, + starts_with=None, + ends_with=None, + exact_word=None, + regex=None +): """Select rows from a data matrix Parameters @@ -428,19 +491,30 @@ def select_rows(data, *extra_data, idx=None, """ if len(extra_data) > 0: _check_rows_compatible(data, *extra_data) - if idx is None and starts_with is None and ends_with is None and \ - exact_word is None and regex is None: - warnings.warn("No selection conditions provided. " - "Returning all rows.", UserWarning) + if ( + idx is None + and starts_with is None + and ends_with is None + and exact_word is None + and regex is None + ): + warnings.warn( + "No selection conditions provided. " "Returning all rows.", UserWarning + ) return tuple([data] + list(extra_data)) if len(extra_data) > 0 else data if idx is None: if not isinstance(data, pd.DataFrame): raise ValueError( "Can only select based on row names with DataFrame input. " - "Please set `idx` to select specific rows.") - idx = get_cell_set(data, starts_with=starts_with, - ends_with=ends_with, - exact_word=exact_word, regex=regex) + "Please set `idx` to select specific rows." + ) + idx = get_cell_set( + data, + starts_with=starts_with, + ends_with=ends_with, + exact_word=exact_word, + regex=regex, + ) if isinstance(idx, pd.DataFrame): idx = _convert_dataframe_1d(idx) @@ -461,15 +535,16 @@ def select_rows(data, *extra_data, idx=None, # temporary workaround for pandas error raise TypeError with warnings.catch_warnings(): - warnings.filterwarnings( - "error", "Passing list-likes to .loc") + warnings.filterwarnings("error", "Passing list-likes to .loc") data = data.loc[idx] except (KeyError, TypeError, FutureWarning): if isinstance(idx, str): raise - if isinstance(idx, numbers.Integral) or \ - np.issubdtype(idx.dtype, np.dtype(int)) or \ - np.issubdtype(idx.dtype, np.dtype(bool)): + if ( + isinstance(idx, numbers.Integral) + or np.issubdtype(idx.dtype, np.dtype(int)) + or np.issubdtype(idx.dtype, np.dtype(bool)) + ): data = data.loc[np.array(data.index)[idx]] else: raise @@ -479,9 +554,7 @@ def select_rows(data, *extra_data, idx=None, data = np.array(data) data = data[idx] else: - if isinstance(data, (sparse.coo_matrix, - sparse.bsr_matrix, - sparse.dia_matrix)): + if isinstance(data, (sparse.coo_matrix, sparse.bsr_matrix, sparse.dia_matrix)): data = data.tocsr() if isinstance(idx, pd.Series): idx = utils.toarray(idx) @@ -528,8 +601,9 @@ def subsample(*data, n=10000, seed=None): return tuple(data) if len(data) > 1 else data[0] -def highly_variable_genes(data, *extra_data, kernel_size=0.05, smooth=5, - cutoff=None, percentile=80): +def highly_variable_genes( + data, *extra_data, kernel_size=0.05, smooth=5, cutoff=None, percentile=80 +): """Select genes with high variability Variability is computed as the deviation from a loess fit @@ -561,8 +635,9 @@ def highly_variable_genes(data, *extra_data, kernel_size=0.05, smooth=5, Filtered extra data, if passed. """ from . import measure + var_genes = measure.gene_variability(data, kernel_size=kernel_size, smooth=smooth) - keep_cells_idx = utils._get_filter_idx(var_genes, - cutoff, percentile, - keep_cells='above') + keep_cells_idx = utils._get_filter_idx( + var_genes, cutoff, percentile, keep_cells="above" + ) return select_cols(data, *extra_data, idx=keep_cells_idx) diff --git a/scprep/stats.py b/scprep/stats.py index efe3f625..a300e446 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -11,6 +11,7 @@ import warnings from ._lazyload import matplotlib + plt = matplotlib.pyplot @@ -79,13 +80,14 @@ def pairwise_correlation(X, Y): # one-by-one N_times_sum_xy = utils.toarray(N * Y.T.dot(X)) sum_x_times_sum_y = X_colsums * Y_colsums[:, None] - var_x = N * utils.matrix_sum(utils.matrix_transform(X, np.power, 2), - axis=0) - (X_colsums**2) - var_y = N * utils.matrix_sum(utils.matrix_transform(Y, np.power, 2), - axis=0) - (Y_colsums**2) + var_x = N * utils.matrix_sum(utils.matrix_transform(X, np.power, 2), axis=0) - ( + X_colsums ** 2 + ) + var_y = N * utils.matrix_sum(utils.matrix_transform(Y, np.power, 2), axis=0) - ( + Y_colsums ** 2 + ) # Finally compute Pearson Correlation Coefficient as 2D array - cor = ((N_times_sum_xy - sum_x_times_sum_y) / - np.sqrt(var_x * var_y[:, None])) + cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) return cor.T @@ -123,8 +125,9 @@ def mutual_information(x, y, bins=8): return mi -def knnDREMI(x, y, k=10, n_bins=20, n_mesh=3, n_jobs=1, - plot=False, return_drevi=False, **kwargs): +def knnDREMI( + x, y, k=10, n_bins=20, n_mesh=3, n_jobs=1, plot=False, return_drevi=False, **kwargs +): """kNN conditional Density Resampled Estimate of Mutual Information Calculates k-Nearest Neighbor conditional Density Resampled Estimate of @@ -198,7 +201,8 @@ def knnDREMI(x, y, k=10, n_bins=20, n_mesh=3, n_jobs=1, if np.count_nonzero(x - x[0]) == 0 or np.count_nonzero(y - y[0]) == 0: warnings.warn( "Attempting to calculate kNN-DREMI on a constant array. Returning `0`", - UserWarning) + UserWarning, + ) # constant input: mutual information is numerically zero if return_drevi: return 0, None @@ -206,14 +210,11 @@ def knnDREMI(x, y, k=10, n_bins=20, n_mesh=3, n_jobs=1, return 0 if not isinstance(k, numbers.Integral): - raise ValueError( - "Expected k as an integer. Got {}".format(type(k))) + raise ValueError("Expected k as an integer. Got {}".format(type(k))) if not isinstance(n_bins, numbers.Integral): - raise ValueError( - "Expected n_bins as an integer. Got {}".format(type(n_bins))) + raise ValueError("Expected n_bins as an integer. Got {}".format(type(n_bins))) if not isinstance(n_mesh, numbers.Integral): - raise ValueError( - "Expected n_mesh as an integer. Got {}".format(type(n_mesh))) + raise ValueError("Expected n_mesh as an integer. Got {}".format(type(n_mesh))) # 0. Z-score X and Y x = stats.zscore(x) @@ -226,12 +227,14 @@ def knnDREMI(x, y, k=10, n_bins=20, n_mesh=3, n_jobs=1, y_mesh = np.linspace(min(y), max(y), ((n_mesh + 1) * n_bins) + 1) # calculate the kNN density around the mesh points - mesh_points = np.vstack([np.tile(x_mesh, len(y_mesh)), - np.repeat(y_mesh, len(x_mesh))]).T + mesh_points = np.vstack( + [np.tile(x_mesh, len(y_mesh)), np.repeat(y_mesh, len(x_mesh))] + ).T # Next, we find the nearest points in the data from the mesh knn = neighbors.NearestNeighbors(n_neighbors=k, n_jobs=n_jobs).fit( - np.vstack([x, y]).T) # this is the data + np.vstack([x, y]).T + ) # this is the data # get dists of closests points in data to mesh dists, _ = knn.kneighbors(mesh_points) @@ -240,27 +243,30 @@ def knnDREMI(x, y, k=10, n_bins=20, n_mesh=3, n_jobs=1, density = k / area # get list of all mesh points that are not bin intersections - mesh_mask = np.logical_or(np.isin(mesh_points[:, 0], x_bins), - np.isin(mesh_points[:, 1], y_bins)) + mesh_mask = np.logical_or( + np.isin(mesh_points[:, 0], x_bins), np.isin(mesh_points[:, 1], y_bins) + ) # Sum the densities of each point over the bins - bin_density, _, _ = np.histogram2d(mesh_points[~mesh_mask, 0], - mesh_points[~mesh_mask, 1], - bins=[x_bins, y_bins], - weights=density[~mesh_mask]) + bin_density, _, _ = np.histogram2d( + mesh_points[~mesh_mask, 0], + mesh_points[~mesh_mask, 1], + bins=[x_bins, y_bins], + weights=density[~mesh_mask], + ) bin_density = bin_density.T # sum the whole grid should be 1 bin_density = bin_density / np.sum(bin_density) # Calculate conditional entropy # NB: not using thresholding here; entr(M) calcs -x*log(x) elementwise - drevi = bin_density / \ - np.sum(bin_density, axis=0) # columns sum to 1 + drevi = bin_density / np.sum(bin_density, axis=0) # columns sum to 1 # calc entropy of each column cond_entropies = stats.entropy(drevi, base=2) # Mutual information (not normalized) marginal_entropy = stats.entropy( - np.sum(bin_density, axis=1), base=2) # entropy of Y + np.sum(bin_density, axis=1), base=2 + ) # entropy of Y # Multiply the entropy of each column by the density of each column # Conditional entropy is the entropy in Y that isn't exmplained by X @@ -269,17 +275,25 @@ def knnDREMI(x, y, k=10, n_bins=20, n_mesh=3, n_jobs=1, mutual_info = marginal_entropy - conditional_entropy # DREMI - marginal_entropy_norm = stats.entropy(np.sum(drevi, axis=1), - base=2) + marginal_entropy_norm = stats.entropy(np.sum(drevi, axis=1), base=2) cond_sums_norm = np.mean(drevi) conditional_entropy_norm = np.sum(cond_entropies * cond_sums_norm) dremi = marginal_entropy_norm - conditional_entropy_norm if plot: - plot_knnDREMI(dremi, mutual_info, - x, y, n_bins, n_mesh, - density, bin_density, drevi, **kwargs) + plot_knnDREMI( + dremi, + mutual_info, + x, + y, + n_bins, + n_mesh, + density, + bin_density, + drevi, + **kwargs + ) if return_drevi: return dremi, drevi else: @@ -287,12 +301,24 @@ def knnDREMI(x, y, k=10, n_bins=20, n_mesh=3, n_jobs=1, @utils._with_pkg(pkg="matplotlib", min_version=3) -def plot_knnDREMI(dremi, mutual_info, x, y, n_bins, n_mesh, - density, bin_density, drevi, - figsize=(12, 3.5), filename=None, - xlabel="Feature 1", ylabel="Feature 2", - title_fontsize=18, label_fontsize=16, - dpi=150): +def plot_knnDREMI( + dremi, + mutual_info, + x, + y, + n_bins, + n_mesh, + density, + bin_density, + drevi, + figsize=(12, 3.5), + filename=None, + xlabel="Feature 1", + ylabel="Feature 2", + title_fontsize=18, + label_fontsize=16, + dpi=150, +): """Plot results of DREMI Create plots of the data like those seen in @@ -328,8 +354,9 @@ def plot_knnDREMI(dremi, mutual_info, x, y, n_bins, n_mesh, # Plot kNN density n = ((n_mesh + 1) * n_bins) + 1 - axes[1].imshow(np.log(density.reshape(n, n)), - cmap='inferno', origin="lower", aspect="auto") + axes[1].imshow( + np.log(density.reshape(n, n)), cmap="inferno", origin="lower", aspect="auto" + ) for b in np.linspace(0, n, n_bins + 1): axes[1].axhline(b - 0.5, c="grey", linewidth=1) @@ -342,21 +369,21 @@ def plot_knnDREMI(dremi, mutual_info, x, y, n_bins, n_mesh, axes[1].set_xlabel(xlabel, fontsize=label_fontsize) # Plot joint probability - axes[2].imshow(bin_density, - cmap="inferno", origin="lower", aspect="auto") + axes[2].imshow(bin_density, cmap="inferno", origin="lower", aspect="auto") axes[2].set_xticks([]) axes[2].set_yticks([]) - axes[2].set_title("Joint Prob.\nMI={:.2f}".format(mutual_info), - fontsize=title_fontsize) + axes[2].set_title( + "Joint Prob.\nMI={:.2f}".format(mutual_info), fontsize=title_fontsize + ) axes[2].set_xlabel(xlabel, fontsize=label_fontsize) # Plot conditional probability - axes[3].imshow(drevi, - cmap="inferno", origin="lower", aspect="auto") + axes[3].imshow(drevi, cmap="inferno", origin="lower", aspect="auto") axes[3].set_xticks([]) axes[3].set_yticks([]) - axes[3].set_title("Conditional Prob.\nDREMI={:.2f}".format(dremi), - fontsize=title_fontsize) + axes[3].set_title( + "Conditional Prob.\nDREMI={:.2f}".format(dremi), fontsize=title_fontsize + ) axes[3].set_xlabel(xlabel, fontsize=label_fontsize) fig.tight_layout() @@ -369,8 +396,10 @@ def _preprocess_test_matrices(X, Y): X = utils.to_array_or_spmatrix(X) Y = utils.to_array_or_spmatrix(Y) if not X.shape[1] == Y.shape[1]: - raise ValueError("Expected X and Y to have the same number of columns. " - "Got shapes {}, {}".format(X.shape, Y.shape)) + raise ValueError( + "Expected X and Y to have the same number of columns. " + "Got shapes {}, {}".format(X.shape, Y.shape) + ) return X, Y @@ -412,7 +441,7 @@ def t_statistic(X, Y): X, Y = _preprocess_test_matrices(X, Y) X_std = utils.matrix_std(X, axis=0) Y_std = utils.matrix_std(Y, axis=0) - paired_std = np.sqrt(X_std**2 / X.shape[0] + Y_std**2 / Y.shape[0]) + paired_std = np.sqrt(X_std ** 2 / X.shape[0] + Y_std ** 2 / Y.shape[0]) return mean_difference(X, Y) / paired_std @@ -433,19 +462,28 @@ def _rank(X, axis=0): X_sorted = X[sort_indices].reshape(X.shape) # check if an item in the sorted list is the first instance - first_obs = np.hstack([np.repeat(True, X.shape[0])[:,np.newaxis], - X_sorted[:,1:] != X_sorted[:,:-1]]) - - sort_indices = (np.repeat(np.arange(X.shape[0]), X.shape[1]), rank_ordinal.flatten()) + first_obs = np.hstack( + [ + np.repeat(True, X.shape[0])[:, np.newaxis], + X_sorted[:, 1:] != X_sorted[:, :-1], + ] + ) + + sort_indices = ( + np.repeat(np.arange(X.shape[0]), X.shape[1]), + rank_ordinal.flatten(), + ) rank_dense = first_obs.cumsum(axis=1)[sort_indices].reshape(X.shape) offset = np.cumsum(first_obs.sum(axis=1))[:-1] + np.arange(1, first_obs.shape[0]) - rank_dense = rank_dense + np.r_[0, offset][:,np.newaxis] + rank_dense = rank_dense + np.r_[0, offset][:, np.newaxis] - first_or_last_obs = np.hstack([first_obs, np.repeat(True, X.shape[0])[:,np.newaxis]]) + first_or_last_obs = np.hstack( + [first_obs, np.repeat(True, X.shape[0])[:, np.newaxis]] + ) rank_min_max = np.nonzero(first_or_last_obs)[1] - rank_ave = .5 * (rank_min_max[rank_dense] + rank_min_max[rank_dense - 1] + 1) - + rank_ave = 0.5 * (rank_min_max[rank_dense] + rank_min_max[rank_dense - 1] + 1) + if axis == 0: rank_ave = rank_ave.T return rank_ave @@ -459,7 +497,7 @@ def _ranksum(X, sum_idx, axis=0): next_fn = X.getrow elif axis == 0: next_fn = X.getcol - for i in range(X.shape[(axis+1) % 2]): + for i in range(X.shape[(axis + 1) % 2]): coldata = X.getcol(i) colrank = _rank(coldata, axis=axis) ranksums.append(np.sum(colrank[sum_idx])) @@ -482,17 +520,16 @@ def rank_sum_statistic(X, Y): rank_sum_statistic : list-like, shape=[n_genes] """ X, Y = _preprocess_test_matrices(X, Y) - data, labels = utils.combine_batches([X, Y], ['x', 'y']) - X_rank_sum = _ranksum(data, labels=='x', axis=0) + data, labels = utils.combine_batches([X, Y], ["x", "y"]) + X_rank_sum = _ranksum(data, labels == "x", axis=0) X_u_statistic = X_rank_sum - X.shape[0] * (X.shape[0] + 1) / 2 Y_u_statistic = X.shape[0] * Y.shape[0] - X_u_statistic return np.minimum(X_u_statistic, Y_u_statistic) -def differential_expression(X, Y, - measure='difference', - direction='up', - gene_names=None, - n_jobs=-2): + +def differential_expression( + X, Y, measure="difference", direction="up", gene_names=None, n_jobs=-2 +): """Calculate the most significant genes between two datasets Parameters @@ -519,15 +556,21 @@ def differential_expression(X, Y, result : pd.DataFrame Ordered DataFrame with a column "gene" and a column named `measure`. """ - if not direction in ['up', 'down', 'both']: - raise ValueError("Expected `direction` in ['up', 'down', 'both']. " - "Got {}".format(direction)) - if not measure in ['difference', 'emd', 'ttest', 'ranksum']: - raise ValueError("Expected `measure` in ['difference', 'emd', 'ttest', 'ranksum']. " - "Got {}".format(measure)) + if not direction in ["up", "down", "both"]: + raise ValueError( + "Expected `direction` in ['up', 'down', 'both']. " + "Got {}".format(direction) + ) + if not measure in ["difference", "emd", "ttest", "ranksum"]: + raise ValueError( + "Expected `measure` in ['difference', 'emd', 'ttest', 'ranksum']. " + "Got {}".format(measure) + ) if not (len(X.shape) == 2 and len(Y.shape) == 2): - raise ValueError("Expected `X` and `Y` to be matrices. " - "Got shapes {}, {}".format(X.shape, Y.shape)) + raise ValueError( + "Expected `X` and `Y` to be matrices. " + "Got shapes {}, {}".format(X.shape, Y.shape) + ) [X, Y] = utils.check_consistent_columns([X, Y]) if gene_names is not None: if isinstance(X, pd.DataFrame): @@ -537,8 +580,10 @@ def differential_expression(X, Y, Y = select.select_cols(Y, idx=gene_names) gene_names = Y.columns if not len(gene_names) == X.shape[1]: - raise ValueError("Expected gene_names to have length {}. " - "Got {}".format(X.shape[1], len(gene_names))) + raise ValueError( + "Expected gene_names to have length {}. " + "Got {}".format(X.shape[1], len(gene_names)) + ) else: if isinstance(X, pd.DataFrame) and isinstance(Y, pd.DataFrame): gene_names = X.columns @@ -551,36 +596,36 @@ def differential_expression(X, Y, X = X.tocsr() if sparse.issparse(Y): Y = Y.tocsr() - if measure == 'difference': + if measure == "difference": difference = mean_difference(X, Y) - if measure == 'ttest': + if measure == "ttest": difference = t_statistic(X, Y) - if measure == 'ranksum': + if measure == "ranksum": difference = rank_sum_statistic(X, Y) - elif measure == 'emd': - difference = joblib.Parallel(n_jobs)(joblib.delayed(EMD)( - select.select_cols(X, idx=i), - select.select_cols(Y, idx=i)) - for i in range(X.shape[1])) + elif measure == "emd": + difference = joblib.Parallel(n_jobs)( + joblib.delayed(EMD)( + select.select_cols(X, idx=i), select.select_cols(Y, idx=i) + ) + for i in range(X.shape[1]) + ) difference = np.array(difference) * np.sign(mean_difference(X, Y)) - result = pd.DataFrame({measure : difference}, index=gene_names) - if direction == 'up': + result = pd.DataFrame({measure: difference}, index=gene_names) + if direction == "up": result = result.sort_index().sort_values([measure], ascending=False) - elif direction == 'down': + elif direction == "down": result = result.sort_index().sort_values([measure], ascending=True) - elif direction == 'both': - result['measure_abs'] = np.abs(difference) - result = result.sort_index().sort_values(['measure_abs'], ascending=False) - del result['measure_abs'] - result['rank'] = np.arange(result.shape[0]) + elif direction == "both": + result["measure_abs"] = np.abs(difference) + result = result.sort_index().sort_values(["measure_abs"], ascending=False) + del result["measure_abs"] + result["rank"] = np.arange(result.shape[0]) return result -def differential_expression_by_cluster(data, clusters, - measure='difference', - direction='up', - gene_names=None, - n_jobs=-2): +def differential_expression_by_cluster( + data, clusters, measure="difference", direction="up", gene_names=None, n_jobs=-2 +): """Calculate the most significant genes for each cluster in a dataset Measurements are run for each cluster against the rest of the dataset. @@ -614,23 +659,30 @@ def differential_expression_by_cluster(data, clusters, if isinstance(data, pd.DataFrame): gene_names = data.columns elif not len(gene_names) == data.shape[1]: - raise ValueError("Expected gene_names to have length {}. " - "Got {}".format(data.shape[1], len(gene_names))) + raise ValueError( + "Expected gene_names to have length {}. " + "Got {}".format(data.shape[1], len(gene_names)) + ) data = utils.to_array_or_spmatrix(data) - result = {cluster : differential_expression( - select.select_rows(data, idx=clusters==cluster), - select.select_rows(data, idx=clusters!=cluster), - measure = measure, direction = direction, - gene_names = gene_names, n_jobs = n_jobs) - for cluster in np.unique(clusters)} + result = { + cluster: differential_expression( + select.select_rows(data, idx=clusters == cluster), + select.select_rows(data, idx=clusters != cluster), + measure=measure, + direction=direction, + gene_names=gene_names, + n_jobs=n_jobs, + ) + for cluster in np.unique(clusters) + } return result + def _vector_coerce_dense(x): x = utils.toarray(x) x_1d = x.flatten() if not len(x_1d) == x.shape[0]: - raise ValueError( - "x must be a 1D array. Got shape {}".format(x.shape)) + raise ValueError("x must be a 1D array. Got shape {}".format(x.shape)) return x_1d @@ -640,8 +692,10 @@ def _vector_coerce_two_dense(x, y): y = _vector_coerce_dense(y) except ValueError as e: if "x must be a 1D array. Got shape " in str(e): - raise ValueError("Expected x and y to be 1D arrays. " - "Got shapes x {}, y {}".format(x.shape, y.shape)) + raise ValueError( + "Expected x and y to be 1D arrays. " + "Got shapes x {}, y {}".format(x.shape, y.shape) + ) else: raise e return x, y diff --git a/scprep/transform.py b/scprep/transform.py index 179fd026..502fae01 100644 --- a/scprep/transform.py +++ b/scprep/transform.py @@ -56,23 +56,25 @@ def log(data, pseudocount=1, base=10): """ data_min = utils.matrix_min(data) if pseudocount + data_min <= 0: - raise ValueError("Required pseudocount + min(data) ({}) > 0. " - "Got pseudocount = {}".format(utils.matrix_min(data), - pseudocount)) - elif pseudocount != data_min + 1 and \ - (sparse.issparse(data) or - isinstance(data, pd.SparseDataFrame) or - utils.is_sparse_dataframe(data)): - req = "min(data) + 1 ({})".format(data_min + - 1) if data_min != 0 else "1" - warnings.warn("log transform on sparse data requires " - "pseudocount = {}. Got {}".format( - req, data_min + 1, pseudocount), - RuntimeWarning) + raise ValueError( + "Required pseudocount + min(data) ({}) > 0. " + "Got pseudocount = {}".format(utils.matrix_min(data), pseudocount) + ) + elif pseudocount != data_min + 1 and ( + sparse.issparse(data) + or isinstance(data, pd.SparseDataFrame) + or utils.is_sparse_dataframe(data) + ): + req = "min(data) + 1 ({})".format(data_min + 1) if data_min != 0 else "1" + warnings.warn( + "log transform on sparse data requires " + "pseudocount = {}. Got {}".format(req, data_min + 1, pseudocount), + RuntimeWarning, + ) pseudocount = data_min + 1 if base == 2: log = np.log2 - elif base == 'e': + elif base == "e": log = np.log elif base == 10: log = np.log10 @@ -101,26 +103,34 @@ def arcsinh(data, cofactor=5): ValueError : if cofactor <= 0 """ if cofactor <= 0: - raise ValueError("Expected cofactor > 0 or None. " - "Got {}".format(cofactor)) + raise ValueError("Expected cofactor > 0 or None. " "Got {}".format(cofactor)) if cofactor is not None: data = data / cofactor return utils.matrix_transform(data, np.arcsinh) def sqrt_transform(*args, **kwargs): - warnings.warn("scprep.transform.sqrt_transform is deprecated. Please use " - "scprep.transform.sqrt in future.", FutureWarning) + warnings.warn( + "scprep.transform.sqrt_transform is deprecated. Please use " + "scprep.transform.sqrt in future.", + FutureWarning, + ) return sqrt(*args, **kwargs) def log_transform(*args, **kwargs): - warnings.warn("scprep.transform.log_transform is deprecated. Please use " - "scprep.transform.log in future.", FutureWarning) + warnings.warn( + "scprep.transform.log_transform is deprecated. Please use " + "scprep.transform.log in future.", + FutureWarning, + ) return log(*args, **kwargs) def arcsinh_transform(*args, **kwargs): - warnings.warn("scprep.transform.arcsinh_transform is deprecated. Please " - "use scprep.transform.arcsinh in future.", FutureWarning) + warnings.warn( + "scprep.transform.arcsinh_transform is deprecated. Please " + "use scprep.transform.arcsinh in future.", + FutureWarning, + ) return arcsinh(*args, **kwargs) diff --git a/scprep/utils.py b/scprep/utils.py index c805f589..f5b105d3 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -32,8 +32,8 @@ def _version_check(version, min_version=None): # no requirement return True min_version = str(min_version) - min_version_split = re.split(r'[^0-9]+', min_version) - version_split = re.split(r'[^0-9]+', version) + min_version_split = re.split(r"[^0-9]+", min_version) + version_split = re.split(r"[^0-9]+", version) version_major = int(version_split[0]) min_major = int(min_version_split[0]) if min_major > version_major: @@ -62,13 +62,16 @@ def check_version(pkg, min_version=None): except ModuleNotFoundError: raise ModuleNotFoundError( "{0} not found. " - "Please install it with e.g. `pip install --user {0}`".format(pkg)) + "Please install it with e.g. `pip install --user {0}`".format(pkg) + ) if not _version_check(module.__version__, min_version): raise ImportError( "{0}>={1} is required (installed: {2}). " "Please upgrade it with e.g." " `pip install --user --upgrade {0}`".format( - pkg, min_version, module.__version__)) + pkg, min_version, module.__version__ + ) + ) @decorator @@ -104,27 +107,23 @@ def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False): if cutoff is not None: raise ValueError( "Only one of `cutoff` and `percentile` should be given." - "Got cutoff={}, percentile={}".format(cutoff, percentile)) + "Got cutoff={}, percentile={}".format(cutoff, percentile) + ) if not isinstance(percentile, numbers.Number): - return [_get_percentile_cutoff(data, percentile=p) - for p in percentile] + return [_get_percentile_cutoff(data, percentile=p) for p in percentile] if percentile < 1: warnings.warn( "`percentile` expects values between 0 and 100." - "Got {}. Did you mean {}?".format(percentile, - percentile * 100), - UserWarning) + "Got {}. Did you mean {}?".format(percentile, percentile * 100), + UserWarning, + ) cutoff = np.percentile(np.array(data).reshape(-1), percentile) elif cutoff is None and required: - raise ValueError( - "One of either `cutoff` or `percentile` must be given.") + raise ValueError("One of either `cutoff` or `percentile` must be given.") return cutoff - -def _get_filter_idx(values, - cutoff, percentile, - keep_cells): +def _get_filter_idx(values, cutoff, percentile, keep_cells): """Return a boolean array to index cells based on a filter Parameters @@ -148,31 +147,40 @@ def _get_filter_idx(values, keep_cells_idx : list-like Boolean retention array """ - cutoff = _get_percentile_cutoff( - values, cutoff, percentile, required=True) + cutoff = _get_percentile_cutoff(values, cutoff, percentile, required=True) if keep_cells is None: if isinstance(cutoff, numbers.Number): - keep_cells = 'above' + keep_cells = "above" else: - keep_cells = 'between' - if keep_cells == 'above': + keep_cells = "between" + if keep_cells == "above": if not isinstance(cutoff, numbers.Number): - raise ValueError("Expected a single cutoff with keep_cells='above'." - " Got {}".format(cutoff)) + raise ValueError( + "Expected a single cutoff with keep_cells='above'." + " Got {}".format(cutoff) + ) keep_cells_idx = values > cutoff - elif keep_cells == 'below': + elif keep_cells == "below": if not isinstance(cutoff, numbers.Number): - raise ValueError("Expected a single cutoff with keep_cells='below'." - " Got {}".format(cutoff)) + raise ValueError( + "Expected a single cutoff with keep_cells='below'." + " Got {}".format(cutoff) + ) keep_cells_idx = values < cutoff - elif keep_cells == 'between': + elif keep_cells == "between": if isinstance(cutoff, numbers.Number) or len(cutoff) != 2: - raise ValueError("Expected cutoff of length 2 with keep_cells='between'." - " Got {}".format(cutoff)) - keep_cells_idx = np.logical_and(values > np.min(cutoff), values < np.max(cutoff)) + raise ValueError( + "Expected cutoff of length 2 with keep_cells='between'." + " Got {}".format(cutoff) + ) + keep_cells_idx = np.logical_and( + values > np.min(cutoff), values < np.max(cutoff) + ) else: - raise ValueError("Expected `keep_cells` in ['above', 'below', 'between']. " - "Got {}".format(keep_cells)) + raise ValueError( + "Expected `keep_cells` in ['above', 'below', 'between']. " + "Got {}".format(keep_cells) + ) return keep_cells_idx @@ -237,8 +245,9 @@ def to_array_or_spmatrix(x): x = x.to_coo() elif is_sparse_dataframe(x) or is_sparse_series(x): x = x.sparse.to_coo() - elif isinstance(x, (sparse.spmatrix, np.ndarray, numbers.Number)) and \ - not isinstance(x, np.matrix): + elif isinstance( + x, (sparse.spmatrix, np.ndarray, numbers.Number) + ) and not isinstance(x, np.matrix): pass elif isinstance(x, list): x_out = [] @@ -350,15 +359,17 @@ def matrix_sum(data, axis=None): sums = data.to_coo().sum() else: index = data.index if axis == 1 else data.columns - sums = pd.Series(np.array(data.to_coo().sum(axis)).flatten(), - index=index) + sums = pd.Series( + np.array(data.to_coo().sum(axis)).flatten(), index=index + ) elif is_sparse_dataframe(data): if axis is None: sums = data.sparse.to_coo().sum() else: index = data.index if axis == 1 else data.columns - sums = pd.Series(np.array(data.sparse.to_coo().sum(axis)).flatten(), - index=index) + sums = pd.Series( + np.array(data.sparse.to_coo().sum(axis)).flatten(), index=index + ) elif axis is None: sums = data.to_numpy().sum() else: @@ -425,7 +436,7 @@ def matrix_std(data, axis=None): else: std = np.std(data, axis=axis) if index is not None: - std = pd.Series(std, index=index, name='std') + std = pd.Series(std, index=index, name="std") return std @@ -455,7 +466,8 @@ def matrix_vector_elementwise_multiply(data, multiplier, axis=None): raise RuntimeError( "`data` is square, cannot guess axis from input. " "Please provide `axis=0` to multiply along rows or " - "`axis=1` to multiply along columns.") + "`axis=1` to multiply along columns." + ) elif np.prod(multiplier.shape) == data.shape[0]: axis = 0 elif np.prod(multiplier.shape) == data.shape[1]: @@ -464,21 +476,23 @@ def matrix_vector_elementwise_multiply(data, multiplier, axis=None): raise ValueError( "Expected `multiplier` to be a vector of length " "`data.shape[0]` ({}) or `data.shape[1]` ({}). Got {}".format( - data.shape[0], data.shape[1], multiplier.shape)) + data.shape[0], data.shape[1], multiplier.shape + ) + ) multiplier = toarray(multiplier) if axis == 0: if not np.prod(multiplier.shape) == data.shape[0]: raise ValueError( "Expected `multiplier` to be a vector of length " - "`data.shape[0]` ({}). Got {}".format( - data.shape[0], multiplier.shape)) + "`data.shape[0]` ({}). Got {}".format(data.shape[0], multiplier.shape) + ) multiplier = multiplier.reshape(-1, 1) else: if not np.prod(multiplier.shape) == data.shape[1]: raise ValueError( "Expected `multiplier` to be a vector of length " - "`data.shape[1]` ({}). Got {}".format( - data.shape[1], multiplier.shape)) + "`data.shape[1]` ({}). Got {}".format(data.shape[1], multiplier.shape) + ) multiplier = multiplier.reshape(1, -1) if isinstance(data, pd.SparseDataFrame) or is_sparse_dataframe(data): @@ -491,17 +505,25 @@ def matrix_vector_elementwise_multiply(data, multiplier, axis=None): except AttributeError: mult_indices = data[col].values.sp_index.to_int_index().indices new_data = data[col].values.sp_values * multiplier[mult_indices] - data[col].values.sp_values.put(np.arange(data[col].sparse.npoints), - new_data) + data[col].values.sp_values.put( + np.arange(data[col].sparse.npoints), new_data + ) else: for col, mult in zip(data.columns, multiplier): data[col] = data[col] * mult elif isinstance(data, pd.DataFrame): data = data.mul(multiplier.flatten(), axis=axis) elif sparse.issparse(data): - if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix, - sparse.coo_matrix, sparse.bsr_matrix, - sparse.dia_matrix)): + if isinstance( + data, + ( + sparse.lil_matrix, + sparse.dok_matrix, + sparse.coo_matrix, + sparse.bsr_matrix, + sparse.dia_matrix, + ), + ): data = data.tocsr() data = data.multiply(multiplier) else: @@ -595,23 +617,28 @@ def check_consistent_columns(data): matrix_type = type(data[0]) matrix_shape = data[0].shape[1] if issubclass(matrix_type, pd.DataFrame): - if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and - np.all([data[0].columns == d.columns for d in data])): + if not ( + np.all([d.shape[1] == matrix_shape for d in data[1:]]) + and np.all([data[0].columns == d.columns for d in data]) + ): common_genes = data[0].columns.values for d in data[1:]: - common_genes = common_genes[np.isin(common_genes, - d.columns.values)] + common_genes = common_genes[np.isin(common_genes, d.columns.values)] for i in range(len(data)): data[i] = data[i][common_genes] - warnings.warn("Input data has inconsistent column names. " - "Subsetting to {} common columns.".format( - len(common_genes)), UserWarning) + warnings.warn( + "Input data has inconsistent column names. " + "Subsetting to {} common columns.".format(len(common_genes)), + UserWarning, + ) else: for d in data[1:]: if not d.shape[1] == matrix_shape: shapes = ", ".join([str(d.shape[1]) for d in data]) - raise ValueError("Expected data all with the same number of " - "columns. Got {}".format(shapes)) + raise ValueError( + "Expected data all with the same number of " + "columns. Got {}".format(shapes) + ) return data @@ -639,31 +666,37 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): Batch labels corresponding to each sample """ if not len(data) == len(batch_labels): - raise ValueError("Expected data ({}) and batch_labels ({}) to be the " - "same length.".format(len(data), len(batch_labels))) + raise ValueError( + "Expected data ({}) and batch_labels ({}) to be the " + "same length.".format(len(data), len(batch_labels)) + ) # check consistent type matrix_type = type(data[0]) if matrix_type is pd.SparseDataFrame: matrix_type = pd.DataFrame - if not issubclass(matrix_type, (np.ndarray, - pd.DataFrame, - sparse.spmatrix)): - raise ValueError("Expected data to contain pandas DataFrames, " - "scipy sparse matrices or numpy arrays. " - "Got {}".format(matrix_type.__name__)) + if not issubclass(matrix_type, (np.ndarray, pd.DataFrame, sparse.spmatrix)): + raise ValueError( + "Expected data to contain pandas DataFrames, " + "scipy sparse matrices or numpy arrays. " + "Got {}".format(matrix_type.__name__) + ) for d in data[1:]: if not isinstance(d, matrix_type): types = ", ".join([type(d).__name__ for d in data]) - raise TypeError("Expected data all of the same class. " - "Got {}".format(types)) + raise TypeError( + "Expected data all of the same class. " "Got {}".format(types) + ) data = check_consistent_columns(data) # check append_to_cell_names if append_to_cell_names and not issubclass(matrix_type, pd.DataFrame): - warnings.warn("append_to_cell_names only valid for pd.DataFrame input." - " Got {}".format(matrix_type.__name__), UserWarning) + warnings.warn( + "append_to_cell_names only valid for pd.DataFrame input." + " Got {}".format(matrix_type.__name__), + UserWarning, + ) elif append_to_cell_names is None: if issubclass(matrix_type, pd.DataFrame): if all([isinstance(d.index, pd.RangeIndex) for d in data]): @@ -675,23 +708,29 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): append_to_cell_names = False # concatenate labels - sample_labels = np.concatenate([np.repeat(batch_labels[i], d.shape[0]) - for i, d in enumerate(data)]) + sample_labels = np.concatenate( + [np.repeat(batch_labels[i], d.shape[0]) for i, d in enumerate(data)] + ) # conatenate data if issubclass(matrix_type, pd.DataFrame): data_combined = pd.concat(data) if append_to_cell_names: index = np.concatenate( - [np.core.defchararray.add(np.array(d.index, dtype=str), - "_" + str(batch_labels[i])) - for i, d in enumerate(data)]) + [ + np.core.defchararray.add( + np.array(d.index, dtype=str), "_" + str(batch_labels[i]) + ) + for i, d in enumerate(data) + ] + ) data_combined.index = index elif all([isinstance(d.index, pd.RangeIndex) for d in data]): # rangeindex should still be a rangeindex data_combined = data_combined.reset_index(drop=True) - sample_labels = pd.Series(sample_labels, index=data_combined.index, - name='sample_labels') + sample_labels = pd.Series( + sample_labels, index=data_combined.index, name="sample_labels" + ) elif issubclass(matrix_type, sparse.spmatrix): data_combined = sparse.vstack(data) elif issubclass(matrix_type, np.ndarray): @@ -701,28 +740,39 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): def select_cols(data, idx): - raise RuntimeError("`scprep.utils.select_cols` is deprecated. Use " - "`scprep.select.select_cols` instead.") + raise RuntimeError( + "`scprep.utils.select_cols` is deprecated. Use " + "`scprep.select.select_cols` instead." + ) def select_rows(data, idx): - raise RuntimeError("`scprep.utils.select_rows` is deprecated. Use " - "`scprep.select.select_rows` instead.") + raise RuntimeError( + "`scprep.utils.select_rows` is deprecated. Use " + "`scprep.select.select_rows` instead." + ) def get_gene_set(data, starts_with=None, ends_with=None, regex=None): - raise RuntimeError("`scprep.utils.get_gene_set` is deprecated. Use " - "`scprep.select.get_gene_set` instead.") + raise RuntimeError( + "`scprep.utils.get_gene_set` is deprecated. Use " + "`scprep.select.get_gene_set` instead." + ) def get_cell_set(data, starts_with=None, ends_with=None, regex=None): - raise RuntimeError("`scprep.utils.get_cell_set` is deprecated. Use " - "`scprep.select.get_cell_set` instead.") + raise RuntimeError( + "`scprep.utils.get_cell_set` is deprecated. Use " + "`scprep.select.get_cell_set` instead." + ) def subsample(*data, n=10000, seed=None): - raise RuntimeError("`scprep.utils.subsample` is deprecated. Use " - "`scprep.select.subsample` instead.") + raise RuntimeError( + "`scprep.utils.subsample` is deprecated. Use " + "`scprep.select.subsample` instead." + ) + def sort_clusters_by_values(clusters, values): """Sorts `clusters` in increasing order of `values`. @@ -748,12 +798,15 @@ def sort_clusters_by_values(clusters, values): clusters = toarray(clusters) values = toarray(values) if not len(clusters) == len(values): - raise ValueError("Expected clusters ({}) and values ({}) to be the " - "same length.".format(len(clusters), len(values))) + raise ValueError( + "Expected clusters ({}) and values ({}) to be the " + "same length.".format(len(clusters), len(values)) + ) uniq_clusters = np.unique(clusters) means = np.array([np.mean(values[clusters == cl]) for cl in uniq_clusters]) - new_clust_map = {curr_cl: i for i, curr_cl in enumerate( - uniq_clusters[np.argsort(means)])} + new_clust_map = { + curr_cl: i for i, curr_cl in enumerate(uniq_clusters[np.argsort(means)]) + } return np.array([new_clust_map[cl] for cl in clusters]) diff --git a/setup.py b/setup.py index b11d1620..1e891c23 100644 --- a/setup.py +++ b/setup.py @@ -3,78 +3,76 @@ from setuptools import setup, find_packages install_requires = [ - 'numpy>=1.12.0', - 'scipy>=0.18.1', - 'scikit-learn>=0.19.1', - 'decorator>=4.3.0', - 'pandas>=0.25', + "numpy>=1.12.0", + "scipy>=0.18.1", + "scikit-learn>=0.19.1", + "decorator>=4.3.0", + "pandas>=0.25", ] test_requires = [ - 'nose', - 'nose2', - 'fcsparser', - 'tables', - 'h5py', - 'coverage', - 'coveralls', - 'parameterized', - 'requests', + "nose", + "nose2", + "fcsparser", + "tables", + "h5py", + "coverage", + "coveralls", + "parameterized", + "requests", + "black", ] doc_requires = [ - 'sphinx<=1.8.5', - 'sphinxcontrib-napoleon', - 'autodocsumm', - 'ipykernel', - 'nbsphinx', + "sphinx<=1.8.5", + "sphinxcontrib-napoleon", + "autodocsumm", + "ipykernel", + "nbsphinx", ] if sys.version_info[:2] < (3, 5): raise RuntimeError("Python version >=3.5 required.") elif sys.version_info[:2] < (3, 6): - test_requires += ['matplotlib>=3.0,<3.1', 'rpy2>=3.0,<3.1'] + test_requires += ["matplotlib>=3.0,<3.1", "rpy2>=3.0,<3.1"] else: - test_requires += ['matplotlib>=3.0', 'rpy2>=3.0'] + test_requires += ["matplotlib>=3.0", "rpy2>=3.0"] -version_py = os.path.join(os.path.dirname( - __file__), 'scprep', 'version.py') -version = open(version_py).read().strip().split( - '=')[-1].replace('"', '').strip() +version_py = os.path.join(os.path.dirname(__file__), "scprep", "version.py") +version = open(version_py).read().strip().split("=")[-1].replace('"', "").strip() -readme = open('README.rst').read() +readme = open("README.rst").read() -setup(name='scprep', - version=version, - description='scprep', - author='Scott Gigante, Daniel Burkhardt and Jay Stanley, Yale University', - author_email='krishnaswamylab@gmail.com', - packages=find_packages(), - license='GNU General Public License Version 2', - install_requires=install_requires, - extras_require={'test': test_requires, - 'doc': doc_requires}, - test_suite='nose2.collector.collector', - long_description=readme, - url='https://github.com/KrishnaswamyLab/scprep', - download_url="https://github.com/KrishnaswamyLab/scprep/archive/v{}.tar.gz".format( - version), - keywords=['big-data', - 'computational-biology', - ], - classifiers=[ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Framework :: Jupyter', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'Natural Language :: English', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: Microsoft :: Windows', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - ] - ) +setup( + name="scprep", + version=version, + description="scprep", + author="Scott Gigante, Daniel Burkhardt and Jay Stanley, Yale University", + author_email="krishnaswamylab@gmail.com", + packages=find_packages(), + license="GNU General Public License Version 2", + install_requires=install_requires, + extras_require={"test": test_requires, "doc": doc_requires}, + test_suite="nose2.collector.collector", + long_description=readme, + url="https://github.com/KrishnaswamyLab/scprep", + download_url="https://github.com/KrishnaswamyLab/scprep/archive/v{}.tar.gz".format( + version + ), + keywords=["big-data", "computational-biology",], + classifiers=[ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Framework :: Jupyter", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Natural Language :: English", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], +) diff --git a/test/_test_lazyload.py b/test/_test_lazyload.py index a4de92a6..c1412cf4 100644 --- a/test/_test_lazyload.py +++ b/test/_test_lazyload.py @@ -5,16 +5,18 @@ def test_lazyload(): - preloaded_modules = set([m.split('.')[0] for m in sys.modules.keys()]) - assert 'scprep' not in preloaded_modules + preloaded_modules = set([m.split(".")[0] for m in sys.modules.keys()]) + assert "scprep" not in preloaded_modules import scprep - postloaded_modules = set([m.split('.')[0] for m in sys.modules.keys()]) + + postloaded_modules = set([m.split(".")[0] for m in sys.modules.keys()]) scprep_loaded = postloaded_modules.difference(preloaded_modules) for module in scprep._lazyload._importspec.keys(): if module in preloaded_modules: - assert getattr( - scprep._lazyload, module).__class__ is type(scprep) + assert getattr(scprep._lazyload, module).__class__ is type(scprep) else: - assert getattr( - scprep._lazyload, module).__class__ is scprep._lazyload.AliasModule + assert ( + getattr(scprep._lazyload, module).__class__ + is scprep._lazyload.AliasModule + ) assert module not in scprep_loaded, module diff --git a/test/test_filter.py b/test/test_filter.py index a14c0372..d43af317 100644 --- a/test/test_filter.py +++ b/test/test_filter.py @@ -9,7 +9,6 @@ class Test10X(unittest.TestCase): - @classmethod def setUpClass(self): self.X_dense = data.load_10X(sparse=False) @@ -21,27 +20,34 @@ def test_filter_empty_cells(self): assert X_filtered.shape[1] == self.X_dense.shape[1] assert not np.any(X_filtered.sum(1) == 0) matrix.test_all_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=X_filtered, transform=scprep.filter.filter_empty_cells) + self.X_dense, + utils.assert_transform_equals, + Y=X_filtered, + transform=scprep.filter.filter_empty_cells, + ) sample_labels = np.arange(self.X_dense.shape[0]) sample_labels_filt = sample_labels[self.X_dense.sum(1) > 0] X_filtered_2, sample_labels = scprep.filter.filter_empty_cells( - self.X_dense, sample_labels) + self.X_dense, sample_labels + ) assert X_filtered_2.shape[0] == len(sample_labels) assert np.all(sample_labels == sample_labels_filt) assert np.all(X_filtered_2 == X_filtered) def test_filter_duplicates(self): - unique_idx = np.sort( - np.unique(self.X_dense, axis=0, return_index=True)[1]) + unique_idx = np.sort(np.unique(self.X_dense, axis=0, return_index=True)[1]) X_filtered = np.array(self.X_dense)[unique_idx] matrix.test_all_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=X_filtered, transform=scprep.filter.filter_duplicates) + self.X_dense, + utils.assert_transform_equals, + Y=X_filtered, + transform=scprep.filter.filter_duplicates, + ) sample_labels = np.arange(self.X_dense.shape[0]) sample_labels_filt = sample_labels[unique_idx] X_filtered_2, sample_labels = scprep.filter.filter_duplicates( - self.X_dense, sample_labels) + self.X_dense, sample_labels + ) assert X_filtered_2.shape[0] == len(sample_labels) assert np.all(sample_labels == sample_labels_filt) assert np.all(X_filtered_2 == X_filtered) @@ -50,7 +56,8 @@ def test_filter_empty_cells_sample_label(self): sample_labels = np.arange(self.X_dense.shape[0]) sample_labels_filt = sample_labels[self.X_dense.sum(1) > 0] X_filtered, sample_labels = scprep.filter.filter_empty_cells( - self.X_dense, sample_labels) + self.X_dense, sample_labels + ) assert X_filtered.shape[0] == len(sample_labels) assert np.all(sample_labels == sample_labels_filt) @@ -59,65 +66,80 @@ def test_filter_empty_cells_sparse(self): assert X_filtered.shape[1] == self.X_sparse.shape[1] assert not np.any(X_filtered.sum(1) == 0) matrix.test_all_matrix_types( - self.X_sparse, utils.assert_transform_equals, - Y=X_filtered, transform=scprep.filter.filter_empty_cells) + self.X_sparse, + utils.assert_transform_equals, + Y=X_filtered, + transform=scprep.filter.filter_empty_cells, + ) def test_filter_empty_genes(self): X_filtered = scprep.filter.filter_empty_genes(self.X_dense) assert X_filtered.shape[0] == self.X_dense.shape[0] assert not np.any(X_filtered.sum(0) == 0) matrix.test_all_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=X_filtered, transform=scprep.filter.filter_empty_genes) + self.X_dense, + utils.assert_transform_equals, + Y=X_filtered, + transform=scprep.filter.filter_empty_genes, + ) def test_filter_empty_genes_sparse(self): X_filtered = scprep.filter.filter_empty_genes(self.X_sparse) assert X_filtered.shape[0] == self.X_sparse.shape[0] assert not np.any(X_filtered.sum(0) == 0) matrix.test_all_matrix_types( - self.X_sparse, utils.assert_transform_equals, - Y=X_filtered, transform=scprep.filter.filter_empty_genes) + self.X_sparse, + utils.assert_transform_equals, + Y=X_filtered, + transform=scprep.filter.filter_empty_genes, + ) def test_filter_rare_genes(self): X_filtered = scprep.filter.filter_rare_genes(self.X_dense) assert X_filtered.shape[0] == self.X_dense.shape[0] assert not np.any(X_filtered.sum(0) < 5) matrix.test_all_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=X_filtered, transform=scprep.filter.filter_rare_genes) - + self.X_dense, + utils.assert_transform_equals, + Y=X_filtered, + transform=scprep.filter.filter_rare_genes, + ) def test_library_size_filter(self): - X_filtered = scprep.filter.filter_library_size( - self.X_sparse, cutoff=100) + X_filtered = scprep.filter.filter_library_size(self.X_sparse, cutoff=100) assert X_filtered.shape[1] == self.X_sparse.shape[1] assert not np.any(X_filtered.sum(1) <= 100) X_filtered, libsize = scprep.filter.filter_library_size( - self.X_sparse, cutoff=100, return_library_size=True) + self.X_sparse, cutoff=100, return_library_size=True + ) assert np.all(scprep.measure.library_size(X_filtered) == libsize) matrix.test_all_matrix_types( - self.X_sparse, utils.assert_transform_equals, - Y=X_filtered, transform=partial( - scprep.filter.filter_library_size, cutoff=100)) + self.X_sparse, + utils.assert_transform_equals, + Y=X_filtered, + transform=partial(scprep.filter.filter_library_size, cutoff=100), + ) X_filtered = scprep.filter.filter_library_size( - self.X_sparse, cutoff=100, keep_cells='below') + self.X_sparse, cutoff=100, keep_cells="below" + ) assert X_filtered.shape[1] == self.X_sparse.shape[1] assert not np.any(X_filtered.sum(1) >= 100) def test_library_size_filter_below(self): X_filtered = scprep.filter.filter_library_size( - self.X_sparse, cutoff=100, keep_cells='below') + self.X_sparse, cutoff=100, keep_cells="below" + ) assert X_filtered.shape[1] == self.X_sparse.shape[1] assert not np.any(X_filtered.sum(1) >= 100) def test_library_size_filter_between(self): - X_filtered = scprep.filter.filter_library_size( - self.X_sparse, cutoff=(50, 100)) + X_filtered = scprep.filter.filter_library_size(self.X_sparse, cutoff=(50, 100)) assert X_filtered.shape[1] == self.X_sparse.shape[1] assert not np.any(X_filtered.sum(1) >= 100) assert not np.any(X_filtered.sum(1) <= 50) X_filtered = scprep.filter.filter_library_size( - self.X_sparse, percentile=(20, 80)) + self.X_sparse, percentile=(20, 80) + ) assert X_filtered.shape[1] == self.X_sparse.shape[1] assert not np.any(X_filtered.sum(1) >= np.percentile(self.libsize, 80)) assert not np.any(X_filtered.sum(1) <= np.percentile(self.libsize, 20)) @@ -127,181 +149,268 @@ def test_library_size_filter_error(self): ValueError, "Expected `keep_cells` in ['above', 'below', 'between']. Got invalid", scprep.filter.filter_library_size, - self.X_sparse, cutoff=100, keep_cells='invalid') + self.X_sparse, + cutoff=100, + keep_cells="invalid", + ) assert_raise_message( ValueError, "Expected cutoff of length 2 with keep_cells='between'. Got 100", scprep.filter.filter_library_size, - self.X_sparse, cutoff=100, keep_cells='between') + self.X_sparse, + cutoff=100, + keep_cells="between", + ) assert_raise_message( ValueError, "Expected a single cutoff with keep_cells='above'. Got (50, 100)", scprep.filter.filter_library_size, - self.X_sparse, cutoff=(50, 100), keep_cells='above') + self.X_sparse, + cutoff=(50, 100), + keep_cells="above", + ) assert_raise_message( ValueError, "Expected a single cutoff with keep_cells='below'. Got (50, 100)", scprep.filter.filter_library_size, - self.X_sparse, cutoff=(50, 100), keep_cells='below') + self.X_sparse, + cutoff=(50, 100), + keep_cells="below", + ) def test_library_size_filter_sample_label(self): - sample_labels = pd.DataFrame(np.random.choice([0, 1], self.X_dense.shape[0]), - index=self.X_dense.index) + sample_labels = pd.DataFrame( + np.random.choice([0, 1], self.X_dense.shape[0]), index=self.X_dense.index + ) sample_labels_filt = sample_labels.loc[self.X_dense.sum(1) > 100] X_filtered, sample_labels_filt2 = scprep.filter.filter_library_size( - self.X_dense, sample_labels, cutoff=100) + self.X_dense, sample_labels, cutoff=100 + ) assert X_filtered.shape[0] == len(sample_labels_filt2) assert np.all(np.all(sample_labels_filt2 == sample_labels_filt)) def test_gene_expression_filter_below(self): genes = np.arange(10) X_filtered = scprep.filter.filter_gene_set_expression( - self.X_sparse, genes=genes, percentile=90, - library_size_normalize=False) + self.X_sparse, genes=genes, percentile=90, library_size_normalize=False + ) gene_cols = np.array(self.X_sparse.columns)[genes] assert X_filtered.shape[1] == self.X_sparse.shape[1] assert np.max(np.sum(self.X_sparse[gene_cols], axis=1)) > np.max( - np.sum(X_filtered[gene_cols], axis=1)) + np.sum(X_filtered[gene_cols], axis=1) + ) matrix.test_all_matrix_types( - self.X_sparse, utils.assert_transform_equals, - Y=X_filtered, transform=partial( - scprep.filter.filter_gene_set_expression, genes=genes, - percentile=90, keep_cells='below', - library_size_normalize=False)) + self.X_sparse, + utils.assert_transform_equals, + Y=X_filtered, + transform=partial( + scprep.filter.filter_gene_set_expression, + genes=genes, + percentile=90, + keep_cells="below", + library_size_normalize=False, + ), + ) def test_gene_expression_filter_above(self): genes = np.arange(10) gene_cols = np.array(self.X_sparse.columns)[genes] X_filtered = scprep.filter.filter_gene_set_expression( - self.X_sparse, genes=genes, percentile=10, keep_cells='above', - library_size_normalize=False) + self.X_sparse, + genes=genes, + percentile=10, + keep_cells="above", + library_size_normalize=False, + ) assert X_filtered.shape[1] == self.X_sparse.shape[1] assert np.min(np.sum(self.X_sparse[gene_cols], axis=1)) < np.min( - np.sum(X_filtered[gene_cols], axis=1)) + np.sum(X_filtered[gene_cols], axis=1) + ) matrix.test_all_matrix_types( - self.X_sparse, utils.assert_transform_equals, - Y=X_filtered, transform=partial( - scprep.filter.filter_gene_set_expression, genes=genes, - percentile=10, keep_cells='above', - library_size_normalize=False)) + self.X_sparse, + utils.assert_transform_equals, + Y=X_filtered, + transform=partial( + scprep.filter.filter_gene_set_expression, + genes=genes, + percentile=10, + keep_cells="above", + library_size_normalize=False, + ), + ) def test_gene_expression_libsize(self): genes = np.arange(10) X_filtered = scprep.filter.filter_gene_set_expression( - self.X_sparse, genes=genes, percentile=10, keep_cells='above', - library_size_normalize=True) + self.X_sparse, + genes=genes, + percentile=10, + keep_cells="above", + library_size_normalize=True, + ) X_libsize = scprep.normalize.library_size_normalize(self.X_sparse) Y = scprep.filter.filter_gene_set_expression( - X_libsize, genes=genes, percentile=10, keep_cells='above', - library_size_normalize=False) + X_libsize, + genes=genes, + percentile=10, + keep_cells="above", + library_size_normalize=False, + ) assert X_filtered.shape == Y.shape assert np.all(X_filtered.index == Y.index) def test_gene_expression_filter_sample_label(self): genes = np.arange(10) - sample_labels = pd.DataFrame(np.arange(self.X_dense.shape[0]), - index=self.X_dense.index) + sample_labels = pd.DataFrame( + np.arange(self.X_dense.shape[0]), index=self.X_dense.index + ) X_filtered, sample_labels = scprep.filter.filter_gene_set_expression( - self.X_dense, sample_labels, genes=genes, percentile=90) + self.X_dense, sample_labels, genes=genes, percentile=90 + ) assert X_filtered.shape[0] == len(sample_labels) def test_gene_expression_filter_warning(self): genes = np.arange(10) - no_genes = 'not_a_gene' + no_genes = "not_a_gene" assert_warns_message( UserWarning, "`percentile` expects values between 0 and 100." "Got 0.9. Did you mean 90.0?", scprep.filter.filter_gene_set_expression, - self.X_sparse, genes=genes, percentile=0.90, keep_cells='below') + self.X_sparse, + genes=genes, + percentile=0.90, + keep_cells="below", + ) assert_raise_message( ValueError, "Only one of `cutoff` and `percentile` should be given.", scprep.filter.filter_gene_set_expression, - self.X_sparse, genes=genes, percentile=0.90, cutoff=50) + self.X_sparse, + genes=genes, + percentile=0.90, + cutoff=50, + ) assert_raise_message( ValueError, - "Expected `keep_cells` in ['above', 'below', 'between']. " - "Got neither", + "Expected `keep_cells` in ['above', 'below', 'between']. " "Got neither", scprep.filter.filter_gene_set_expression, - self.X_sparse, genes=genes, percentile=90.0, keep_cells='neither') + self.X_sparse, + genes=genes, + percentile=90.0, + keep_cells="neither", + ) assert_warns_message( UserWarning, "`percentile` expects values between 0 and 100." "Got 0.9. Did you mean 90.0?", scprep.filter.filter_gene_set_expression, - self.X_sparse, genes=genes, percentile=0.90, keep_cells='below') + self.X_sparse, + genes=genes, + percentile=0.90, + keep_cells="below", + ) assert_raise_message( ValueError, "One of either `cutoff` or `percentile` must be given.", scprep.filter.filter_gene_set_expression, - self.X_sparse, genes=genes, cutoff=None, percentile=None) + self.X_sparse, + genes=genes, + cutoff=None, + percentile=None, + ) assert_raise_message( KeyError, "not_a_gene", scprep.filter.filter_gene_set_expression, - self.X_sparse, genes=no_genes, percentile=90.0, keep_cells='below') + self.X_sparse, + genes=no_genes, + percentile=90.0, + keep_cells="below", + ) def filter_series(self): libsize = scprep.measure.library_size(self.X_sparse) - libsize_filt = scprep.filter.filter_values( - libsize, libsize, cutoff=100) + libsize_filt = scprep.filter.filter_values(libsize, libsize, cutoff=100) assert np.all(libsize_filt > 100) def test_deprecated_remove(self): - assert_warns_message(DeprecationWarning, - "`scprep.filter.remove_empty_genes` is deprecated. Use " - "`scprep.filter.filter_empty_genes` instead.", - scprep.filter.remove_empty_genes, - self.X_dense) - assert_warns_message(DeprecationWarning, - "`scprep.filter.remove_rare_genes` is deprecated. Use " - "`scprep.filter.filter_rare_genes` instead.", - scprep.filter.remove_rare_genes, - self.X_dense) - assert_warns_message(DeprecationWarning, - "`scprep.filter.remove_empty_cells` is deprecated. Use " - "`scprep.filter.filter_empty_cells` instead.", - scprep.filter.remove_empty_cells, - self.X_dense) - assert_warns_message(DeprecationWarning, - "`scprep.filter.remove_duplicates` is deprecated. Use " - "`scprep.filter.filter_duplicates` instead.", - scprep.filter.remove_duplicates, - self.X_dense) + assert_warns_message( + DeprecationWarning, + "`scprep.filter.remove_empty_genes` is deprecated. Use " + "`scprep.filter.filter_empty_genes` instead.", + scprep.filter.remove_empty_genes, + self.X_dense, + ) + assert_warns_message( + DeprecationWarning, + "`scprep.filter.remove_rare_genes` is deprecated. Use " + "`scprep.filter.filter_rare_genes` instead.", + scprep.filter.remove_rare_genes, + self.X_dense, + ) + assert_warns_message( + DeprecationWarning, + "`scprep.filter.remove_empty_cells` is deprecated. Use " + "`scprep.filter.filter_empty_cells` instead.", + scprep.filter.remove_empty_cells, + self.X_dense, + ) + assert_warns_message( + DeprecationWarning, + "`scprep.filter.remove_duplicates` is deprecated. Use " + "`scprep.filter.filter_duplicates` instead.", + scprep.filter.remove_duplicates, + self.X_dense, + ) def test_deprecated_sample_labels(self): sample_labels = np.arange(self.X_dense.shape[0]) - assert_warns_message(DeprecationWarning, - "`sample_labels` is deprecated. " - "Passing `sample_labels` as `extra_data`.", - scprep.filter.filter_empty_cells, - self.X_dense, sample_labels=sample_labels) - assert_warns_message(DeprecationWarning, - "`sample_labels` is deprecated. " - "Passing `sample_labels` as `extra_data`.", - scprep.filter.filter_duplicates, - self.X_dense, sample_labels=sample_labels) - assert_warns_message(DeprecationWarning, - "`sample_labels` is deprecated. " - "Passing `sample_labels` as `extra_data`.", - scprep.filter.filter_library_size, - self.X_dense, cutoff=10, sample_labels=sample_labels) - assert_warns_message(DeprecationWarning, - "`filter_per_sample` is deprecated. " - "Filtering as a single sample.", - scprep.filter.filter_library_size, - self.X_dense, cutoff=10, filter_per_sample=True) + assert_warns_message( + DeprecationWarning, + "`sample_labels` is deprecated. " + "Passing `sample_labels` as `extra_data`.", + scprep.filter.filter_empty_cells, + self.X_dense, + sample_labels=sample_labels, + ) + assert_warns_message( + DeprecationWarning, + "`sample_labels` is deprecated. " + "Passing `sample_labels` as `extra_data`.", + scprep.filter.filter_duplicates, + self.X_dense, + sample_labels=sample_labels, + ) + assert_warns_message( + DeprecationWarning, + "`sample_labels` is deprecated. " + "Passing `sample_labels` as `extra_data`.", + scprep.filter.filter_library_size, + self.X_dense, + cutoff=10, + sample_labels=sample_labels, + ) + assert_warns_message( + DeprecationWarning, + "`filter_per_sample` is deprecated. " "Filtering as a single sample.", + scprep.filter.filter_library_size, + self.X_dense, + cutoff=10, + filter_per_sample=True, + ) def test_large_sparse_dataframe_library_size(): matrix._ignore_pandas_sparse_warning() - X = pd.SparseDataFrame(sparse.coo_matrix((10**7, 2 * 10**4)), - default_fill_value=0.0) + X = pd.SparseDataFrame( + sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0 + ) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0] matrix._reset_warnings() - X = matrix.SparseDataFrame(sparse.coo_matrix((10**7, 2 * 10**4)), - default_fill_value=0.0) + X = matrix.SparseDataFrame( + sparse.coo_matrix((10 ** 7, 2 * 10 ** 4)), default_fill_value=0.0 + ) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0] diff --git a/test/test_hdf5.py b/test/test_hdf5.py index 78c3f319..65ca6b76 100644 --- a/test/test_hdf5.py +++ b/test/test_hdf5.py @@ -14,7 +14,7 @@ def test_failed_import_tables(): tables = scprep.io.hdf5.tables del scprep.io.hdf5.tables assert hdf5_available() is True - with tables.File(h5_file, 'r') as f: + with tables.File(h5_file, "r") as f: assert scprep.io.hdf5._is_tables(f) is False with scprep.io.hdf5.open_file(h5_file) as f: assert scprep.io.hdf5._is_h5py(f) @@ -26,7 +26,7 @@ def test_failed_import_h5py(): h5py = scprep.io.hdf5.h5py del scprep.io.hdf5.h5py assert hdf5_available() is True - with h5py.File(h5_file, 'r') as f: + with h5py.File(h5_file, "r") as f: assert scprep.io.hdf5._is_h5py(f) is False scprep.io.hdf5.h5py = h5py @@ -36,32 +36,43 @@ def test_failed_import_both(): del scprep.io.hdf5.tables h5py = scprep.io.hdf5.h5py del scprep.io.hdf5.h5py - assert_raise_message(ImportError, - "Found neither tables nor h5py. " - "Please install one of them with e.g. " - "`pip install --user tables` or " - "`pip install --user h5py`", - hdf5_available) + assert_raise_message( + ImportError, + "Found neither tables nor h5py. " + "Please install one of them with e.g. " + "`pip install --user tables` or " + "`pip install --user h5py`", + hdf5_available, + ) scprep.io.hdf5.tables = tables scprep.io.hdf5.h5py = h5py def test_list_nodes_invalid(): - assert_raise_message(TypeError, - "Expected h5py.File, tables.File, h5py.Group or " - "tables.Group. Got ", - scprep.io.hdf5.list_nodes, 'invalid') + assert_raise_message( + TypeError, + "Expected h5py.File, tables.File, h5py.Group or " + "tables.Group. Got ", + scprep.io.hdf5.list_nodes, + "invalid", + ) def test_get_node_invalid(): - assert_raise_message(TypeError, - "Expected h5py.File, tables.File, h5py.Group or " - "tables.Group. Got ", - scprep.io.hdf5.get_node, 'invalid', 'node') + assert_raise_message( + TypeError, + "Expected h5py.File, tables.File, h5py.Group or " + "tables.Group. Got ", + scprep.io.hdf5.get_node, + "invalid", + "node", + ) def test_get_values_invalid(): - assert_raise_message(TypeError, - "Expected h5py.Dataset or tables.CArray. " - "Got ", - scprep.io.hdf5.get_values, 'invalid') + assert_raise_message( + TypeError, + "Expected h5py.Dataset or tables.CArray. " "Got ", + scprep.io.hdf5.get_values, + "invalid", + ) diff --git a/test/test_io.py b/test/test_io.py index 7c34edf8..73a969a1 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -1,7 +1,11 @@ from tools import data, utils import scprep import scprep.io.utils -from sklearn.utils.testing import assert_warns_message, assert_raise_message, assert_raises +from sklearn.utils.testing import ( + assert_warns_message, + assert_raise_message, + assert_raises, +) import pandas as pd import numpy as np from scipy import sparse @@ -14,7 +18,6 @@ class TestMatrixToDataFrame(unittest.TestCase): - @classmethod def setUpClass(self): self.X_dense = data.load_10X(sparse=False) @@ -67,42 +70,66 @@ def test_matrix_to_dataframe_no_names_dataframe_dense(self): utils.assert_matrix_class_equivalent(Y, self.X_dense) def test_matrix_to_dataframe_names_sparse(self): - Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, cell_names=self.cell_names, - gene_names=self.gene_names, sparse=True) + Y = scprep.io.utils._matrix_to_data_frame( + self.X_dense, + cell_names=self.cell_names, + gene_names=self.gene_names, + sparse=True, + ) assert scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) utils.assert_matrix_class_equivalent(Y, self.X_sparse) - Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, cell_names=self.cell_names, - gene_names=self.gene_names, sparse=True) + Y = scprep.io.utils._matrix_to_data_frame( + self.X_sparse, + cell_names=self.cell_names, + gene_names=self.gene_names, + sparse=True, + ) assert scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) utils.assert_matrix_class_equivalent(Y, self.X_sparse) - Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, cell_names=self.cell_names, - gene_names=self.gene_names, sparse=True) + Y = scprep.io.utils._matrix_to_data_frame( + self.X_numpy, + cell_names=self.cell_names, + gene_names=self.gene_names, + sparse=True, + ) assert scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) utils.assert_matrix_class_equivalent(Y, self.X_sparse) def test_matrix_to_dataframe_names_dense(self): - Y = scprep.io.utils._matrix_to_data_frame(self.X_dense, cell_names=self.cell_names, - gene_names=self.gene_names, sparse=False) + Y = scprep.io.utils._matrix_to_data_frame( + self.X_dense, + cell_names=self.cell_names, + gene_names=self.gene_names, + sparse=False, + ) assert isinstance(Y, pd.DataFrame) assert not scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) utils.assert_matrix_class_equivalent(Y, self.X_dense) - Y = scprep.io.utils._matrix_to_data_frame(self.X_sparse, cell_names=self.cell_names, - gene_names=self.gene_names, sparse=False) + Y = scprep.io.utils._matrix_to_data_frame( + self.X_sparse, + cell_names=self.cell_names, + gene_names=self.gene_names, + sparse=False, + ) assert isinstance(Y, pd.DataFrame) assert not scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) assert np.all(scprep.utils.toarray(Y) == self.X_numpy) utils.assert_matrix_class_equivalent(Y, self.X_dense) - Y = scprep.io.utils._matrix_to_data_frame(self.X_numpy, cell_names=self.cell_names, - gene_names=self.gene_names, sparse=False) + Y = scprep.io.utils._matrix_to_data_frame( + self.X_numpy, + cell_names=self.cell_names, + gene_names=self.gene_names, + sparse=False, + ) assert isinstance(Y, pd.DataFrame) assert not scprep.utils.is_sparse_dataframe(Y) assert not isinstance(Y, pd.SparseDataFrame) @@ -119,14 +146,16 @@ def test_10X_duplicate_gene_names(): scprep.io.load_10X, os.path.join(data.data_dir, "test_10X_duplicate_gene_names"), gene_labels="symbol", - sparse=True) + sparse=True, + ) assert_warns_message( RuntimeWarning, "Duplicate gene names detected! Forcing dense matrix", scprep.io.load_10X, os.path.join(data.data_dir, "test_10X_duplicate_gene_names"), allow_duplicates=True, - sparse=True) + sparse=True, + ) def test_10X(): @@ -134,71 +163,70 @@ def test_10X(): assert X.shape == (100, 100) assert scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "Arl8b" - X = data.load_10X(gene_labels='id', sparse=False) + X = data.load_10X(gene_labels="id", sparse=False) assert X.shape == (100, 100) assert isinstance(X, pd.DataFrame) assert not scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "ENSMUSG00000030105" - X = data.load_10X(gene_labels='both') + X = data.load_10X(gene_labels="both") assert X.shape == (100, 100) assert scprep.utils.is_sparse_dataframe(X) assert X.columns[0] == "Arl8b (ENSMUSG00000030105)" X_cellranger3 = scprep.io.load_10X( - os.path.join(data.data_dir, "test_10X_cellranger3"), - gene_labels="both") + os.path.join(data.data_dir, "test_10X_cellranger3"), gene_labels="both" + ) np.testing.assert_array_equal(X.index, X_cellranger3.index) np.testing.assert_array_equal(X.columns, X_cellranger3.columns) np.testing.assert_array_equal(X.index, X_cellranger3.index) assert_raise_message( ValueError, - "gene_labels='invalid' not recognized. " - "Choose from ['symbol', 'id', 'both']", + "gene_labels='invalid' not recognized. " "Choose from ['symbol', 'id', 'both']", data.load_10X, - gene_labels='invalid') + gene_labels="invalid", + ) assert_raise_message( FileNotFoundError, - "{} is not a directory".format( - os.path.join(data.data_dir, "test_10X.zip")), + "{} is not a directory".format(os.path.join(data.data_dir, "test_10X.zip")), scprep.io.load_10X, - os.path.join(data.data_dir, "test_10X.zip")) + os.path.join(data.data_dir, "test_10X.zip"), + ) assert_raise_message( FileNotFoundError, "'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)' must be present " "in {}".format(data.data_dir), scprep.io.load_10X, - data.data_dir) + data.data_dir, + ) def test_10X_zip(): X = data.load_10X() filename = os.path.join(data.data_dir, "test_10X.zip") - X_zip = scprep.io.load_10X_zip( - filename) + X_zip = scprep.io.load_10X_zip(filename) assert scprep.utils.is_sparse_dataframe(X_zip) assert np.sum(np.sum(X != X_zip)) == 0 np.testing.assert_array_equal(X.columns, X_zip.columns) np.testing.assert_array_equal(X.index, X_zip.index) assert_raise_message( ValueError, - "gene_labels='invalid' not recognized. " - "Choose from ['symbol', 'id', 'both']", + "gene_labels='invalid' not recognized. " "Choose from ['symbol', 'id', 'both']", scprep.io.load_10X_zip, filename, - gene_labels='invalid') + gene_labels="invalid", + ) assert_raise_message( ValueError, "Expected a single zipped folder containing 'matrix.mtx(.gz)', " "'[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)'. Got ", scprep.io.load_10X_zip, - os.path.join(data.data_dir, "test_10X_invalid.zip") + os.path.join(data.data_dir, "test_10X_invalid.zip"), ) def test_10X_zip_url(): X = data.load_10X() filename = "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip" - X_zip = scprep.io.load_10X_zip( - filename) + X_zip = scprep.io.load_10X_zip(filename) assert scprep.utils.is_sparse_dataframe(X_zip) assert np.sum(np.sum(X != X_zip)) == 0 np.testing.assert_array_equal(X.columns, X_zip.columns) @@ -210,14 +238,14 @@ def test_10X_zip_url_not_a_zip(): zipfile.BadZipFile, "File is not a zip file", scprep.io.load_10X_zip, - "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X") + "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X", + ) def test_10X_zip_url_not_a_real_website(): assert_raises( - urllib.error.URLError, - scprep.io.load_10X_zip, - 'http://invalid.not.a.url/scprep') + urllib.error.URLError, scprep.io.load_10X_zip, "http://invalid.not.a.url/scprep" + ) def test_10X_zip_url_404(): @@ -225,7 +253,8 @@ def test_10X_zip_url_404(): urllib.error.HTTPError, "HTTP Error 404: Not Found", scprep.io.load_10X_zip, - 'https://github.com/KrishnaswamyLab/scprep/invalid_url') + "https://github.com/KrishnaswamyLab/scprep/invalid_url", + ) def test_10X_zip_not_a_file(): @@ -233,7 +262,8 @@ def test_10X_zip_not_a_file(): FileNotFoundError, "No such file: 'not_a_file.zip'", scprep.io.load_10X_zip, - 'not_a_file.zip') + "not_a_file.zip", + ) def test_10X_HDF5(): @@ -246,13 +276,13 @@ def test_10X_HDF5(): np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit tables backend - X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables') + X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend="tables") assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend - X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py') + X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend="h5py") assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) @@ -278,13 +308,13 @@ def test_10X_HDF5_cellranger3(): np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit tables backend - X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables') + X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend="tables") assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) np.testing.assert_array_equal(X.index, X_hdf5.index) # explicit h5py backend - X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py') + X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend="h5py") assert scprep.utils.is_sparse_dataframe(X_hdf5) assert np.sum(np.sum(X != X_hdf5)) == 0 np.testing.assert_array_equal(X.columns, X_hdf5.columns) @@ -304,11 +334,11 @@ def test_10X_HDF5_invalid_genome(): h5_file = os.path.join(data.data_dir, "test_10X.h5") assert_raise_message( ValueError, - "Genome invalid not found in {}. " - "Available genomes: GRCh38".format(h5_file), + "Genome invalid not found in {}. " "Available genomes: GRCh38".format(h5_file), scprep.io.load_10X_HDF5, filename=h5_file, - genome="invalid") + genome="invalid", + ) def test_10X_HDF5_genome_cellranger3(): @@ -320,7 +350,8 @@ def test_10X_HDF5_genome_cellranger3(): "https://github.com/KrishnaswamyLab/scprep/issues", scprep.io.load_10X_HDF5, filename=h5_file, - genome="GRCh38") + genome="GRCh38", + ) def test_10X_HDF5_invalid_backend(): @@ -330,37 +361,46 @@ def test_10X_HDF5_invalid_backend(): "Expected backend in ['tables', 'h5py']. Got invalid", scprep.io.load_10X_HDF5, filename=h5_file, - backend="invalid") + backend="invalid", + ) def test_10X_HDF5_invalid_gene_labels(): h5_file = os.path.join(data.data_dir, "test_10X.h5") assert_raise_message( ValueError, - "gene_labels='invalid' not recognized. " - "Choose from ['symbol', 'id', 'both']", + "gene_labels='invalid' not recognized. " "Choose from ['symbol', 'id', 'both']", scprep.io.load_10X_HDF5, filename=h5_file, - gene_labels='invalid') + gene_labels="invalid", + ) def test_csv_and_tsv(): X = data.load_10X() filename = os.path.join(data.data_dir, "test_small.csv") X_csv = scprep.io.load_csv( - os.path.join(data.data_dir, "test_small.csv"), - gene_names=True, cell_names=True) + os.path.join(data.data_dir, "test_small.csv"), gene_names=True, cell_names=True + ) X_csv2 = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), - gene_names=True, cell_names=None, index_col=0) + gene_names=True, + cell_names=None, + index_col=0, + ) X_csv3 = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), - gene_names=None, cell_names=True, header=0) + gene_names=None, + cell_names=True, + header=0, + ) X_csv4 = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), - gene_names=True, cell_names=True, cell_axis='col') - X_tsv = scprep.io.load_tsv( - os.path.join(data.data_dir, "test_small.tsv")) + gene_names=True, + cell_names=True, + cell_axis="col", + ) + X_tsv = scprep.io.load_tsv(os.path.join(data.data_dir, "test_small.tsv")) assert np.sum(np.sum(X != X_csv)) == 0 assert np.sum(np.sum(X_csv != X_csv2)) == 0 assert np.sum(np.sum(X_csv != X_csv3)) == 0 @@ -378,12 +418,11 @@ def test_csv_and_tsv(): assert not scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( os.path.join(data.data_dir, "test_small.csv"), - gene_names=os.path.join( - data.data_dir, "gene_symbols.csv"), - cell_names=os.path.join( - data.data_dir, "barcodes.tsv"), + gene_names=os.path.join(data.data_dir, "gene_symbols.csv"), + cell_names=os.path.join(data.data_dir, "barcodes.tsv"), skiprows=1, - usecols=range(1, 101)) + usecols=range(1, 101), + ) assert np.sum(np.sum(X != X_csv)) == 0 np.testing.assert_array_equal(X.columns, X_csv.columns) np.testing.assert_array_equal(X.index, X_csv.index) @@ -394,7 +433,8 @@ def test_csv_and_tsv(): gene_names=X.columns, cell_names=X.index, skiprows=1, - usecols=range(1, 101)) + usecols=range(1, 101), + ) assert np.sum(np.sum(X != X_csv)) == 0 np.testing.assert_array_equal(X.columns, X_csv.columns) np.testing.assert_array_equal(X.index, X_csv.index) @@ -406,20 +446,22 @@ def test_csv_and_tsv(): cell_names=None, sparse=True, skiprows=1, - usecols=range(1, 101)) + usecols=range(1, 101), + ) assert np.sum(np.sum(X.to_numpy() != X_csv.to_numpy())) == 0 assert scprep.utils.is_sparse_dataframe(X_csv) X_csv = scprep.io.load_csv( - os.path.join(data.data_dir, - "test_small_duplicate_gene_names.csv")) - assert 'DUPLICATE' in X_csv.columns - assert 'DUPLICATE.1' in X_csv.columns + os.path.join(data.data_dir, "test_small_duplicate_gene_names.csv") + ) + assert "DUPLICATE" in X_csv.columns + assert "DUPLICATE.1" in X_csv.columns assert_raise_message( ValueError, - "cell_axis neither not recognized. " - "Expected 'row' or 'column'", - scprep.io.load_csv, filename, - cell_axis='neither') + "cell_axis neither not recognized. " "Expected 'row' or 'column'", + scprep.io.load_csv, + filename, + cell_axis="neither", + ) def test_mtx(): @@ -427,42 +469,38 @@ def test_mtx(): filename = os.path.join(data.data_dir, "test_10X", "matrix.mtx.gz") X_mtx = scprep.io.load_mtx( filename, - gene_names=os.path.join( - data.data_dir, "gene_symbols.csv"), - cell_names=os.path.join( - data.data_dir, "barcodes.tsv"), - cell_axis="column") + gene_names=os.path.join(data.data_dir, "gene_symbols.csv"), + cell_names=os.path.join(data.data_dir, "barcodes.tsv"), + cell_axis="column", + ) assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) assert scprep.utils.is_sparse_dataframe(X_mtx) X_mtx = scprep.io.load_mtx( - filename, - gene_names=X.columns, - cell_names=X.index, - cell_axis="column") + filename, gene_names=X.columns, cell_names=X.index, cell_axis="column" + ) assert np.sum(np.sum(X.to_numpy() != X_mtx.to_numpy())) == 0 np.testing.assert_array_equal(X.columns, X_mtx.columns) np.testing.assert_array_equal(X.index, X_mtx.index) assert scprep.utils.is_sparse_dataframe(X_mtx) X_mtx = scprep.io.load_mtx( - filename, - gene_names=None, - cell_names=None, - sparse=False, - cell_axis="column") + filename, gene_names=None, cell_names=None, sparse=False, cell_axis="column" + ) assert np.sum(np.sum(X.to_numpy() != X_mtx)) == 0 assert isinstance(X_mtx, np.ndarray) assert_raise_message( ValueError, - "cell_axis neither not recognized. " - "Expected 'row' or 'column'", - scprep.io.load_mtx, filename, - cell_axis='neither') + "cell_axis neither not recognized. " "Expected 'row' or 'column'", + scprep.io.load_mtx, + filename, + cell_axis="neither", + ) X_mtx = scprep.io.load_mtx( filename, - gene_names=np.arange(X.shape[1]).astype('str'), - cell_names=np.arange(X.shape[0])) + gene_names=np.arange(X.shape[1]).astype("str"), + cell_names=np.arange(X.shape[0]), + ) assert X_mtx.shape == (100, 100) assert scprep.utils.is_sparse_dataframe(X_mtx) assert X_mtx.columns[0] == "0" @@ -473,16 +511,16 @@ def test_save_mtx(): filename = os.path.join(data.data_dir, "test_10X", "matrix.mtx.gz") X = scprep.io.load_mtx( filename, - gene_names=os.path.join( - data.data_dir, "gene_symbols.csv"), - cell_names=os.path.join( - data.data_dir, "barcodes.tsv"), - cell_axis="column") + gene_names=os.path.join(data.data_dir, "gene_symbols.csv"), + cell_names=os.path.join(data.data_dir, "barcodes.tsv"), + cell_axis="column", + ) scprep.io.save_mtx(X, "test_mtx") Y = scprep.io.load_mtx( "test_mtx/matrix.mtx", gene_names="test_mtx/gene_names.tsv", - cell_names="test_mtx/cell_names.tsv") + cell_names="test_mtx/cell_names.tsv", + ) np.testing.assert_array_equal(X, Y) assert np.all(X.index == Y.index) assert np.all(X.columns == Y.columns) @@ -493,16 +531,17 @@ def test_fcs(): path = fcsparser.test_sample_path meta, data = fcsparser.parse(path) _, _, X = scprep.io.load_fcs(path) - assert 'Time' not in X.columns + assert "Time" not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) np.testing.assert_array_equal(X.to_numpy(), data[X.columns].to_numpy()) _, _, X = scprep.io.load_fcs(path, sparse=True) - assert 'Time' not in X.columns + assert "Time" not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) np.testing.assert_array_equal( - X.sparse.to_dense().to_numpy(), data[X.columns].to_numpy()) + X.sparse.to_dense().to_numpy(), data[X.columns].to_numpy() + ) X_meta, _, X = scprep.io.load_fcs(path, reformat_meta=False, override=True) assert set(meta.keys()) == set(X_meta.keys()) @@ -529,13 +568,13 @@ def test_fcs_reformat_meta(): np.testing.assert_array_equal(meta[key], int(X_meta[key]), key) elif key == "_channels_": for column in meta[key].columns: - X_column = X_meta[key][column].astype( - meta[key][column].dtype) + X_column = X_meta[key][column].astype(meta[key][column].dtype) np.testing.assert_array_equal( - meta[key][column], X_column, key + column) + meta[key][column], X_column, key + column + ) else: raise - assert 'Time' not in X.columns + assert "Time" not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) np.testing.assert_array_equal(X.values, data[X.columns].values) @@ -543,10 +582,10 @@ def test_fcs_reformat_meta(): def test_fcs_PnN(): path = fcsparser.test_sample_path - meta, data = fcsparser.parse(path, reformat_meta=True, - channel_naming='$PnN') - X_meta, _, X = scprep.io.load_fcs(path, reformat_meta=True, - channel_naming='$PnN', override=True) + meta, data = fcsparser.parse(path, reformat_meta=True, channel_naming="$PnN") + X_meta, _, X = scprep.io.load_fcs( + path, reformat_meta=True, channel_naming="$PnN", override=True + ) assert set(meta.keys()) == set(X_meta.keys()) for key in meta.keys(): try: @@ -556,13 +595,13 @@ def test_fcs_PnN(): np.testing.assert_array_equal(meta[key], int(X_meta[key]), key) elif key == "_channels_": for column in meta[key].columns: - X_column = X_meta[key][column].astype( - meta[key][column].dtype) + X_column = X_meta[key][column].astype(meta[key][column].dtype) np.testing.assert_array_equal( - meta[key][column], X_column, key + column) + meta[key][column], X_column, key + column + ) else: raise - assert 'Time' not in X.columns + assert "Time" not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) np.testing.assert_array_equal(X.values, data[X.columns].values) @@ -575,19 +614,23 @@ def test_fcs_file_error(): " a malformed header. You can try using " "`override=True` to use scprep's built-in " "experimental FCS parser.".format( - os.path.join(data.data_dir, "test_small.csv")), + os.path.join(data.data_dir, "test_small.csv") + ), scprep.io.load_fcs, - os.path.join(data.data_dir, "test_small.csv")) + os.path.join(data.data_dir, "test_small.csv"), + ) def test_fcs_naming_error(): path = fcsparser.test_sample_path assert_raise_message( ValueError, - "Expected channel_naming in ['$PnS', '$PnN']. " - "Got 'invalid'", - scprep.io.load_fcs, path, - override=True, channel_naming="invalid") + "Expected channel_naming in ['$PnS', '$PnN']. " "Got 'invalid'", + scprep.io.load_fcs, + path, + override=True, + channel_naming="invalid", + ) def test_parse_header(): @@ -596,44 +639,61 @@ def test_parse_header(): assert_raise_message( ValueError, "Expected 5 entries in gene_names. Got 10", - scprep.io.utils._parse_header, header1, 5) + scprep.io.utils._parse_header, + header1, + 5, + ) assert_raise_message( ValueError, "Expected 50 entries in {}. Got 100".format(os.path.abspath(header2)), - scprep.io.utils._parse_header, header2, 50) + scprep.io.utils._parse_header, + header2, + 50, + ) + def test_download_google_drive(): id = "1_T5bRqbid5mtuDYnyusoGvujc6fW1UKv" dest = "test.txt" scprep.io.download.download_google_drive(id, dest) assert os.path.isfile(dest) - with open(dest, 'r') as f: + with open(dest, "r") as f: data = f.read() - assert data == 'test\n', data + assert data == "test\n", data os.remove(dest) + def test_download_google_drive_large(): id = "1FDDSWtSZcdQUVKpk-mPCZ8Ji1Fx8KSz9" response = scprep.io.download._GET_google_drive(id) assert response.status_code == 200 response.close() + def test_download_url(): X = data.load_10X() - scprep.io.download.download_url("https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X/matrix.mtx.gz", "url_test.mtx.gz") + scprep.io.download.download_url( + "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X/matrix.mtx.gz", + "url_test.mtx.gz", + ) Y = scprep.io.load_mtx("url_test.mtx.gz").T assert (X.sparse.to_coo() - Y).nnz == 0 os.remove("url_test.mtx.gz") + def test_download_zip(): X = data.load_10X() - scprep.io.download.download_and_extract_zip("https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip", "zip_test") + scprep.io.download.download_and_extract_zip( + "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_10X.zip", + "zip_test", + ) Y = scprep.io.load_10X("zip_test/test_10X") assert np.all(X == Y) assert np.all(X.index == Y.index) assert np.all(X.columns == Y.columns) shutil.rmtree("zip_test") + def test_unzip_no_destination(): X = data.load_10X() filename = os.path.join(data.data_dir, "test_10X.zip") @@ -648,6 +708,7 @@ def test_unzip_no_destination(): assert np.all(X.columns == Y.columns) shutil.rmtree("zip_test") + def test_unzip_destination(): X = data.load_10X() filename = os.path.join(data.data_dir, "test_10X.zip") @@ -659,4 +720,4 @@ def test_unzip_destination(): assert np.all(X == Y) assert np.all(X.index == Y.index) assert np.all(X.columns == Y.columns) - shutil.rmtree("zip_test") \ No newline at end of file + shutil.rmtree("zip_test") diff --git a/test/test_lazyload.py b/test/test_lazyload.py index 2e5e40f2..e7eeebab 100644 --- a/test/test_lazyload.py +++ b/test/test_lazyload.py @@ -7,13 +7,15 @@ def test_lazyload(): proc = subprocess.Popen( - ['nose2', '--quiet', '_test_lazyload'], - cwd=os.path.join(data._get_root_dir(), "test"), stderr=subprocess.PIPE) + ["nose2", "--quiet", "_test_lazyload"], + cwd=os.path.join(data._get_root_dir(), "test"), + stderr=subprocess.PIPE, + ) return_code = proc.wait() try: assert return_code == 0 except AssertionError: - lines = proc.stderr.read().decode().split('\n') + lines = proc.stderr.read().decode().split("\n") lines = lines[4:-6] raise AssertionError("\n".join(lines)) finally: @@ -26,12 +28,11 @@ def test_builtins(): del sys.modules[module] except KeyError: pass - assert getattr( - scprep._lazyload, module).__class__ is scprep._lazyload.AliasModule + assert ( + getattr(scprep._lazyload, module).__class__ is scprep._lazyload.AliasModule + ) try: - getattr( - scprep._lazyload, module).__version__ + getattr(scprep._lazyload, module).__version__ except AttributeError: pass - assert getattr( - scprep._lazyload, module).__class__ is type(scprep) + assert getattr(scprep._lazyload, module).__class__ is type(scprep) diff --git a/test/test_measure.py b/test/test_measure.py index e54e61f1..711d3356 100644 --- a/test/test_measure.py +++ b/test/test_measure.py @@ -9,69 +9,83 @@ class TestGeneSetExpression(unittest.TestCase): - @classmethod def setUpClass(self): self.X_dense = data.load_10X(sparse=False) self.X_sparse = data.load_10X(sparse=True) - self.Y = scprep.measure.gene_set_expression(self.X_dense, - genes="Arl8b") + self.Y = scprep.measure.gene_set_expression(self.X_dense, genes="Arl8b") def test_setup(self): assert self.Y.shape[0] == self.X_dense.shape[0] - utils.assert_all_equal(self.Y, scprep.select.select_cols( - self.X_dense, idx="Arl8b")) + utils.assert_all_equal( + self.Y, scprep.select.select_cols(self.X_dense, idx="Arl8b") + ) def test_single_pandas(self): matrix.test_pandas_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=self.Y, transform=scprep.measure.gene_set_expression, - genes="Arl8b") + self.X_dense, + utils.assert_transform_equals, + Y=self.Y, + transform=scprep.measure.gene_set_expression, + genes="Arl8b", + ) def test_array_pandas(self): matrix.test_pandas_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=self.Y, transform=scprep.measure.gene_set_expression, - genes=["Arl8b"]) + self.X_dense, + utils.assert_transform_equals, + Y=self.Y, + transform=scprep.measure.gene_set_expression, + genes=["Arl8b"], + ) def test_starts_with_pandas(self): matrix.test_pandas_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=self.Y, transform=scprep.measure.gene_set_expression, - starts_with="Arl8b") + self.X_dense, + utils.assert_transform_equals, + Y=self.Y, + transform=scprep.measure.gene_set_expression, + starts_with="Arl8b", + ) def test_single_all(self): matrix.test_all_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=self.Y, transform=scprep.measure.gene_set_expression, - genes=0) + self.X_dense, + utils.assert_transform_equals, + Y=self.Y, + transform=scprep.measure.gene_set_expression, + genes=0, + ) def test_array_all(self): matrix.test_all_matrix_types( - self.X_dense, utils.assert_transform_equals, - Y=self.Y, transform=scprep.measure.gene_set_expression, - genes=[0]) + self.X_dense, + utils.assert_transform_equals, + Y=self.Y, + transform=scprep.measure.gene_set_expression, + genes=[0], + ) def test_library_size(self): def test_fun(X): x = scprep.measure.library_size(X) - assert x.name == 'library_size' + assert x.name == "library_size" assert np.all(x.index == self.X_dense.index) - matrix.test_pandas_matrix_types( - self.X_dense, test_fun) + + matrix.test_pandas_matrix_types(self.X_dense, test_fun) def test_gene_set_expression(self): def test_fun(X): x = scprep.measure.gene_set_expression(X, genes=[0, 1]) - assert x.name == 'expression' + assert x.name == "expression" assert np.all(x.index == self.X_dense.index) - matrix.test_pandas_matrix_types( - self.X_dense, test_fun) + + matrix.test_pandas_matrix_types(self.X_dense, test_fun) def test_variable_genes(self): def test_fun(X): x = scprep.measure.gene_variability(X) - assert x.name == 'variability' + assert x.name == "variability" assert np.all(x.index == self.X_dense.columns) - matrix.test_pandas_matrix_types( - self.X_dense, test_fun) + + matrix.test_pandas_matrix_types(self.X_dense, test_fun) diff --git a/test/test_normalize.py b/test/test_normalize.py index 29e24d42..9b582a88 100644 --- a/test/test_normalize.py +++ b/test/test_normalize.py @@ -8,69 +8,93 @@ class TestNormalize(unittest.TestCase): - @classmethod def setUpClass(self): self.X = data.generate_positive_sparse_matrix() self.libsize = self.X.sum(axis=1) self.median = np.median(self.libsize) self.mean = np.mean(self.X.sum(axis=1)) - self.X_norm = normalize(self.X, 'l1') - self.sample_idx = np.random.choice( - [0, 1], self.X.shape[0], replace=True) + self.X_norm = normalize(self.X, "l1") + self.sample_idx = np.random.choice([0, 1], self.X.shape[0], replace=True) def test_libsize_norm_rescale_median(self): Y = self.X_norm * self.median utils.assert_all_close(Y.sum(1), np.median(np.sum(self.X, 1))) Y2, libsize2 = scprep.normalize.library_size_normalize( - self.X, rescale='median', return_library_size=True) + self.X, rescale="median", return_library_size=True + ) np.testing.assert_allclose(Y, Y2) np.testing.assert_allclose(self.libsize, libsize2) matrix.test_all_matrix_types( - self.X, utils.assert_transform_equivalent, Y=Y, + self.X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.normalize.library_size_normalize, - rescale='median', - check=utils.assert_all_close) + rescale="median", + check=utils.assert_all_close, + ) def test_libsize_norm_return_libsize(self): def test_fun(*args, **kwargs): return scprep.normalize.library_size_normalize( - *args, return_library_size=True, **kwargs)[1] + *args, return_library_size=True, **kwargs + )[1] + matrix.test_all_matrix_types( - self.X, utils.assert_transform_equals, Y=self.libsize, + self.X, + utils.assert_transform_equals, + Y=self.libsize, transform=test_fun, - check=utils.assert_all_close) + check=utils.assert_all_close, + ) def test_libsize_norm_return_libsize_rescale_constant(self): def test_fun(*args, **kwargs): return scprep.normalize.library_size_normalize( - *args, return_library_size=True, rescale=1, **kwargs)[1] + *args, return_library_size=True, rescale=1, **kwargs + )[1] + matrix.test_all_matrix_types( - self.X, utils.assert_transform_equals, Y=self.libsize, + self.X, + utils.assert_transform_equals, + Y=self.libsize, transform=test_fun, - check=utils.assert_all_close) + check=utils.assert_all_close, + ) def test_libsize_norm_rescale_mean(self): Y = self.X_norm * self.mean utils.assert_all_close(Y.sum(1), np.mean(np.sum(self.X, 1))) matrix.test_all_matrix_types( - self.X, utils.assert_transform_equivalent, Y=Y, + self.X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.normalize.library_size_normalize, - check=utils.assert_all_close, rescale='mean') + check=utils.assert_all_close, + rescale="mean", + ) def test_libsize_norm_rescale_none(self): Y = self.X_norm matrix.test_all_matrix_types( - self.X, utils.assert_transform_equivalent, Y=Y, + self.X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.normalize.library_size_normalize, - check=utils.assert_all_close, rescale=None) + check=utils.assert_all_close, + rescale=None, + ) def test_libsize_norm_rescale_integer(self): Y = self.X_norm matrix.test_all_matrix_types( - self.X, utils.assert_transform_equivalent, Y=Y, + self.X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.normalize.library_size_normalize, - check=utils.assert_all_close, rescale=1) + check=utils.assert_all_close, + rescale=1, + ) def test_libsize_norm_rescale_invalid(self): assert_raise_message( @@ -78,49 +102,60 @@ def test_libsize_norm_rescale_invalid(self): "Expected rescale in ['median', 'mean'], a number or `None`. " "Got invalid", scprep.normalize.library_size_normalize, - self.X, rescale='invalid') + self.X, + rescale="invalid", + ) def test_libsize_norm_median_zero(self): X = self.X.copy() - X[:X.shape[0] // 2 + 1] = 0 + X[: X.shape[0] // 2 + 1] = 0 assert_warns_message( UserWarning, - "Median library size is zero. " - "Rescaling to mean instead.", + "Median library size is zero. " "Rescaling to mean instead.", scprep.normalize.library_size_normalize, - X, rescale='median') + X, + rescale="median", + ) def test_batch_mean_center(self): X = self.X.copy() X[self.sample_idx == 1] += 1 Y = X.copy() - Y[self.sample_idx == 0] -= np.mean( - Y[self.sample_idx == 0], axis=0)[None, :] - Y[self.sample_idx == 1] -= np.mean( - Y[self.sample_idx == 1], axis=0)[None, :] + Y[self.sample_idx == 0] -= np.mean(Y[self.sample_idx == 0], axis=0)[None, :] + Y[self.sample_idx == 1] -= np.mean(Y[self.sample_idx == 1], axis=0)[None, :] utils.assert_all_close(np.mean(Y[self.sample_idx == 0], axis=0), 0) utils.assert_all_close(np.mean(Y[self.sample_idx == 1], axis=0), 0) matrix.test_dense_matrix_types( - X, utils.assert_transform_equivalent, Y=Y, + X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.normalize.batch_mean_center, - sample_idx=self.sample_idx) + sample_idx=self.sample_idx, + ) def test_batch_mean_center_sparse(self): matrix.test_sparse_matrix_types( - self.X, utils.assert_transform_raises, + self.X, + utils.assert_transform_raises, transform=scprep.normalize.batch_mean_center, sample_idx=self.sample_idx, - exception=ValueError) + exception=ValueError, + ) def test_batch_mean_center_one_sample(self): Y = self.X.copy() Y -= np.mean(Y, axis=0)[None, :] matrix.test_dense_matrix_types( - self.X, utils.assert_transform_equivalent, Y=Y, - transform=scprep.normalize.batch_mean_center) + self.X, + utils.assert_transform_equivalent, + Y=Y, + transform=scprep.normalize.batch_mean_center, + ) def test_batch_mean_center_sparse_one_sample(self): matrix.test_sparse_matrix_types( - self.X, utils.assert_transform_raises, + self.X, + utils.assert_transform_raises, transform=scprep.normalize.batch_mean_center, - exception=ValueError) + exception=ValueError, + ) diff --git a/test/test_patch.py b/test/test_patch.py index 150a92c7..5e537530 100644 --- a/test/test_patch.py +++ b/test/test_patch.py @@ -11,14 +11,18 @@ def test_pandas_series_rmatmul(): ser = pd.Series(arr) np.testing.assert_array_equal(mat @ ser, (df @ ser).values) + def test_pandas_sparse_iloc(): - X = pd.DataFrame([[0,1,1], [0,0,1], [0,0,0]]).astype(pd.SparseDtype(float, fill_value=0.0)) - assert np.all(~np.isnan(X.iloc[[0,1]].to_numpy())) + X = pd.DataFrame([[0, 1, 1], [0, 0, 1], [0, 0, 0]]).astype( + pd.SparseDtype(float, fill_value=0.0) + ) + assert np.all(~np.isnan(X.iloc[[0, 1]].to_numpy())) class CustomBlock(ExtensionBlock): _holder = np.ndarray + def test_fill_value(): values = pd.Series(np.arange(3), dtype=pd.UInt16Dtype()) custom_block = CustomBlock(values, placement=slice(1, 2)) diff --git a/test/test_plot.py b/test/test_plot.py index 16bab573..03a5db08 100644 --- a/test/test_plot.py +++ b/test/test_plot.py @@ -22,105 +22,109 @@ def try_remove(filename): def test_default_matplotlibrc(): - for key in ['axes.labelsize', - 'axes.titlesize', - 'figure.titlesize', - 'legend.fontsize', - 'legend.title_fontsize', - 'xtick.labelsize', - 'ytick.labelsize']: + for key in [ + "axes.labelsize", + "axes.titlesize", + "figure.titlesize", + "legend.fontsize", + "legend.title_fontsize", + "xtick.labelsize", + "ytick.labelsize", + ]: assert scprep.plot.utils._is_default_matplotlibrc() is True default = plt.rcParams[key] - plt.rcParams[key] = 'xx-large' + plt.rcParams[key] = "xx-large" assert scprep.plot.utils._is_default_matplotlibrc() is False plt.rcParams[key] = default assert scprep.plot.utils._is_default_matplotlibrc() is True def test_parse_fontsize(): - for key in ['axes.labelsize', - 'axes.titlesize', - 'figure.titlesize', - 'legend.fontsize', - 'legend.title_fontsize', - 'xtick.labelsize', - 'ytick.labelsize']: - assert scprep.plot.utils.parse_fontsize( - 'x-large', 'large') == 'x-large' - assert scprep.plot.utils.parse_fontsize(None, 'large') == 'large' + for key in [ + "axes.labelsize", + "axes.titlesize", + "figure.titlesize", + "legend.fontsize", + "legend.title_fontsize", + "xtick.labelsize", + "ytick.labelsize", + ]: + assert scprep.plot.utils.parse_fontsize("x-large", "large") == "x-large" + assert scprep.plot.utils.parse_fontsize(None, "large") == "large" default = plt.rcParams[key] - plt.rcParams[key] = 'xx-large' - assert scprep.plot.utils.parse_fontsize( - 'x-large', 'large') == 'x-large' - assert scprep.plot.utils.parse_fontsize(None, 'large') is None + plt.rcParams[key] = "xx-large" + assert scprep.plot.utils.parse_fontsize("x-large", "large") == "x-large" + assert scprep.plot.utils.parse_fontsize(None, "large") is None plt.rcParams[key] = default - assert scprep.plot.utils.parse_fontsize('x-large', 'large') == 'x-large' - assert scprep.plot.utils.parse_fontsize(None, 'large') == 'large' + assert scprep.plot.utils.parse_fontsize("x-large", "large") == "x-large" + assert scprep.plot.utils.parse_fontsize(None, "large") == "large" def test_generate_colorbar_str(): - cb = scprep.plot.tools.generate_colorbar(cmap='viridis') - assert cb.cmap.name == 'viridis' + cb = scprep.plot.tools.generate_colorbar(cmap="viridis") + assert cb.cmap.name == "viridis" def test_generate_colorbar_colormap(): cb = scprep.plot.tools.generate_colorbar(cmap=plt.cm.viridis) - assert cb.cmap.name == 'viridis' + assert cb.cmap.name == "viridis" def test_generate_colorbar_list(): - cb = scprep.plot.tools.generate_colorbar(cmap=['red', 'blue']) - assert cb.cmap.name == 'scprep_custom_cmap' + cb = scprep.plot.tools.generate_colorbar(cmap=["red", "blue"]) + assert cb.cmap.name == "scprep_custom_cmap" def test_generate_colorbar_dict(): - assert_raise_message(TypeError, - "unhashable type: 'dict'", - scprep.plot.tools.generate_colorbar, - cmap={'+': 'r', '-': 'b'}) + assert_raise_message( + TypeError, + "unhashable type: 'dict'", + scprep.plot.tools.generate_colorbar, + cmap={"+": "r", "-": "b"}, + ) def test_tab30(): cmap = scprep.plot.colors.tab30() np.testing.assert_array_equal( cmap.colors[:15], - np.array(matplotlib.cm.tab20c.colors)[[0, 1, 2, 4, 5, 6, 8, 9, - 10, 12, 13, 14, 16, 17, 18]]) + np.array(matplotlib.cm.tab20c.colors)[ + [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18] + ], + ) np.testing.assert_array_equal( cmap.colors[15:], - np.array(matplotlib.cm.tab20b.colors)[[0, 1, 2, 4, 5, 6, 8, 9, - 10, 12, 13, 14, 16, 17, 18]]) + np.array(matplotlib.cm.tab20b.colors)[ + [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18] + ], + ) def test_tab40(): cmap = scprep.plot.colors.tab40() - np.testing.assert_array_equal( - cmap.colors[:20], matplotlib.cm.tab20c.colors) - np.testing.assert_array_equal( - cmap.colors[20:], matplotlib.cm.tab20b.colors) + np.testing.assert_array_equal(cmap.colors[:20], matplotlib.cm.tab20c.colors) + np.testing.assert_array_equal(cmap.colors[20:], matplotlib.cm.tab20b.colors) def test_tab10_continuous(): - cmap = scprep.plot.colors.tab10_continuous( - n_colors=10, n_step=2, reverse=True) + cmap = scprep.plot.colors.tab10_continuous(n_colors=10, n_step=2, reverse=True) np.testing.assert_allclose( cmap.colors, np.hstack([matplotlib.cm.tab20.colors, np.ones((20, 1))]), - atol=0.06) + atol=0.06, + ) def test_tab10_continuous_no_reverse(): - cmap = scprep.plot.colors.tab10_continuous( - n_colors=10, n_step=2, reverse=False) + cmap = scprep.plot.colors.tab10_continuous(n_colors=10, n_step=2, reverse=False) colors = np.array(cmap.colors) for i in range(len(colors) // 2): tmp = np.array(colors[2 * i]) colors[2 * i] = colors[2 * i + 1] colors[2 * i + 1] = tmp np.testing.assert_allclose( - colors, - np.hstack([matplotlib.cm.tab20.colors, np.ones((20, 1))]), - atol=0.06) + colors, np.hstack([matplotlib.cm.tab20.colors, np.ones((20, 1))]), atol=0.06 + ) def test_tab10_continuous_invalid_n_colors(): @@ -128,87 +132,101 @@ def test_tab10_continuous_invalid_n_colors(): ValueError, "Expected 0 < n_colors <= 10. Got 0", scprep.plot.colors.tab10_continuous, - n_colors=0) + n_colors=0, + ) assert_raise_message( ValueError, "Expected 0 < n_colors <= 10. Got 11", scprep.plot.colors.tab10_continuous, - n_colors=11) + n_colors=11, + ) assert_raise_message( ValueError, "Expected n_step >= 2. Got 1", scprep.plot.colors.tab10_continuous, - n_step=1) + n_step=1, + ) def test_tab_exact(): assert scprep.plot.colors.tab(1) is plt.cm.tab10 np.testing.assert_array_equal( - scprep.plot.colors.tab(10).colors, plt.cm.tab10.colors) + scprep.plot.colors.tab(10).colors, plt.cm.tab10.colors + ) np.testing.assert_array_equal( - scprep.plot.colors.tab(20).colors, plt.cm.tab20.colors) + scprep.plot.colors.tab(20).colors, plt.cm.tab20.colors + ) np.testing.assert_array_equal( - scprep.plot.colors.tab(30).colors, scprep.plot.colors.tab30().colors) + scprep.plot.colors.tab(30).colors, scprep.plot.colors.tab30().colors + ) np.testing.assert_array_equal( - scprep.plot.colors.tab(40).colors, scprep.plot.colors.tab40().colors) + scprep.plot.colors.tab(40).colors, scprep.plot.colors.tab40().colors + ) np.testing.assert_array_equal( scprep.plot.colors.tab(50).colors, - scprep.plot.colors.tab10_continuous(n_colors=10, n_step=5).colors) + scprep.plot.colors.tab10_continuous(n_colors=10, n_step=5).colors, + ) def test_tab_first10(): np.testing.assert_array_equal( - scprep.plot.colors.tab(19).colors[:10], plt.cm.tab10.colors) + scprep.plot.colors.tab(19).colors[:10], plt.cm.tab10.colors + ) np.testing.assert_array_equal( - scprep.plot.colors.tab(29).colors[:10], - scprep.plot.colors.tab30().colors[::3]) + scprep.plot.colors.tab(29).colors[:10], scprep.plot.colors.tab30().colors[::3] + ) np.testing.assert_array_equal( - scprep.plot.colors.tab(39).colors[:10], - scprep.plot.colors.tab40().colors[::4]) + scprep.plot.colors.tab(39).colors[:10], scprep.plot.colors.tab40().colors[::4] + ) np.testing.assert_array_equal( scprep.plot.colors.tab(49).colors[:10], - scprep.plot.colors.tab10_continuous( - n_colors=10, n_step=5).colors[::5]) + scprep.plot.colors.tab10_continuous(n_colors=10, n_step=5).colors[::5], + ) def test_tab_first20(): np.testing.assert_array_equal( scprep.plot.colors.tab(29).colors[10:20], - scprep.plot.colors.tab30().colors[1::3]) + scprep.plot.colors.tab30().colors[1::3], + ) np.testing.assert_array_equal( scprep.plot.colors.tab(39).colors[10:20], - scprep.plot.colors.tab40().colors[1::4]) + scprep.plot.colors.tab40().colors[1::4], + ) def test_tab_first30(): np.testing.assert_array_equal( scprep.plot.colors.tab(39).colors[20:30], - scprep.plot.colors.tab40().colors[2::4]) + scprep.plot.colors.tab40().colors[2::4], + ) def test_tab_overhang(): np.testing.assert_array_equal( - scprep.plot.colors.tab(9).colors, plt.cm.tab10.colors[:9]) + scprep.plot.colors.tab(9).colors, plt.cm.tab10.colors[:9] + ) np.testing.assert_array_equal( - scprep.plot.colors.tab(19).colors[10:], plt.cm.tab20.colors[1:-1:2]) + scprep.plot.colors.tab(19).colors[10:], plt.cm.tab20.colors[1:-1:2] + ) np.testing.assert_array_equal( scprep.plot.colors.tab(29).colors[20:], - scprep.plot.colors.tab30().colors[2:-1:3]) + scprep.plot.colors.tab30().colors[2:-1:3], + ) np.testing.assert_array_equal( scprep.plot.colors.tab(39).colors[30:], - scprep.plot.colors.tab40().colors[3:-1:4]) + scprep.plot.colors.tab40().colors[3:-1:4], + ) np.testing.assert_array_equal( scprep.plot.colors.tab(49).colors[40:], - scprep.plot.colors.tab10_continuous( - n_colors=10, n_step=5).colors[4:-1:5]) + scprep.plot.colors.tab10_continuous(n_colors=10, n_step=5).colors[4:-1:5], + ) def test_tab_invalid(): assert_raise_message( - ValueError, - "Expected n >= 1. Got 0", - scprep.plot.colors.tab, - n=0) + ValueError, "Expected n >= 1. Got 0", scprep.plot.colors.tab, n=0 + ) def test_is_color_array_none(): @@ -231,7 +249,6 @@ def test_symlog_bins(): class TestScatterParams(unittest.TestCase): - @classmethod def setUpClass(self): self.X = np.random.normal(0, 1, [500, 4]) @@ -248,8 +265,9 @@ def test_size(self): assert params.size == len(self.x) def test_plot_idx_shuffle(self): - params = _ScatterParams(x=self.x, y=self.y, z=self.z, c=self.c, - s=np.abs(self.x)) + params = _ScatterParams( + x=self.x, y=self.y, z=self.z, c=self.c, s=np.abs(self.x) + ) assert not np.all(params.plot_idx == np.arange(params.size)) np.testing.assert_equal(params.x, self.x[params.plot_idx]) np.testing.assert_equal(params.y, self.y[params.plot_idx]) @@ -258,9 +276,9 @@ def test_plot_idx_shuffle(self): np.testing.assert_equal(params.s, np.abs(self.x)[params.plot_idx]) def test_plot_idx_no_shuffle(self): - params = _ScatterParams(x=self.x, y=self.y, - z=self.z, c=self.c, - s=np.abs(self.x), shuffle=False) + params = _ScatterParams( + x=self.x, y=self.y, z=self.z, c=self.c, s=np.abs(self.x), shuffle=False + ) np.testing.assert_equal(params.plot_idx, np.arange(params.size)) np.testing.assert_equal(params.x, self.x) np.testing.assert_equal(params.y, self.y) @@ -269,9 +287,9 @@ def test_plot_idx_no_shuffle(self): np.testing.assert_equal(params.s, np.abs(self.x)) def test_plot_idx_mask(self): - params = _ScatterParams(x=self.x, y=self.y, - z=self.z, c=self.c, - mask=self.x > 0, shuffle=False) + params = _ScatterParams( + x=self.x, y=self.y, z=self.z, c=self.c, mask=self.x > 0, shuffle=False + ) np.testing.assert_equal(params.plot_idx, np.arange(params.size)[self.x > 0]) np.testing.assert_equal(params.x, self.x[self.x > 0]) np.testing.assert_equal(params.y, self.y[self.x > 0]) @@ -279,9 +297,10 @@ def test_plot_idx_mask(self): np.testing.assert_equal(params.c, self.c[self.x > 0]) def test_plot_idx_mask_shuffle(self): - params = _ScatterParams(x=self.x, y=self.y, - mask=self.x > 0, shuffle=True) - np.testing.assert_equal(np.sort(params.plot_idx), np.arange(params.size)[self.x > 0]) + params = _ScatterParams(x=self.x, y=self.y, mask=self.x > 0, shuffle=True) + np.testing.assert_equal( + np.sort(params.plot_idx), np.arange(params.size)[self.x > 0] + ) assert np.all(params.x > 0) def test_data_int(self): @@ -291,21 +310,20 @@ def test_data_int(self): def test_data_2d(self): params = _ScatterParams(x=self.x, y=self.y) - np.testing.assert_equal(params._data, [self.x, - self.y]) - np.testing.assert_equal(params.data, [self.x[params.plot_idx], - self.y[params.plot_idx]]) + np.testing.assert_equal(params._data, [self.x, self.y]) + np.testing.assert_equal( + params.data, [self.x[params.plot_idx], self.y[params.plot_idx]] + ) assert params.subplot_kw == {} def test_data_3d(self): params = _ScatterParams(x=self.x, y=self.y, z=self.z) - np.testing.assert_equal(params._data, [self.x, - self.y, - self.z]) - np.testing.assert_equal(params.data, [self.x[params.plot_idx], - self.y[params.plot_idx], - self.z[params.plot_idx]]) - assert params.subplot_kw == {'projection': '3d'} + np.testing.assert_equal(params._data, [self.x, self.y, self.z]) + np.testing.assert_equal( + params.data, + [self.x[params.plot_idx], self.y[params.plot_idx], self.z[params.plot_idx]], + ) + assert params.subplot_kw == {"projection": "3d"} def test_s_default(self): params = _ScatterParams(x=self.x, y=self.y) @@ -328,7 +346,7 @@ def test_c_none(self): assert params.extend is None def test_constant_c(self): - params = _ScatterParams(x=self.x, y=self.y, c='blue') + params = _ScatterParams(x=self.x, y=self.y, c="blue") assert params.constant_c() assert not params.array_c() assert params.discrete is None @@ -341,8 +359,7 @@ def test_constant_c(self): assert params.labels is None def test_array_c(self): - params = _ScatterParams(x=self.x, y=self.y, - c=self.array_c) + params = _ScatterParams(x=self.x, y=self.y, c=self.array_c) assert params.array_c() assert not params.constant_c() np.testing.assert_array_equal(params.x, params._x[params.plot_idx]) @@ -366,10 +383,11 @@ def test_continuous(self): np.testing.assert_array_equal(params.c, params._c[params.plot_idx]) assert params.discrete is False assert params.legend is True - assert params.cmap_scale == 'linear' + assert params.cmap_scale == "linear" assert params.cmap is plt.cm.inferno - params = _ScatterParams(x=self.x, y=self.y, discrete=False, - c=np.round(self.c % 1, 1)) + params = _ScatterParams( + x=self.x, y=self.y, discrete=False, c=np.round(self.c % 1, 1) + ) assert not params.array_c() assert not params.constant_c() np.testing.assert_array_equal(params.x, params._x[params.plot_idx]) @@ -378,18 +396,16 @@ def test_continuous(self): assert params.discrete is False assert params.legend is True assert params.labels is None - assert params.cmap_scale == 'linear' + assert params.cmap_scale == "linear" assert params.cmap is plt.cm.inferno def test_discrete_tab10(self): - params = _ScatterParams(x=self.x, y=self.y, - c=np.where(self.c > 0, '+', '-')) + params = _ScatterParams(x=self.x, y=self.y, c=np.where(self.c > 0, "+", "-")) assert not params.array_c() assert not params.constant_c() np.testing.assert_array_equal(params.x, params._x[params.plot_idx]) np.testing.assert_array_equal(params.y, params._y[params.plot_idx]) - np.testing.assert_array_equal( - params.c, params.c_discrete[params.plot_idx]) + np.testing.assert_array_equal(params.c, params.c_discrete[params.plot_idx]) assert params.discrete is True assert params.legend is True assert params.vmin is None @@ -398,8 +414,7 @@ def test_discrete_tab10(self): np.testing.assert_equal(params.cmap.colors, plt.cm.tab10.colors[:2]) def test_discrete_tab20(self): - params = _ScatterParams(x=self.x, y=self.y, - c=10 * np.round(self.c % 1, 1)) + params = _ScatterParams(x=self.x, y=self.y, c=10 * np.round(self.c % 1, 1)) assert not params.array_c() assert not params.constant_c() assert params.discrete is True @@ -409,40 +424,48 @@ def test_discrete_tab20(self): assert params.cmap_scale is None assert params.extend is None assert isinstance(params.cmap, matplotlib.colors.ListedColormap) - np.testing.assert_equal( - params.cmap.colors[:10], - plt.cm.tab10.colors) + np.testing.assert_equal(params.cmap.colors[:10], plt.cm.tab10.colors) np.testing.assert_equal( params.cmap.colors[10:], - plt.cm.tab20.colors[1:1 + (len(params.cmap.colors) - 10) * 2:2]) + plt.cm.tab20.colors[1 : 1 + (len(params.cmap.colors) - 10) * 2 : 2], + ) def test_continuous_less_than_20(self): - params = _ScatterParams(x=self.x, y=self.y, - c=np.round(self.c % 1, 1)) + params = _ScatterParams(x=self.x, y=self.y, c=np.round(self.c % 1, 1)) assert not params.array_c() assert not params.constant_c() assert params.discrete is False assert params.legend is True assert params.vmin == 0 assert params.vmax == 1 - assert params.cmap_scale == 'linear' - assert params.extend == 'neither' + assert params.cmap_scale == "linear" + assert params.extend == "neither" assert params.cmap is matplotlib.cm.inferno def test_continuous_tab20_str(self): - params = _ScatterParams(x=self.x, y=self.y, discrete=False, - cmap='tab20', c=np.round(self.c % 1, 1)) + params = _ScatterParams( + x=self.x, y=self.y, discrete=False, cmap="tab20", c=np.round(self.c % 1, 1) + ) assert params.cmap is plt.cm.tab20 def test_continuous_tab20_obj(self): - params = _ScatterParams(x=self.x, y=self.y, discrete=False, - cmap=plt.get_cmap('tab20'), c=np.round(self.c % 1, 1)) + params = _ScatterParams( + x=self.x, + y=self.y, + discrete=False, + cmap=plt.get_cmap("tab20"), + c=np.round(self.c % 1, 1), + ) assert params.cmap is plt.cm.tab20 def test_discrete_dark2(self): - params = _ScatterParams(x=self.x, y=self.y, discrete=True, - cmap='Dark2', - c=np.where(self.c > 0, '+', '-')) + params = _ScatterParams( + x=self.x, + y=self.y, + discrete=True, + cmap="Dark2", + c=np.where(self.c > 0, "+", "-"), + ) assert not params.array_c() assert not params.constant_c() assert params.discrete is True @@ -455,10 +478,10 @@ def test_discrete_dark2(self): np.testing.assert_equal(params.cmap.colors, plt.cm.Dark2.colors[:2]) def test_c_discrete(self): - c = np.where(self.c > 0, 'a', 'b') + c = np.where(self.c > 0, "a", "b") params = _ScatterParams(x=self.x, y=self.y, c=c) - np.testing.assert_equal(params.c_discrete, np.where(c == 'a', 0, 1)) - np.testing.assert_equal(params.labels, ['a', 'b']) + np.testing.assert_equal(params.c_discrete, np.where(c == "a", 0, 1)) + np.testing.assert_equal(params.labels, ["a", "b"]) def test_legend(self): params = _ScatterParams(x=self.x, y=self.y, c=self.c, legend=False) @@ -483,108 +506,122 @@ def test_vmax_default(self): assert params.vmax == np.max(self.c) def test_list_cmap(self): - params = _ScatterParams(x=self.x, y=self.y, c=self.c, - cmap=['red', 'black']) + params = _ScatterParams(x=self.x, y=self.y, c=self.c, cmap=["red", "black"]) assert params.list_cmap() - np.testing.assert_equal(params.cmap([0, 255]), - [[1, 0, 0, 1], [0, 0, 0, 1]]) + np.testing.assert_equal(params.cmap([0, 255]), [[1, 0, 0, 1], [0, 0, 0, 1]]) def test_dict_cmap_fwd(self): - params = _ScatterParams(x=self.x, y=self.y, - c=np.where(self.c > 0, '+', '-'), - cmap={'+': 'k', '-': 'r'}) + params = _ScatterParams( + x=self.x, + y=self.y, + c=np.where(self.c > 0, "+", "-"), + cmap={"+": "k", "-": "r"}, + ) assert not params.list_cmap() if sys.version_info[1] > 5: - np.testing.assert_equal(params.cmap.colors, - [[0, 0, 0, 1], [1, 0, 0, 1]]) - assert np.all(params._labels == np.array(['+', '-'])) + np.testing.assert_equal(params.cmap.colors, [[0, 0, 0, 1], [1, 0, 0, 1]]) + assert np.all(params._labels == np.array(["+", "-"])) else: try: - np.testing.assert_equal(params.cmap.colors, - [[0, 0, 0, 1], [1, 0, 0, 1]]) - assert np.all(params._labels == np.array(['+', '-'])) + np.testing.assert_equal( + params.cmap.colors, [[0, 0, 0, 1], [1, 0, 0, 1]] + ) + assert np.all(params._labels == np.array(["+", "-"])) except AssertionError: - np.testing.assert_equal(params.cmap.colors, - [[1, 0, 0, 1], [0, 0, 0, 1]]) - assert np.all(params._labels == np.array(['-', '+'])) + np.testing.assert_equal( + params.cmap.colors, [[1, 0, 0, 1], [0, 0, 0, 1]] + ) + assert np.all(params._labels == np.array(["-", "+"])) def test_dict_cmap_rev(self): - params = _ScatterParams(x=self.x, y=self.y, - c=np.where(self.c > 0, '+', '-'), - cmap={'-': 'k', '+': 'r'}) + params = _ScatterParams( + x=self.x, + y=self.y, + c=np.where(self.c > 0, "+", "-"), + cmap={"-": "k", "+": "r"}, + ) if sys.version_info[1] > 5: - np.testing.assert_equal(params.cmap.colors, - [[0, 0, 0, 1], [1, 0, 0, 1]]) - assert np.all(params._labels == np.array(['-', '+'])) + np.testing.assert_equal(params.cmap.colors, [[0, 0, 0, 1], [1, 0, 0, 1]]) + assert np.all(params._labels == np.array(["-", "+"])) else: try: - np.testing.assert_equal(params.cmap.colors, - [[0, 0, 0, 1], [1, 0, 0, 1]]) - assert np.all(params._labels == np.array(['-', '+'])) + np.testing.assert_equal( + params.cmap.colors, [[0, 0, 0, 1], [1, 0, 0, 1]] + ) + assert np.all(params._labels == np.array(["-", "+"])) except AssertionError: - np.testing.assert_equal(params.cmap.colors, - [[1, 0, 0, 1], [0, 0, 0, 1]]) - assert np.all(params._labels == np.array(['+', '-'])) + np.testing.assert_equal( + params.cmap.colors, [[1, 0, 0, 1], [0, 0, 0, 1]] + ) + assert np.all(params._labels == np.array(["+", "-"])) def test_dict_cmap_constant(self): - params = _ScatterParams(x=self.x, y=self.y, - c=np.full_like(self.c, '+', dtype=str), - cmap={'-': 'k', '+': 'r'}) - np.testing.assert_equal(params.cmap.colors, - [[1, 0, 0, 1]]) - assert np.all(params._labels == np.array(['+'])) + params = _ScatterParams( + x=self.x, + y=self.y, + c=np.full_like(self.c, "+", dtype=str), + cmap={"-": "k", "+": "r"}, + ) + np.testing.assert_equal(params.cmap.colors, [[1, 0, 0, 1]]) + assert np.all(params._labels == np.array(["+"])) def test_cmap_given(self): - params = _ScatterParams(x=self.x, y=self.y, c=self.c, cmap='viridis') + params = _ScatterParams(x=self.x, y=self.y, c=self.c, cmap="viridis") assert params.cmap is matplotlib.cm.viridis assert not params.list_cmap() def test_cmap_scale_symlog(self): - params = _ScatterParams(x=self.x, y=self.y, c=self.c, - cmap_scale='symlog') - assert params.cmap_scale == 'symlog' + params = _ScatterParams(x=self.x, y=self.y, c=self.c, cmap_scale="symlog") + assert params.cmap_scale == "symlog" assert isinstance(params.norm, matplotlib.colors.SymLogNorm) def test_cmap_scale_log(self): - params = _ScatterParams(x=self.x, y=self.y, c=np.abs(self.c) + 1, - cmap_scale='log') - assert params.cmap_scale == 'log' + params = _ScatterParams( + x=self.x, y=self.y, c=np.abs(self.c) + 1, cmap_scale="log" + ) + assert params.cmap_scale == "log" assert isinstance(params.norm, matplotlib.colors.LogNorm) def test_cmap_scale_sqrt(self): - params = _ScatterParams(x=self.x, y=self.y, c=self.c, - cmap_scale='sqrt') - assert params.cmap_scale == 'sqrt' + params = _ScatterParams(x=self.x, y=self.y, c=self.c, cmap_scale="sqrt") + assert params.cmap_scale == "sqrt" assert isinstance(params.norm, matplotlib.colors.PowerNorm) assert params.norm.gamma == 0.5 def test_extend(self): - params = _ScatterParams(x=self.x, y=self.y, c=self.c, - vmin=np.mean(self.c)) - assert params.extend == 'min' - params = _ScatterParams(x=self.x, y=self.y, c=self.c, - vmax=np.mean(self.c)) - assert params.extend == 'max' - params = _ScatterParams(x=self.x, y=self.y, c=self.c, - vmin=(np.min(self.c) + np.mean(self.c)) / 2, - vmax=(np.max(self.c) + np.mean(self.c)) / 2) - assert params.extend == 'both' + params = _ScatterParams(x=self.x, y=self.y, c=self.c, vmin=np.mean(self.c)) + assert params.extend == "min" + params = _ScatterParams(x=self.x, y=self.y, c=self.c, vmax=np.mean(self.c)) + assert params.extend == "max" + params = _ScatterParams( + x=self.x, + y=self.y, + c=self.c, + vmin=(np.min(self.c) + np.mean(self.c)) / 2, + vmax=(np.max(self.c) + np.mean(self.c)) / 2, + ) + assert params.extend == "both" params = _ScatterParams(x=self.x, y=self.y, c=self.c) - assert params.extend == 'neither' + assert params.extend == "neither" def test_check_vmin_vmax(self): assert_warns_message( UserWarning, "Cannot set `vmin` or `vmax` with constant `c=None`. " "Setting `vmin = vmax = None`.", - _ScatterParams, x=self.x, y=self.y, vmin=0 + _ScatterParams, + x=self.x, + y=self.y, + vmin=0, ) assert_warns_message( UserWarning, - "Cannot set `vmin` or `vmax` with discrete data. " - "Setting to `None`.", - _ScatterParams, x=self.x, y=self.y, - c=np.where(self.c > 0, '+', '-'), vmin=0 + "Cannot set `vmin` or `vmax` with discrete data. " "Setting to `None`.", + _ScatterParams, + x=self.x, + y=self.y, + c=np.where(self.c > 0, "+", "-"), + vmin=0, ) def test_check_legend(self): @@ -592,155 +629,199 @@ def test_check_legend(self): ValueError, "Received conflicting values for synonyms " "`legend=True` and `colorbar=False`", - _ScatterParams, x=self.x, y=self.y, - legend=True, colorbar=False + _ScatterParams, + x=self.x, + y=self.y, + legend=True, + colorbar=False, ) assert_warns_message( UserWarning, "`c` is a color array and cannot be used to create a " "legend. To interpret these values as labels instead, " "provide a `cmap` dictionary with label-color pairs.", - _ScatterParams, x=self.x, y=self.y, - c=self.array_c, legend=True + _ScatterParams, + x=self.x, + y=self.y, + c=self.array_c, + legend=True, ) assert_warns_message( UserWarning, "Cannot create a legend with constant `c=None`", - _ScatterParams, x=self.x, y=self.y, - c=None, legend=True + _ScatterParams, + x=self.x, + y=self.y, + c=None, + legend=True, ) def test_check_size(self): assert_raise_message( ValueError, - "Expected all axes of data to have the same length" - ". Got [500, 100]", - - _ScatterParams, x=self.x, y=self.y[:100] + "Expected all axes of data to have the same length" ". Got [500, 100]", + _ScatterParams, + x=self.x, + y=self.y[:100], ) assert_raise_message( ValueError, - "Expected all axes of data to have the same length" - ". Got [500, 500, 100]", - _ScatterParams, x=self.x, y=self.y, z=self.z[:100] + "Expected all axes of data to have the same length" ". Got [500, 500, 100]", + _ScatterParams, + x=self.x, + y=self.y, + z=self.z[:100], ) def test_check_c(self): assert_raise_message( ValueError, "Expected c of length 500 or 1. Got 100", - _ScatterParams, x=self.x, y=self.y, c=self.c[:100] + _ScatterParams, + x=self.x, + y=self.y, + c=self.c[:100], ) def test_check_discrete(self): assert_raise_message( ValueError, "Cannot treat non-numeric data as continuous.", - _ScatterParams, x=self.x, y=self.y, - c=np.where(self.c > 0, '+', '-'), discrete=False + _ScatterParams, + x=self.x, + y=self.y, + c=np.where(self.c > 0, "+", "-"), + discrete=False, ) def test_check_cmap(self): - assert_raise_message(ValueError, - "Expected list-like `c` with dictionary cmap." - " Got ", - _ScatterParams, x=self.x, y=self.y, - c='black', - cmap={'+': 'k', '-': 'r'}) assert_raise_message( ValueError, - "Cannot use dictionary cmap with " - "continuous data.", - _ScatterParams, x=self.x, y=self.y, - c=self.c, discrete=False, - cmap={'+': 'k', '-': 'r'}) + "Expected list-like `c` with dictionary cmap." " Got ", + _ScatterParams, + x=self.x, + y=self.y, + c="black", + cmap={"+": "k", "-": "r"}, + ) + assert_raise_message( + ValueError, + "Cannot use dictionary cmap with " "continuous data.", + _ScatterParams, + x=self.x, + y=self.y, + c=self.c, + discrete=False, + cmap={"+": "k", "-": "r"}, + ) assert_raise_message( ValueError, "Dictionary cmap requires a color " "for every unique entry in `c`. " "Missing colors for [+]", - _ScatterParams, x=self.x, y=self.y, - c=np.where(self.c > 0, '+', '-'), - cmap={'-': 'r'}) + _ScatterParams, + x=self.x, + y=self.y, + c=np.where(self.c > 0, "+", "-"), + cmap={"-": "r"}, + ) assert_raise_message( ValueError, - "Expected list-like `c` with list cmap. " - "Got ", - _ScatterParams, x=self.x, y=self.y, - c='black', - cmap=['k', 'r']) + "Expected list-like `c` with list cmap. " "Got ", + _ScatterParams, + x=self.x, + y=self.y, + c="black", + cmap=["k", "r"], + ) def test_check_cmap_scale(self): assert_warns_message( UserWarning, - "Cannot use non-linear `cmap_scale` with " - "`c` as a color array.", - _ScatterParams, x=self.x, y=self.y, - c=self.array_c, cmap_scale='log' + "Cannot use non-linear `cmap_scale` with " "`c` as a color array.", + _ScatterParams, + x=self.x, + y=self.y, + c=self.array_c, + cmap_scale="log", ) assert_warns_message( UserWarning, - "Cannot use non-linear `cmap_scale` with constant " - "`c=black`.", - _ScatterParams, x=self.x, y=self.y, - c='black', cmap_scale='log' + "Cannot use non-linear `cmap_scale` with constant " "`c=black`.", + _ScatterParams, + x=self.x, + y=self.y, + c="black", + cmap_scale="log", ) assert_warns_message( UserWarning, "Cannot use non-linear `cmap_scale` with discrete data.", - _ScatterParams, x=self.x, y=self.y, - cmap_scale='log', - c=np.where(self.c > 0, '+', '-'), + _ScatterParams, + x=self.x, + y=self.y, + cmap_scale="log", + c=np.where(self.c > 0, "+", "-"), ) def test_series_labels(self): - params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c) - assert params.xlabel == 'x' + params = _ScatterParams(x=pd.Series(self.x, name="x"), y=self.y, c=self.c) + assert params.xlabel == "x" assert params.ylabel is None assert params.zlabel is None - params = _ScatterParams(x=self.x, y=pd.Series(self.y, name='y'), c=self.c) + params = _ScatterParams(x=self.x, y=pd.Series(self.y, name="y"), c=self.c) assert params.xlabel is None - assert params.ylabel == 'y' + assert params.ylabel == "y" assert params.zlabel is None - params = _ScatterParams(x=self.x, y=self.y, z=pd.Series(self.y, name='z'), c=self.c) + params = _ScatterParams( + x=self.x, y=self.y, z=pd.Series(self.y, name="z"), c=self.c + ) assert params.xlabel is None assert params.ylabel is None - assert params.zlabel == 'z' + assert params.zlabel == "z" # xlabel overrides series - params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c, - xlabel='y') - assert params.xlabel == 'y' + params = _ScatterParams( + x=pd.Series(self.x, name="x"), y=self.y, c=self.c, xlabel="y" + ) + assert params.xlabel == "y" assert params.ylabel is None assert params.zlabel is None # label_prefix overrides series - params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, c=self.c, - label_prefix='y') - assert params.xlabel == 'y1' - assert params.ylabel == 'y2' + params = _ScatterParams( + x=pd.Series(self.x, name="x"), y=self.y, c=self.c, label_prefix="y" + ) + assert params.xlabel == "y1" + assert params.ylabel == "y2" assert params.zlabel is None # xlabel overrides label_prefix - params = _ScatterParams(x=pd.Series(self.x, name='x'), y=self.y, z=self.y, c=self.c, - label_prefix='y', xlabel='test') - assert params.xlabel == 'test' - assert params.ylabel == 'y2' - assert params.zlabel == 'y3' + params = _ScatterParams( + x=pd.Series(self.x, name="x"), + y=self.y, + z=self.y, + c=self.c, + label_prefix="y", + xlabel="test", + ) + assert params.xlabel == "test" + assert params.ylabel == "y2" + assert params.zlabel == "y3" def test_jitter_x(self): - params = _JitterParams(x=np.where(self.x > 0, '+', '-'), y=self.y) - np.testing.assert_array_equal(params.x_labels, ['+', '-']) + params = _JitterParams(x=np.where(self.x > 0, "+", "-"), y=self.y) + np.testing.assert_array_equal(params.x_labels, ["+", "-"]) np.testing.assert_array_equal( - params.x_coords, np.where(self.x > 0, 0, 1)[params.plot_idx]) + params.x_coords, np.where(self.x > 0, 0, 1)[params.plot_idx] + ) class Test10X(unittest.TestCase): - @classmethod def setUpClass(self): self.X = data.load_10X(sparse=False) self.X_filt = scprep.filter.filter_empty_cells(self.X) - self.X_pca, self.S = scprep.reduce.pca(scprep.utils.toarray(self.X), - n_components=10, - return_singular_values=True) + self.X_pca, self.S = scprep.reduce.pca( + scprep.utils.toarray(self.X), n_components=10, return_singular_values=True + ) @classmethod def tearDownClass(self): @@ -755,12 +836,13 @@ def tearDownClass(self): try_remove("test_scree.png") def tearDown(self): - plt.close('all') + plt.close("all") def test_histogram(self): scprep.plot.plot_library_size(self.X_filt, cutoff=1000, log=True) - scprep.plot.plot_library_size(self.X_filt, cutoff=1000, log=True, - xlabel="x label", ylabel="y label") + scprep.plot.plot_library_size( + self.X_filt, cutoff=1000, log=True, xlabel="x label", ylabel="y label" + ) def test_histogram_list_of_lists(self): scprep.plot.plot_library_size(scprep.utils.toarray(self.X_filt).tolist()) @@ -769,9 +851,10 @@ def test_histogram_array(self): scprep.plot.plot_library_size(scprep.utils.toarray(self.X_filt)) def test_histogram_multiple(self): - scprep.plot.histogram([scprep.select.select_rows(self.X, idx=0), - [1, 2, 2, 2, 3]], - color=['r', 'b']) + scprep.plot.histogram( + [scprep.select.select_rows(self.X, idx=0), [1, 2, 2, 2, 3]], + color=["r", "b"], + ) def test_histogram_multiple_cutoff(self): scprep.plot.plot_library_size(self.X_filt, cutoff=[500, 1000], log=True) @@ -780,100 +863,116 @@ def test_histogram_multiple_percentile(self): scprep.plot.plot_library_size(self.X_filt, percentile=[10, 90], log=True) def test_histogram_log_negative_min(self): - scprep.plot.histogram([-1, 1, 1, 1], log='x') + scprep.plot.histogram([-1, 1, 1, 1], log="x") scprep.plot.histogram([-1, 1, 1, 1], log=True) - scprep.plot.histogram([-1, -0.1, -0.1, 1], log='x') + scprep.plot.histogram([-1, -0.1, -0.1, 1], log="x") scprep.plot.histogram([-1, -0.1, -0.1, 1], log=True) def test_histogram_log_negative_max(self): - scprep.plot.histogram([-1, -1, -1, -1], log='x') + scprep.plot.histogram([-1, -1, -1, -1], log="x") scprep.plot.histogram([-1, -1, -1, -1], log=True) - scprep.plot.histogram([-1, -1, -1, -2], log='x') + scprep.plot.histogram([-1, -1, -1, -2], log="x") scprep.plot.histogram([-1, -1, -1, -2], log=True) def test_histogram_log_zero_min(self): - scprep.plot.histogram([0, 1, 1, 1], log='x') + scprep.plot.histogram([0, 1, 1, 1], log="x") scprep.plot.histogram([0, 1, 1, 1], log=True) - scprep.plot.histogram([0, 0, -0.1, 1], log='x') + scprep.plot.histogram([0, 0, -0.1, 1], log="x") scprep.plot.histogram([0, 0, -0.1, 1], log=True) def test_histogram_log_zero_max(self): - scprep.plot.histogram([-1, -1, 0, -1], log='x') + scprep.plot.histogram([-1, -1, 0, -1], log="x") scprep.plot.histogram([-1, -1, 0, -1], log=True) - scprep.plot.histogram([-1, -1, 0, -2], log='x') + scprep.plot.histogram([-1, -1, 0, -2], log="x") scprep.plot.histogram([-1, -1, 0, -2], log=True) def test_plot_library_size_multiple(self): - scprep.plot.plot_library_size([ - self.X_filt, scprep.select.select_rows( - self.X_filt, idx=np.arange(self.X_filt.shape[0] // 2))], - color=['r', 'b'], - filename="test_library_size.png") + scprep.plot.plot_library_size( + [ + self.X_filt, + scprep.select.select_rows( + self.X_filt, idx=np.arange(self.X_filt.shape[0] // 2) + ), + ], + color=["r", "b"], + filename="test_library_size.png", + ) assert os.path.exists("test_library_size.png") def test_plot_gene_set_expression_multiple(self): - scprep.plot.plot_gene_set_expression([ - self.X, scprep.select.select_rows( - self.X, idx=np.arange(self.X.shape[0] // 2))], + scprep.plot.plot_gene_set_expression( + [ + self.X, + scprep.select.select_rows(self.X, idx=np.arange(self.X.shape[0] // 2)), + ], starts_with="D", - color=['r', 'b']) + color=["r", "b"], + ) def test_gene_set_expression_list_of_lists(self): scprep.plot.plot_gene_set_expression( - scprep.utils.toarray(self.X).tolist(), genes=[0, 1]) + scprep.utils.toarray(self.X).tolist(), genes=[0, 1] + ) def test_gene_set_expression_array(self): - scprep.plot.plot_gene_set_expression(scprep.utils.toarray(self.X), - genes=[0, 1]) + scprep.plot.plot_gene_set_expression(scprep.utils.toarray(self.X), genes=[0, 1]) def test_plot_gene_set_expression_single_gene(self): scprep.plot.plot_gene_set_expression( - self.X, color=["red"], - genes="Arl8b", - filename="test_gene_expression.png") + self.X, color=["red"], genes="Arl8b", filename="test_gene_expression.png" + ) assert os.path.exists("test_gene_expression.png") def test_plot_variable_genes(self): - scprep.plot.plot_gene_variability( - self.X, - filename="test_variable_genes.png") + scprep.plot.plot_gene_variability(self.X, filename="test_variable_genes.png") assert os.path.exists("test_variable_genes.png") def test_variable_genes_list_of_lists(self): - scprep.plot.plot_gene_variability( - scprep.utils.toarray(self.X).tolist()) + scprep.plot.plot_gene_variability(scprep.utils.toarray(self.X).tolist()) def test_histogram_single_gene_dataframe(self): scprep.plot.histogram( - scprep.select.select_cols(self.X, idx=['Arl8b']), - color=["red"]) + scprep.select.select_cols(self.X, idx=["Arl8b"]), color=["red"] + ) def test_histogram_single_gene_series(self): scprep.plot.histogram( - scprep.select.select_cols(self.X, idx='Arl8b'), - color=["red"]) + scprep.select.select_cols(self.X, idx="Arl8b"), color=["red"] + ) def test_histogram_custom_axis(self): fig, ax = plt.subplots() scprep.plot.plot_gene_set_expression( - self.X, genes=scprep.select.get_gene_set(self.X, starts_with="D"), - percentile=90, log='y', ax=ax, title="histogram", - filename="test_histogram.png") + self.X, + genes=scprep.select.get_gene_set(self.X, starts_with="D"), + percentile=90, + log="y", + ax=ax, + title="histogram", + filename="test_histogram.png", + ) assert os.path.exists("test_histogram.png") - assert ax.get_title() == 'histogram' + assert ax.get_title() == "histogram" def test_histogram_invalid_axis(self): assert_raise_message( TypeError, "Expected ax as a matplotlib.axes.Axes. Got ", scprep.plot.plot_library_size, - self.X, ax="invalid") + self.X, + ax="invalid", + ) def test_scree(self): ax = scprep.plot.scree_plot(self.S) assert all([t == int(t) for t in ax.get_xticks()]), ax.get_xticks() - ax = scprep.plot.scree_plot(self.S, cumulative=True, - xlabel="x label", ylabel="y label", filename="test_scree.png") + ax = scprep.plot.scree_plot( + self.S, + cumulative=True, + xlabel="x label", + ylabel="y label", + filename="test_scree.png", + ) assert all([t == int(t) for t in ax.get_xticks()]), ax.get_xticks() assert os.path.isfile("test_scree.png") @@ -887,110 +986,143 @@ def test_scree_invalid_axis(self): TypeError, "Expected ax as a matplotlib.axes.Axes. Got ", scprep.plot.scree_plot, - self.S, ax="invalid") + self.S, + ax="invalid", + ) def test_scatter_continuous(self): - scprep.plot.scatter2d(self.X_pca, c=self.X_pca[:, 0], - legend_title="test", title="title test") + scprep.plot.scatter2d( + self.X_pca, c=self.X_pca[:, 0], legend_title="test", title="title test" + ) def test_scatter_discrete(self): - ax = scprep.plot.scatter2d(self.X_pca, c=np.random.choice( - ['hello', 'world'], self.X_pca.shape[0], replace=True), - legend_title="test", legend_loc='center left', - legend_anchor=(1.02, 0.5)) - assert ax.get_legend().get_title().get_text() == 'test' + ax = scprep.plot.scatter2d( + self.X_pca, + c=np.random.choice(["hello", "world"], self.X_pca.shape[0], replace=True), + legend_title="test", + legend_loc="center left", + legend_anchor=(1.02, 0.5), + ) + assert ax.get_legend().get_title().get_text() == "test" def test_jitter_discrete(self): - ax = scprep.plot.jitter(np.where(self.X_pca[:, 0] > 0, '+', '-'), - self.X_pca[:, 1], c=np.random.choice( - ['hello', 'world'], self.X_pca.shape[0], replace=True), - legend_title="test", title="jitter", filename="test_jitter.png") + ax = scprep.plot.jitter( + np.where(self.X_pca[:, 0] > 0, "+", "-"), + self.X_pca[:, 1], + c=np.random.choice(["hello", "world"], self.X_pca.shape[0], replace=True), + legend_title="test", + title="jitter", + filename="test_jitter.png", + ) assert os.path.exists("test_jitter.png") - assert ax.get_legend().get_title().get_text() == 'test' - assert ax.get_title() == 'jitter' + assert ax.get_legend().get_title().get_text() == "test" + assert ax.get_title() == "jitter" assert ax.get_xlim() == (-0.5, 1.5) - assert [t.get_text() for t in ax.get_xticklabels()] == ['+', '-'] + assert [t.get_text() for t in ax.get_xticklabels()] == ["+", "-"] def test_jitter_continuous(self): - ax = scprep.plot.jitter(np.where(self.X_pca[:, 0] > 0, '+', '-'), - self.X_pca[:, 1], c=self.X_pca[:, 1], - title="jitter", legend_title="test") - assert ax.get_figure().get_axes()[1].get_ylabel() == 'test' - assert ax.get_title() == 'jitter' + ax = scprep.plot.jitter( + np.where(self.X_pca[:, 0] > 0, "+", "-"), + self.X_pca[:, 1], + c=self.X_pca[:, 1], + title="jitter", + legend_title="test", + ) + assert ax.get_figure().get_axes()[1].get_ylabel() == "test" + assert ax.get_title() == "jitter" assert ax.get_xlim() == (-0.5, 1.5) - assert [t.get_text() for t in ax.get_xticklabels()] == ['+', '-'] + assert [t.get_text() for t in ax.get_xticklabels()] == ["+", "-"] def test_jitter_axis_labels(self): - ax = scprep.plot.jitter(np.where(self.X_pca[:, 0] > 0, '+', '-'), - self.X_pca[:, 1], - xlabel="test") + ax = scprep.plot.jitter( + np.where(self.X_pca[:, 0] > 0, "+", "-"), self.X_pca[:, 1], xlabel="test" + ) assert ax.get_xlabel() == "test" - assert ax.get_ylabel() == '' + assert ax.get_ylabel() == "" ax = scprep.plot.jitter( - pd.Series(np.where(self.X_pca[:, 0] > 0, '+', '-'), name='x'), - pd.Series(self.X_pca[:, 1], name='y'), ylabel="override") + pd.Series(np.where(self.X_pca[:, 0] > 0, "+", "-"), name="x"), + pd.Series(self.X_pca[:, 1], name="y"), + ylabel="override", + ) assert ax.get_xlabel() == "x" assert ax.get_ylabel() == "override" def test_scatter_dict(self): - scprep.plot.scatter2d(self.X_pca, c=np.random.choice( - ['hello', 'world'], self.X_pca.shape[0], replace=True), - cmap={'hello': 'red', 'world': 'green'}) + scprep.plot.scatter2d( + self.X_pca, + c=np.random.choice(["hello", "world"], self.X_pca.shape[0], replace=True), + cmap={"hello": "red", "world": "green"}, + ) def test_scatter_dict_c_none(self): assert_raise_message( ValueError, "Expected list-like `c` with dictionary cmap. Got ", - scprep.plot.scatter2d, self.X_pca, c=None, - cmap={'hello': 'red', 'world': 'green'}) + scprep.plot.scatter2d, + self.X_pca, + c=None, + cmap={"hello": "red", "world": "green"}, + ) def test_scatter_dict_continuous(self): assert_raise_message( ValueError, "Cannot use dictionary cmap with continuous data", - scprep.plot.scatter2d, self.X_pca, c=self.X_pca[:, 0], - discrete=False, cmap={'hello': 'red', 'world': 'green'}) + scprep.plot.scatter2d, + self.X_pca, + c=self.X_pca[:, 0], + discrete=False, + cmap={"hello": "red", "world": "green"}, + ) def test_scatter_dict_missing(self): assert_raise_message( ValueError, "Dictionary cmap requires a color for every unique entry in `c`. " "Missing colors for [world]", - scprep.plot.scatter2d, self.X_pca, c=np.random.choice( - ['hello', 'world'], self.X_pca.shape[0], replace=True), - cmap={'hello': 'red'}) + scprep.plot.scatter2d, + self.X_pca, + c=np.random.choice(["hello", "world"], self.X_pca.shape[0], replace=True), + cmap={"hello": "red"}, + ) def test_scatter_list_discrete(self): - scprep.plot.scatter2d(self.X_pca, c=np.random.choice( - ['hello', 'world'], self.X_pca.shape[0], replace=True), - cmap=['red', 'green']) + scprep.plot.scatter2d( + self.X_pca, + c=np.random.choice(["hello", "world"], self.X_pca.shape[0], replace=True), + cmap=["red", "green"], + ) def test_scatter_list_discrete_missing(self): - scprep.plot.scatter2d(self.X_pca, c=np.random.choice( - ['hello', 'great', 'world'], self.X_pca.shape[0], replace=True), - cmap=['red', 'green']) + scprep.plot.scatter2d( + self.X_pca, + c=np.random.choice( + ["hello", "great", "world"], self.X_pca.shape[0], replace=True + ), + cmap=["red", "green"], + ) def test_scatter_list_continuous(self): - scprep.plot.scatter2d(self.X_pca, c=self.X_pca[:, 0], - cmap=['red', 'green']) + scprep.plot.scatter2d(self.X_pca, c=self.X_pca[:, 0], cmap=["red", "green"]) def test_scatter_list_single(self): - scprep.plot.scatter2d(self.X_pca, c=self.X_pca[:, 0], - cmap=['red']) + scprep.plot.scatter2d(self.X_pca, c=self.X_pca[:, 0], cmap=["red"]) def test_scatter_list_c_none(self): assert_raise_message( ValueError, "Expected list-like `c` with list cmap. Got ", - scprep.plot.scatter2d, self.X_pca, c=None, - cmap=['red', 'green']) + scprep.plot.scatter2d, + self.X_pca, + c=None, + cmap=["red", "green"], + ) def test_scatter_discrete_greater_than_10(self): - scprep.plot.scatter2d( - self.X_pca, c=np.arange(self.X_pca.shape[0]) % 11) + scprep.plot.scatter2d(self.X_pca, c=np.arange(self.X_pca.shape[0]) % 11) def test_scatter_solid(self): - scprep.plot.scatter3d(self.X_pca, c='green') + scprep.plot.scatter3d(self.X_pca, c="green") def test_scatter_none(self): scprep.plot.scatter2d(self.X_pca, c=None) @@ -1001,7 +1133,7 @@ def test_scatter_no_ticks(self): def test_scatter_no_ticklabels(self): ax = scprep.plot.scatter3d(self.X_pca, zticklabels=False) - assert np.all([lab.get_text() == '' for lab in ax.get_zticklabels()]) + assert np.all([lab.get_text() == "" for lab in ax.get_zticklabels()]) def test_scatter_custom_ticks(self): ax = scprep.plot.scatter2d(self.X_pca, xticks=[0, 1, 2]) @@ -1010,50 +1142,49 @@ def test_scatter_custom_ticks(self): assert np.all(ax.get_zticks() == np.array([])) def test_scatter_custom_ticklabels(self): - ax = scprep.plot.scatter2d(self.X_pca, xticks=[0, 1, 2], - xticklabels=['a', 'b', 'c']) + ax = scprep.plot.scatter2d( + self.X_pca, xticks=[0, 1, 2], xticklabels=["a", "b", "c"] + ) assert np.all(ax.get_xticks() == np.array([0, 1, 2])) - xticklabels = np.array([lab.get_text() - for lab in ax.get_xticklabels()]) - assert np.all(xticklabels == np.array(['a', 'b', 'c'])) + xticklabels = np.array([lab.get_text() for lab in ax.get_xticklabels()]) + assert np.all(xticklabels == np.array(["a", "b", "c"])) def test_scatter_axis_labels(self): - ax = scprep.plot.scatter2d( - self.X_pca.tolist(), label_prefix="test") + ax = scprep.plot.scatter2d(self.X_pca.tolist(), label_prefix="test") assert ax.get_xlabel() == "test1" assert ax.get_ylabel() == "test2" - ax = scprep.plot.scatter3d( - self.X_pca.tolist(), label_prefix="test") + ax = scprep.plot.scatter3d(self.X_pca.tolist(), label_prefix="test") assert ax.get_xlabel() == "test1" assert ax.get_ylabel() == "test2" assert ax.get_zlabel() == "test3" - ax = scprep.plot.scatter2d( - self.X_pca, label_prefix="test", xlabel="override") + ax = scprep.plot.scatter2d(self.X_pca, label_prefix="test", xlabel="override") assert ax.get_xlabel() == "override" assert ax.get_ylabel() == "test2" ax = scprep.plot.scatter( - x=self.X_pca[:,0], y=pd.Series(self.X_pca[:,1], name='y'), - z=pd.Series(self.X_pca[:,2], name='z'), - ylabel='override') - assert ax.get_xlabel() == '' + x=self.X_pca[:, 0], + y=pd.Series(self.X_pca[:, 1], name="y"), + z=pd.Series(self.X_pca[:, 2], name="z"), + ylabel="override", + ) + assert ax.get_xlabel() == "" assert ax.get_ylabel() == "override" assert ax.get_zlabel() == "z" ax = scprep.plot.scatter( - x=self.X_pca[:,0], y=pd.Series(self.X_pca[:,1], name='y'), - z=pd.Series(self.X_pca[:,2], name='z'), - zlabel='override') - assert ax.get_xlabel() == '' + x=self.X_pca[:, 0], + y=pd.Series(self.X_pca[:, 1], name="y"), + z=pd.Series(self.X_pca[:, 2], name="z"), + zlabel="override", + ) + assert ax.get_xlabel() == "" assert ax.get_ylabel() == "y" assert ax.get_zlabel() == "override" def test_scatter_axis_savefig(self): - scprep.plot.scatter2d( - self.X_pca, filename="test.png") + scprep.plot.scatter2d(self.X_pca, filename="test.png") assert os.path.exists("test.png") def test_scatter_viewinit(self): - ax = scprep.plot.scatter3d( - self.X_pca, elev=80, azim=270) + ax = scprep.plot.scatter3d(self.X_pca, elev=80, azim=270) assert ax.elev == 80 assert ax.azim == 270 @@ -1062,23 +1193,23 @@ def test_scatter3d_data_2d(self): ValueError, "Expected data.shape[1] >= 3. Got 2", scprep.plot.scatter3d, - self.X_pca[:,:2]) + self.X_pca[:, :2], + ) def test_scatter3d_data_2d_list(self): assert_raise_message( ValueError, "Expected data.shape[1] >= 3. Got 2", scprep.plot.scatter3d, - self.X_pca[:,:2].tolist()) + self.X_pca[:, :2].tolist(), + ) def test_scatter_rotate_gif(self): - scprep.plot.rotate_scatter3d(self.X_pca, fps=3, dpi=20, - filename="test.gif") + scprep.plot.rotate_scatter3d(self.X_pca, fps=3, dpi=20, filename="test.gif") assert os.path.exists("test.gif") def test_scatter_rotate_mp4(self): - scprep.plot.rotate_scatter3d(self.X_pca, fps=3, dpi=20, - filename="test.mp4") + scprep.plot.rotate_scatter3d(self.X_pca, fps=3, dpi=20, filename="test.mp4") assert os.path.exists("test.mp4") def test_scatter_rotate_invalid_filename(self): @@ -1086,156 +1217,232 @@ def test_scatter_rotate_invalid_filename(self): ValueError, "filename must end in .gif or .mp4. Got test.invalid", scprep.plot.rotate_scatter3d, - self.X_pca, fps=3, dpi=20, filename="test.invalid") + self.X_pca, + fps=3, + dpi=20, + filename="test.invalid", + ) def test_scatter_invalid_data(self): assert_raise_message( - ValueError, "Expected all axes of data to have the same length. " + ValueError, + "Expected all axes of data to have the same length. " "Got {}".format([self.X_pca.shape[0], self.X_pca.shape[1]]), - scprep.plot.scatter, x=self.X_pca[:, 0], y=self.X_pca[0, :]) + scprep.plot.scatter, + x=self.X_pca[:, 0], + y=self.X_pca[0, :], + ) assert_raise_message( - ValueError, "Expected all axes of data to have the same length. " - "Got {}".format([self.X_pca.shape[0], self.X_pca.shape[0], - self.X_pca.shape[1]]), - scprep.plot.scatter, x=self.X_pca[:, 0], y=self.X_pca[:, 0], - z=self.X_pca[0, :]) + ValueError, + "Expected all axes of data to have the same length. " + "Got {}".format( + [self.X_pca.shape[0], self.X_pca.shape[0], self.X_pca.shape[1]] + ), + scprep.plot.scatter, + x=self.X_pca[:, 0], + y=self.X_pca[:, 0], + z=self.X_pca[0, :], + ) def test_scatter_invalid_c(self): assert_raise_message( - ValueError, "Expected c of length {} or 1. Got {}".format( - self.X_pca.shape[0], self.X_pca.shape[1]), - scprep.plot.scatter2d, self.X_pca, - c=self.X_pca[0, :]) + ValueError, + "Expected c of length {} or 1. Got {}".format( + self.X_pca.shape[0], self.X_pca.shape[1] + ), + scprep.plot.scatter2d, + self.X_pca, + c=self.X_pca[0, :], + ) def test_scatter_invalid_s(self): assert_raise_message( - ValueError, "Expected s of length {} or 1. Got {}".format( - self.X_pca.shape[0], self.X_pca.shape[1]), - scprep.plot.scatter2d, self.X_pca, - s=self.X_pca[0, :]) + ValueError, + "Expected s of length {} or 1. Got {}".format( + self.X_pca.shape[0], self.X_pca.shape[1] + ), + scprep.plot.scatter2d, + self.X_pca, + s=self.X_pca[0, :], + ) def test_scatter_invalid_mask(self): assert_raise_message( - ValueError, "Expected mask of length {}. Got {}".format( - self.X_pca.shape[0], self.X_pca.shape[1]), - scprep.plot.scatter2d, self.X_pca, - mask=self.X_pca[0, :] > 0) + ValueError, + "Expected mask of length {}. Got {}".format( + self.X_pca.shape[0], self.X_pca.shape[1] + ), + scprep.plot.scatter2d, + self.X_pca, + mask=self.X_pca[0, :] > 0, + ) def test_scatter_invalid_discrete(self): assert_raise_message( - ValueError, "Cannot treat non-numeric data as continuous", - scprep.plot.scatter2d, self.X_pca, discrete=False, - c=np.random.choice( - ['hello', 'world'], self.X_pca.shape[0], replace=True)) + ValueError, + "Cannot treat non-numeric data as continuous", + scprep.plot.scatter2d, + self.X_pca, + discrete=False, + c=np.random.choice(["hello", "world"], self.X_pca.shape[0], replace=True), + ) def test_scatter_invalid_legend(self): assert_warns_message( - UserWarning, "`c` is a color array and cannot be used to create a " + UserWarning, + "`c` is a color array and cannot be used to create a " "legend. To interpret these values as labels instead, " "provide a `cmap` dictionary with label-color pairs.", - scprep.plot.scatter2d, self.X_pca, legend=True, - c=np.random.choice(['red', 'blue'], - self.X_pca.shape[0], replace=True)) + scprep.plot.scatter2d, + self.X_pca, + legend=True, + c=np.random.choice(["red", "blue"], self.X_pca.shape[0], replace=True), + ) assert_warns_message( - UserWarning, "Cannot create a legend with constant `c=red`", - scprep.plot.scatter2d, self.X_pca, legend=True, - c='red') + UserWarning, + "Cannot create a legend with constant `c=red`", + scprep.plot.scatter2d, + self.X_pca, + legend=True, + c="red", + ) assert_warns_message( - UserWarning, "Cannot create a legend with constant `c=None`", - scprep.plot.scatter2d, self.X_pca, legend=True, - c=None) + UserWarning, + "Cannot create a legend with constant `c=None`", + scprep.plot.scatter2d, + self.X_pca, + legend=True, + c=None, + ) def test_scatter_invalid_axis(self): fig, ax = plt.subplots() assert_raise_message( - TypeError, "Expected ax with projection='3d'. " - "Got 2D axis instead.", - scprep.plot.scatter3d, self.X_pca, ax=ax) + TypeError, + "Expected ax with projection='3d'. " "Got 2D axis instead.", + scprep.plot.scatter3d, + self.X_pca, + ax=ax, + ) def test_scatter_colorbar(self): scprep.plot.scatter3d(self.X_pca, c=self.X_pca[:, 0], colorbar=True) def test_scatter_colorbar_log(self): - scprep.plot.scatter2d(self.X_pca, c=np.abs(self.X_pca[:, 0]) + 1e-7, - colorbar=True, cmap_scale='log') + scprep.plot.scatter2d( + self.X_pca, + c=np.abs(self.X_pca[:, 0]) + 1e-7, + colorbar=True, + cmap_scale="log", + ) def test_scatter_colorbar_log_constant_c(self): assert_warns_message( UserWarning, "Cannot use non-linear `cmap_scale` with constant `c=blue`", - scprep.plot.scatter2d, self.X_pca, c='blue', - colorbar=True, cmap_scale='log') + scprep.plot.scatter2d, + self.X_pca, + c="blue", + colorbar=True, + cmap_scale="log", + ) def test_scatter_colorbar_log_discrete(self): assert_warns_message( UserWarning, "Cannot use non-linear `cmap_scale` with discrete data.", - scprep.plot.scatter2d, self.X_pca, - c=np.random.choice(['hello', 'world'], self.X_pca.shape[0]), - colorbar=True, cmap_scale='log') + scprep.plot.scatter2d, + self.X_pca, + c=np.random.choice(["hello", "world"], self.X_pca.shape[0]), + colorbar=True, + cmap_scale="log", + ) def test_scatter_colorbar_log_negative(self): assert_raise_message( - ValueError, "`vmin` must be positive for `cmap_scale='log'`. " + ValueError, + "`vmin` must be positive for `cmap_scale='log'`. " "Got {}".format(self.X_pca[:, 0].min()), - scprep.plot.scatter2d, self.X_pca, + scprep.plot.scatter2d, + self.X_pca, c=self.X_pca[:, 0], - colorbar=True, cmap_scale='log') + colorbar=True, + cmap_scale="log", + ) def test_scatter_colorbar_symlog(self): - scprep.plot.scatter2d(self.X_pca, c=self.X_pca[:, 0], - colorbar=True, cmap_scale='symlog') + scprep.plot.scatter2d( + self.X_pca, c=self.X_pca[:, 0], colorbar=True, cmap_scale="symlog" + ) def test_scatter_colorbar_sqrt(self): - scprep.plot.scatter2d(self.X_pca, c=self.X_pca[:, 0], - colorbar=True, cmap_scale='sqrt') + scprep.plot.scatter2d( + self.X_pca, c=self.X_pca[:, 0], colorbar=True, cmap_scale="sqrt" + ) def test_scatter_colorbar_invalid(self): assert_raise_message( - ValueError, "Expected norm in ['linear', 'log', 'symlog'," + ValueError, + "Expected norm in ['linear', 'log', 'symlog'," "'sqrt'] or a matplotlib.colors.Normalize object." " Got invalid", scprep.plot.scatter2d, - self.X_pca, c=self.X_pca[:, 0], - colorbar=True, cmap_scale='invalid') + self.X_pca, + c=self.X_pca[:, 0], + colorbar=True, + cmap_scale="invalid", + ) def test_scatter_legend_and_colorbar(self): assert_raise_message( - ValueError, "Received conflicting values for synonyms " + ValueError, + "Received conflicting values for synonyms " "`legend=True` and `colorbar=False`", - scprep.plot.scatter2d, self.X_pca, c=self.X_pca[:, 0], - legend=True, colorbar=False) + scprep.plot.scatter2d, + self.X_pca, + c=self.X_pca[:, 0], + legend=True, + colorbar=False, + ) def test_scatter_vmin_vmax(self): - scprep.plot.scatter2d( - self.X_pca, c=self.X_pca[:, 0], vmin=1, vmax=2) + scprep.plot.scatter2d(self.X_pca, c=self.X_pca[:, 0], vmin=1, vmax=2) def test_scatter_vmin_vmax_discrete(self): assert_warns_message( - UserWarning, "Cannot set `vmin` or `vmax` with discrete data. " - "Setting to `None`.", scprep.plot.scatter3d, - self.X_pca, c=np.random.choice( - ['hello', 'world'], self.X_pca.shape[0], replace=True), - vmin=1, vmax=2) + UserWarning, + "Cannot set `vmin` or `vmax` with discrete data. " "Setting to `None`.", + scprep.plot.scatter3d, + self.X_pca, + c=np.random.choice(["hello", "world"], self.X_pca.shape[0], replace=True), + vmin=1, + vmax=2, + ) def test_scatter_vmin_vmax_solid_color(self): assert_warns_message( - UserWarning, "Cannot set `vmin` or `vmax` with constant `c=red`. " - "Setting `vmin = vmax = None`.", scprep.plot.scatter3d, - self.X_pca, c='red', vmin=1, vmax=2) + UserWarning, + "Cannot set `vmin` or `vmax` with constant `c=red`. " + "Setting `vmin = vmax = None`.", + scprep.plot.scatter3d, + self.X_pca, + c="red", + vmin=1, + vmax=2, + ) def test_generate_colorbar_n_ticks(self): - cb = scprep.plot.tools.generate_colorbar('inferno', vmin=0, vmax=1, - n_ticks=4) + cb = scprep.plot.tools.generate_colorbar("inferno", vmin=0, vmax=1, n_ticks=4) assert len(cb.get_ticks()) == 4 def test_generate_colorbar_vmin_vmax_none(self): - cb = scprep.plot.tools.generate_colorbar('inferno') + cb = scprep.plot.tools.generate_colorbar("inferno") assert_warns_message( UserWarning, "Cannot set `n_ticks` without setting `vmin` and `vmax`.", scprep.plot.tools.generate_colorbar, - n_ticks=4) + n_ticks=4, + ) def test_generate_colorbar_mappable(self): im = plt.imshow([np.arange(10), np.arange(10)]) @@ -1244,81 +1451,103 @@ def test_generate_colorbar_mappable(self): UserWarning, "Cannot set `vmin` or `vmax` when `mappable` is given.", scprep.plot.tools.generate_colorbar, - mappable=im, vmin=10, vmax=20) + mappable=im, + vmin=10, + vmax=20, + ) assert_warns_message( UserWarning, "Cannot set `cmap` when `mappable` is given.", scprep.plot.tools.generate_colorbar, - mappable=im, cmap='inferno') + mappable=im, + cmap="inferno", + ) assert_warns_message( UserWarning, "Cannot set `scale` when `mappable` is given.", scprep.plot.tools.generate_colorbar, - mappable=im, scale='log') + mappable=im, + scale="log", + ) def test_generate_colorbar_vmin_none_vmax_given(self): assert_raise_message( ValueError, "Either both or neither of `vmax` and `vmin` should be set. " "Got `vmax=None, vmin=0`", - scprep.plot.tools.generate_colorbar, 'inferno', vmin=0) + scprep.plot.tools.generate_colorbar, + "inferno", + vmin=0, + ) def test_marker_plot_dict(self): scprep.plot.marker_plot( data=self.X, clusters=np.random.choice( - np.arange(10), replace=True, size=self.X.shape[0]), + np.arange(10), replace=True, size=self.X.shape[0] + ), gene_names=self.X.columns, - markers={'tissue': self.X.columns[:2], - 'other tissue': self.X.columns[2:4]}) + markers={"tissue": self.X.columns[:2], "other tissue": self.X.columns[2:4]}, + ) def test_marker_plot_single_marker(self): scprep.plot.marker_plot( data=self.X, clusters=np.random.choice( - np.arange(10), replace=True, size=self.X.shape[0]), + np.arange(10), replace=True, size=self.X.shape[0] + ), gene_names=self.X.columns, - markers={'tissue': [self.X.columns[0]], - 'other tissue': self.X.columns[2:4]}) + markers={ + "tissue": [self.X.columns[0]], + "other tissue": self.X.columns[2:4], + }, + ) def test_marker_plot_repeat_marker(self): scprep.plot.marker_plot( data=self.X, clusters=np.random.choice( - np.arange(10), replace=True, size=self.X.shape[0]), + np.arange(10), replace=True, size=self.X.shape[0] + ), gene_names=self.X.columns, - markers={'tissue': self.X.columns[:3], - 'other tissue': self.X.columns[2:4]}) + markers={"tissue": self.X.columns[:3], "other tissue": self.X.columns[2:4]}, + ) def test_marker_plot_list(self): scprep.plot.marker_plot( data=self.X, clusters=np.random.choice( - np.arange(10), replace=True, size=self.X.shape[0]), + np.arange(10), replace=True, size=self.X.shape[0] + ), markers=self.X.columns, - normalize_emd=False, normalize_expression=False) + normalize_emd=False, + normalize_expression=False, + ) def test_marker_plot_bad_gene_names(self): assert_raise_message( ValueError, - 'All genes in `markers` must appear ' - 'in gene_names. Did not find: {}'.format('z'), + "All genes in `markers` must appear " + "in gene_names. Did not find: {}".format("z"), scprep.plot.marker_plot, data=self.X, clusters=np.random.choice( - np.arange(10), replace=True, size=self.X.shape[0]), + np.arange(10), replace=True, size=self.X.shape[0] + ), gene_names=self.X.columns, - markers={'tissue': ['z']}) + markers={"tissue": ["z"]}, + ) def test_marker_plot_pandas_gene_names(self): scprep.plot.marker_plot( data=self.X, clusters=np.random.choice( - np.arange(10), replace=True, size=self.X.shape[0]), - markers={'tissue': self.X.columns[:2], - 'other tissue': self.X.columns[2:4]}, + np.arange(10), replace=True, size=self.X.shape[0] + ), + markers={"tissue": self.X.columns[:2], "other tissue": self.X.columns[2:4]}, reorder_tissues=False, - reorder_markers=False) + reorder_markers=False, + ) def test_marker_plot_no_gene_names(self): assert_raise_message( @@ -1329,38 +1558,34 @@ def test_marker_plot_no_gene_names(self): scprep.plot.marker_plot, data=self.X.to_numpy(), clusters=np.random.choice( - np.arange(10), replace=True, size=self.X.shape[0]), - markers={'tissue': ['z']}) + np.arange(10), replace=True, size=self.X.shape[0] + ), + markers={"tissue": ["z"]}, + ) def test_label_axis_va(self): ax = scprep.plot.scatter2d(self.X_pca) - scprep.plot.tools.label_axis( - ax.yaxis, ticklabel_vertical_alignment="top") + scprep.plot.tools.label_axis(ax.yaxis, ticklabel_vertical_alignment="top") for tick in ax.yaxis.get_ticklabels(): assert tick.get_va() == "top" - scprep.plot.tools.label_axis( - ax.yaxis, ticklabel_vertical_alignment="bottom") + scprep.plot.tools.label_axis(ax.yaxis, ticklabel_vertical_alignment="bottom") for tick in ax.yaxis.get_ticklabels(): assert tick.get_va() == "bottom" def test_label_axis_ha(self): ax = scprep.plot.scatter2d(self.X_pca) - scprep.plot.tools.label_axis( - ax.xaxis, ticklabel_horizontal_alignment="left") + scprep.plot.tools.label_axis(ax.xaxis, ticklabel_horizontal_alignment="left") for tick in ax.xaxis.get_ticklabels(): assert tick.get_ha() == "left" - scprep.plot.tools.label_axis( - ax.xaxis, ticklabel_horizontal_alignment="right") + scprep.plot.tools.label_axis(ax.xaxis, ticklabel_horizontal_alignment="right") for tick in ax.xaxis.get_ticklabels(): assert tick.get_ha() == "right" def test_label_axis_rotation(self): ax = scprep.plot.scatter2d(self.X_pca) - scprep.plot.tools.label_axis( - ax.xaxis, ticklabel_rotation=45) + scprep.plot.tools.label_axis(ax.xaxis, ticklabel_rotation=45) for tick in ax.xaxis.get_ticklabels(): assert tick.get_rotation() == 45 - scprep.plot.tools.label_axis( - ax.xaxis, ticklabel_rotation=90) + scprep.plot.tools.label_axis(ax.xaxis, ticklabel_rotation=90) for tick in ax.xaxis.get_ticklabels(): assert tick.get_rotation() == 90 diff --git a/test/test_reduce.py b/test/test_reduce.py index 94179d16..21e7de4a 100644 --- a/test/test_reduce.py +++ b/test/test_reduce.py @@ -10,7 +10,6 @@ class TestPCA(unittest.TestCase): - @classmethod def setUpClass(self): self.X = data.generate_positive_sparse_matrix(shape=[100, 3000]) @@ -18,110 +17,173 @@ def setUpClass(self): random_pca_op = decomposition.PCA(100, random_state=42) self.Y_random = random_pca_op.fit_transform(self.X) self.S_random = random_pca_op.singular_values_ - full_pca_op = decomposition.PCA(50, svd_solver='full') + full_pca_op = decomposition.PCA(50, svd_solver="full") self.Y_full = full_pca_op.fit_transform(self.X) self.S_full = full_pca_op.singular_values_ def test_dense(self): matrix.test_dense_matrix_types( - self.X, utils.assert_transform_equals, - Y=self.Y_random, transform=scprep.reduce.pca, - n_components=100, seed=42) + self.X, + utils.assert_transform_equals, + Y=self.Y_random, + transform=scprep.reduce.pca, + n_components=100, + seed=42, + ) matrix.test_all_matrix_types( - self.X, utils.assert_transform_equals, - Y=self.Y_random, transform=scprep.reduce.pca, - n_components=100, seed=42, method='dense', - check=partial(utils.assert_all_close, atol=1e-10)) + self.X, + utils.assert_transform_equals, + Y=self.Y_random, + transform=scprep.reduce.pca, + n_components=100, + seed=42, + method="dense", + check=partial(utils.assert_all_close, atol=1e-10), + ) def test_sparse_svd(self): matrix.test_sparse_matrix_types( - self.X, utils.assert_transform_equals, - Y=self.Y_full, transform=scprep.reduce.pca, + self.X, + utils.assert_transform_equals, + Y=self.Y_full, + transform=scprep.reduce.pca, check=partial(utils.assert_all_close, rtol=1e-3, atol=1e-5), - n_components=50, eps=0.3, seed=42, method='svd') + n_components=50, + eps=0.3, + seed=42, + method="svd", + ) def test_pandas(self): - X = pd.DataFrame(self.X, index=np.arange(self.X.shape[0]).astype(str), - columns=np.arange(self.X.shape[1]).astype(float)) + X = pd.DataFrame( + self.X, + index=np.arange(self.X.shape[0]).astype(str), + columns=np.arange(self.X.shape[1]).astype(float), + ) + def test_fun(X_pd): Y = scprep.reduce.pca(X_pd, n_components=100, seed=42) assert isinstance(Y, pd.DataFrame) assert np.all(Y.index == X.index) - assert np.all(Y.columns == np.array(['PC{}'.format(i+1) - for i in range(Y.shape[1])])) - matrix.test_pandas_matrix_types( - X, test_fun) + assert np.all( + Y.columns == np.array(["PC{}".format(i + 1) for i in range(Y.shape[1])]) + ) + + matrix.test_pandas_matrix_types(X, test_fun) def test_sparse_orth_rproj(self): def test_fn(*args, **kwargs): return scprep.utils.toarray(scprep.reduce.pca(*args, **kwargs)) + matrix.test_sparse_matrix_types( - self.X, utils.assert_transform_equals, + self.X, + utils.assert_transform_equals, check=utils.assert_matrix_class_equivalent, - Y=self.Y_full, transform=test_fn, - n_components=50, eps=0.3, seed=42, method='orth_rproj') + Y=self.Y_full, + transform=test_fn, + n_components=50, + eps=0.3, + seed=42, + method="orth_rproj", + ) def test_singular_values_dense(self): utils.assert_all_equal( - self.S_random, scprep.reduce.pca( - self.X, n_components=100, - seed=42, return_singular_values=True)[1]) + self.S_random, + scprep.reduce.pca( + self.X, n_components=100, seed=42, return_singular_values=True + )[1], + ) def test_singular_values_sparse(self): utils.assert_all_close( - self.S_full, scprep.reduce.pca( - self.X_sparse, n_components=50, - eps=0.3, seed=42, return_singular_values=True)[1], atol=1e-14) + self.S_full, + scprep.reduce.pca( + self.X_sparse, + n_components=50, + eps=0.3, + seed=42, + return_singular_values=True, + )[1], + atol=1e-14, + ) def test_sparse_rproj(self): def test_fn(*args, **kwargs): return scprep.utils.toarray(scprep.reduce.pca(*args, **kwargs)) + matrix.test_sparse_matrix_types( - self.X, utils.assert_transform_equals, + self.X, + utils.assert_transform_equals, check=utils.assert_matrix_class_equivalent, - Y=self.Y_full, transform=test_fn, - n_components=50, eps=0.3, seed=42, method='rproj') + Y=self.Y_full, + transform=test_fn, + n_components=50, + eps=0.3, + seed=42, + method="rproj", + ) def test_eps_too_low(self): utils.assert_all_close( - self.Y_random, scprep.reduce.pca(self.X_sparse, n_components=100, - eps=0.0001, seed=42)) + self.Y_random, + scprep.reduce.pca(self.X_sparse, n_components=100, eps=0.0001, seed=42), + ) def test_invalid_method(self): assert_raise_message( - ValueError, "Expected `method` in ['svd', 'orth_rproj', 'rproj']. " - "Got 'invalid'", scprep.reduce.pca, self.X_sparse, - method='invalid') + ValueError, + "Expected `method` in ['svd', 'orth_rproj', 'rproj']. " "Got 'invalid'", + scprep.reduce.pca, + self.X_sparse, + method="invalid", + ) def test_bad_n_components(self): assert_raise_message( ValueError, - "n_components=0 must be between 0 and " - "min(n_samples, n_features)=100", - scprep.reduce.pca, self.X, n_components=0) + "n_components=0 must be between 0 and " "min(n_samples, n_features)=100", + scprep.reduce.pca, + self.X, + n_components=0, + ) assert_raise_message( ValueError, - "n_components=101 must be between 0 and " - "min(n_samples, n_features)=100", - scprep.reduce.pca, self.X, n_components=101) + "n_components=101 must be between 0 and " "min(n_samples, n_features)=100", + scprep.reduce.pca, + self.X, + n_components=101, + ) def test_deprecated(self): assert_warns_message( FutureWarning, "n_pca is deprecated. Setting n_components=2", - scprep.reduce.pca, self.X, n_pca=2) + scprep.reduce.pca, + self.X, + n_pca=2, + ) assert_warns_message( FutureWarning, "svd_offset is deprecated. Please use `eps` instead.", - scprep.reduce.pca, self.X, n_components=2, svd_offset=100) + scprep.reduce.pca, + self.X, + n_components=2, + svd_offset=100, + ) assert_warns_message( FutureWarning, "svd_multiples is deprecated. Please use `eps` instead.", - scprep.reduce.pca, self.X, n_components=2, svd_multiples=100) + scprep.reduce.pca, + self.X, + n_components=2, + svd_multiples=100, + ) def test_rproj_operator(self): pca_op = scprep.reduce.SparseInputPCA( - n_components=50, eps=0.3, seed=42, method='rproj') + n_components=50, eps=0.3, seed=42, method="rproj" + ) assert pca_op.fit(self.X_sparse) == pca_op Y = pca_op.transform(self.X_sparse) assert Y.shape == (self.X_sparse.shape[0], 50) @@ -129,12 +191,15 @@ def test_rproj_operator(self): assert len(pca_op.explained_variance_) == 50 assert len(pca_op.explained_variance_ratio_) == 50 assert pca_op.components_.shape == (50, self.X_sparse.shape[1]) - assert pca_op.inverse_transform( - pca_op.components_[:, [0]].T).shape == (1, self.X_sparse.shape[1]) + assert pca_op.inverse_transform(pca_op.components_[:, [0]].T).shape == ( + 1, + self.X_sparse.shape[1], + ) def test_orth_operator(self): pca_op = scprep.reduce.SparseInputPCA( - n_components=50, eps=0.3, seed=42, method='orth_rproj') + n_components=50, eps=0.3, seed=42, method="orth_rproj" + ) assert pca_op.fit(self.X_sparse) == pca_op Y = pca_op.transform(self.X_sparse) assert Y.shape == (self.X_sparse.shape[0], 50) @@ -142,5 +207,7 @@ def test_orth_operator(self): assert len(pca_op.explained_variance_) == 50 assert len(pca_op.explained_variance_ratio_) == 50 assert pca_op.components_.shape == (50, self.X_sparse.shape[1]) - assert pca_op.inverse_transform( - pca_op.components_[:, [0]].T).shape == (1, self.X_sparse.shape[1]) + assert pca_op.inverse_transform(pca_op.components_[:, [0]].T).shape == ( + 1, + self.X_sparse.shape[1], + ) diff --git a/test/test_run.py b/test/test_run.py index f5cff38c..ab37ba37 100644 --- a/test/test_run.py +++ b/test/test_run.py @@ -1,5 +1,6 @@ import sys -if int(sys.version.split('.')[1]) < 6: + +if int(sys.version.split(".")[1]) < 6: # python 3.5 pass else: @@ -16,192 +17,211 @@ builtin_warning = rpy2.rinterface_lib.callbacks.consolewrite_warnerror - def test_verbose(): fun = scprep.run.RFunction( setup="message('This should not print')", - body="message('Verbose test\n\n'); list(1,2,3)", verbose=True) + body="message('Verbose test\n\n'); list(1,2,3)", + verbose=True, + ) assert np.all(fun() == np.array([[1], [2], [3]])) - def test_install_bioc(): assert_raise_message( rpy2.rinterface_lib.embedded.RRuntimeError, "Error: Bioconductor version '3.1' requires R version '3.2'; see", scprep.run.install_bioconductor, - version='3.1', site_repository='https://bioconductor.org/packages/3.1/bioc', - verbose=False) - + version="3.1", + site_repository="https://bioconductor.org/packages/3.1/bioc", + verbose=False, + ) class TestSplatter(unittest.TestCase): - @classmethod def setUpClass(self): scprep.run.splatter.install(verbose=False) def test_splatter_default(self): - sim = scprep.run.SplatSimulate( - batch_cells=10, n_genes=200, verbose=0) - assert sim['counts'].shape == (10, 200) - assert np.all(sim['batch'] == 'Batch1') - assert sim['batch_cell_means'].shape == (10, 200) - assert sim['base_cell_means'].shape == (10, 200) - assert sim['bcv'].shape == (10, 200) - assert sim['cell_means'].shape == (10, 200) - assert sim['true_counts'].shape == (10, 200) - assert sim['dropout'] is None - assert sim['step'].shape == (10,) - assert sim['group'].shape == (10,) - assert sim['exp_lib_size'].shape == (10,) - assert sim['base_gene_mean'].shape == (200,) - assert sim['outlier_factor'].shape == (200,) - assert sum(['batch_fac' in k for k in sim.keys()]) == 0 - assert sum(['de_fac' in k for k in sim.keys()]) == 1 - assert sim['de_fac_1'].shape == (200,) - assert sum(['sigma_fac' in k for k in sim.keys()]) == 1 - assert sim['sigma_fac_1'].shape == (200,) + sim = scprep.run.SplatSimulate(batch_cells=10, n_genes=200, verbose=0) + assert sim["counts"].shape == (10, 200) + assert np.all(sim["batch"] == "Batch1") + assert sim["batch_cell_means"].shape == (10, 200) + assert sim["base_cell_means"].shape == (10, 200) + assert sim["bcv"].shape == (10, 200) + assert sim["cell_means"].shape == (10, 200) + assert sim["true_counts"].shape == (10, 200) + assert sim["dropout"] is None + assert sim["step"].shape == (10,) + assert sim["group"].shape == (10,) + assert sim["exp_lib_size"].shape == (10,) + assert sim["base_gene_mean"].shape == (200,) + assert sim["outlier_factor"].shape == (200,) + assert sum(["batch_fac" in k for k in sim.keys()]) == 0 + assert sum(["de_fac" in k for k in sim.keys()]) == 1 + assert sim["de_fac_1"].shape == (200,) + assert sum(["sigma_fac" in k for k in sim.keys()]) == 1 + assert sim["sigma_fac_1"].shape == (200,) def test_splatter_batch(self): - sim = scprep.run.SplatSimulate( - batch_cells=[5, 5], n_genes=200, verbose=0) - assert sim['counts'].shape == (10, 200) - assert np.all(sim['batch'][:5] == 'Batch1') - assert np.all(sim['batch'][5:] == 'Batch2') - assert sim['batch_cell_means'].shape == (10, 200) - assert sim['base_cell_means'].shape == (10, 200) - assert sim['bcv'].shape == (10, 200) - assert sim['cell_means'].shape == (10, 200) - assert sim['true_counts'].shape == (10, 200) - assert sim['dropout'] is None - assert sim['step'].shape == (10,) - assert sim['group'].shape == (10,) - assert sim['exp_lib_size'].shape == (10,) - assert sim['base_gene_mean'].shape == (200,) - assert sim['outlier_factor'].shape == (200,) - assert sum(['batch_fac' in k for k in sim.keys()]) == 2 - assert sim['batch_fac_1'].shape == (200,) - assert sim['batch_fac_2'].shape == (200,) - assert sum(['de_fac' in k for k in sim.keys()]) == 1 - assert sim['de_fac_1'].shape == (200,) - assert sum(['sigma_fac' in k for k in sim.keys()]) == 1 - assert sim['sigma_fac_1'].shape == (200,) + sim = scprep.run.SplatSimulate(batch_cells=[5, 5], n_genes=200, verbose=0) + assert sim["counts"].shape == (10, 200) + assert np.all(sim["batch"][:5] == "Batch1") + assert np.all(sim["batch"][5:] == "Batch2") + assert sim["batch_cell_means"].shape == (10, 200) + assert sim["base_cell_means"].shape == (10, 200) + assert sim["bcv"].shape == (10, 200) + assert sim["cell_means"].shape == (10, 200) + assert sim["true_counts"].shape == (10, 200) + assert sim["dropout"] is None + assert sim["step"].shape == (10,) + assert sim["group"].shape == (10,) + assert sim["exp_lib_size"].shape == (10,) + assert sim["base_gene_mean"].shape == (200,) + assert sim["outlier_factor"].shape == (200,) + assert sum(["batch_fac" in k for k in sim.keys()]) == 2 + assert sim["batch_fac_1"].shape == (200,) + assert sim["batch_fac_2"].shape == (200,) + assert sum(["de_fac" in k for k in sim.keys()]) == 1 + assert sim["de_fac_1"].shape == (200,) + assert sum(["sigma_fac" in k for k in sim.keys()]) == 1 + assert sim["sigma_fac_1"].shape == (200,) def test_splatter_groups(self): - sim = scprep.run.SplatSimulate(method='groups', batch_cells=10, - group_prob=[0.5, 0.5], n_genes=200, - de_fac_loc=[0.1, 0.5], verbose=0) - assert sim['counts'].shape == (10, 200) - assert np.all(sim['batch'] == 'Batch1') - assert sim['batch_cell_means'].shape == (10, 200) - assert sim['base_cell_means'].shape == (10, 200) - assert sim['bcv'].shape == (10, 200) - assert sim['cell_means'].shape == (10, 200) - assert sim['true_counts'].shape == (10, 200) - assert sim['dropout'] is None - assert sim['step'] is None - assert sim['group'].shape == (10,) - assert sim['exp_lib_size'].shape == (10,) - assert sim['base_gene_mean'].shape == (200,) - assert sim['outlier_factor'].shape == (200,) - assert sum(['batch_fac' in k for k in sim.keys()]) == 0 - assert sum(['de_fac' in k for k in sim.keys()]) == 2 - assert sim['de_fac_1'].shape == (200,) - assert sim['de_fac_2'].shape == (200,) - assert sum(['sigma_fac' in k for k in sim.keys()]) == 0 + sim = scprep.run.SplatSimulate( + method="groups", + batch_cells=10, + group_prob=[0.5, 0.5], + n_genes=200, + de_fac_loc=[0.1, 0.5], + verbose=0, + ) + assert sim["counts"].shape == (10, 200) + assert np.all(sim["batch"] == "Batch1") + assert sim["batch_cell_means"].shape == (10, 200) + assert sim["base_cell_means"].shape == (10, 200) + assert sim["bcv"].shape == (10, 200) + assert sim["cell_means"].shape == (10, 200) + assert sim["true_counts"].shape == (10, 200) + assert sim["dropout"] is None + assert sim["step"] is None + assert sim["group"].shape == (10,) + assert sim["exp_lib_size"].shape == (10,) + assert sim["base_gene_mean"].shape == (200,) + assert sim["outlier_factor"].shape == (200,) + assert sum(["batch_fac" in k for k in sim.keys()]) == 0 + assert sum(["de_fac" in k for k in sim.keys()]) == 2 + assert sim["de_fac_1"].shape == (200,) + assert sim["de_fac_2"].shape == (200,) + assert sum(["sigma_fac" in k for k in sim.keys()]) == 0 def test_splatter_paths(self): - sim = scprep.run.SplatSimulate(method='paths', batch_cells=10, n_genes=200, - group_prob=[0.5, 0.5], path_from=[0, 0], - path_length=[100, 200], path_skew=[0.4, 0.6], - de_fac_loc=[0.1, 0.5], verbose=0) - assert sim['counts'].shape == (10, 200) - assert np.all(sim['batch'] == 'Batch1') - assert sim['batch_cell_means'].shape == (10, 200) - assert sim['base_cell_means'].shape == (10, 200) - assert sim['bcv'].shape == (10, 200) - assert sim['cell_means'].shape == (10, 200) - assert sim['true_counts'].shape == (10, 200) - assert sim['dropout'] is None - assert sim['step'].shape == (10,) - assert sim['group'].shape == (10,) - assert sim['exp_lib_size'].shape == (10,) - assert sim['base_gene_mean'].shape == (200,) - assert sim['outlier_factor'].shape == (200,) - assert sum(['batch_fac' in k for k in sim.keys()]) == 0 - assert sum(['de_fac' in k for k in sim.keys()]) == 2 - assert sim['de_fac_1'].shape == (200,) - assert sim['de_fac_2'].shape == (200,) - assert sum(['sigma_fac' in k for k in sim.keys()]) == 2 - assert sim['sigma_fac_1'].shape == (200,) - assert sim['sigma_fac_2'].shape == (200,) + sim = scprep.run.SplatSimulate( + method="paths", + batch_cells=10, + n_genes=200, + group_prob=[0.5, 0.5], + path_from=[0, 0], + path_length=[100, 200], + path_skew=[0.4, 0.6], + de_fac_loc=[0.1, 0.5], + verbose=0, + ) + assert sim["counts"].shape == (10, 200) + assert np.all(sim["batch"] == "Batch1") + assert sim["batch_cell_means"].shape == (10, 200) + assert sim["base_cell_means"].shape == (10, 200) + assert sim["bcv"].shape == (10, 200) + assert sim["cell_means"].shape == (10, 200) + assert sim["true_counts"].shape == (10, 200) + assert sim["dropout"] is None + assert sim["step"].shape == (10,) + assert sim["group"].shape == (10,) + assert sim["exp_lib_size"].shape == (10,) + assert sim["base_gene_mean"].shape == (200,) + assert sim["outlier_factor"].shape == (200,) + assert sum(["batch_fac" in k for k in sim.keys()]) == 0 + assert sum(["de_fac" in k for k in sim.keys()]) == 2 + assert sim["de_fac_1"].shape == (200,) + assert sim["de_fac_2"].shape == (200,) + assert sum(["sigma_fac" in k for k in sim.keys()]) == 2 + assert sim["sigma_fac_1"].shape == (200,) + assert sim["sigma_fac_2"].shape == (200,) def test_splatter_dropout(self): - sim = scprep.run.SplatSimulate(batch_cells=10, n_genes=200, - dropout_type='experiment', - verbose=0) - assert sim['counts'].shape == (10, 200) - assert np.all(sim['batch'] == 'Batch1') - assert sim['batch_cell_means'].shape == (10, 200) - assert sim['base_cell_means'].shape == (10, 200) - assert sim['bcv'].shape == (10, 200) - assert sim['cell_means'].shape == (10, 200) - assert sim['true_counts'].shape == (10, 200) - assert sim['dropout'].shape == (10, 200) - assert sim['step'].shape == (10,) - assert sim['group'].shape == (10,) - assert sim['exp_lib_size'].shape == (10,) - assert sim['base_gene_mean'].shape == (200,) - assert sim['outlier_factor'].shape == (200,) - assert sum(['batch_fac' in k for k in sim.keys()]) == 0 - assert sum(['de_fac' in k for k in sim.keys()]) == 1 - assert sim['de_fac_1'].shape == (200,) - assert sum(['sigma_fac' in k for k in sim.keys()]) == 1 - assert sim['sigma_fac_1'].shape == (200,) + sim = scprep.run.SplatSimulate( + batch_cells=10, n_genes=200, dropout_type="experiment", verbose=0 + ) + assert sim["counts"].shape == (10, 200) + assert np.all(sim["batch"] == "Batch1") + assert sim["batch_cell_means"].shape == (10, 200) + assert sim["base_cell_means"].shape == (10, 200) + assert sim["bcv"].shape == (10, 200) + assert sim["cell_means"].shape == (10, 200) + assert sim["true_counts"].shape == (10, 200) + assert sim["dropout"].shape == (10, 200) + assert sim["step"].shape == (10,) + assert sim["group"].shape == (10,) + assert sim["exp_lib_size"].shape == (10,) + assert sim["base_gene_mean"].shape == (200,) + assert sim["outlier_factor"].shape == (200,) + assert sum(["batch_fac" in k for k in sim.keys()]) == 0 + assert sum(["de_fac" in k for k in sim.keys()]) == 1 + assert sim["de_fac_1"].shape == (200,) + assert sum(["sigma_fac" in k for k in sim.keys()]) == 1 + assert sim["sigma_fac_1"].shape == (200,) def test_splatter_dropout_binomial(self): - sim = scprep.run.SplatSimulate(batch_cells=10, n_genes=200, - dropout_type='binomial', - dropout_prob=0.5, verbose=False) - assert sim['counts'].shape == (10, 200) - assert np.all(sim['batch'] == 'Batch1') - assert sim['batch_cell_means'].shape == (10, 200) - assert sim['base_cell_means'].shape == (10, 200) - assert sim['bcv'].shape == (10, 200) - assert sim['cell_means'].shape == (10, 200) - assert sim['true_counts'].shape == (10, 200) + sim = scprep.run.SplatSimulate( + batch_cells=10, + n_genes=200, + dropout_type="binomial", + dropout_prob=0.5, + verbose=False, + ) + assert sim["counts"].shape == (10, 200) + assert np.all(sim["batch"] == "Batch1") + assert sim["batch_cell_means"].shape == (10, 200) + assert sim["base_cell_means"].shape == (10, 200) + assert sim["bcv"].shape == (10, 200) + assert sim["cell_means"].shape == (10, 200) + assert sim["true_counts"].shape == (10, 200) dropout_proportion = np.mean( - sim['counts'][np.where(sim['true_counts'] > 0)] / - sim['true_counts'][np.where(sim['true_counts'] > 0)]) + sim["counts"][np.where(sim["true_counts"] > 0)] + / sim["true_counts"][np.where(sim["true_counts"] > 0)] + ) assert dropout_proportion < 0.55 assert dropout_proportion > 0.45 - assert sim['dropout'] is None - assert sim['step'].shape == (10,) - assert sim['group'].shape == (10,) - assert sim['exp_lib_size'].shape == (10,) - assert sim['base_gene_mean'].shape == (200,) - assert sim['outlier_factor'].shape == (200,) - assert sum(['batch_fac' in k for k in sim.keys()]) == 0 - assert sum(['de_fac' in k for k in sim.keys()]) == 1 - assert sim['de_fac_1'].shape == (200,) - assert sum(['sigma_fac' in k for k in sim.keys()]) == 1 - assert sim['sigma_fac_1'].shape == (200,) + assert sim["dropout"] is None + assert sim["step"].shape == (10,) + assert sim["group"].shape == (10,) + assert sim["exp_lib_size"].shape == (10,) + assert sim["base_gene_mean"].shape == (200,) + assert sim["outlier_factor"].shape == (200,) + assert sum(["batch_fac" in k for k in sim.keys()]) == 0 + assert sum(["de_fac" in k for k in sim.keys()]) == 1 + assert sim["de_fac_1"].shape == (200,) + assert sum(["sigma_fac" in k for k in sim.keys()]) == 1 + assert sim["sigma_fac_1"].shape == (200,) def test_splatter_warning(self): - assert rpy2.rinterface_lib.callbacks.consolewrite_warnerror is \ - builtin_warning + assert ( + rpy2.rinterface_lib.callbacks.consolewrite_warnerror is builtin_warning + ) scprep.run.r_function._ConsoleWarning.set_debug() - assert rpy2.rinterface_lib.callbacks.consolewrite_warnerror is \ - scprep.run.r_function._ConsoleWarning.debug + assert ( + rpy2.rinterface_lib.callbacks.consolewrite_warnerror + is scprep.run.r_function._ConsoleWarning.debug + ) scprep.run.r_function._ConsoleWarning.set_warning() - assert rpy2.rinterface_lib.callbacks.consolewrite_warnerror is \ - scprep.run.r_function._ConsoleWarning.warning + assert ( + rpy2.rinterface_lib.callbacks.consolewrite_warnerror + is scprep.run.r_function._ConsoleWarning.warning + ) scprep.run.r_function._ConsoleWarning.set_builtin() - assert rpy2.rinterface_lib.callbacks.consolewrite_warnerror is \ - builtin_warning - + assert ( + rpy2.rinterface_lib.callbacks.consolewrite_warnerror is builtin_warning + ) class TestSlingshot(unittest.TestCase): - @classmethod def setUpClass(self): scprep.run.slingshot.install(verbose=False) @@ -210,16 +230,22 @@ def setUpClass(self): self.clusters = sklearn.cluster.KMeans(6).fit_predict(self.X_pca) def test_slingshot(self): - slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, verbose=False) - pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] + slingshot = scprep.run.Slingshot( + self.X_pca[:, :2], self.clusters, verbose=False + ) + pseudotime, branch, curves = ( + slingshot["pseudotime"], + slingshot["branch"], + slingshot["curves"], + ) assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] current_pseudotime = -1 for i in np.unique(branch): - branch_membership = np.isnan(pseudotime[branch==i]) + branch_membership = np.isnan(pseudotime[branch == i]) assert np.all(branch_membership == branch_membership[0]) - new_pseudotime = np.nanmean(pseudotime[branch==i]) + new_pseudotime = np.nanmean(pseudotime[branch == i]) assert new_pseudotime > current_pseudotime current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] @@ -227,20 +253,27 @@ def test_slingshot(self): assert np.all(np.any(~np.isnan(pseudotime), axis=1)) def test_slingshot_pandas(self): - slingshot = scprep.run.Slingshot(pd.DataFrame(self.X_pca[:,:2], index=self.X.index), - self.clusters, verbose=False) - pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] + slingshot = scprep.run.Slingshot( + pd.DataFrame(self.X_pca[:, :2], index=self.X.index), + self.clusters, + verbose=False, + ) + pseudotime, branch, curves = ( + slingshot["pseudotime"], + slingshot["branch"], + slingshot["curves"], + ) assert np.all(pseudotime.index == self.X.index) assert np.all(branch.index == self.X.index) - assert branch.name == 'branch' + assert branch.name == "branch" assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] current_pseudotime = -1 for i in np.unique(branch): - branch_membership = np.isnan(pseudotime.loc[branch==i]) + branch_membership = np.isnan(pseudotime.loc[branch == i]) assert np.all(branch_membership == branch_membership.iloc[0]) - new_pseudotime = np.nanmean(np.nanmean(pseudotime.loc[branch==i])) + new_pseudotime = np.nanmean(np.nanmean(pseudotime.loc[branch == i])) assert new_pseudotime > current_pseudotime current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] @@ -252,38 +285,55 @@ def test_slingshot_distance(self): NotImplementedError, "distance argument not currently implemented", scprep.run.Slingshot, - self.X_pca, self.clusters, distance=lambda X, Y : np.sum(X-Y)) + self.X_pca, + self.clusters, + distance=lambda X, Y: np.sum(X - Y), + ) def test_slingshot_optional_args(self): - slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, - start_cluster=4, omega=0.1, verbose=False) - pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] + slingshot = scprep.run.Slingshot( + self.X_pca[:, :2], + self.clusters, + start_cluster=4, + omega=0.1, + verbose=False, + ) + pseudotime, branch, curves = ( + slingshot["pseudotime"], + slingshot["branch"], + slingshot["curves"], + ) assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] current_pseudotime = -1 for i in np.unique(branch): - branch_membership = np.isnan(pseudotime[branch==i]) + branch_membership = np.isnan(pseudotime[branch == i]) assert np.all(branch_membership == branch_membership[0]) - if np.all(np.isnan(pseudotime[branch==i])): + if np.all(np.isnan(pseudotime[branch == i])): assert i == -1 else: - new_pseudotime = np.nanmean(pseudotime[branch==i]) + new_pseudotime = np.nanmean(pseudotime[branch == i]) assert new_pseudotime > current_pseudotime current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] assert curves.shape[2] == 2 - slingshot = scprep.run.Slingshot(self.X_pca[:,:2], self.clusters, - end_cluster=0, verbose=False) - pseudotime, branch, curves = slingshot['pseudotime'], slingshot['branch'], slingshot['curves'] + slingshot = scprep.run.Slingshot( + self.X_pca[:, :2], self.clusters, end_cluster=0, verbose=False + ) + pseudotime, branch, curves = ( + slingshot["pseudotime"], + slingshot["branch"], + slingshot["curves"], + ) assert pseudotime.shape[0] == self.X_pca.shape[0] assert pseudotime.shape[1] == curves.shape[0] assert branch.shape[0] == self.X_pca.shape[0] current_pseudotime = -1 for i in np.unique(branch): - branch_membership = np.isnan(pseudotime[branch==i]) + branch_membership = np.isnan(pseudotime[branch == i]) assert np.all(branch_membership == branch_membership[0]) - new_pseudotime = np.nanmean(pseudotime[branch==i]) + new_pseudotime = np.nanmean(pseudotime[branch == i]) assert new_pseudotime > current_pseudotime current_pseudotime = new_pseudotime assert curves.shape[1] == self.X_pca.shape[0] @@ -293,14 +343,18 @@ def test_slingshot_optional_args(self): def test_slingshot_errors(self): assert_warns_message( UserWarning, - "Expected data to be low-dimensional. " - "Got data.shape[1] = 4", + "Expected data to be low-dimensional. " "Got data.shape[1] = 4", scprep.run.Slingshot, - self.X_pca[:, :4], self.clusters, verbose=False) + self.X_pca[:, :4], + self.clusters, + verbose=False, + ) assert_raise_message( ValueError, "Expected len(cluster_labels) ({}) to equal " - "data.shape[0] ({})".format( - self.X.shape[0]//2, self.X.shape[0]), + "data.shape[0] ({})".format(self.X.shape[0] // 2, self.X.shape[0]), scprep.run.Slingshot, - self.X_pca[:, :2], self.clusters[:self.X.shape[0]//2], verbose=False) + self.X_pca[:, :2], + self.clusters[: self.X.shape[0] // 2], + verbose=False, + ) diff --git a/test/test_sanitize.py b/test/test_sanitize.py index 9c46f611..9cd85193 100644 --- a/test/test_sanitize.py +++ b/test/test_sanitize.py @@ -10,7 +10,8 @@ def test_check_numeric_copy(): X, utils.assert_transform_unchanged, transform=scprep.sanitize.check_numeric, - copy=True) + copy=True, + ) def test_check_numeric_inplace(): @@ -18,31 +19,35 @@ def test_check_numeric_inplace(): matrix.test_matrix_types( X, utils.assert_transform_unchanged, - matrix._scipy_matrix_types + - matrix._numpy_matrix_types + - matrix._pandas_dense_matrix_types + - [matrix.SparseDataFrame], + matrix._scipy_matrix_types + + matrix._numpy_matrix_types + + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame], transform=scprep.sanitize.check_numeric, - copy=False) + copy=False, + ) matrix._ignore_pandas_sparse_warning() assert_raise_message( TypeError, - "pd.SparseDataFrame does not support " - "copy=False. Please use copy=True.", + "pd.SparseDataFrame does not support " "copy=False. Please use copy=True.", scprep.sanitize.check_numeric, - data=matrix.SparseDataFrame_deprecated(X), copy=False) + data=matrix.SparseDataFrame_deprecated(X), + copy=False, + ) matrix._reset_warnings() class TypeErrorClass(object): - def astype(self, dtype): return + X = TypeErrorClass() assert_raise_message( TypeError, "astype() got an unexpected keyword argument 'copy'", scprep.sanitize.check_numeric, - data=X, copy=None) + data=X, + copy=None, + ) def test_check_numeric_bad_dtype(): @@ -50,4 +55,5 @@ def test_check_numeric_bad_dtype(): ValueError, "could not convert string to float: ", scprep.sanitize.check_numeric, - np.array(['hello', 'world'])) + np.array(["hello", "world"]), + ) diff --git a/test/test_select.py b/test/test_select.py index f59e479c..7ec3e04a 100644 --- a/test/test_select.py +++ b/test/test_select.py @@ -8,7 +8,6 @@ class Test10X(unittest.TestCase): - @classmethod def setUpClass(self): self.X = data.load_10X(sparse=False) @@ -16,300 +15,401 @@ def setUpClass(self): self.libsize = scprep.measure.library_size(self.X) def test_get_gene_set_starts_with(self): - gene_idx = np.argwhere([g.startswith("D") - for g in self.X.columns]).flatten() + gene_idx = np.argwhere([g.startswith("D") for g in self.X.columns]).flatten() gene_names = self.X.columns[gene_idx] - assert np.all(scprep.select.get_gene_set( - self.X, starts_with="D") == gene_names) - assert np.all(scprep.select.get_gene_set( - self.X, regex="^D") == gene_names) - assert np.all(scprep.select.get_gene_set( - self.X.columns, regex="^D") == gene_names) + assert np.all(scprep.select.get_gene_set(self.X, starts_with="D") == gene_names) + assert np.all(scprep.select.get_gene_set(self.X, regex="^D") == gene_names) + assert np.all( + scprep.select.get_gene_set(self.X.columns, regex="^D") == gene_names + ) def test_get_gene_set_ends_with(self): - gene_idx = np.argwhere([g.endswith("8") - for g in self.X.columns]).flatten() + gene_idx = np.argwhere([g.endswith("8") for g in self.X.columns]).flatten() gene_names = self.X.columns[gene_idx] - assert np.all(scprep.select.get_gene_set( - self.X, ends_with="8") == gene_names) - assert np.all(scprep.select.get_gene_set( - self.X, regex="8$") == gene_names) + assert np.all(scprep.select.get_gene_set(self.X, ends_with="8") == gene_names) + assert np.all(scprep.select.get_gene_set(self.X, regex="8$") == gene_names) def test_get_gene_set_ndarray(self): assert_raise_message( TypeError, - "data must be a list of gene names or a pandas " - "DataFrame. Got ndarray", + "data must be a list of gene names or a pandas " "DataFrame. Got ndarray", scprep.select.get_gene_set, - data=self.X.to_numpy(), regex="8$") + data=self.X.to_numpy(), + regex="8$", + ) def test_get_gene_set_no_condition(self): assert_warns_message( UserWarning, "No selection conditions provided. Returning all genes.", - scprep.select.get_gene_set, self.X) + scprep.select.get_gene_set, + self.X, + ) def test_get_cell_set_starts_with(self): - cell_idx = np.argwhere([g.startswith("A") - for g in self.X.index]).flatten() + cell_idx = np.argwhere([g.startswith("A") for g in self.X.index]).flatten() cell_names = self.X.index[cell_idx] - assert np.all(scprep.select.get_cell_set( - self.X, starts_with="A") == cell_names) - assert np.all(scprep.select.get_cell_set( - self.X, regex="^A") == cell_names) - assert np.all(scprep.select.get_cell_set( - self.X.index, regex="^A") == cell_names) + assert np.all(scprep.select.get_cell_set(self.X, starts_with="A") == cell_names) + assert np.all(scprep.select.get_cell_set(self.X, regex="^A") == cell_names) + assert np.all( + scprep.select.get_cell_set(self.X.index, regex="^A") == cell_names + ) def test_get_cell_set_ends_with(self): - cell_idx = np.argwhere([g.endswith("G-1") - for g in self.X.index]).flatten() + cell_idx = np.argwhere([g.endswith("G-1") for g in self.X.index]).flatten() cell_names = self.X.index[cell_idx] - assert np.all(scprep.select.get_cell_set( - self.X, ends_with="G-1") == cell_names) - assert np.all(scprep.select.get_cell_set( - self.X, regex="G\\-1$") == cell_names) + assert np.all(scprep.select.get_cell_set(self.X, ends_with="G-1") == cell_names) + assert np.all(scprep.select.get_cell_set(self.X, regex="G\\-1$") == cell_names) def test_get_cell_set_ndarray(self): assert_raise_message( TypeError, - "data must be a list of cell names or a pandas " - "DataFrame. Got ndarray", + "data must be a list of cell names or a pandas " "DataFrame. Got ndarray", scprep.select.get_cell_set, - data=self.X.to_numpy(), regex="G\\-1$") + data=self.X.to_numpy(), + regex="G\\-1$", + ) def test_get_cell_set_no_condition(self): assert_warns_message( UserWarning, "No selection conditions provided. Returning all cells.", - scprep.select.get_cell_set, self.X) + scprep.select.get_cell_set, + self.X, + ) def test_select_rows_boolean_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_rows, - idx=np.random.choice([True, False], [self.X.shape[0]])) + self.X, + scprep.select.select_rows, + idx=np.random.choice([True, False], [self.X.shape[0]]), + ) def test_select_rows_integer_array_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_rows, - idx=np.random.choice(self.X.shape[0], self.X.shape[0] // 2)) + self.X, + scprep.select.select_rows, + idx=np.random.choice(self.X.shape[0], self.X.shape[0] // 2), + ) def test_select_rows_integer_list_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_rows, - idx=np.random.choice(self.X.shape[0], self.X.shape[0] // 2).tolist()) + self.X, + scprep.select.select_rows, + idx=np.random.choice(self.X.shape[0], self.X.shape[0] // 2).tolist(), + ) def test_select_rows_integer_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_rows, - idx=np.random.choice(self.X.shape[0])) + self.X, scprep.select.select_rows, idx=np.random.choice(self.X.shape[0]) + ) def test_select_rows_string_array_index(self): matrix.test_pandas_matrix_types( - self.X, scprep.select.select_rows, - idx=np.random.choice(self.X.index.to_numpy(), self.X.shape[0] // 2)) + self.X, + scprep.select.select_rows, + idx=np.random.choice(self.X.index.to_numpy(), self.X.shape[0] // 2), + ) def test_select_rows_pandas_index_index(self): matrix.test_pandas_matrix_types( - self.X, scprep.select.select_rows, - idx=self.X.index[np.random.choice([True, False], [self.X.shape[0]])]) + self.X, + scprep.select.select_rows, + idx=self.X.index[np.random.choice([True, False], [self.X.shape[0]])], + ) def test_select_rows_series_index(self): matrix.test_pandas_matrix_types( - self.X, scprep.select.select_rows, - idx=pd.Series(self.X.index[np.random.choice([True, False], [self.X.shape[0]])])) + self.X, + scprep.select.select_rows, + idx=pd.Series( + self.X.index[np.random.choice([True, False], [self.X.shape[0]])] + ), + ) def test_select_rows_dataframe_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_rows, - idx=pd.DataFrame(np.random.choice([True, False], [self.X.shape[0], 1]), - index=self.X.index)) + self.X, + scprep.select.select_rows, + idx=pd.DataFrame( + np.random.choice([True, False], [self.X.shape[0], 1]), + index=self.X.index, + ), + ) def test_select_rows_series_data_boolean_index(self): scprep.select.select_rows( - self.X, self.X.iloc[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) + self.X, + self.X.iloc[:, 0], + idx=np.random.choice([True, False], [self.X.shape[0]]), + ) def test_select_rows_sparse_series_data_boolean_index(self): scprep.select.select_rows( - self.X, self.X_sparse.iloc[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) + self.X, + self.X_sparse.iloc[:, 0], + idx=np.random.choice([True, False], [self.X.shape[0]]), + ) def test_select_rows_series_data_integer_index(self): scprep.select.select_rows( - self.X, self.X.iloc[:, 0], idx=np.random.choice(self.X.shape[1], self.X.shape[0] // 2)) + self.X, + self.X.iloc[:, 0], + idx=np.random.choice(self.X.shape[1], self.X.shape[0] // 2), + ) def test_select_rows_sparse_series_data_integer_index(self): scprep.select.select_rows( - self.X, self.X_sparse.iloc[:, 0], idx=np.random.choice(self.X.shape[1], self.X.shape[0] // 2)) + self.X, + self.X_sparse.iloc[:, 0], + idx=np.random.choice(self.X.shape[1], self.X.shape[0] // 2), + ) def test_select_rows_1d_array_data(self): scprep.select.select_rows( - self.X, self.X.to_numpy()[:, 0], idx=np.random.choice([True, False], [self.X.shape[0]])) + self.X, + self.X.to_numpy()[:, 0], + idx=np.random.choice([True, False], [self.X.shape[0]]), + ) def test_select_rows_list_data(self): scprep.select.select_rows( - self.X, self.X.to_numpy()[:, 0].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, + self.X.to_numpy()[:, 0].tolist(), + idx=np.random.choice([True, False], [self.X.shape[1]]), + ) def test_select_rows_get_cell_set(self): matrix.test_pandas_matrix_types( - self.X, scprep.select.select_rows, self.X.iloc[:, 0], - starts_with="A") + self.X, scprep.select.select_rows, self.X.iloc[:, 0], starts_with="A" + ) def test_select_rows_zero_rows(self): assert_warns_message( UserWarning, "Selecting 0 rows", - scprep.select.select_rows, self.X, - idx=(self.X.sum(axis=1) < 0)) + scprep.select.select_rows, + self.X, + idx=(self.X.sum(axis=1) < 0), + ) def test_select_rows_no_condition(self): assert_warns_message( UserWarning, "No selection conditions provided. Returning all rows.", - scprep.select.select_rows, self.X) + scprep.select.select_rows, + self.X, + ) def test_select_cols_boolean_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_cols, - idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, + scprep.select.select_cols, + idx=np.random.choice([True, False], [self.X.shape[1]]), + ) def test_select_cols_integer_array_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_cols, - idx=np.random.choice(self.X.shape[1], self.X.shape[1] // 2)) + self.X, + scprep.select.select_cols, + idx=np.random.choice(self.X.shape[1], self.X.shape[1] // 2), + ) def test_select_cols_integer_list_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_cols, - idx=np.random.choice(self.X.shape[1], self.X.shape[1] // 2).tolist()) + self.X, + scprep.select.select_cols, + idx=np.random.choice(self.X.shape[1], self.X.shape[1] // 2).tolist(), + ) def test_select_cols_integer_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_cols, - idx=np.random.choice(self.X.shape[1])) + self.X, scprep.select.select_cols, idx=np.random.choice(self.X.shape[1]) + ) def test_select_cols_string_array_index(self): matrix.test_pandas_matrix_types( - self.X, scprep.select.select_cols, - idx=np.random.choice(self.X.columns.to_numpy(), self.X.shape[1] // 2)) + self.X, + scprep.select.select_cols, + idx=np.random.choice(self.X.columns.to_numpy(), self.X.shape[1] // 2), + ) def test_select_cols_pandas_index_index(self): matrix.test_pandas_matrix_types( - self.X, scprep.select.select_cols, - idx=self.X.columns[np.random.choice([True, False], [self.X.shape[1]])]) + self.X, + scprep.select.select_cols, + idx=self.X.columns[np.random.choice([True, False], [self.X.shape[1]])], + ) def test_select_cols_series_index(self): matrix.test_pandas_matrix_types( - self.X, scprep.select.select_cols, - idx=pd.Series(self.X.columns[np.random.choice([True, False], [self.X.shape[1]])])) + self.X, + scprep.select.select_cols, + idx=pd.Series( + self.X.columns[np.random.choice([True, False], [self.X.shape[1]])] + ), + ) def test_select_cols_dataframe_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_cols, - idx=pd.DataFrame(np.random.choice([True, False], [1, self.X.shape[1]]), - index=[1], columns=self.X.columns)) + self.X, + scprep.select.select_cols, + idx=pd.DataFrame( + np.random.choice([True, False], [1, self.X.shape[1]]), + index=[1], + columns=self.X.columns, + ), + ) def test_select_cols_sparse_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_cols, - idx=sparse.coo_matrix(np.random.choice([True, False], [1, self.X.shape[1]]))) + self.X, + scprep.select.select_cols, + idx=sparse.coo_matrix( + np.random.choice([True, False], [1, self.X.shape[1]]) + ), + ) matrix.test_all_matrix_types( - self.X, scprep.select.select_cols, - idx=sparse.coo_matrix(np.random.choice([True, False], [self.X.shape[1], 1]))) + self.X, + scprep.select.select_cols, + idx=sparse.coo_matrix( + np.random.choice([True, False], [self.X.shape[1], 1]) + ), + ) def test_select_rows_sparse_index(self): matrix.test_all_matrix_types( - self.X, scprep.select.select_rows, - idx=sparse.coo_matrix(np.random.choice([True, False], [1, self.X.shape[0]]))) + self.X, + scprep.select.select_rows, + idx=sparse.coo_matrix( + np.random.choice([True, False], [1, self.X.shape[0]]) + ), + ) matrix.test_all_matrix_types( - self.X, scprep.select.select_rows, - idx=sparse.coo_matrix(np.random.choice([True, False], [self.X.shape[0], 1]))) + self.X, + scprep.select.select_rows, + idx=sparse.coo_matrix( + np.random.choice([True, False], [self.X.shape[0], 1]) + ), + ) def test_select_cols_series_data_boolean_index(self): scprep.select.select_cols( - self.X, self.X.iloc[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, + self.X.iloc[0, :], + idx=np.random.choice([True, False], [self.X.shape[1]]), + ) def test_select_cols_sparse_series_data_boolean_index(self): scprep.select.select_cols( - self.X, self.X_sparse.iloc[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, + self.X_sparse.iloc[0, :], + idx=np.random.choice([True, False], [self.X.shape[1]]), + ) def test_select_cols_series_data_integer_index(self): scprep.select.select_cols( - self.X, self.X.iloc[0, :], idx=np.random.choice(self.X.shape[1], self.X.shape[1] // 2)) + self.X, + self.X.iloc[0, :], + idx=np.random.choice(self.X.shape[1], self.X.shape[1] // 2), + ) def test_select_cols_sparse_series_data_integer_index(self): scprep.select.select_cols( - self.X, self.X_sparse.iloc[0, :], idx=np.random.choice(self.X.shape[1], self.X.shape[1] // 2)) + self.X, + self.X_sparse.iloc[0, :], + idx=np.random.choice(self.X.shape[1], self.X.shape[1] // 2), + ) def test_select_cols_1d_array_data(self): scprep.select.select_cols( - self.X, self.X.to_numpy()[0, :], idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, + self.X.to_numpy()[0, :], + idx=np.random.choice([True, False], [self.X.shape[1]]), + ) def test_select_cols_list_data(self): scprep.select.select_cols( - self.X, self.X.to_numpy()[0, :].tolist(), idx=np.random.choice([True, False], [self.X.shape[1]])) + self.X, + self.X.to_numpy()[0, :].tolist(), + idx=np.random.choice([True, False], [self.X.shape[1]]), + ) def test_select_cols_get_gene_set(self): matrix.test_pandas_matrix_types( - self.X, scprep.select.select_cols, self.X.iloc[0, :], - starts_with="D") + self.X, scprep.select.select_cols, self.X.iloc[0, :], starts_with="D" + ) def test_select_cols_zero_columns(self): assert_warns_message( UserWarning, "Selecting 0 columns", - scprep.select.select_cols, self.X, - idx=(self.X.sum(axis=0) < 0)) + scprep.select.select_cols, + self.X, + idx=(self.X.sum(axis=0) < 0), + ) def test_select_cols_no_condition(self): assert_warns_message( UserWarning, "No selection conditions provided. Returning all columns.", - scprep.select.select_cols, self.X) + scprep.select.select_cols, + self.X, + ) def test_select_rows_invalid_index(self): - assert_raise_message(KeyError, - "'not_a_cell'", - scprep.select.select_rows, - self.X, - idx='not_a_cell') + assert_raise_message( + KeyError, + "'not_a_cell'", + scprep.select.select_rows, + self.X, + idx="not_a_cell", + ) def test_select_cols_invalid_index(self): - assert_raise_message(KeyError, - "'not_a_gene'", - scprep.select.select_cols, - self.X, - idx='not_a_gene') + assert_raise_message( + KeyError, + "'not_a_gene'", + scprep.select.select_cols, + self.X, + idx="not_a_gene", + ) def test_select_rows_2d_dataframe_index(self): - assert_raise_message(ValueError, - "Expected idx to be 1D. " - "Got shape (2, {})".format(self.X.shape[0]), - scprep.select.select_rows, - self.X, - idx=pd.DataFrame([self.X.index, self.X.index])) + assert_raise_message( + ValueError, + "Expected idx to be 1D. " "Got shape (2, {})".format(self.X.shape[0]), + scprep.select.select_rows, + self.X, + idx=pd.DataFrame([self.X.index, self.X.index]), + ) def test_select_rows_2d_list_index(self): - assert_raise_message(ValueError, - "Expected idx to be 1D. " - "Got shape (2, {})".format(self.X.shape[0]), - scprep.select.select_rows, - self.X, - idx=[self.X.index, self.X.index]) + assert_raise_message( + ValueError, + "Expected idx to be 1D. " "Got shape (2, {})".format(self.X.shape[0]), + scprep.select.select_rows, + self.X, + idx=[self.X.index, self.X.index], + ) def test_select_cols_2d_dataframe_index(self): - assert_raise_message(ValueError, - "Expected idx to be 1D. " - "Got shape (2, {})".format(self.X.shape[1]), - scprep.select.select_cols, - self.X, - idx=pd.DataFrame([self.X.columns, self.X.columns])) + assert_raise_message( + ValueError, + "Expected idx to be 1D. " "Got shape (2, {})".format(self.X.shape[1]), + scprep.select.select_cols, + self.X, + idx=pd.DataFrame([self.X.columns, self.X.columns]), + ) def test_select_cols_2d_list_index(self): - assert_raise_message(ValueError, - "Expected idx to be 1D. " - "Got shape (2, {})".format(self.X.shape[1]), - scprep.select.select_cols, - self.X, - idx=[self.X.columns, self.X.columns]) + assert_raise_message( + ValueError, + "Expected idx to be 1D. " "Got shape (2, {})".format(self.X.shape[1]), + scprep.select.select_cols, + self.X, + idx=[self.X.columns, self.X.columns], + ) def test_select_cols_unequal_columns(self): assert_raise_message( @@ -318,7 +418,8 @@ def test_select_cols_unequal_columns(self): "columns. Got [100, 50]", scprep.select.select_cols, self.X, - self.X.to_numpy()[:, :50]) + self.X.to_numpy()[:, :50], + ) def test_select_rows_unequal_rows(self): assert_raise_message( @@ -327,7 +428,8 @@ def test_select_rows_unequal_rows(self): "rows. Got [100, 50]", scprep.select.select_rows, self.X, - self.X.to_numpy()[:50, :]) + self.X.to_numpy()[:50, :], + ) def test_select_cols_conflicting_data(self): assert_raise_message( @@ -337,7 +439,8 @@ def test_select_cols_conflicting_data(self): "`scprep.select.select_cols(*extra_data, idx=data.columns)`", scprep.select.select_cols, self.X, - self.X.iloc[:,::-1]) + self.X.iloc[:, ::-1], + ) def test_select_rows_conflicting_data(self): assert_raise_message( @@ -347,14 +450,17 @@ def test_select_rows_conflicting_data(self): "`scprep.select.select_rows(*extra_data, idx=data.index)`", scprep.select.select_rows, self.X, - self.X.iloc[::-1]) + self.X.iloc[::-1], + ) def test_select_cols_get_gene_set_ndarray_data(self): assert_raise_message( ValueError, "Can only select based on column names with DataFrame input. " "Please set `idx` to select specific columns.", - scprep.select.select_cols, self.X.to_numpy(), starts_with="A" + scprep.select.select_cols, + self.X.to_numpy(), + starts_with="A", ) def test_select_rows_get_cell_set_ndarray_data(self): @@ -362,36 +468,54 @@ def test_select_rows_get_cell_set_ndarray_data(self): ValueError, "Can only select based on row names with DataFrame input. " "Please set `idx` to select specific rows.", - scprep.select.select_rows, self.X.to_numpy(), starts_with="A" + scprep.select.select_rows, + self.X.to_numpy(), + starts_with="A", ) def test_subsample(self): self.X = data.generate_positive_sparse_matrix(shape=(50, 100)) Y = scprep.select.subsample(self.X, n=20, seed=42) matrix.test_all_matrix_types( - self.X, utils.assert_transform_equals, Y=Y, + self.X, + utils.assert_transform_equals, + Y=Y, transform=scprep.select.subsample, - check=utils.assert_all_equal, n=20, seed=42) + check=utils.assert_all_equal, + n=20, + seed=42, + ) def test_subsample_multiple(self): - Y, libsize_sub = scprep.select.subsample( - self.X, self.libsize, n=20, seed=42) + Y, libsize_sub = scprep.select.subsample(self.X, self.libsize, n=20, seed=42) def test_fun(X, **kwargs): libsize = scprep.measure.library_size(X) return scprep.select.subsample(X, libsize, **kwargs)[0] + matrix.test_all_matrix_types( - self.X, utils.assert_transform_equals, Y=Y, + self.X, + utils.assert_transform_equals, + Y=Y, transform=test_fun, - check=utils.assert_all_equal, n=20, seed=42) + check=utils.assert_all_equal, + n=20, + seed=42, + ) def test_fun(X, **kwargs): libsize = scprep.measure.library_size(X) return scprep.select.subsample(X, libsize, **kwargs)[1] + matrix.test_all_matrix_types( - self.X, utils.assert_transform_equals, Y=libsize_sub, + self.X, + utils.assert_transform_equals, + Y=libsize_sub, transform=test_fun, - check=utils.assert_all_close, n=20, seed=42) + check=utils.assert_all_close, + n=20, + seed=42, + ) def test_subsample_mismatch_size(self): libsize = self.libsize[:25] @@ -399,72 +523,151 @@ def test_subsample_mismatch_size(self): ValueError, "Expected `data` and `extra_data` to have the same number of " "rows. Got [100, 25]", - scprep.select.subsample, self.X, libsize, n=20) + scprep.select.subsample, + self.X, + libsize, + n=20, + ) def test_subsample_n_too_large(self): assert_raise_message( ValueError, "Expected n (101) <= n_samples (100)", - scprep.select.subsample, self.X, n=self.X.shape[0] + 1) + scprep.select.subsample, + self.X, + n=self.X.shape[0] + 1, + ) def test_sparse_dataframe_fill_value(self): def test_fun(X): - Y = scprep.select.select_rows(X, idx=np.arange(X.shape[0]//2)) + Y = scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2)) for col in Y.columns: assert X[col].dtype == Y[col].dtype, (X[col].dtype, Y[col].dtype) - Y = scprep.select.select_cols(X, idx=np.arange(X.shape[1]//2)) + Y = scprep.select.select_cols(X, idx=np.arange(X.shape[1] // 2)) for col in Y.columns: assert X[col].dtype == Y[col].dtype, (X[col].dtype, Y[col].dtype) + matrix.test_matrix_types( - self.X.astype(float), test_fun, matrix._pandas_sparse_matrix_types) + self.X.astype(float), test_fun, matrix._pandas_sparse_matrix_types + ) def test_select_variable_genes(self): X_filtered = scprep.select.highly_variable_genes(self.X, percentile=90) assert X_filtered.shape[0] == self.X.shape[0] assert X_filtered.shape[1] == 10 - assert self.X.columns[np.argmax(self.X.values.std(axis=0))] in X_filtered.columns + assert ( + self.X.columns[np.argmax(self.X.values.std(axis=0))] in X_filtered.columns + ) matrix.test_all_matrix_types( - self.X, utils.assert_transform_equals, - Y=X_filtered, transform=scprep.select.highly_variable_genes, percentile=90) + self.X, + utils.assert_transform_equals, + Y=X_filtered, + transform=scprep.select.highly_variable_genes, + percentile=90, + ) def test_string_subset_exact_word(): - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], exact_word='hello'), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - [' hello ', 'world'], exact_word='hello'), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['(hello)', 'world'], exact_word='hello'), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['[hello]', 'world'], exact_word='hello'), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello...?', 'world'], exact_word='hello'), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello world', 'world'], exact_word='hello'), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['(hello) world', 'world'], exact_word='hello'), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['World, hello!', 'world'], exact_word='hello'), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['helloooo!', 'world'], exact_word='hello'), [False, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['(hello) world', 'world'], exact_word='(hello) world'), [True, False]) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask(["hello", "world"], exact_word="hello"), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask([" hello ", "world"], exact_word="hello"), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask(["(hello)", "world"], exact_word="hello"), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask(["[hello]", "world"], exact_word="hello"), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello...?", "world"], exact_word="hello" + ), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello world", "world"], exact_word="hello" + ), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["(hello) world", "world"], exact_word="hello" + ), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["World, hello!", "world"], exact_word="hello" + ), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["helloooo!", "world"], exact_word="hello" + ), + [False, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["(hello) world", "world"], exact_word="(hello) world" + ), + [True, False], + ) def test_string_subset_list(): - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], exact_word=['hello', 'world']), [True, True]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], exact_word=['hello', 'earth']), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], starts_with=['hell', 'w']), [True, True]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], starts_with=['hell', 'e']), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], ends_with=['ello', 'ld']), [True, True]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], ends_with=['ello', 'h']), [True, False]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], regex=['^hell.', '^.or.*']), [True, True]) - np.testing.assert_array_equal(scprep.select._get_string_subset_mask( - ['hello', 'world'], regex=['^hell', '^earth']), [True, False]) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello", "world"], exact_word=["hello", "world"] + ), + [True, True], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello", "world"], exact_word=["hello", "earth"] + ), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello", "world"], starts_with=["hell", "w"] + ), + [True, True], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello", "world"], starts_with=["hell", "e"] + ), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello", "world"], ends_with=["ello", "ld"] + ), + [True, True], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello", "world"], ends_with=["ello", "h"] + ), + [True, False], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello", "world"], regex=["^hell.", "^.or.*"] + ), + [True, True], + ) + np.testing.assert_array_equal( + scprep.select._get_string_subset_mask( + ["hello", "world"], regex=["^hell", "^earth"] + ), + [True, False], + ) diff --git a/test/test_stats.py b/test/test_stats.py index 166d9c9a..b19c1fcb 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -11,83 +11,118 @@ def _test_fun_2d(X, fun, **kwargs): - return fun(scprep.select.select_cols(X, idx=0), scprep.select.select_cols(X, idx=1), **kwargs) + return fun( + scprep.select.select_cols(X, idx=0), + scprep.select.select_cols(X, idx=1), + **kwargs + ) def test_EMD(): - X = data.generate_positive_sparse_matrix( - shape=(500, 2), seed=42, poisson_mean=5) + X = data.generate_positive_sparse_matrix(shape=(500, 2), seed=42, poisson_mean=5) Y = scprep.stats.EMD(X[:, 0], X[:, 1]) assert isinstance(Y, float) np.testing.assert_allclose(Y, 0.5537161) matrix.test_all_matrix_types( - X, utils.assert_transform_equals, Y=Y, + X, + utils.assert_transform_equals, + Y=Y, transform=partial(_test_fun_2d, fun=scprep.stats.EMD), - check=utils.assert_all_close) + check=utils.assert_all_close, + ) assert_raise_message( - ValueError, "Expected x and y to be 1D arrays. " + ValueError, + "Expected x and y to be 1D arrays. " "Got shapes x {}, y {}".format(X.shape, X[:, 1].shape), - scprep.stats.EMD, X, X[:, 1]) + scprep.stats.EMD, + X, + X[:, 1], + ) + def test_u_statistic(): - X = data.generate_positive_sparse_matrix( - shape=(500, 3), seed=42, poisson_mean=0.2) - Y = data.generate_positive_sparse_matrix( - shape=(500, 3), seed=42, poisson_mean=0.3) - u_stat = [stats.mannwhitneyu(X[:,i], Y[:,i], alternative='two-sided')[0] for i in range(X.shape[1])] + X = data.generate_positive_sparse_matrix(shape=(500, 3), seed=42, poisson_mean=0.2) + Y = data.generate_positive_sparse_matrix(shape=(500, 3), seed=42, poisson_mean=0.3) + u_stat = [ + stats.mannwhitneyu(X[:, i], Y[:, i], alternative="two-sided")[0] + for i in range(X.shape[1]) + ] + def test_fun(X): - return scprep.stats.rank_sum_statistic(scprep.select.select_rows(X, idx=np.arange(500)), - scprep.select.select_rows(X, idx=np.arange(500,1000))) + return scprep.stats.rank_sum_statistic( + scprep.select.select_rows(X, idx=np.arange(500)), + scprep.select.select_rows(X, idx=np.arange(500, 1000)), + ) + matrix.test_all_matrix_types( - np.vstack([X, Y]), utils.assert_transform_equals, Y=u_stat, + np.vstack([X, Y]), + utils.assert_transform_equals, + Y=u_stat, transform=test_fun, - check=utils.assert_all_close) + check=utils.assert_all_close, + ) + def test_t_statistic(): - X = data.generate_positive_sparse_matrix( - shape=(500, 3), seed=42, poisson_mean=0.2) - Y = data.generate_positive_sparse_matrix( - shape=(500, 3), seed=42, poisson_mean=0.3) - u_stat = [stats.ttest_ind(X[:,i], Y[:,i], equal_var=False)[0] for i in range(X.shape[1])] + X = data.generate_positive_sparse_matrix(shape=(500, 3), seed=42, poisson_mean=0.2) + Y = data.generate_positive_sparse_matrix(shape=(500, 3), seed=42, poisson_mean=0.3) + u_stat = [ + stats.ttest_ind(X[:, i], Y[:, i], equal_var=False)[0] for i in range(X.shape[1]) + ] + def test_fun(X): - return scprep.stats.t_statistic(scprep.select.select_rows(X, idx=np.arange(500)), - scprep.select.select_rows(X, idx=np.arange(500,1000))) + return scprep.stats.t_statistic( + scprep.select.select_rows(X, idx=np.arange(500)), + scprep.select.select_rows(X, idx=np.arange(500, 1000)), + ) + matrix.test_all_matrix_types( - np.vstack([X, Y]), utils.assert_transform_equals, Y=u_stat, + np.vstack([X, Y]), + utils.assert_transform_equals, + Y=u_stat, transform=test_fun, - check=partial(utils.assert_all_close, rtol=2e-3)) + check=partial(utils.assert_all_close, rtol=2e-3), + ) def test_pairwise_correlation(): def test_fun(X, *args, **kwargs): return scprep.stats.pairwise_correlation( - X, - scprep.select.select_cols(X, idx=np.arange(10)), - *args, **kwargs) - D = data.generate_positive_sparse_matrix( - shape=(500, 100), seed=42, poisson_mean=5) + X, scprep.select.select_cols(X, idx=np.arange(10)), *args, **kwargs + ) + + D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5) Y = test_fun(D) assert Y.shape == (D.shape[1], 10) assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0) matrix.test_all_matrix_types( - D, utils.assert_transform_equals, Y=Y, + D, + utils.assert_transform_equals, + Y=Y, transform=test_fun, - check=utils.assert_all_close) + check=utils.assert_all_close, + ) matrix.test_all_matrix_types( - D, utils.assert_transform_equals, Y=Y, - transform=partial(scprep.stats.pairwise_correlation, - Y=scprep.select.select_cols(D, idx=np.arange(10))), - check=utils.assert_all_close) + D, + utils.assert_transform_equals, + Y=Y, + transform=partial( + scprep.stats.pairwise_correlation, + Y=scprep.select.select_cols(D, idx=np.arange(10)), + ), + check=utils.assert_all_close, + ) def test_fun(X, *args, **kwargs): - return scprep.stats.pairwise_correlation( - X=D, - Y=X, - *args, **kwargs) + return scprep.stats.pairwise_correlation(X=D, Y=X, *args, **kwargs) + matrix.test_all_matrix_types( scprep.select.select_cols(D, idx=np.arange(10)), - utils.assert_transform_equals, Y=Y, - transform=test_fun, check=utils.assert_all_close) + utils.assert_transform_equals, + Y=Y, + transform=test_fun, + check=utils.assert_all_close, + ) def shan_entropy(c): @@ -107,52 +142,75 @@ def calc_MI(X, Y, bins): def test_mutual_information(): - X = data.generate_positive_sparse_matrix( - shape=(500, 2), seed=42, poisson_mean=5) + X = data.generate_positive_sparse_matrix(shape=(500, 2), seed=42, poisson_mean=5) Y = scprep.stats.mutual_information(X[:, 0], X[:, 1], bins=20) assert isinstance(Y, float) np.testing.assert_allclose(Y, calc_MI(X[:, 0], X[:, 1], bins=20)) matrix.test_all_matrix_types( - X, utils.assert_transform_equals, Y=Y, + X, + utils.assert_transform_equals, + Y=Y, transform=partial(_test_fun_2d, fun=scprep.stats.mutual_information), - check=utils.assert_all_close, bins=20) + check=utils.assert_all_close, + bins=20, + ) def test_knnDREMI(): - X = data.generate_positive_sparse_matrix( - shape=(500, 2), seed=42, poisson_mean=5) + X = data.generate_positive_sparse_matrix(shape=(500, 2), seed=42, poisson_mean=5) Y = scprep.stats.knnDREMI(X[:, 0], X[:, 1]) assert isinstance(Y, float) np.testing.assert_allclose(Y, 0.16238906) - Y2, drevi = scprep.stats.knnDREMI(X[:, 0], X[:, 1], - plot=True, filename="test.png", - return_drevi=True) + Y2, drevi = scprep.stats.knnDREMI( + X[:, 0], X[:, 1], plot=True, filename="test.png", return_drevi=True + ) assert os.path.isfile("test.png") os.remove("test.png") assert Y2 == Y assert drevi.shape == (20, 20) matrix.test_all_matrix_types( - X, utils.assert_transform_equals, Y=Y, + X, + utils.assert_transform_equals, + Y=Y, transform=partial(_test_fun_2d, fun=scprep.stats.knnDREMI), - check=utils.assert_all_close) + check=utils.assert_all_close, + ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) - assert scprep.stats.knnDREMI(X[:, 0], np.repeat(X[0, 1], X.shape[0]), - return_drevi=True) == (0, None) + assert scprep.stats.knnDREMI( + X[:, 0], np.repeat(X[0, 1], X.shape[0]), return_drevi=True + ) == (0, None) assert_raise_message( - ValueError, "Expected k as an integer. Got ", - scprep.stats.knnDREMI, X[:, 0], X[:, 1], k="invalid") + ValueError, + "Expected k as an integer. Got ", + scprep.stats.knnDREMI, + X[:, 0], + X[:, 1], + k="invalid", + ) assert_raise_message( - ValueError, "Expected n_bins as an integer. Got ", - scprep.stats.knnDREMI, X[:, 0], X[:, 1], n_bins="invalid") + ValueError, + "Expected n_bins as an integer. Got ", + scprep.stats.knnDREMI, + X[:, 0], + X[:, 1], + n_bins="invalid", + ) assert_raise_message( - ValueError, "Expected n_mesh as an integer. Got ", - scprep.stats.knnDREMI, X[:, 0], X[:, 1], n_mesh="invalid") + ValueError, + "Expected n_mesh as an integer. Got ", + scprep.stats.knnDREMI, + X[:, 0], + X[:, 1], + n_mesh="invalid", + ) assert_warns_message( UserWarning, - "Attempting to calculate kNN-DREMI on a constant array. " - "Returning `0`", scprep.stats.knnDREMI, X[:, 0], - np.zeros_like(X[:, 1])) + "Attempting to calculate kNN-DREMI on a constant array. " "Returning `0`", + scprep.stats.knnDREMI, + X[:, 0], + np.zeros_like(X[:, 1]), + ) def test_mean_difference(): @@ -161,64 +219,96 @@ def test_mean_difference(): Y = scprep.stats.mean_difference(X.iloc[:20], X.iloc[20:100]) assert np.allclose(np.max(Y), 16.8125) assert np.allclose(np.min(Y), -0.5625) + def test_fun(X, **kwargs): return scprep.stats.mean_difference( scprep.select.select_rows(X, idx=np.arange(20)), scprep.select.select_rows(X, idx=np.arange(20, 100)), - **kwargs) + **kwargs + ) + matrix.test_all_matrix_types( - X, utils.assert_transform_equals, Y=Y, + X, + utils.assert_transform_equals, + Y=Y, transform=test_fun, - check=utils.assert_all_close) + check=utils.assert_all_close, + ) assert_raise_message( ValueError, "Expected X and Y to have the same number of columns. " - "Got shapes {}, {}".format(X.shape, X.iloc[:,:10].shape), + "Got shapes {}, {}".format(X.shape, X.iloc[:, :10].shape), scprep.stats.mean_difference, - X, X.iloc[:,:10]) + X, + X.iloc[:, :10], + ) -@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), - ('emd', 'up'), ('emd', 'down'), ('emd', 'both'), - ('ttest', 'up'), ('ttest', 'down'), ('ttest', 'both'), - ('ranksum', 'up'), ('ranksum', 'down'), ('ranksum', 'both')]) +@parameterized( + [ + ("difference", "up"), + ("difference", "down"), + ("difference", "both"), + ("emd", "up"), + ("emd", "down"), + ("emd", "both"), + ("ttest", "up"), + ("ttest", "down"), + ("ttest", "both"), + ("ranksum", "up"), + ("ranksum", "down"), + ("ranksum", "both"), + ] +) def test_differential_expression(measure, direction): X = data.load_10X() X = scprep.filter.filter_empty_genes(X) - result = scprep.stats.differential_expression(X.iloc[:20], X.iloc[20:100], - measure=measure, direction=direction) - expected_results = {('difference', 'up') : ('Gstm5', 16.8125), - ('difference', 'down') : ('Slc2a3', -0.5625), - ('difference', 'both') : ('Gstm5', 16.8125), - ('emd', 'up') : ('Gstm5', 17.5625), - ('emd', 'down') : ('Slc2a3', -0.6875), - ('emd', 'both') : ('Gstm5', 17.5625), - ('ttest', 'up') : ('Trmt1', 2.6335), - ('ttest', 'down') : ('Dhfr', -1.93347), - ('ttest', 'both') : ('Trmt1', 2.6335), - ('ranksum', 'up') : ('Adam30', 796), - ('ranksum', 'down') : ('Gstm5', 339), - ('ranksum', 'both') : ('Adam30', 796)} + result = scprep.stats.differential_expression( + X.iloc[:20], X.iloc[20:100], measure=measure, direction=direction + ) + expected_results = { + ("difference", "up"): ("Gstm5", 16.8125), + ("difference", "down"): ("Slc2a3", -0.5625), + ("difference", "both"): ("Gstm5", 16.8125), + ("emd", "up"): ("Gstm5", 17.5625), + ("emd", "down"): ("Slc2a3", -0.6875), + ("emd", "both"): ("Gstm5", 17.5625), + ("ttest", "up"): ("Trmt1", 2.6335), + ("ttest", "down"): ("Dhfr", -1.93347), + ("ttest", "both"): ("Trmt1", 2.6335), + ("ranksum", "up"): ("Adam30", 796), + ("ranksum", "down"): ("Gstm5", 339), + ("ranksum", "both"): ("Adam30", 796), + } assert result.index[0] == expected_results[(measure, direction)][0], result.index[0] - assert np.allclose(result[measure][0], - expected_results[(measure, direction)][1]), result[measure][0] - result_unnamed = scprep.stats.differential_expression(X.iloc[:20].sparse.to_coo(), X.iloc[20:100].sparse.to_coo(), - measure=measure, direction=direction) - if direction != 'both': + assert np.allclose( + result[measure][0], expected_results[(measure, direction)][1] + ), result[measure][0] + result_unnamed = scprep.stats.differential_expression( + X.iloc[:20].sparse.to_coo(), + X.iloc[20:100].sparse.to_coo(), + measure=measure, + direction=direction, + ) + if direction != "both": values = result[measure] else: values = np.abs(result[measure]) unique_values = ~np.isin(values, values[values.duplicated()]) - assert np.all(X.columns[result_unnamed.index][unique_values] == result.index[unique_values]) + assert np.all( + X.columns[result_unnamed.index][unique_values] == result.index[unique_values] + ) + def test_fun(X, **kwargs): return scprep.stats.differential_expression( scprep.select.select_rows(X, idx=np.arange(20)), scprep.select.select_rows(X, idx=np.arange(20, 100)), - **kwargs) + **kwargs + ) def check_fun(Y1, Y2): - if direction == 'both': + if direction == "both": Y1[measure] = np.abs(Y1[measure]) Y2[measure] = np.abs(Y2[measure]) np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) @@ -227,76 +317,109 @@ def check_fun(Y1, Y2): np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) matrix.test_all_matrix_types( - X, utils.assert_transform_equals, Y=result, + X, + utils.assert_transform_equals, + Y=result, transform=test_fun, check=check_fun, gene_names=X.columns, - measure=measure, direction=direction) + measure=measure, + direction=direction, + ) def test_differential_expression_error(): X = data.load_10X() assert_raise_message( - ValueError, "Expected `direction` in ['up', 'down', 'both']. " - "Got invalid", scprep.stats.differential_expression, - X, X, direction='invalid') + ValueError, + "Expected `direction` in ['up', 'down', 'both']. " "Got invalid", + scprep.stats.differential_expression, + X, + X, + direction="invalid", + ) assert_raise_message( - ValueError, "Expected `measure` in ['difference', 'emd', 'ttest', 'ranksum']. " - "Got invalid", scprep.stats.differential_expression, - X, X, measure='invalid') + ValueError, + "Expected `measure` in ['difference', 'emd', 'ttest', 'ranksum']. " + "Got invalid", + scprep.stats.differential_expression, + X, + X, + measure="invalid", + ) assert_raise_message( - ValueError, "Expected `X` and `Y` to be matrices. " + ValueError, + "Expected `X` and `Y` to be matrices. " "Got shapes {}, {}".format(X.shape, X.iloc[0].shape), scprep.stats.differential_expression, - X, X.iloc[0]) + X, + X.iloc[0], + ) assert_raise_message( - ValueError, "Expected gene_names to have length {}. " - "Got {}".format(X.shape[0], X.shape[0]//2), + ValueError, + "Expected gene_names to have length {}. " + "Got {}".format(X.shape[0], X.shape[0] // 2), scprep.stats.differential_expression, - X.sparse.to_coo(), X.sparse.to_coo(), gene_names=np.arange(X.shape[0]//2)) + X.sparse.to_coo(), + X.sparse.to_coo(), + gene_names=np.arange(X.shape[0] // 2), + ) assert_raise_message( - ValueError, "Expected gene_names to have length {}. " - "Got {}".format(X.shape[0], X.shape[0]//2), + ValueError, + "Expected gene_names to have length {}. " + "Got {}".format(X.shape[0], X.shape[0] // 2), scprep.stats.differential_expression_by_cluster, - X.sparse.to_coo(), np.random.choice(2, X.shape[0], replace=True), - gene_names=np.arange(X.shape[0]//2)) + X.sparse.to_coo(), + np.random.choice(2, X.shape[0], replace=True), + gene_names=np.arange(X.shape[0] // 2), + ) assert_warns_message( - UserWarning, "Input data has inconsistent column names. " - "Subsetting to 20 common columns.", + UserWarning, + "Input data has inconsistent column names. " "Subsetting to 20 common columns.", scprep.stats.differential_expression, - X, X.iloc[:,:20]) + X, + X.iloc[:, :20], + ) def test_differential_expression_by_cluster(): - measure = 'difference' - direction = 'up' + measure = "difference" + direction = "up" X = data.load_10X() np.random.seed(42) clusters = np.random.choice(4, X.shape[0], replace=True) result = scprep.stats.differential_expression_by_cluster( - X, clusters, - measure=measure, direction=direction) + X, clusters, measure=measure, direction=direction + ) for cluster in range(4): r = scprep.stats.differential_expression( - scprep.select.select_rows(X, idx=clusters==cluster), - scprep.select.select_rows(X, idx=clusters!=cluster), - measure=measure, direction=direction) + scprep.select.select_rows(X, idx=clusters == cluster), + scprep.select.select_rows(X, idx=clusters != cluster), + measure=measure, + direction=direction, + ) assert np.all(result[cluster] == r) def test_differential_expression_by_cluster_subset(): - measure = 'difference' - direction = 'up' + measure = "difference" + direction = "up" X = data.load_10X() np.random.seed(42) clusters = np.random.choice(4, X.shape[0], replace=True) result = scprep.stats.differential_expression_by_cluster( - X, clusters, - measure=measure, direction=direction, gene_names=X.columns[:X.shape[0]//2]) + X, + clusters, + measure=measure, + direction=direction, + gene_names=X.columns[: X.shape[0] // 2], + ) for cluster in range(4): r = scprep.stats.differential_expression( - scprep.select.select_rows(X, idx=clusters==cluster), - scprep.select.select_rows(X, idx=clusters!=cluster), - measure=measure, direction=direction, - gene_names=X.columns[:X.shape[0]//2]) + scprep.select.select_rows(X, idx=clusters == cluster), + scprep.select.select_rows(X, idx=clusters != cluster), + measure=measure, + direction=direction, + gene_names=X.columns[: X.shape[0] // 2], + ) assert np.all(result[cluster] == r) diff --git a/test/test_transform.py b/test/test_transform.py index 58168cbd..675b6a5b 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -11,53 +11,76 @@ def test_sqrt_transform(): X = data.generate_positive_sparse_matrix() Y = np.sqrt(X) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, - Y=Y, transform=scprep.transform.sqrt) + X, utils.assert_transform_equivalent, Y=Y, transform=scprep.transform.sqrt + ) def test_log_transform(): X = data.generate_positive_sparse_matrix() Y = np.log10(X + 1) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, - Y=Y, transform=scprep.transform.log, - base=10) + X, + utils.assert_transform_equivalent, + Y=Y, + transform=scprep.transform.log, + base=10, + ) Y = np.log(X + 1) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, - Y=Y, transform=scprep.transform.log, - base='e') + X, + utils.assert_transform_equivalent, + Y=Y, + transform=scprep.transform.log, + base="e", + ) Y = np.log2(X + 1) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, - Y=Y, transform=scprep.transform.log, - base=2) + X, + utils.assert_transform_equivalent, + Y=Y, + transform=scprep.transform.log, + base=2, + ) Y = np.log2(X + 5) + def test_fun(X): assert_warns_message( RuntimeWarning, "log transform on sparse data requires pseudocount = 1", scprep.transform.log, - data=X, base=2, pseudocount=5) + data=X, + base=2, + pseudocount=5, + ) + matrix.test_sparse_matrix_types(X, test_fun) matrix.test_dense_matrix_types( - X, utils.assert_transform_equivalent, - Y=Y, transform=scprep.transform.log, - base=2, pseudocount=5) + X, + utils.assert_transform_equivalent, + Y=Y, + transform=scprep.transform.log, + base=2, + pseudocount=5, + ) def test_arcsinh_transform(): X = data.generate_positive_sparse_matrix() Y = np.arcsinh(X / 5) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, - Y=Y, transform=scprep.transform.arcsinh, - check=utils.assert_all_close) + X, + utils.assert_transform_equivalent, + Y=Y, + transform=scprep.transform.arcsinh, + check=utils.assert_all_close, + ) assert_raise_message( ValueError, - "Expected cofactor > 0 or None. " - "Got 0", - scprep.transform.arcsinh, data=X, cofactor=0) + "Expected cofactor > 0 or None. " "Got 0", + scprep.transform.arcsinh, + data=X, + cofactor=0, + ) def test_deprecated(): @@ -66,34 +89,41 @@ def test_deprecated(): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) utils.assert_transform_equivalent( - X, Y=Y, transform=scprep.transform.sqrt_transform) + X, Y=Y, transform=scprep.transform.sqrt_transform + ) assert_warns_message( FutureWarning, "scprep.transform.sqrt_transform is deprecated. Please use " "scprep.transform.sqrt in future.", scprep.transform.sqrt_transform, - data=X) + data=X, + ) Y = scprep.transform.log(X) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) utils.assert_transform_equivalent( - X, Y=Y, transform=scprep.transform.log_transform) + X, Y=Y, transform=scprep.transform.log_transform + ) assert_warns_message( FutureWarning, "scprep.transform.log_transform is deprecated. Please use " "scprep.transform.log in future.", scprep.transform.log_transform, - data=X) + data=X, + ) Y = scprep.transform.arcsinh(X) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning) utils.assert_transform_equivalent( - X, Y=Y, transform=scprep.transform.arcsinh_transform) + X, Y=Y, transform=scprep.transform.arcsinh_transform + ) assert_warns_message( FutureWarning, "scprep.transform.arcsinh_transform is deprecated. Please use " "scprep.transform.arcsinh in future.", - scprep.transform.arcsinh_transform, data=X) + scprep.transform.arcsinh_transform, + data=X, + ) def test_sqrt_negative_value(): @@ -101,7 +131,9 @@ def test_sqrt_negative_value(): assert_raise_message( ValueError, "Cannot square root transform negative values", - scprep.transform.sqrt, data=X) + scprep.transform.sqrt, + data=X, + ) def test_log_error(): @@ -109,12 +141,20 @@ def test_log_error(): assert_raise_message( ValueError, "Required pseudocount + min(data) (-9) > 0. Got pseudocount = 1", - scprep.transform.log, data=X * -1) + scprep.transform.log, + data=X * -1, + ) assert_raise_message( ValueError, "Expected base in [2, 'e', 10]. Got 0", - scprep.transform.log, data=X, base=0) + scprep.transform.log, + data=X, + base=0, + ) assert_raise_message( ValueError, "Expected base in [2, 'e', 10]. Got none", - scprep.transform.log, data=X, base='none') + scprep.transform.log, + data=X, + base="none", + ) diff --git a/test/test_utils.py b/test/test_utils.py index 1a939c87..a87d52d4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -10,16 +10,20 @@ def test_with_pkg(): @scprep.utils._with_pkg(pkg="invalid") def invalid(): pass - assert_raise_message(ImportError, - "invalid not found. Please install it with e.g. " - "`pip install --user invalid`", - invalid) + + assert_raise_message( + ImportError, + "invalid not found. Please install it with e.g. " + "`pip install --user invalid`", + invalid, + ) def test_with_pkg_version_none(): @scprep.utils._with_pkg(pkg="numpy") def test(): return True + assert test() @@ -29,6 +33,7 @@ def test_with_pkg_version_exact(): @scprep.utils._with_pkg(pkg="numpy", min_version="{}.{}".format(major, minor)) def test(): return True + assert test() @@ -38,6 +43,7 @@ def test_with_pkg_version_exact_no_minor(): @scprep.utils._with_pkg(pkg="numpy", min_version=major) def test(): return True + assert test() @@ -47,6 +53,7 @@ def test_with_pkg_version_pass_major(): @scprep.utils._with_pkg(pkg="numpy", min_version=major - 1) def test(): return True + assert test() @@ -56,6 +63,7 @@ def test_with_pkg_version_pass_minor(): @scprep.utils._with_pkg(pkg="numpy", min_version="{}.{}".format(major, minor - 1)) def test(): return True + assert test() @@ -65,12 +73,14 @@ def test_with_pkg_version_fail_major(): @scprep.utils._with_pkg(pkg="numpy", min_version=major + 1) def test(): return True - assert_raise_message(ImportError, - "numpy>={0} is required (installed: {1}). " - "Please upgrade it with e.g." - " `pip install --user --upgrade numpy".format( - major + 1, np.__version__), - test) + + assert_raise_message( + ImportError, + "numpy>={0} is required (installed: {1}). " + "Please upgrade it with e.g." + " `pip install --user --upgrade numpy".format(major + 1, np.__version__), + test, + ) def test_with_pkg_version_fail_minor(): @@ -79,12 +89,14 @@ def test_with_pkg_version_fail_minor(): @scprep.utils._with_pkg(pkg="numpy", min_version="{}.{}".format(major, minor + 1)) def test(): return True - assert_raise_message(ImportError, - "numpy>={0}.{1} is required (installed: {2}). " - "Please upgrade it with e.g." - " `pip install --user --upgrade numpy".format( - major, minor + 1, np.__version__), - test) + + assert_raise_message( + ImportError, + "numpy>={0}.{1} is required (installed: {2}). " + "Please upgrade it with e.g." + " `pip install --user --upgrade numpy".format(major, minor + 1, np.__version__), + test, + ) def test_with_pkg_version_memoize(): @@ -94,6 +106,7 @@ def test_with_pkg_version_memoize(): @scprep.utils._with_pkg(pkg="numpy", min_version=min_version) def test(): return True + true_version = np.__version__ np.__version__ = min_version # should pass @@ -110,110 +123,124 @@ def test_try_import(): def test_combine_batches(): X = data.load_10X() - Y = pd.concat([X, scprep.select.select_rows( - X, idx=np.arange(X.shape[0] // 2))]) + Y = pd.concat([X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))]) Y2, sample_labels = scprep.utils.combine_batches( - [X, scprep.select.select_rows( - X, idx=np.arange(X.shape[0] // 2))], + [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], batch_labels=[0, 1], - append_to_cell_names=False) + append_to_cell_names=False, + ) assert utils.assert_matrix_class_equivalent(Y, Y2) utils.assert_all_equal(Y, Y2) assert np.all(Y.index == Y2.index) - assert np.all(sample_labels == np.concatenate( - [np.repeat(0, X.shape[0]), np.repeat(1, X.shape[0] // 2)])) + assert np.all( + sample_labels + == np.concatenate([np.repeat(0, X.shape[0]), np.repeat(1, X.shape[0] // 2)]) + ) assert np.all(sample_labels.index == Y2.index) - assert sample_labels.name == 'sample_labels' + assert sample_labels.name == "sample_labels" Y2, sample_labels = scprep.utils.combine_batches( - [X, scprep.select.select_rows( - X, idx=np.arange(X.shape[0] // 2))], + [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], batch_labels=[0, 1], - append_to_cell_names=True) + append_to_cell_names=True, + ) assert np.all(Y.index == np.array([i[:-2] for i in Y2.index])) - assert np.all(np.core.defchararray.add( - "_", sample_labels.astype(str)) == np.array( - [i[-2:] for i in Y2.index], dtype=str)) + assert np.all( + np.core.defchararray.add("_", sample_labels.astype(str)) + == np.array([i[-2:] for i in Y2.index], dtype=str) + ) assert np.all(sample_labels.index == Y2.index) - assert sample_labels.name == 'sample_labels' + assert sample_labels.name == "sample_labels" transform = lambda X: scprep.utils.combine_batches( [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], - batch_labels=[0, 1])[0] + batch_labels=[0, 1], + )[0] matrix.test_matrix_types( X, utils.assert_transform_equals, matrix._indexable_matrix_types, Y=Y, transform=transform, - check=utils.assert_all_equal) + check=utils.assert_all_equal, + ) + def test_fun(X): Y, sample_labels = scprep.utils.combine_batches( [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], - batch_labels=[0, 1]) + batch_labels=[0, 1], + ) assert np.all(sample_labels.index == Y.index) - assert sample_labels.name == 'sample_labels' - matrix.test_pandas_matrix_types( - X, - test_fun) + assert sample_labels.name == "sample_labels" + + matrix.test_pandas_matrix_types(X, test_fun) def test_combine_batches_rangeindex(): X = data.load_10X() X = X.reset_index(drop=True) - Y = X.iloc[:X.shape[0] // 2] - data_combined, labels = scprep.utils.combine_batches( - [X, Y], ['x', 'y']) + Y = X.iloc[: X.shape[0] // 2] + data_combined, labels = scprep.utils.combine_batches([X, Y], ["x", "y"]) assert isinstance(data_combined.index, pd.RangeIndex) assert np.all(data_combined.columns == X.columns) def test_combine_batches_uncommon_genes(): X = data.load_10X() - Y = X.iloc[:, :X.shape[1] // 2] + Y = X.iloc[:, : X.shape[1] // 2] assert_warns_message( UserWarning, "Input data has inconsistent column names. " "Subsetting to {} common columns.".format(Y.shape[1]), scprep.utils.combine_batches, - [X, Y], ['x', 'y']) + [X, Y], + ["x", "y"], + ) def test_combine_batches_errors(): X = data.load_10X() assert_warns_message( UserWarning, - "append_to_cell_names only valid for pd.DataFrame input. " - "Got coo_matrix", + "append_to_cell_names only valid for pd.DataFrame input. " "Got coo_matrix", scprep.utils.combine_batches, - [X.sparse.to_coo(), X.iloc[:X.shape[0] // 2].sparse.to_coo()], + [X.sparse.to_coo(), X.iloc[: X.shape[0] // 2].sparse.to_coo()], batch_labels=[0, 1], - append_to_cell_names=True) + append_to_cell_names=True, + ) assert_raise_message( TypeError, "Expected data all of the same class. Got DataFrame, coo_matrix", scprep.utils.combine_batches, - [X, X.iloc[:X.shape[0] // 2].sparse.to_coo()], - batch_labels=[0, 1]) + [X, X.iloc[: X.shape[0] // 2].sparse.to_coo()], + batch_labels=[0, 1], + ) assert_raise_message( ValueError, "Expected data all with the same number of columns. " "Got {}, {}".format(X.shape[1], X.shape[1] // 2), scprep.utils.combine_batches, - [scprep.utils.toarray(X), scprep.select.select_cols( - scprep.utils.toarray(X), idx=np.arange(X.shape[1] // 2))], - batch_labels=[0, 1]) + [ + scprep.utils.toarray(X), + scprep.select.select_cols( + scprep.utils.toarray(X), idx=np.arange(X.shape[1] // 2) + ), + ], + batch_labels=[0, 1], + ) assert_raise_message( ValueError, "Expected data (2) and batch_labels (1) to be the same length.", scprep.utils.combine_batches, [X, scprep.select.select_rows(X, idx=np.arange(X.shape[0] // 2))], - batch_labels=[0]) + batch_labels=[0], + ) assert_raise_message( ValueError, "Expected data to contain pandas DataFrames, " "scipy sparse matrices or numpy arrays. Got str", scprep.utils.combine_batches, ["hello", "world"], - batch_labels=[0, 1]) + batch_labels=[0, 1], + ) def test_matrix_any(): @@ -222,14 +249,14 @@ def test_matrix_any(): def test_fun(X): assert not scprep.utils.matrix_any(X == 500000) - matrix.test_all_matrix_types(X, - test_fun) + + matrix.test_all_matrix_types(X, test_fun) def test_fun(X): assert scprep.utils.matrix_any(X == 500000) + X[0, 0] = 500000 - matrix.test_all_matrix_types(X, - test_fun) + matrix.test_all_matrix_types(X, test_fun) def test_toarray(): @@ -237,16 +264,15 @@ def test_toarray(): def test_fun(X): assert isinstance(scprep.utils.toarray(X), np.ndarray) - matrix.test_all_matrix_types(X, - test_fun) + + matrix.test_all_matrix_types(X, test_fun) test_fun([X, np.matrix(X)]) def test_toarray_string_error(): - assert_raise_message(TypeError, - "Expected array-like. Got ", - scprep.utils.toarray, - "hello") + assert_raise_message( + TypeError, "Expected array-like. Got ", scprep.utils.toarray, "hello" + ) def test_toarray_vector(): @@ -254,13 +280,12 @@ def test_toarray_vector(): def test_fun(X): assert isinstance(scprep.utils.toarray(X), np.ndarray) - matrix.test_matrix_types(X, - test_fun, - matrix._pandas_vector_types) + + matrix.test_matrix_types(X, test_fun, matrix._pandas_vector_types) def test_toarray_list_of_strings(): - X = ['hello', 'world', [1, 2, 3]] + X = ["hello", "world", [1, 2, 3]] X = scprep.utils.toarray(X) assert isinstance(X[2], np.ndarray) @@ -268,7 +293,8 @@ def test_toarray_list_of_strings(): def test_to_array_or_spmatrix_list_of_strings(): X = data.generate_positive_sparse_matrix(shape=(50, 50)) X = scprep.utils.to_array_or_spmatrix( - [X, sparse.csr_matrix(X), 'hello', 'world', [1, 2, 3]]) + [X, sparse.csr_matrix(X), "hello", "world", [1, 2, 3]] + ) assert isinstance(X[0], np.ndarray) assert isinstance(X[1], sparse.csr_matrix) assert isinstance(X[4], np.ndarray) @@ -277,78 +303,146 @@ def test_to_array_or_spmatrix_list_of_strings(): def test_matrix_sum(): X = data.generate_positive_sparse_matrix(shape=(50, 100)) sums = np.array(X.sum(0)).flatten() - matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=sums, - transform=scprep.utils.matrix_sum, axis=0, - check=utils.assert_all_close) - matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=sums, - transform=scprep.utils.matrix_sum, axis=0, - check=utils.assert_all_close) + matrix.test_all_matrix_types( + X, + utils.assert_transform_equals, + Y=sums, + transform=scprep.utils.matrix_sum, + axis=0, + check=utils.assert_all_close, + ) + matrix.test_numpy_matrix( + X, + utils.assert_transform_equals, + Y=sums, + transform=scprep.utils.matrix_sum, + axis=0, + check=utils.assert_all_close, + ) sums = np.array(X.sum(1)).flatten() - matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=sums, - transform=scprep.utils.matrix_sum, axis=1, - check=utils.assert_all_close) - matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=sums, - transform=scprep.utils.matrix_sum, axis=1, - check=utils.assert_all_close) + matrix.test_all_matrix_types( + X, + utils.assert_transform_equals, + Y=sums, + transform=scprep.utils.matrix_sum, + axis=1, + check=utils.assert_all_close, + ) + matrix.test_numpy_matrix( + X, + utils.assert_transform_equals, + Y=sums, + transform=scprep.utils.matrix_sum, + axis=1, + check=utils.assert_all_close, + ) sums = np.array(X.sum(None)).flatten() - matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=sums, - transform=scprep.utils.matrix_sum, axis=None, - check=utils.assert_all_close) - matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=sums, - transform=scprep.utils.matrix_sum, axis=None, - check=utils.assert_all_close) + matrix.test_all_matrix_types( + X, + utils.assert_transform_equals, + Y=sums, + transform=scprep.utils.matrix_sum, + axis=None, + check=utils.assert_all_close, + ) + matrix.test_numpy_matrix( + X, + utils.assert_transform_equals, + Y=sums, + transform=scprep.utils.matrix_sum, + axis=None, + check=utils.assert_all_close, + ) - assert_raise_message(ValueError, - "Expected axis in [0, 1, None]. Got 5", - scprep.utils.matrix_sum, - data, - 5) + assert_raise_message( + ValueError, + "Expected axis in [0, 1, None]. Got 5", + scprep.utils.matrix_sum, + data, + 5, + ) def test_matrix_std(): X = data.generate_positive_sparse_matrix(shape=(50, 100)) stds = np.array(X.std(0)).flatten() - matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, - transform=scprep.utils.matrix_std, axis=0, - check=utils.assert_all_close) - matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, - transform=scprep.utils.matrix_std, axis=0, - check=utils.assert_all_close) + matrix.test_all_matrix_types( + X, + utils.assert_transform_equals, + Y=stds, + transform=scprep.utils.matrix_std, + axis=0, + check=utils.assert_all_close, + ) + matrix.test_numpy_matrix( + X, + utils.assert_transform_equals, + Y=stds, + transform=scprep.utils.matrix_std, + axis=0, + check=utils.assert_all_close, + ) stds = np.array(X.std(1)).flatten() - matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, - transform=scprep.utils.matrix_std, axis=1, - check=utils.assert_all_close) - matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, - transform=scprep.utils.matrix_std, axis=1, - check=utils.assert_all_close) + matrix.test_all_matrix_types( + X, + utils.assert_transform_equals, + Y=stds, + transform=scprep.utils.matrix_std, + axis=1, + check=utils.assert_all_close, + ) + matrix.test_numpy_matrix( + X, + utils.assert_transform_equals, + Y=stds, + transform=scprep.utils.matrix_std, + axis=1, + check=utils.assert_all_close, + ) stds = np.array(X.std(None)).flatten() - matrix.test_all_matrix_types(X, utils.assert_transform_equals, Y=stds, - transform=scprep.utils.matrix_std, axis=None, - check=utils.assert_all_close) - matrix.test_numpy_matrix(X, utils.assert_transform_equals, Y=stds, - transform=scprep.utils.matrix_std, axis=None, - check=utils.assert_all_close) - - X_df = pd.DataFrame(X, index=np.arange(X.shape[0]).astype(str), - columns=np.arange(X.shape[1]).astype(str)) + matrix.test_all_matrix_types( + X, + utils.assert_transform_equals, + Y=stds, + transform=scprep.utils.matrix_std, + axis=None, + check=utils.assert_all_close, + ) + matrix.test_numpy_matrix( + X, + utils.assert_transform_equals, + Y=stds, + transform=scprep.utils.matrix_std, + axis=None, + check=utils.assert_all_close, + ) + + X_df = pd.DataFrame( + X, + index=np.arange(X.shape[0]).astype(str), + columns=np.arange(X.shape[1]).astype(str), + ) + def test_fun(X): x = scprep.utils.matrix_std(X, axis=0) - assert x.name == 'std' + assert x.name == "std" assert np.all(x.index == X_df.columns) x = scprep.utils.matrix_std(X, axis=1) - assert x.name == 'std' + assert x.name == "std" assert np.all(x.index == X_df.index) - matrix.test_pandas_matrix_types( - X_df, test_fun) - assert_raise_message(ValueError, - "Expected axis in [0, 1, None]. Got 5", - scprep.utils.matrix_std, - data, - 5) + + matrix.test_pandas_matrix_types(X_df, test_fun) + assert_raise_message( + ValueError, + "Expected axis in [0, 1, None]. Got 5", + scprep.utils.matrix_std, + data, + 5, + ) def test_matrix_elementwise_multiply_row(): @@ -356,10 +450,14 @@ def test_matrix_elementwise_multiply_row(): x = X[:, 0] + 1 Y = pd.DataFrame(X).mul(x, axis=0) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, Y=Y, + X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.utils.matrix_vector_elementwise_multiply, check=utils.assert_all_close, - axis=0, multiplier=x) + axis=0, + multiplier=x, + ) def test_matrix_elementwise_multiply_col(): @@ -367,10 +465,14 @@ def test_matrix_elementwise_multiply_col(): x = X[0] + 1 Y = pd.DataFrame(X).mul(x, axis=1) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, Y=Y, + X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.utils.matrix_vector_elementwise_multiply, check=utils.assert_all_close, - axis=1, multiplier=x) + axis=1, + multiplier=x, + ) def test_matrix_elementwise_multiply_guess_row(): @@ -378,10 +480,14 @@ def test_matrix_elementwise_multiply_guess_row(): x = X[:, 0] + 1 Y = pd.DataFrame(X).mul(x, axis=0) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, Y=Y, + X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.utils.matrix_vector_elementwise_multiply, check=utils.assert_all_close, - axis=None, multiplier=x) + axis=None, + multiplier=x, + ) def test_matrix_elementwise_multiply_guess_col(): @@ -389,10 +495,14 @@ def test_matrix_elementwise_multiply_guess_col(): x = X[0] + 1 Y = pd.DataFrame(X).mul(x, axis=1) matrix.test_all_matrix_types( - X, utils.assert_transform_equivalent, Y=Y, + X, + utils.assert_transform_equivalent, + Y=Y, transform=scprep.utils.matrix_vector_elementwise_multiply, check=utils.assert_all_close, - axis=None, multiplier=x) + axis=None, + multiplier=x, + ) def test_matrix_elementwise_multiply_square_guess(): @@ -403,7 +513,9 @@ def test_matrix_elementwise_multiply_square_guess(): "`axis=0` to multiply along rows or " "`axis=1` to multiply along columns.", scprep.utils.matrix_vector_elementwise_multiply, - X, X[0]) + X, + X[0], + ) def test_matrix_elementwise_multiply_row_wrong_size(): @@ -413,7 +525,10 @@ def test_matrix_elementwise_multiply_row_wrong_size(): "Expected `multiplier` to be a vector of length `data.shape[0]` (50)." " Got (100,)", scprep.utils.matrix_vector_elementwise_multiply, - X, X[0], axis=0) + X, + X[0], + axis=0, + ) def test_matrix_elementwise_multiply_col_wrong_size(): @@ -423,7 +538,10 @@ def test_matrix_elementwise_multiply_col_wrong_size(): "Expected `multiplier` to be a vector of length `data.shape[1]` (100)." " Got (50,)", scprep.utils.matrix_vector_elementwise_multiply, - X, X[:, 0], axis=1) + X, + X[:, 0], + axis=1, + ) def test_matrix_elementwise_multiply_guess_wrong_size(): @@ -433,7 +551,9 @@ def test_matrix_elementwise_multiply_guess_wrong_size(): "Expected `multiplier` to be a vector of length `data.shape[0]` (50) " "or `data.shape[1]` (100). Got (10,)", scprep.utils.matrix_vector_elementwise_multiply, - X, X[0, :10]) + X, + X[0, :10], + ) def test_matrix_elementwise_multiply_invalid_axis(): @@ -442,56 +562,71 @@ def test_matrix_elementwise_multiply_invalid_axis(): ValueError, "Expected axis in [0, 1, None]. Got 5", scprep.utils.matrix_vector_elementwise_multiply, - X, X[0], axis=5) + X, + X[0], + axis=5, + ) def test_deprecated(): X = data.load_10X() - assert_raise_message(RuntimeError, - "`scprep.utils.select_cols` is deprecated. Use " - "`scprep.select.select_cols` instead.", - scprep.utils.select_cols, - X, - [1, 2, 3]) - assert_raise_message(RuntimeError, - "`scprep.utils.select_rows` is deprecated. Use " - "`scprep.select.select_rows` instead.", - scprep.utils.select_rows, - X, - [1, 2, 3]) - assert_raise_message(RuntimeError, - "`scprep.utils.get_gene_set` is deprecated. Use " - "`scprep.select.get_gene_set` instead.", - scprep.utils.get_gene_set, - X, - starts_with="D") - assert_raise_message(RuntimeError, - "`scprep.utils.get_cell_set` is deprecated. Use " - "`scprep.select.get_cell_set` instead.", - scprep.utils.get_cell_set, - X, - starts_with="A") - assert_raise_message(RuntimeError, - "`scprep.utils.subsample` is deprecated. Use " - "`scprep.select.subsample` instead.", - scprep.utils.subsample, - X, - n=10) + assert_raise_message( + RuntimeError, + "`scprep.utils.select_cols` is deprecated. Use " + "`scprep.select.select_cols` instead.", + scprep.utils.select_cols, + X, + [1, 2, 3], + ) + assert_raise_message( + RuntimeError, + "`scprep.utils.select_rows` is deprecated. Use " + "`scprep.select.select_rows` instead.", + scprep.utils.select_rows, + X, + [1, 2, 3], + ) + assert_raise_message( + RuntimeError, + "`scprep.utils.get_gene_set` is deprecated. Use " + "`scprep.select.get_gene_set` instead.", + scprep.utils.get_gene_set, + X, + starts_with="D", + ) + assert_raise_message( + RuntimeError, + "`scprep.utils.get_cell_set` is deprecated. Use " + "`scprep.select.get_cell_set` instead.", + scprep.utils.get_cell_set, + X, + starts_with="A", + ) + assert_raise_message( + RuntimeError, + "`scprep.utils.subsample` is deprecated. Use " + "`scprep.select.subsample` instead.", + scprep.utils.subsample, + X, + n=10, + ) def test_is_sparse_dataframe(): X = data.load_10X(sparse=False) Y = X.astype(pd.SparseDtype(float, fill_value=0.0)) assert scprep.utils.is_sparse_dataframe(Y) + def test_fun(X): assert not scprep.utils.is_sparse_dataframe(X) + matrix.test_matrix_types( X, test_fun, - matrix._scipy_matrix_types + - matrix._numpy_matrix_types + - matrix._pandas_dense_matrix_types + - [matrix.SparseDataFrame_deprecated] + matrix._scipy_matrix_types + + matrix._numpy_matrix_types + + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame_deprecated], ) @@ -500,53 +635,57 @@ def test_SparseDataFrame(): Y = X.astype(pd.SparseDtype(float, fill_value=0.0)) index = X.index columns = X.columns + def test_fun(X): X = scprep.utils.SparseDataFrame(X, index=index, columns=columns) utils.assert_matrix_class_equivalent(X, Y) - matrix.test_all_matrix_types( - X, - test_fun - ) + + matrix.test_all_matrix_types(X, test_fun) matrix.test_pandas_matrix_types( X, utils.assert_transform_equivalent, Y=Y, - transform=scprep.utils.SparseDataFrame + transform=scprep.utils.SparseDataFrame, ) def test_is_sparse_series(): X = data.load_10X(sparse=True) assert scprep.utils.is_sparse_series(X[X.columns[0]]) + def test_fun(X): if isinstance(X, pd.SparseDataFrame): x = X[X.columns[0]] else: x = scprep.select.select_cols(X, idx=0) assert not scprep.utils.is_sparse_series(x) + matrix.test_matrix_types( X.to_numpy(), test_fun, - matrix._scipy_matrix_types + - matrix._numpy_matrix_types + - matrix._pandas_dense_matrix_types + - [matrix.SparseDataFrame_deprecated] + matrix._scipy_matrix_types + + matrix._numpy_matrix_types + + matrix._pandas_dense_matrix_types + + [matrix.SparseDataFrame_deprecated], ) def test_sort_clusters_by_values_accurate(): - clusters = [0,0,1,1,2,2] - values = [5,5,1,1,2,2] + clusters = [0, 0, 1, 1, 2, 2] + values = [5, 5, 1, 1, 2, 2] new_clusters = scprep.utils.sort_clusters_by_values(clusters, values) - test_array = scprep.utils.toarray([2,2,0,0,1,1]) + test_array = scprep.utils.toarray([2, 2, 0, 0, 1, 1]) np.testing.assert_array_equal(new_clusters, test_array) + def test_sort_clusters_by_values_wrong_len(): - clusters = [0,0,1,1,2,2] - values = [5,5,1,1,2] - assert_raise_message(ValueError, - "Expected clusters ({}) and values ({}) to be the " - "same length.".format(len(clusters), len(values)), - scprep.utils.sort_clusters_by_values, - clusters, - values) + clusters = [0, 0, 1, 1, 2, 2] + values = [5, 5, 1, 1, 2] + assert_raise_message( + ValueError, + "Expected clusters ({}) and values ({}) to be the " + "same length.".format(len(clusters), len(values)), + scprep.utils.sort_clusters_by_values, + clusters, + values, + ) diff --git a/test/tools/__init__.py b/test/tools/__init__.py index 610d72cb..fa270b41 100644 --- a/test/tools/__init__.py +++ b/test/tools/__init__.py @@ -1,2 +1,3 @@ import matplotlib as mpl + mpl.use("agg") diff --git a/test/tools/data.py b/test/tools/data.py index 52086388..8df4ff77 100644 --- a/test/tools/data.py +++ b/test/tools/data.py @@ -56,12 +56,10 @@ def load_10X(**kwargs): ------- data : array-like 10X data """ - return scprep.io.load_10X( - os.path.join(data_dir, "test_10X"), **kwargs) + return scprep.io.load_10X(os.path.join(data_dir, "test_10X"), **kwargs) -def generate_positive_sparse_matrix(shape=[200, 500], seed=42, - poisson_mean=0.1): +def generate_positive_sparse_matrix(shape=[200, 500], seed=42, poisson_mean=0.1): """Returns an ndarray of shape=shape filled mostly with zeros Creates a matrix with np.random.normal and multiplies the result @@ -78,7 +76,6 @@ def generate_positive_sparse_matrix(shape=[200, 500], seed=42, np.ndarray """ np.random.seed(seed) - X = np.random.normal(0, 1, shape) * \ - np.random.poisson(poisson_mean, shape) + X = np.random.normal(0, 1, shape) * np.random.poisson(poisson_mean, shape) X = np.abs(X) return X diff --git a/test/tools/matrix.py b/test/tools/matrix.py index e2251f58..caef81f7 100644 --- a/test/tools/matrix.py +++ b/test/tools/matrix.py @@ -6,31 +6,15 @@ def _ignore_pandas_sparse_warning(): - warnings.filterwarnings( - "ignore", - category=FutureWarning, - message="SparseSeries") - warnings.filterwarnings( - "ignore", - category=FutureWarning, - message="SparseDataFrame") - warnings.filterwarnings( - "error", - category=pd.errors.PerformanceWarning) + warnings.filterwarnings("ignore", category=FutureWarning, message="SparseSeries") + warnings.filterwarnings("ignore", category=FutureWarning, message="SparseDataFrame") + warnings.filterwarnings("error", category=pd.errors.PerformanceWarning) def _reset_warnings(): - warnings.filterwarnings( - "error", - category=FutureWarning, - message="SparseSeries") - warnings.filterwarnings( - "error", - category=FutureWarning, - message="SparseDataFrame") - warnings.filterwarnings( - "error", - category=pd.errors.PerformanceWarning) + warnings.filterwarnings("error", category=FutureWarning, message="SparseSeries") + warnings.filterwarnings("error", category=FutureWarning, message="SparseDataFrame") + warnings.filterwarnings("error", category=pd.errors.PerformanceWarning) _reset_warnings() @@ -42,16 +26,19 @@ def _no_warning_dia_matrix(*args, **kwargs): warnings.filterwarnings( "ignore", category=sparse.SparseEfficiencyWarning, - message="Constructing a DIA matrix with [0-9]*" - " diagonals is inefficient") + message="Constructing a DIA matrix with [0-9]*" " diagonals is inefficient", + ) return sparse.dia_matrix(*args, **kwargs) + def SparseDataFrame_deprecated(X, default_fill_value=0.0): return pd.SparseDataFrame(X, default_fill_value=default_fill_value) + def SparseSeries(X, default_fill_value=0.0): return pd.Series(X).astype(pd.SparseDtype(float, fill_value=default_fill_value)) + def SparseSeries_deprecated(X, default_fill_value=0.0): return pd.SparseSeries(X, fill_value=default_fill_value) @@ -87,24 +74,23 @@ def SparseDataFrame(X, default_fill_value=0.0): SparseDataFrame_deprecated, ] -_pandas_vector_types = [ - pd.Series, - SparseSeries, - SparseSeries_deprecated -] +_pandas_vector_types = [pd.Series, SparseSeries, SparseSeries_deprecated] _pandas_matrix_types = _pandas_dense_matrix_types + _pandas_sparse_matrix_types -_indexable_matrix_types = [ - sparse.csr_matrix, - sparse.csc_matrix, - sparse.lil_matrix, - sparse.dok_matrix -] + _numpy_matrix_types + _pandas_matrix_types +_indexable_matrix_types = ( + [sparse.csr_matrix, sparse.csc_matrix, sparse.lil_matrix, sparse.dok_matrix] + + _numpy_matrix_types + + _pandas_matrix_types +) def _typename(X): - if isinstance(X, pd.DataFrame) and not isinstance(X, pd.SparseDataFrame) and hasattr(X, "sparse"): + if ( + isinstance(X, pd.DataFrame) + and not isinstance(X, pd.SparseDataFrame) + and hasattr(X, "sparse") + ): return "DataFrame[SparseArray]" else: return type(X).__name__ @@ -128,9 +114,11 @@ def test_matrix_types(X, test_fun, matrix_types, *args, **kwargs): try: test_fun(Y, *args, **kwargs) except Exception as e: - raise RuntimeError("{} with {} input to {}\n{}".format( - type(e).__name__, _typename(Y), test_fun.__name__, - str(e))) + raise RuntimeError( + "{} with {} input to {}\n{}".format( + type(e).__name__, _typename(Y), test_fun.__name__, str(e) + ) + ) finally: _reset_warnings() @@ -146,8 +134,8 @@ def test_dense_matrix_types(X, test_fun, *args, **kwargs): **kwargs : keyword arguments for test_fun """ test_matrix_types( - X, test_fun, _numpy_matrix_types + _pandas_dense_matrix_types, - *args, **kwargs) + X, test_fun, _numpy_matrix_types + _pandas_dense_matrix_types, *args, **kwargs + ) def test_sparse_matrix_types(X, test_fun, *args, **kwargs): @@ -161,8 +149,8 @@ def test_sparse_matrix_types(X, test_fun, *args, **kwargs): **kwargs : keyword arguments for test_fun """ test_matrix_types( - X, test_fun, _scipy_matrix_types + _pandas_sparse_matrix_types, - *args, **kwargs) + X, test_fun, _scipy_matrix_types + _pandas_sparse_matrix_types, *args, **kwargs + ) def test_all_matrix_types(X, test_fun, *args, **kwargs): @@ -189,9 +177,7 @@ def test_pandas_matrix_types(X, test_fun, *args, **kwargs): *args : positional arguments for test_fun **kwargs : keyword arguments for test_fun """ - test_matrix_types( - X, test_fun, _pandas_matrix_types, - *args, **kwargs) + test_matrix_types(X, test_fun, _pandas_matrix_types, *args, **kwargs) def test_numpy_matrix(X, test_fun, *args, **kwargs): @@ -204,6 +190,4 @@ def test_numpy_matrix(X, test_fun, *args, **kwargs): *args : positional arguments for test_fun **kwargs : keyword arguments for test_fun """ - test_matrix_types( - X, test_fun, [np.matrix], - *args, **kwargs) + test_matrix_types(X, test_fun, [np.matrix], *args, **kwargs) diff --git a/test/tools/utils.py b/test/tools/utils.py index 25671c2e..02d6afb7 100644 --- a/test/tools/utils.py +++ b/test/tools/utils.py @@ -42,9 +42,7 @@ def assert_transform_equals(X, Y, transform, check=assert_all_equal, **kwargs): Y2 : returned value of transform(X, **kwargs) """ Y2 = transform(X, **kwargs) - check(Y, Y2), "{} failed on {}".format( - transform, - matrix._typename(X)) + check(Y, Y2), "{} failed on {}".format(transform, matrix._typename(X)) return Y2 @@ -65,8 +63,7 @@ def assert_transform_unchanged(X, transform, check=assert_all_equal, **kwargs): assert_transform_equals(X, X, transform, check=check, **kwargs) -def assert_transform_equivalent(X, Y, transform, check=assert_all_equal, - **kwargs): +def assert_transform_equivalent(X, Y, transform, check=assert_all_equal, **kwargs): """Check the output of transform(X, **kwargs) == Y and transform(X, **kwargs) gives the same kind of matrix as X Parameters @@ -83,9 +80,9 @@ def assert_transform_equivalent(X, Y, transform, check=assert_all_equal, Y2 : returned value of transform(X, **kwargs) """ Y2 = assert_transform_equals(X, Y, transform, check=check, **kwargs) - assert assert_matrix_class_equivalent(X, Y2), \ - "{} produced inconsistent matrix output".format( - _typename(X)) + assert assert_matrix_class_equivalent( + X, Y2 + ), "{} produced inconsistent matrix output".format(_typename(X)) def assert_transform_raises(X, transform, exception=ValueError, **kwargs): @@ -101,8 +98,9 @@ def assert_transform_raises(X, transform, exception=ValueError, **kwargs): def _is_sparse_dataframe(X): - return isinstance(X, pd.SparseDataFrame) or \ - (isinstance(X, pd.DataFrame) and hasattr(X, "sparse")) + return isinstance(X, pd.SparseDataFrame) or ( + isinstance(X, pd.DataFrame) and hasattr(X, "sparse") + ) def _sparse_dataframe_density(X): From 314017f22dc9b5df78f8340b960b80bd734b2b46 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 12:46:32 -0400 Subject: [PATCH 03/10] add autoblack --- CONTRIBUTING.md | 9 ++++++++- autoblack.sh | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 autoblack.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4341d1ad..a6b9b910 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,6 +20,13 @@ up" on issues that others reported and that are relevant to you. It also helps us if you spread the word: reference the project from your blog and articles, link to it from your website, or simply star it in GitHub to say "I use it". +Code Style and Testing +---------------------- + +`scprep` is maintained at close to 100% code coverage. Contributors are encouraged to write tests for their code, but if you do not know how to do so, please do not feel discouraged from contributing code! Others can always help you test your contribution. + +Code style is dictated by [`black`](https://pypi.org/project/black/#installation-and-usage). To automatically reformat your code when you run `git commit`, you can run `./autoblack.sh` in the root directory of this project to add a hook to your `git` repository. + Code of Conduct --------------- @@ -29,4 +36,4 @@ of the Python Software Foundation: https://www.python.org/psf/codeofconduct/. Attribution --------------- -This `CONTRIBUTING.md` was adapted from [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md). \ No newline at end of file +This `CONTRIBUTING.md` was adapted from [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md). diff --git a/autoblack.sh b/autoblack.sh new file mode 100644 index 00000000..c420d199 --- /dev/null +++ b/autoblack.sh @@ -0,0 +1,14 @@ +cat <> .git/hooks/pre-commit +#!/bin/sh + +set -e + +files=`git diff --staged --name-only --diff-filter=d -- "*.py"` + +for file in $files; do + black $file + git add $file +done +EOF +chmod +x .git/hooks/pre-commit + From 9ec65ba9e1a32c20ee045f2b49df7eef93cc4ce0 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 12:48:58 -0400 Subject: [PATCH 04/10] skip py3.5 for black --- .travis.yml | 2 +- setup.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index bdf123fa..51293ed5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ script: - pip install -U .[test] - - black . --check --diff + - if [ "$TRAVIS_PYTHON_VERSION" != "3.5" ]; then black . --check --diff; fi - python setup.py test - pip install -U .[doc] - cd doc; make html diff --git a/setup.py b/setup.py index 1e891c23..88920c0f 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,6 @@ "coveralls", "parameterized", "requests", - "black", ] doc_requires = [ @@ -36,7 +35,7 @@ elif sys.version_info[:2] < (3, 6): test_requires += ["matplotlib>=3.0,<3.1", "rpy2>=3.0,<3.1"] else: - test_requires += ["matplotlib>=3.0", "rpy2>=3.0"] + test_requires += ["matplotlib>=3.0", "rpy2>=3.0", "black"] version_py = os.path.join(os.path.dirname(__file__), "scprep", "version.py") version = open(version_py).read().strip().split("=")[-1].replace('"', "").strip() From 953ac47aac5d85ed3be878dc1dc9dc5b3847bc53 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 12:55:30 -0400 Subject: [PATCH 05/10] bump version --- scprep/version.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scprep/version.py b/scprep/version.py index 940f0424..5cc2267a 100644 --- a/scprep/version.py +++ b/scprep/version.py @@ -1,4 +1,5 @@ # author: Scott Gigante # (C) 2018 Krishnaswamy Lab GPLv2 -__version__ = "1.0.2" +__version__ = "1.0.3" + From f166cba705c4c31cca21872bf9fe611a9cf90f32 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 14:30:51 -0400 Subject: [PATCH 06/10] lint --- scprep/version.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scprep/version.py b/scprep/version.py index 5cc2267a..ae8f795d 100644 --- a/scprep/version.py +++ b/scprep/version.py @@ -2,4 +2,3 @@ # (C) 2018 Krishnaswamy Lab GPLv2 __version__ = "1.0.3" - From dd12258a3ec78c3a0febd7e956c33df1782c0e9f Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 15:32:15 -0400 Subject: [PATCH 07/10] improve coverage of fcs --- scprep/io/fcs.py | 132 ++++++++++++++++++++++++++--------------------- test/test_io.py | 93 +++++++++++++++++++-------------- 2 files changed, 126 insertions(+), 99 deletions(-) diff --git a/scprep/io/fcs.py b/scprep/io/fcs.py index 7ffdf023..fed84712 100644 --- a/scprep/io/fcs.py +++ b/scprep/io/fcs.py @@ -103,30 +103,7 @@ def _reformat_meta(meta, channel_numbers): return df -def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): - """Experimental FCS parser - - Some files fail to load with `fcsparser.parse`. For these, we provide an - alternative parser. It is not guaranteed to work in all cases. - - Code copied from https://github.com/pontikos/fcstools/blob/master/fcs.extract.py - - Paramseters - ----------- - channel_naming: '$PnS' | '$PnN' - Determines which meta data field is used for naming the channels. - The default should be $PnS (even though it is not guaranteed to be unique) - $PnN stands for the short name (guaranteed to be unique). - Will look like 'FL1-H' - $PnS stands for the actual name (not guaranteed to be unique). - Will look like 'FSC-H' (Forward scatter) - The chosen field will be used to population self.channels - Note: These names are not flipped in the implementation. - It looks like they were swapped for some reason in the official FCS specification. - reformat_meta: bool - If true, the meta data is reformatted with the channel information organized - into a DataFrame and moved into the '_channels_' key - """ +def _read_fcs_header(filename): meta = dict() with open(filename, "rb") as handle: # Parse HEADER @@ -135,8 +112,8 @@ def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): meta["__header__"]["FCS format"] = header[0:6].strip() meta["__header__"]["text start"] = int(header[10:18].strip()) meta["__header__"]["text end"] = int(header[18:26].strip()) - meta["__header__"]["data start"] = data_start = int(header[26:34].strip()) - meta["__header__"]["data end"] = data_end = int(header[34:42].strip()) + meta["__header__"]["data start"] = int(header[26:34].strip()) + meta["__header__"]["data end"] = int(header[34:42].strip()) meta["__header__"]["analysis start"] = int(header[42:50].strip()) meta["__header__"]["analysis end"] = int(header[50:58].strip()) @@ -154,53 +131,88 @@ def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): # Iterate over every 2 consecutive elements of the array for k, v in zip(keyvalarray[::2], keyvalarray[1::2]): meta[k.decode()] = v.decode() + return meta - if ( - meta["__header__"]["data start"] == 0 - and meta["__header__"]["data end"] == 0 - ): - data_start = int(meta["$DATASTART"]) - data_end = int(meta["$DATAEND"]) - num_dims = meta["$PAR"] = int(meta["$PAR"]) - num_events = meta["$TOT"] = int(meta["$TOT"]) +def _parse_fcs_header(meta): + if ( + meta["__header__"]["data start"] == 0 + and meta["__header__"]["data end"] == 0 + ): + meta["$DATASTART"] = int(meta["$DATASTART"]) + meta["$DATAEND"] = int(meta["$DATAEND"]) + else: + meta["$DATASTART"] = meta["__header__"]["data start"] + meta["$DATAEND"] = meta["__header__"]["data end"] - # Read DATA portion - handle.seek(data_start) - data = handle.read(data_end - data_start + 1) - - # Determine data format - datatype = meta["$DATATYPE"].lower() - if datatype not in ["f", "d"]: - raise ValueError( - "Expected $DATATYPE in ['F', 'D']. " - "Got '{}'".format(meta["$DATATYPE"]) - ) + meta["$PAR"] = int(meta["$PAR"]) + meta["$TOT"] = int(meta["$TOT"]) - # Determine endianess - endian = meta["$BYTEORD"] - if endian == "4,3,2,1": - # Big endian data format - endian = ">" - elif endian == "1,2,3,4": - # Little endian data format - endian = "<" - else: - raise ValueError( - "Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " - "Got '{}'".format(endian) - ) + # Determine data format + meta["$DATATYPE"] = meta["$DATATYPE"].lower() + if meta["$DATATYPE"] not in ["f", "d"]: + raise ValueError( + "Expected $DATATYPE in ['F', 'D']. " + "Got '{}'".format(meta["$DATATYPE"]) + ) + + # Determine endianess + endian = meta["$BYTEORD"] + if endian == "4,3,2,1": + # Big endian data format + meta['$ENDIAN'] = ">" + elif endian == "1,2,3,4": + # Little endian data format + meta['$ENDIAN'] = "<" + else: + raise ValueError( + "Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " + "Got '{}'".format(endian) + ) + return meta + + +def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): + """Experimental FCS parser + + Some files fail to load with `fcsparser.parse`. For these, we provide an + alternative parser. It is not guaranteed to work in all cases. + + Code copied from https://github.com/pontikos/fcstools/blob/master/fcs.extract.py + + Paramseters + ----------- + channel_naming: '$PnS' | '$PnN' + Determines which meta data field is used for naming the channels. + The default should be $PnS (even though it is not guaranteed to be unique) + $PnN stands for the short name (guaranteed to be unique). + Will look like 'FL1-H' + $PnS stands for the actual name (not guaranteed to be unique). + Will look like 'FSC-H' (Forward scatter) + The chosen field will be used to population self.channels + Note: These names are not flipped in the implementation. + It looks like they were swapped for some reason in the official FCS specification. + reformat_meta: bool + If true, the meta data is reformatted with the channel information organized + into a DataFrame and moved into the '_channels_' key + """ + meta = _read_fcs_header(filename) + meta = _parse_fcs_header(meta) + with open(filename, "rb") as handle: + # Read DATA portion + handle.seek(meta["$DATASTART"]) + data = handle.read(meta["$DATAEND"] - meta["$DATASTART"] + 1) # Put data in StringIO so we can read bytes like a file data = BytesIO(data) # Parsing DATA segment # Create format string based on endianeness and the specified data type - fmt = endian + str(num_dims) + datatype + fmt = meta['$ENDIAN'] + str(meta["$PAR"]) + meta["$DATATYPE"] datasize = struct.calcsize(fmt) events = [] # Read and unpack all the events from the data - for e in range(num_events): + for e in range(meta["$TOT"]): event = struct.unpack(fmt, data.read(datasize)) events.append(event) diff --git a/test/test_io.py b/test/test_io.py index 73a969a1..3cc73c22 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -10,6 +10,7 @@ import numpy as np from scipy import sparse import os +import copy import shutil import fcsparser import zipfile @@ -527,6 +528,26 @@ def test_save_mtx(): shutil.rmtree("test_mtx") +def _assert_fcs_meta_equal(fcsparser_meta, scprep_meta, reformat_meta=True): + assert set(scprep_meta.keys()).difference(set(fcsparser_meta.keys())) == {'$DATAEND', '$DATASTART', '$ENDIAN'} + for key in fcsparser_meta.keys(): + try: + np.testing.assert_array_equal(fcsparser_meta[key], scprep_meta[key], key) + except AssertionError: + if key == "$NEXTDATA" or (key.startswith("$P") and key.endswith("B")): + np.testing.assert_array_equal(fcsparser_meta[key], int(scprep_meta[key]), key) + elif key == "_channels_": + for column in fcsparser_meta[key].columns: + scprep_column = scprep_meta[key][column].astype(fcsparser_meta[key][column].dtype) + np.testing.assert_array_equal( + fcsparser_meta[key][column], scprep_column, key + column + ) + elif key == "$DATATYPE": + assert fcsparser_meta[key].lower() == scprep_meta[key].lower() + else: + raise + + def test_fcs(): path = fcsparser.test_sample_path meta, data = fcsparser.parse(path) @@ -544,36 +565,14 @@ def test_fcs(): ) X_meta, _, X = scprep.io.load_fcs(path, reformat_meta=False, override=True) - assert set(meta.keys()) == set(X_meta.keys()) - for key in meta.keys(): - try: - np.testing.assert_array_equal(meta[key], X_meta[key], key) - except AssertionError: - if key == "$NEXTDATA" or (key.startswith("$P") and key.endswith("B")): - np.testing.assert_array_equal(meta[key], int(X_meta[key]), key) - else: - raise + _assert_fcs_meta_equal(meta, X_meta, reformat_meta=False) def test_fcs_reformat_meta(): path = fcsparser.test_sample_path meta, data = fcsparser.parse(path, reformat_meta=True) X_meta, _, X = scprep.io.load_fcs(path, reformat_meta=True, override=True) - assert set(meta.keys()) == set(X_meta.keys()) - for key in meta.keys(): - try: - np.testing.assert_array_equal(meta[key], X_meta[key], key) - except AssertionError: - if key == "$NEXTDATA" or (key.startswith("$P") and key.endswith("B")): - np.testing.assert_array_equal(meta[key], int(X_meta[key]), key) - elif key == "_channels_": - for column in meta[key].columns: - X_column = X_meta[key][column].astype(meta[key][column].dtype) - np.testing.assert_array_equal( - meta[key][column], X_column, key + column - ) - else: - raise + _assert_fcs_meta_equal(meta, X_meta) assert "Time" not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) @@ -586,21 +585,7 @@ def test_fcs_PnN(): X_meta, _, X = scprep.io.load_fcs( path, reformat_meta=True, channel_naming="$PnN", override=True ) - assert set(meta.keys()) == set(X_meta.keys()) - for key in meta.keys(): - try: - np.testing.assert_array_equal(meta[key], X_meta[key], key) - except AssertionError: - if key == "$NEXTDATA" or (key.startswith("$P") and key.endswith("B")): - np.testing.assert_array_equal(meta[key], int(X_meta[key]), key) - elif key == "_channels_": - for column in meta[key].columns: - X_column = X_meta[key][column].astype(meta[key][column].dtype) - np.testing.assert_array_equal( - meta[key][column], X_column, key + column - ) - else: - raise + _assert_fcs_meta_equal(meta, X_meta) assert "Time" not in X.columns assert len(set(X.columns).difference(data.columns)) == 0 np.testing.assert_array_equal(X.index, data.index) @@ -633,6 +618,36 @@ def test_fcs_naming_error(): ) +def test_fcs_header_error(): + path = fcsparser.test_sample_path + meta, data = fcsparser.parse(path, reformat_meta=True, + channel_naming='$PnN') + meta_bad = copy.deepcopy(meta) + meta_bad['__header__']['data start'] = 0 + meta_bad['__header__']['data end'] = 0 + assert scprep.io.fcs._parse_fcs_header(meta_bad)['$DATASTART'] == scprep.io.fcs._parse_fcs_header(meta)['$DATASTART'] + assert scprep.io.fcs._parse_fcs_header(meta_bad)['$DATAEND'] == scprep.io.fcs._parse_fcs_header(meta)['$DATAEND'] + + meta_bad = copy.deepcopy(meta) + meta_bad['$DATATYPE'] = 'invalid' + assert_raise_message( + ValueError, + "Expected $DATATYPE in ['F', 'D']. " + "Got 'invalid'", + scprep.io.fcs._parse_fcs_header, meta_bad) + + meta_bad = copy.deepcopy(meta) + for byteord, endian in zip(["4,3,2,1", "1,2,3,4"], [">", "<"]): + meta_bad['$BYTEORD'] = byteord + assert scprep.io.fcs._parse_fcs_header(meta_bad)['$ENDIAN'] == endian + meta_bad['$BYTEORD'] = "invalid" + assert_raise_message( + ValueError, + "Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " + "Got 'invalid'", + scprep.io.fcs._parse_fcs_header, meta_bad) + + def test_parse_header(): header1 = np.arange(10) header2 = os.path.join(data.data_dir, "gene_symbols.csv") From 1c756dff860f0591c8c6b5d24a625ce2491522db Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 16:20:28 -0400 Subject: [PATCH 08/10] fix test --- test/test_io.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_io.py b/test/test_io.py index 3cc73c22..c480c649 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -623,6 +623,8 @@ def test_fcs_header_error(): meta, data = fcsparser.parse(path, reformat_meta=True, channel_naming='$PnN') meta_bad = copy.deepcopy(meta) + meta_bad['$DATASTART'] = meta_bad['__header__']['data start'] + meta_bad['$DATAEND'] = meta_bad['__header__']['data end'] meta_bad['__header__']['data start'] = 0 meta_bad['__header__']['data end'] = 0 assert scprep.io.fcs._parse_fcs_header(meta_bad)['$DATASTART'] == scprep.io.fcs._parse_fcs_header(meta)['$DATASTART'] From 051062792be39e028897b6d1a82de2df934c0b85 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 16:30:12 -0400 Subject: [PATCH 09/10] fix autoblack --- autoblack.sh | 8 +++---- scprep/io/fcs.py | 17 +++++--------- test/test_io.py | 59 ++++++++++++++++++++++++++++++------------------ 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/autoblack.sh b/autoblack.sh index c420d199..2e26e4db 100644 --- a/autoblack.sh +++ b/autoblack.sh @@ -3,11 +3,11 @@ cat <> .git/hooks/pre-commit set -e -files=`git diff --staged --name-only --diff-filter=d -- "*.py"` +files=\$(git diff --staged --name-only --diff-filter=d -- "*.py") -for file in $files; do - black $file - git add $file +for file in \$files; do + black \$file + git add \$file done EOF chmod +x .git/hooks/pre-commit diff --git a/scprep/io/fcs.py b/scprep/io/fcs.py index fed84712..02f876e4 100644 --- a/scprep/io/fcs.py +++ b/scprep/io/fcs.py @@ -135,10 +135,7 @@ def _read_fcs_header(filename): def _parse_fcs_header(meta): - if ( - meta["__header__"]["data start"] == 0 - and meta["__header__"]["data end"] == 0 - ): + if meta["__header__"]["data start"] == 0 and meta["__header__"]["data end"] == 0: meta["$DATASTART"] = int(meta["$DATASTART"]) meta["$DATAEND"] = int(meta["$DATAEND"]) else: @@ -152,22 +149,20 @@ def _parse_fcs_header(meta): meta["$DATATYPE"] = meta["$DATATYPE"].lower() if meta["$DATATYPE"] not in ["f", "d"]: raise ValueError( - "Expected $DATATYPE in ['F', 'D']. " - "Got '{}'".format(meta["$DATATYPE"]) + "Expected $DATATYPE in ['F', 'D']. " "Got '{}'".format(meta["$DATATYPE"]) ) # Determine endianess endian = meta["$BYTEORD"] if endian == "4,3,2,1": # Big endian data format - meta['$ENDIAN'] = ">" + meta["$ENDIAN"] = ">" elif endian == "1,2,3,4": # Little endian data format - meta['$ENDIAN'] = "<" + meta["$ENDIAN"] = "<" else: raise ValueError( - "Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " - "Got '{}'".format(endian) + "Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " "Got '{}'".format(endian) ) return meta @@ -208,7 +203,7 @@ def _fcsextract(filename, channel_naming="$PnS", reformat_meta=True): # Parsing DATA segment # Create format string based on endianeness and the specified data type - fmt = meta['$ENDIAN'] + str(meta["$PAR"]) + meta["$DATATYPE"] + fmt = meta["$ENDIAN"] + str(meta["$PAR"]) + meta["$DATATYPE"] datasize = struct.calcsize(fmt) events = [] # Read and unpack all the events from the data diff --git a/test/test_io.py b/test/test_io.py index c480c649..a12fdc15 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -529,16 +529,24 @@ def test_save_mtx(): def _assert_fcs_meta_equal(fcsparser_meta, scprep_meta, reformat_meta=True): - assert set(scprep_meta.keys()).difference(set(fcsparser_meta.keys())) == {'$DATAEND', '$DATASTART', '$ENDIAN'} + assert set(scprep_meta.keys()).difference(set(fcsparser_meta.keys())) == { + "$DATAEND", + "$DATASTART", + "$ENDIAN", + } for key in fcsparser_meta.keys(): try: np.testing.assert_array_equal(fcsparser_meta[key], scprep_meta[key], key) except AssertionError: if key == "$NEXTDATA" or (key.startswith("$P") and key.endswith("B")): - np.testing.assert_array_equal(fcsparser_meta[key], int(scprep_meta[key]), key) + np.testing.assert_array_equal( + fcsparser_meta[key], int(scprep_meta[key]), key + ) elif key == "_channels_": for column in fcsparser_meta[key].columns: - scprep_column = scprep_meta[key][column].astype(fcsparser_meta[key][column].dtype) + scprep_column = scprep_meta[key][column].astype( + fcsparser_meta[key][column].dtype + ) np.testing.assert_array_equal( fcsparser_meta[key][column], scprep_column, key + column ) @@ -620,34 +628,41 @@ def test_fcs_naming_error(): def test_fcs_header_error(): path = fcsparser.test_sample_path - meta, data = fcsparser.parse(path, reformat_meta=True, - channel_naming='$PnN') + meta, data = fcsparser.parse(path, reformat_meta=True, channel_naming="$PnN") meta_bad = copy.deepcopy(meta) - meta_bad['$DATASTART'] = meta_bad['__header__']['data start'] - meta_bad['$DATAEND'] = meta_bad['__header__']['data end'] - meta_bad['__header__']['data start'] = 0 - meta_bad['__header__']['data end'] = 0 - assert scprep.io.fcs._parse_fcs_header(meta_bad)['$DATASTART'] == scprep.io.fcs._parse_fcs_header(meta)['$DATASTART'] - assert scprep.io.fcs._parse_fcs_header(meta_bad)['$DATAEND'] == scprep.io.fcs._parse_fcs_header(meta)['$DATAEND'] - + meta_bad["$DATASTART"] = meta_bad["__header__"]["data start"] + meta_bad["$DATAEND"] = meta_bad["__header__"]["data end"] + meta_bad["__header__"]["data start"] = 0 + meta_bad["__header__"]["data end"] = 0 + assert ( + scprep.io.fcs._parse_fcs_header(meta_bad)["$DATASTART"] + == scprep.io.fcs._parse_fcs_header(meta)["$DATASTART"] + ) + assert ( + scprep.io.fcs._parse_fcs_header(meta_bad)["$DATAEND"] + == scprep.io.fcs._parse_fcs_header(meta)["$DATAEND"] + ) + meta_bad = copy.deepcopy(meta) - meta_bad['$DATATYPE'] = 'invalid' + meta_bad["$DATATYPE"] = "invalid" assert_raise_message( ValueError, - "Expected $DATATYPE in ['F', 'D']. " - "Got 'invalid'", - scprep.io.fcs._parse_fcs_header, meta_bad) + "Expected $DATATYPE in ['F', 'D']. " "Got 'invalid'", + scprep.io.fcs._parse_fcs_header, + meta_bad, + ) meta_bad = copy.deepcopy(meta) for byteord, endian in zip(["4,3,2,1", "1,2,3,4"], [">", "<"]): - meta_bad['$BYTEORD'] = byteord - assert scprep.io.fcs._parse_fcs_header(meta_bad)['$ENDIAN'] == endian - meta_bad['$BYTEORD'] = "invalid" + meta_bad["$BYTEORD"] = byteord + assert scprep.io.fcs._parse_fcs_header(meta_bad)["$ENDIAN"] == endian + meta_bad["$BYTEORD"] = "invalid" assert_raise_message( ValueError, - "Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " - "Got 'invalid'", - scprep.io.fcs._parse_fcs_header, meta_bad) + "Expected $BYTEORD in ['1,2,3,4', '4,3,2,1']. " "Got 'invalid'", + scprep.io.fcs._parse_fcs_header, + meta_bad, + ) def test_parse_header(): From e037c1a5e23a2a7efd0e774e2c75938c2a6f6f30 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 29 Oct 2019 16:31:25 -0400 Subject: [PATCH 10/10] make autoblack quiet --- autoblack.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autoblack.sh b/autoblack.sh index 2e26e4db..cfbaf2b4 100644 --- a/autoblack.sh +++ b/autoblack.sh @@ -6,8 +6,8 @@ set -e files=\$(git diff --staged --name-only --diff-filter=d -- "*.py") for file in \$files; do - black \$file - git add \$file + black -q \$file + git add \$file done EOF chmod +x .git/hooks/pre-commit