In [1]:
import scanpy as sc
import pandas as pd
import os
import fast_matrix_market
import polars as pl
import scipy as sp


In [2]:
# Get folders in droplet
folders = os.listdir("droplet")
folders = [f for f in folders if os.path.isdir("droplet/"+f)]



path = "droplet/Bladder-10X_P4_3/"
adata = sc.read_10x_mtx(path, var_names='gene_symbols', cache=True)
adata



AnnData object with n_obs × n_vars = 149 × 23433
    var: 'gene_ids'

In [14]:
def read_scrnaseq_faster(
   path: str
   )-> sc.AnnData:
    """
    Read a sparse matrix in Matrix Market format and two CSV files with gene and cell metadata
    into an AnnData object.
    
    Args:
        path: Path to the directory containing the matrix.mtx, genes.tsv, and barcodes.tsv files.
        
    Returns:
        An AnnData object with the matrix, gene metadata, and cell metadata.

    """
    mtx_file = os.path.join(path, "matrix.mtx")
    gene_info = os.path.join(path, "genes.tsv")
    cell_metadata = os.path.join(path, "barcodes.tsv")
    
    
    # Read the .mtx file into a sparse matrix using the fast_matrix_market package (master than scanpy, uses multiprocessing)
    mtx = fast_matrix_market.mmread(mtx_file)

    # Convert the sparse matrix to a CSR matrix
    # Otherwise you will not be able to use it with scanpy
    if isinstance(mtx, sp.sparse.coo.coo_matrix):
        mtx = mtx.tocsr()
    
    # Create an AnnData object
    adata = sc.AnnData(X=mtx.T)

    # Polars is faster than pandas for reading test files
    # Read the gene names and cell names into the AnnData object
    
    adata.var = pl.read_csv(gene_info, separator= '\t', has_header=False).to_pandas()
    
    # Read the cell names and cell metadata into the AnnData object
    adata.obs = pl.read_csv(cell_metadata, separator= '\t', has_header=False).to_pandas()
    return adata


  if isinstance(mtx, sp.sparse.coo.coo_matrix):


AnnData object with n_obs × n_vars = 149 × 23433
    obs: 'column_1'
    var: 'column_1', 'column_2'

In [15]:
# Time the function
import time

time_start = time.time()
sc.read_10x_mtx(path, var_names='gene_symbols', cache=True)
time_end = time.time()
print("Time for scanpy: ", time_end - time_start)

time_start = time.time()
adata = read_scrnaseq_faster(path)
time_end = time.time()
print("Time for fast_matrix_market: ", time_end - time_start)



Time for scanpy:  0.03070998191833496
Time for fast_matrix_market:  0.024509191513061523


  if isinstance(mtx, sp.sparse.coo.coo_matrix):
