In [1]:
import h5py
import pandas as pd
import os
import gzip
import scipy.sparse as sparse
import numpy as np
import scanpy as sc
import anndata
from scipy.io import mmwrite
import time

In [2]:
FILE = "datasets/10XGenomics/Targeted_SC3v3_Human_Glioblastoma_Neuroscience_filtered_feature_bc_matrix.h5"

In [3]:
def get_sizes(filename):
    size = os.path.getsize(filename)
    with open(filename, 'rb') as f_in:
        with gzip.open(f"{filename}.gz", 'wb') as f_out:
            f_out.write(f_in.read())
    gzipped_size = os.path.getsize(f"{filename}.gz")
    return size, gzipped_size

In [4]:
def CSR(matrix, base_filename):
    if not isinstance(matrix, sparse.csr_matrix):
        csr_matrix = sparse.csr_matrix(matrix)
    else:
        csr_matrix = matrix
    
    csr_filename = f"{base_filename}.csr.npz"
    sparse.save_npz(csr_filename, csr_matrix)
    
    csr_size, csr_gzipped_size = get_sizes(csr_filename)

    return {
        'format': 'CSR',
        'filename': csr_filename,
        'size': csr_size,
        'size_human': f"{csr_size / (1024 * 1024):.2f} MB",
        'gzipped_filename': f"{csr_filename}.gz", 
        'gzipped_size': csr_gzipped_size,
        'gzipped_size_human': f"{csr_gzipped_size / (1024 * 1024):.2f} MB",
        'compression_ratio': f"{csr_size / csr_gzipped_size:.2f}x"
    }

In [5]:
def CSC(matrix, base_filename):
    csc_matrix = matrix.tocsc()
    csc_filename = f"{base_filename}.csc.npz"
    sparse.save_npz(csc_filename, csc_matrix)
    
    csc_size, csc_gzipped_size = get_sizes(csc_filename)
    
    return {
        'format': 'CSC',
        'filename': csc_filename,
        'size': csc_size,
        'size_human': f"{csc_size / (1024 * 1024):.2f} MB",
        'gzipped_filename': f"{csc_filename}.gz", 
        'gzipped_size': csc_gzipped_size,
        'gzipped_size_human': f"{csc_gzipped_size / (1024 * 1024):.2f} MB",
        'compression_ratio': f"{csc_size / csc_gzipped_size:.2f}x"
    }   

In [6]:
def MTX(matrix, base_filename):
    mtx_filename = f"{base_filename}.mtx"
    mmwrite(mtx_filename, matrix)
    
    mtx_size, mtx_gzipped_size = get_sizes(mtx_filename)
    
    return {
        'format': 'MTX',
        'filename': mtx_filename,
        'size': mtx_size,
        'size_human': f"{mtx_size / (1024 * 1024):.2f} MB",
        'gzipped_filename': f"{mtx_filename}.gz", 
        'gzipped_size': mtx_gzipped_size,
        'gzipped_size_human': f"{mtx_gzipped_size / (1024 * 1024):.2f} MB",
        'compression_ratio': f"{mtx_size / mtx_gzipped_size:.2f}x"
    }

In [10]:
def LOOM(matrix, base_filename, genes, cells):
    adata = anndata.AnnData(
        X=matrix,
        var=dict(gene_ids=genes),
        obs=dict(cell_ids=cells)
    )
    
    loom_filename = f"{base_filename}.loom"
    adata.write_loom(loom_filename)
    
    loom_size, loom_gzipped_size = get_sizes(loom_filename)
    
    return {
        'format': 'LOOM',
        'filename': loom_filename,
        'size': loom_size,
        'size_human': f"{loom_size / (1024 * 1024):.2f} MB",
        'gzipped_filename': f"{loom_filename}.gz", 
        'gzipped_size': loom_gzipped_size,
        'gzipped_size_human': f"{loom_gzipped_size / (1024 * 1024):.2f} MB",
        'compression_ratio': f"{loom_size / loom_gzipped_size:.2f}x"
    }

In [11]:
def convert_save_measure_matrix(matrix, base_filename, genes, cells):
    results = {}

    results['CSR']  = CSR(matrix, base_filename)
    results['CSC']  = CSC(matrix, base_filename)
    results['MTX']  = MTX(matrix, base_filename)
    results['LOOM'] = LOOM(matrix, base_filename, genes, cells)

    print("\nFormat comparison:")
    print(f"{'Format':<10} {'Size':<15} {'Gzipped Size':<15} {'Compression Ratio':<20}")
    print("-" * 60)
    for format_name, info in results.items():
        print(f"{format_name:<10} {info['size_human']:<15} {info['gzipped_size_human']:<15} {info['compression_ratio']:<20}")
    
    return results

In [12]:
adata = sc.read_10x_h5(FILE)
base_name = os.path.splitext(os.path.basename(FILE))[0]

results = convert_save_measure_matrix(
    adata.X,
    base_filename=base_name,
    genes=adata.var_names.tolist(),
    cells=adata.obs_names.tolist()
)

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Converting to MTX format...


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.



Format comparison:
Format     Size            Gzipped Size    Compression Ratio   
------------------------------------------------------------
CSR        2.76 MB         2.75 MB         1.00x               
CSC        3.64 MB         3.62 MB         1.00x               
MTX        40.40 MB        5.67 MB         7.12x               
LOOM       3.99 MB         3.55 MB         1.12x               
