In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy import stats


def sc_simulation(bulk_sample, sample, num_cells, libsize):
    """
    TPM-based simulation of single-cell RNA-seq data from bulk sample.

    Parameters:
    -----------
    bulk_sample : pd.DataFrame
        DataFrame with columns 'gene' and 'tpm'
    sample : str
        Sample identifier/name
    num_cells : int
        Number of cells to simulate
    libsize : int
        Mean library size for the normal distribution

    Returns:
    --------
    scipy.sparse.csr_matrix
        Sparse matrix with genes as rows and cells as columns
    """
    # Convert TPM to probability vector
    s = bulk_sample.copy()
    s['prob'] = s['tpm'] / s['tpm'].sum()

    # Get all genes for completeness
    all_genes = s['gene'].values
    gene_to_idx = {gene: idx for idx, gene in enumerate(all_genes)}

    # Determine padding width for cell names
    padding_width = len(str(num_cells))

    # Store results for each cell
    cell_data = []
    cell_names = []

    # Simulate each cell
    for cell_num in range(1, num_cells + 1):
        # Sample library size from normal distribution
        # NOTE: negative values need to be excluded
        sample_size = int(abs(round(np.random.normal(libsize, libsize/2)))) + 1

        # Weighted sampling with replacement
        # Use numpy's choice with probabilities
        sampled_genes = np.random.choice(
            s['gene'].values,
            size=sample_size,
            replace=True,
            p=s['prob'].values
        )

        # Count occurrences of each gene
        unique_genes, counts = np.unique(sampled_genes, return_counts=True)
        gene_counts = dict(zip(unique_genes, counts))

        # Create cell name with padding
        cell_name = f"{sample}_cell_{str(cell_num).zfill(padding_width)}"
        cell_names.append(cell_name)

        # Store gene counts for this cell
        cell_data.append(gene_counts)

    # Create a complete count matrix
    # Initialize with zeros
    count_matrix = np.zeros((len(all_genes), num_cells), dtype=int)

    # Fill in the counts
    for cell_idx, gene_counts in enumerate(cell_data):
        for gene, count in gene_counts.items():
            gene_idx = gene_to_idx[gene]
            count_matrix[gene_idx, cell_idx] = count

    # Convert to sparse matrix
    sparse_matrix = csr_matrix(count_matrix)

    # Create DataFrame with gene names as index and cell names as columns
    # (for compatibility with R version)
    result_df = pd.DataFrame(
        count_matrix,
        index=all_genes,
        columns=cell_names
    )

    return sparse_matrix, result_df


# Alternative version that returns just the sparse matrix (more memory efficient)
def sc_simulation_sparse(bulk_sample, sample, num_cells, libsize):
    """
    TPM-based simulation of single-cell RNA-seq data from bulk sample.
    Returns only the sparse matrix.
    """
    # Convert TPM to probability vector
    s = bulk_sample.copy()
    s['prob'] = s['tpm'] / s['tpm'].sum()

    # Get all genes for completeness
    all_genes = s['gene'].values
    gene_to_idx = {gene: idx for idx, gene in enumerate(all_genes)}

    # Determine padding width for cell names
    padding_width = len(str(num_cells))

    # Store results for each cell
    cell_data = []

    # Simulate each cell
    for cell_num in range(1, num_cells + 1):
        # Sample library size from normal distribution
        sample_size = int(abs(round(np.random.normal(libsize, libsize/2)))) + 1

        # Weighted sampling with replacement
        sampled_genes = np.random.choice(
            s['gene'].values,
            size=sample_size,
            replace=True,
            p=s['prob'].values
        )

        # Count occurrences of each gene
        unique_genes, counts = np.unique(sampled_genes, return_counts=True)
        gene_counts = dict(zip(unique_genes, counts))
        cell_data.append(gene_counts)

    # Create sparse matrix directly
    row_indices = []
    col_indices = []
    data = []

    for cell_idx, gene_counts in enumerate(cell_data):
        for gene, count in gene_counts.items():
            gene_idx = gene_to_idx[gene]
            row_indices.append(gene_idx)
            col_indices.append(cell_idx)
            data.append(count)

    # Create sparse matrix
    sparse_matrix = csr_matrix(
        (data, (row_indices, col_indices)),
        shape=(len(all_genes), num_cells)
    )

    return sparse_matrix


# Example usage function
def simulate_from_bulk_tpm(bulk_tpm_df, sample_name, num_cells, libsize):
    """
    Wrapper function that matches the R interface more closely.

    Parameters:
    -----------
    bulk_tpm_df : pd.DataFrame
        DataFrame with columns 'gene' and 'tpm'
    sample_name : str
        Sample identifier
    num_cells : int
        Number of cells to simulate
    libsize : int
        Mean library size

    Returns:
    --------
    pd.DataFrame
        Count matrix with genes as index and cells as columns
    """
    sparse_mat, result_df = sc_simulation(
        bulk_tpm_df, sample_name, num_cells, libsize
    )
    return result_df


# Batch simulation function (similar to R's map_df approach)
def batch_sc_simulation(bulk_samples_list, sample_names, num_cells, libsize):
    """
    Simulate multiple samples and merge into single matrix.

    Parameters:
    -----------
    bulk_samples_list : list of pd.DataFrame
        List of bulk TPM DataFrames
    sample_names : list of str
        List of sample names
    num_cells : int
        Number of cells per sample
    libsize : int
        Mean library size

    Returns:
    --------
    pd.DataFrame
        Combined count matrix with all cells
    """
    all_results = []

    for bulk_sample, sample_name in zip(bulk_samples_list, sample_names):
        _, result_df = sc_simulation(bulk_sample, sample_name, num_cells, libsize)
        all_results.append(result_df)

    # Combine all results
    combined_df = pd.concat(all_results, axis=1)
    return combined_df

In [6]:
import pandas as pd
import numpy as np

# Example: Create a bulk TPM sample
bulk_sample = pd.DataFrame({
    'gene': ['GENE1', 'GENE2', 'GENE3', 'GENE4', 'GENE5', 'GENE6'],
    'tpm': [100, 200, 150, 300, 250, 50]
})

# Simulate 10 cells with mean library size of 5000
sparse_matrix, count_df = sc_simulation(
    bulk_sample=bulk_sample,
    sample='sample1',
    num_cells=10,
    libsize=5000
)

print("Count matrix shape:", count_df.shape)
print("\nFirst few rows and columns:")
print(count_df.iloc[:5, :5])
print("\nSparse matrix info:")
print(f"Shape: {sparse_matrix.shape}")
print(f"Density: {sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1]):.4f}")

Count matrix shape: (6, 10)

First few rows and columns:
       sample1_cell_01  sample1_cell_02  sample1_cell_03  sample1_cell_04  \
GENE1              799              262              341              977   
GENE2             1577              471              623             1848   
GENE3             1217              373              447             1425   
GENE4             2407              709              936             2835   
GENE5             2028              576              719             2326   

       sample1_cell_05  
GENE1              173  
GENE2              318  
GENE3              246  
GENE4              519  
GENE5              404  

Sparse matrix info:
Shape: (6, 10)
Density: 1.0000


In [7]:
count_df

Unnamed: 0,sample1_cell_01,sample1_cell_02,sample1_cell_03,sample1_cell_04,sample1_cell_05,sample1_cell_06,sample1_cell_07,sample1_cell_08,sample1_cell_09,sample1_cell_10
GENE1,799,262,341,977,173,520,559,471,386,450
GENE2,1577,471,623,1848,318,1161,1108,910,794,909
GENE3,1217,373,447,1425,246,828,807,660,620,681
GENE4,2407,709,936,2835,519,1675,1617,1397,1135,1282
GENE5,2028,576,719,2326,404,1328,1378,1158,984,1115
GENE6,400,106,161,498,77,278,285,233,194,212


In [5]:
sparse_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 50 stored elements and shape (5, 10)>