In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
from scipy.sparse import issparse

In [14]:
h5ad_files = [
    "data/perturbData/FrangiehIzar2021_RNA.h5ad",
    "data/perturbData/TianKampmann2021_CRISPRi.h5ad",
    "data/perturbData/ReplogleWeissman2022_K562_essential.h5ad",
    # "data/perturbData/FrangiehIzar2021_protein.h5ad",  # Note: This contains protein data
    "data/perturbData/ReplogleWeissman2022_rpe1.h5ad",
    "data/perturbData/TianKampmann2021_CRISPRa.h5ad",
    # "data/perturbData/ReplogleWeissman2022_K562_gwps.h5ad",  # This dataset is too large
]


# --- Main Analysis Function ---
def analyze_h5ad_file(filepath):
    if not os.path.exists(filepath):
        print(f"  ERROR: File not found at {filepath}")
        return None
    print(f"Processing file: {filepath}")

    summary = ""

    adata = sc.read_h5ad(filepath)
    gene_exp = pd.DataFrame(
        adata.X.toarray() if issparse(adata.X) else adata.X,
        index=adata.obs.index,
        columns=adata.var.index
    )

    # 1. Cell Count and Gene/Feature Count
    n_cells, n_genes = adata.shape
    print(f"  1. Dimensions: {n_cells} cells x {n_genes} genes")
    summary += f"  1. Dimensions: {n_cells} cells x {n_genes} genes\n"

    # 2. Sparsity
    X = adata.X
    total_elements = n_cells * n_genes
    non_zeros = X.nnz if issparse(X) else np.count_nonzero(X)
    total_zeros = total_elements - non_zeros
    sparsity = (total_zeros / total_elements) * 100 if total_elements > 0 else 0

    print(f"  2. Total elements: {total_elements}")
    print(f"  3. Total non-zero elements: {np.count_nonzero(gene_exp)}")
    print(f"  4. Non-zero elements ratio: {np.count_nonzero(gene_exp) / total_elements:.2%}")
    print(f"  5. Sparsity: {sparsity:.2f}%")
    print(f"  6. Average non-zero elements per cell: {np.mean(np.count_nonzero(gene_exp, axis=1)):.2f}")
    print(f"  7. Average non-zero elements per gene: {np.mean(np.count_nonzero(gene_exp, axis=0)):.2f}")

    summary += f"  2. Total elements: {total_elements}\n"
    summary += f"  3. Total non-zero elements: {np.count_nonzero(gene_exp)}\n"
    summary += f"  4. Non-zero elements ratio: {np.count_nonzero(gene_exp) / total_elements:.2%}\n"
    summary += f"  5. Sparsity: {sparsity:.2f}%\n"
    summary += f"  6. Average non-zero elements per cell: {np.mean(np.count_nonzero(gene_exp, axis=1)):.2f}\n"
    summary += f"  7. Average non-zero elements per gene: {np.mean(np.count_nonzero(gene_exp, axis=0)):.2f}\n\n"

    # Perturbation Information
    pert_col = 'perturbation'  # Adjust based on your data
    perturbations = adata.obs[pert_col].astype(str).unique()  # Use strings for comparison
    perturbation_counts = adata.obs[pert_col].value_counts()

    summary += f"  8.  Perturbation counts (top 10): \n{perturbation_counts.head(10)}\n\n"
    perturbation_counts.to_csv(f"data/data_summary/{filepath.split('/')[-1]}_perturbation_counts.tsv", sep="\t")
    summary += f"  9. Total unique perturbations: {len(perturbation_counts)}\n"

    control_label = 'control'  # Adjust based on your data
    if control_label in perturbations:
        print(f"  10. Found control label: '{control_label}'")
        summary += f"  10. Found control label: '{control_label}'\n"
        # Count controls
        control_count = (adata.obs[pert_col] == control_label).sum()
        print(f"  11. Control count: {control_count}")
        summary += f"  11. Control count: {control_count}\n"
    else:
        print(f"  10. WARNING: Control label '{control_label}' not found in perturbations.")
        summary += f"  10. WARNING: Control label '{control_label}' not found in perturbations.\n"

    # Create and save the summary file
    summary_file = f"data/data_summary/{filepath.split('/')[-1]}_summary.txt"
    with open(summary_file, "w") as f:
        f.write(summary)
    print(f"Summary saved to {summary_file}")

    # Count zeros for each cell and gene. Then create a DataFrame
    col_zero_counts = (gene_exp != 0).sum(axis=0).to_dict()
    row_zero_counts = (gene_exp != 0).sum(axis=1).to_dict()
    genes_count_df = pd.DataFrame({
        'Genes': list(col_zero_counts.keys()),
        'NonZero_Count': list(col_zero_counts.values()),
    })
    cells_count_df = pd.DataFrame({
        'Cells': [f'Row_{i}' for i in row_zero_counts.keys()],
        'NonZero_Count': list(row_zero_counts.values()),
    })
    genes_count_df.to_csv(f"data/data_summary/{filepath.split('/')[-1]}_GENES_nonzero_counts.tsv", sep="\t",
                          index=False)
    cells_count_df.to_csv(f"data/data_summary/{filepath.split('/')[-1]}_CELLS_nonzero_counts.tsv", sep="\t",
                          index=False)


for file in h5ad_files:
    analyze_h5ad_file(file)
    print("\n\n")


Processing file: data/perturbData/FrangiehIzar2021_RNA.h5ad
  1. Dimensions: 218331 cells x 23712 genes
  2. Total elements: 5177064672
  3. Total non-zero elements: 740736244
  4. Non-zero elements ratio: 14.31%
  5. Sparsity: 85.69%
  6. Average non-zero elements per cell: 3392.72
  7. Average non-zero elements per gene: 31238.88
  10. Found control label: 'control'
  11. Control count: 57605
Summary saved to data/data_summary/FrangiehIzar2021_RNA.h5ad_summary.txt



Processing file: data/perturbData/TianKampmann2021_CRISPRi.h5ad
  1. Dimensions: 32300 cells x 33538 genes
  2. Total elements: 1083277400
  3. Total non-zero elements: 143124189
  4. Non-zero elements ratio: 13.21%
  5. Sparsity: 86.79%
  6. Average non-zero elements per cell: 4431.09
  7. Average non-zero elements per gene: 4267.52
  10. Found control label: 'control'
  11. Control count: 437
Summary saved to data/data_summary/TianKampmann2021_CRISPRi.h5ad_summary.txt



Processing file: data/perturbData/ReplogleWeissm

### scImpute method implementation

In [None]:
h5ad_files = [
    "data/perturbData/FrangiehIzar2021_RNA.h5ad",
    "data/perturbData/TianKampmann2021_CRISPRi.h5ad",
    "data/perturbData/ReplogleWeissman2022_K562_essential.h5ad",
    "data/perturbData/ReplogleWeissman2022_rpe1.h5ad",
    "data/perturbData/TianKampmann2021_CRISPRa.h5ad",
    # "data/perturbData/ReplogleWeissman2022_K562_gwps.h5ad",  # This dataset is too large
]

file_path = "data/perturbData/"

for file in h5ad_files:
    adata = sc.read_h5ad(file)
    print(f"Processing file: {file}")
    # Extract raw counts as DataFrame (genes × cells)
    # Make sure your AnnData actually has a 'raw' layer or raw counts in .X
    # If .X is already raw counts, you can just use .X directly.
    counts_df = pd.DataFrame(
        adata.X.toarray() if issparse(adata.X) else adata.X,
        index=adata.obs.index,
        columns=adata.var.index
    ).T

    # Write to TSV (genes × cells)
    counts_df.to_csv(f"{file_path}{file.split('/')[-1].split('.')[0]}_raw_counts.csv")
