In [3]:
import scanpy as sc
import sys
import os 
sys.path.append(os.path.expanduser(f"~/SSS_mount/insituCNV/InSituCNV"))
import insitucnv as icv

# Load AnnData object

In [5]:
adata_path = "/home/augusta/SSS_mount/insituCNV/InSituCNV/Figure2/01_Simulate_CNVs/lung_organoids_cnvclust_simulatedCNVs_310125_simulation3_simulationv2_rho6.h5ad"
adata = sc.read_h5ad(adata_path)

In [6]:
adata

AnnData object with n_obs × n_vars = 1268 × 25691
    obs: 'organism_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type', 'model_id', 'sample_id', 'Phase', 'level_1', 'level_2', 'level_3', 'CountUMIs', 'CountGenes', 'X.Mitochondrial', 'NoveltyScore', 'nCount_SCT', 'nFeature_SCT', 'orig.ident', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'cnv_leiden', 'simulated_subclone'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_referenc

# Subsampling counts

In [8]:
# Move the simulated counts to adata.X
adata.X = adata.layers['CNV_simulated'].copy()

In [7]:
destination = '/home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions'

fractions = [1, 0.7, 0.5, 0.2, 0.1, 0.05, 0.03, 0.02, 0.01]

# Change '03' (10-20Mb) to '01' (1-5Mb), or '02' (5-10 Mb), depending on dataset 
data_names = ['CNV_simulation_03_subsampled_counts_100',
              'CNV_simulation_03_subsampled_counts_70', 
              'CNV_simulation_03_subsampled_counts_50',
              'CNV_simulation_03_subsampled_counts_20', 
              'CNV_simulation_03_subsampled_counts_10',
              'CNV_simulation_03_subsampled_counts_5', 
              'CNV_simulation_03_subsampled_counts_3', 
              'CNV_simulation_03_subsampled_counts_2', 
              'CNV_simulation_03_subsampled_counts_1']

for fraction, data_name in zip(fractions, data_names):
    icv.tl.subsample_counts(adata=adata, fraction=fraction, destination=destination, data_name=data_name)

subsampled data (100% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_100.h5ad
subsampled data (70% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_70.h5ad
subsampled data (50% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_50.h5ad
subsampled data (20% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_20.h5ad
subsampled data (10% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_10.h5ad
subsampled data (5% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsa

# Subssampling gene panel size for each count condition

In [8]:
destination = '/home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions'


# Change '03' (10-20Mb) to '01' (1-5Mb), or '02' (5-10 Mb), depending on dataset 
data_names_counts = ['CNV_simulation_03_subsampled_counts_100',
              'CNV_simulation_03_subsampled_counts_70', 
              'CNV_simulation_03_subsampled_counts_50',
              'CNV_simulation_03_subsampled_counts_20', 
              'CNV_simulation_03_subsampled_counts_10',
              'CNV_simulation_03_subsampled_counts_5', 
              'CNV_simulation_03_subsampled_counts_3', 
              'CNV_simulation_03_subsampled_counts_2', 
              'CNV_simulation_03_subsampled_counts_1']

gene_panel_size = ['all', 20000,15000,10000,5000,1000,500]


n=0
for data_name in data_names_counts:

    # Read in the subsampled counts dataset
    adata_path = f"{destination}/{data_name}.h5ad"
    adata = sc.read(adata_path)
    
    # Subsample the gene panel sizes
    for panel_size in gene_panel_size:
        combined_data_name = f"{data_name}_genes_{panel_size}"
        n += 1
        print(n, combined_data_name, ':')
        icv.tl.subsample_genes(adata=adata, gene_panel_size=panel_size, destination=destination, data_name=combined_data_name)

1 CNV_simulation_03_subsampled_counts_100_genes_all :
subsampled data (with panel size:25691 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_100_genes_all.h5ad
2 CNV_simulation_03_subsampled_counts_100_genes_20000 :
subsampled data (with panel size:20000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_100_genes_20000.h5ad
3 CNV_simulation_03_subsampled_counts_100_genes_15000 :
subsampled data (with panel size:15000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_100_genes_15000.h5ad
4 CNV_simulation_03_subsampled_counts_100_genes_10000 :
subsampled data (with panel size:10000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_03_subsampled_counts_1