In [89]:
import scanpy as sc
import os
import pandas as pd

# Load AnnData object

In [90]:
adata_path = "/home/augusta/SSS_mount/insituCNV/InSituCNV/Figure2/01_Simulate_CNVs_in_spatial_data/Simulate_CNVs_vascular_normal/lung_organoids_cnvclust_simulatedCNVs_121224_simulationv2_rho6.h5ad"
adata = sc.read_h5ad(adata_path)

# Functions to subsample, normalize and save the data

In [91]:
import scanpy as sc
import numpy as np

def subsample_counts(adata, fraction, destination, data_name):
    """
    Parameters:
        adata (AnnData object): AnnData object that should be subsampled
        fraction (nbr): A fraction of which the counts should be downsampled to
        destination (str): Path to the destination where the AnnData object should be saved
        data_name (str): Name of the AnnData object. 

    Returns:
        None
    
    """
    
    new_adata = adata.copy()
    layer_name = 'CNV_simulated'
    
    if fraction > 1 or fraction < 0:
        raise ValueError(f"`fraction` needs to be within [0, 1], not {fraction}")

    if fraction != 1:
        total_counts = int(fraction * adata.X.sum()) 
        downsampled_adata = sc.pp.downsample_counts(
            adata, 
            total_counts=total_counts, 
            random_state=42, 
            copy=True
        )

        new_adata.layers[layer_name] = downsampled_adata.X.copy()

    else:
        new_adata.layers[layer_name] = new_adata.X.copy()


    #Normalize and log transform the data
    new_adata.layers[f"{layer_name}_raw"] = new_adata.layers[layer_name].copy()
    sc.pp.normalize_total(new_adata, layer = layer_name)
    sc.pp.log1p(new_adata, layer = layer_name)

    new_adata.X = new_adata.layers[layer_name].copy()

    #Save the anndata object
    new_adata.write(destination +'/'+ data_name + '.h5ad', compression = 'gzip')
    print(f'subsampled data ({fraction*100:.0f}% counts) saved as {destination}/{data_name}.h5ad')


In [92]:
import numpy as np 

def subsample_genes(adata, gene_panel_size, destination, data_name):
    """
    Parameters:
        adata (AnnData object): AnnData object that should be subsampled
        gene_panel_size (nbr): the size of the gene_panel
        destination (str): Path to the destination where the AnnData object should be saved
        data_name (str): Name of the AnnData object. 

    Returns:
        None
    
    """
    
    # Set random seed for reproducibility
    random_state = 42
    np.random.seed(random_state)

    # Calculate the number of genes to retain
    old_n_vars = adata.n_vars
    
    if gene_panel_size == 'all':
        gene_panel_size = old_n_vars
        
    new_n_vars =  gene_panel_size
    var_indices = np.random.choice(old_n_vars, size=new_n_vars, replace=False)

    # Subsample the data
    subsampled_adata = adata[:, var_indices].copy()

    layer_name = 'CNV_simulated'
    subsampled_adata.X = subsampled_adata.layers[layer_name].copy()

    #Normalize and log transform the data
    subsampled_adata.layers[f"{layer_name}_raw"] = subsampled_adata.layers[layer_name].copy()
    sc.pp.normalize_total(subsampled_adata, layer=layer_name)
    sc.pp.log1p(subsampled_adata, layer=layer_name)
                
    # Save the anndata object
    subsampled_adata.write(destination +'/'+ data_name + '.h5ad', compression = 'gzip')
    print(f'subsampled data (with panel size:{new_n_vars} genes) saved as {destination}/{data_name}.h5ad')


# Subsampling counts

In [93]:
adata.X = adata.layers['CNV_simulated'].copy()

In [95]:
destination = '/home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions'

fractions = [1, 0.7, 0.5, 0.2, 0.1, 0.05, 0.03, 0.02, 0.01]
data_names = ['CNV_simulation_01_subsampled_counts_100',
              'CNV_simulation_01_subsampled_counts_70', 
              'CNV_simulation_01_subsampled_counts_50',
              'CNV_simulation_01_subsampled_counts_20', 
              'CNV_simulation_01_subsampled_counts_10',
              'CNV_simulation_01_subsampled_counts_5', 
              'CNV_simulation_01_subsampled_counts_3', 
              'CNV_simulation_01_subsampled_counts_2', 
              'CNV_simulation_01_subsampled_counts_1']

for fraction, data_name in zip(fractions, data_names):
    subsample_counts(adata=adata, fraction=fraction, destination=destination, data_name=data_name)

subsampled data (100% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_100.h5ad
subsampled data (70% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_70.h5ad
subsampled data (50% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_50.h5ad
subsampled data (20% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_20.h5ad
subsampled data (10% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_10.h5ad
subsampled data (5% counts) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsa

# Subsampling gene panel size

In [84]:
destination = '/home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions'

gene_panel_size = ['all', 20000,15000,10000,5000,1000,500]
data_names = ['CNV_simulated_subsampled_genes_all',
             'CNV_simulated_subsampled_genes_20000', 
             'CNV_simulated_subsampled_genes_15000',
             'CNV_simulated_subsampled_genes_10000', 
             'CNV_simulated_subsampled_genes_5000',
             'CNV_simulated_subsampled_genes_1000', 
             'CNV_simulated_subsampled_genes_500']

for panel_size, data_name in zip(gene_panel_size[0:1], data_names[0:1]):
    subsample_genes(adata=adata, gene_panel_size=panel_size, destination=destination, data_name=data_name)

subsampled data (with panel size:25691 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulated_subsampled_genes_all.h5ad


# Subssampling gene panel size for each count condition

In [113]:
destination = '/home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions'

data_names_counts = ['CNV_simulation_01_subsampled_counts_100',
              'CNV_simulation_01_subsampled_counts_70', 
              'CNV_simulation_01_subsampled_counts_50',
              'CNV_simulation_01_subsampled_counts_20', 
              'CNV_simulation_01_subsampled_counts_10',
              'CNV_simulation_01_subsampled_counts_5', 
              'CNV_simulation_01_subsampled_counts_3', 
              'CNV_simulation_01_subsampled_counts_2', 
              'CNV_simulation_01_subsampled_counts_1']


gene_panel_size = ['all', 20000,15000,10000,5000,1000,500]


n=0
for data_name in data_names_counts:

    # Read in the subsampled counts dataset
    adata_path = f"{destination}/{data_name}.h5ad"
    adata = sc.read(adata_path)
    
    # Subsample the gene panel sizes
    for panel_size in gene_panel_size:
        combined_data_name = f"{data_name}_genes_{panel_size}"
        n += 1
        print(n, combined_data_name, ':')
        subsample_genes(adata=adata, gene_panel_size=panel_size, destination=destination, data_name=combined_data_name)

1 CNV_simulation_01_subsampled_counts_100_genes_all :
subsampled data (with panel size:25691 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_100_genes_all.h5ad
2 CNV_simulation_01_subsampled_counts_100_genes_20000 :
subsampled data (with panel size:20000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_100_genes_20000.h5ad
3 CNV_simulation_01_subsampled_counts_100_genes_15000 :
subsampled data (with panel size:15000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_100_genes_15000.h5ad
4 CNV_simulation_01_subsampled_counts_100_genes_10000 :
subsampled data (with panel size:10000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_1



subsampled data (with panel size:500 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_5_genes_500.h5ad
43 CNV_simulation_01_subsampled_counts_3_genes_all :
subsampled data (with panel size:25691 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_3_genes_all.h5ad
44 CNV_simulation_01_subsampled_counts_3_genes_20000 :
subsampled data (with panel size:20000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_3_genes_20000.h5ad
45 CNV_simulation_01_subsampled_counts_3_genes_15000 :
subsampled data (with panel size:15000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_3_genes_15000.h5ad
46 CNV_simulation_01_subsampled_counts_3_genes_1000



subsampled data (with panel size:500 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_3_genes_500.h5ad
50 CNV_simulation_01_subsampled_counts_2_genes_all :
subsampled data (with panel size:25691 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_2_genes_all.h5ad
51 CNV_simulation_01_subsampled_counts_2_genes_20000 :
subsampled data (with panel size:20000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_2_genes_20000.h5ad
52 CNV_simulation_01_subsampled_counts_2_genes_15000 :
subsampled data (with panel size:15000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_2_genes_15000.h5ad
53 CNV_simulation_01_subsampled_counts_2_genes_1000



subsampled data (with panel size:500 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_2_genes_500.h5ad
57 CNV_simulation_01_subsampled_counts_1_genes_all :
subsampled data (with panel size:25691 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_1_genes_all.h5ad
58 CNV_simulation_01_subsampled_counts_1_genes_20000 :
subsampled data (with panel size:20000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_1_genes_20000.h5ad
59 CNV_simulation_01_subsampled_counts_1_genes_15000 :
subsampled data (with panel size:15000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_1_genes_15000.h5ad
60 CNV_simulation_01_subsampled_counts_1_genes_1000



subsampled data (with panel size:1000 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_1_genes_1000.h5ad
63 CNV_simulation_01_subsampled_counts_1_genes_500 :




subsampled data (with panel size:500 genes) saved as /home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/technical_limiting_conditions/CNV_simulation_01_subsampled_counts_1_genes_500.h5ad
