In [2]:
import sys
import os 
sys.path.append(os.path.expanduser(f"~/SSS_mount/insituCNV/InSituCNV"))
import insitucnv as icv

# Load AnnData

In [3]:
adata_path = "/home/augusta/SSS_mount/insituCNV/data/simulated_CNV_data/lung_organoids_cnvclust.h5ad"
adata = sc.read_h5ad(adata_path)

In [4]:
adata.obs.cell_type

N3_O1_AAACCCAAGCGTCAAG-1    secretory cell
N3_O1_AAAGAACAGCGTATGG-1    secretory cell
N3_O1_AAAGAACCACTATCGA-1    secretory cell
N3_O1_AAAGGATTCGAGTCCG-1    secretory cell
N3_O1_AAAGGGCGTTCTTAGG-1        basal cell
                                 ...      
N3_O2_GGTAGAGCAAGCACCC-1    secretory cell
N3_O2_TTCAATCAGGGCAACT-1        basal cell
N3_O2_TTGTGTTAGAACTGAT-1        basal cell
N3_EX_CATGGTACAATTGCGT-1    secretory cell
N3_EX_GGTAACTAGCGTGAGT-1    secretory cell
Name: cell_type, Length: 1268, dtype: category
Categories (3, object): ['ciliated cell', 'secretory cell', 'basal cell']

In [5]:
print(adata[adata.obs.cell_type=='ciliated cell'].n_obs, 'ciliated cell')
print(adata[adata.obs.cell_type=='secretory cell'].n_obs, 'secretory cell')
print(adata[adata.obs.cell_type=='basal cell'].n_obs, 'basal cell')

3 ciliated cell
1031 secretory cell
234 basal cell


# Simulate the data to contain CNVs

---

### Step 1: Generate CNVs

This function generates copy number variations (CNVs) based on a dictionary (**CNV_dict**) of genes (keys) and whether they should be **gain** or **loss** (value). 

Each CNV is centered around the gene and spans a **random size** between **min_size** and **max_size**.  
The output is a **DataFrame** containing the CNV details:  

- **Gene Name**
- **Chromosome**
- **CNV Size (bp)**
- **CNV Type (gain/loss)**
- **Start Position (bp)**
- **End Position (bp)**

**Parameters:**
- **CNV_dict** *(dict)*: Dictionary mapping genes to CNV type ('gain' or 'loss').  
- **min_size** *(int)*: Minimum CNV size in base pairs (bp).  
- **max_size** *(int)*: Maximum CNV size in base pairs (bp).  
- **gene_info_path** *(str)*: Path to a CSV file containing gene information.   
  - Expected columns: `Gene name`, `Chromosome/scaffold name`, `Gene start (bp)`, `Gene end (bp)`.  
- **save_csv** *(str, optional)*: If provided, saves the output DataFrame to this file.

**Returns:**
- **CNV_df** *(DataFrame)*: Contains the generated CNVs.


In [6]:
adata.var.head()

Unnamed: 0_level_0,gene_symbols,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length,feature_type,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,chromosome,start,end
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ENSG00000238009,AL627309.1,False,ENSG00000238009.6,NCBITaxon:9606,gene,629,lncRNA,486,0.008812,0.008774,94.534413,78.360051,4.373995,chr1,89295.0,133566.0
ENSG00000241860,AL627309.5,False,ENSG00000241860.7,NCBITaxon:9606,gene,1025,lncRNA,581,0.010553,0.010498,93.466037,93.837299,4.552163,chr1,141474.0,173862.0
ENSG00000241599,AL627309.4,False,ENSG00000241599.1,NCBITaxon:9606,gene,457,lncRNA,7,0.000246,0.000246,99.921278,2.190479,1.160171,chr1,160446.0,161525.0
ENSG00000237491,LINC01409,False,LINC01409,NCBITaxon:9606,gene,1059,lncRNA,1591,0.035527,0.03491,82.107512,315.902356,5.758594,chr1,778747.0,810065.0
ENSG00000228794,LINC01128,False,LINC01128,NCBITaxon:9606,gene,1627,lncRNA,2011,0.042585,0.041704,77.384166,378.669894,5.939302,chr1,825138.0,859446.0


In [6]:
CNV_dict = { 
    'DIS3': 'loss',
    'MECOM': 'loss',
    'ERBB2': 'gain',
    'CHD7': 'gain',
    'HCK': 'gain',
    'KEAP1': 'loss',
    'MYD88': 'gain',
    'TBX3': 'gain'
}

gene_file = ("/home/augusta/SSS_mount/insituCNV/InSituCNV/Ensmbl_BioMart_gene_info.txt")

In [None]:
# CNV_df = icv.pp.generate_cnvs(CNV_dict, min_size=1000000, max_size=5000000, gene_info=gene_file, save_csv='CNVs_121224_simulation1.csv') 
# CNV_df = icv.pp.generate_cnvs(CNV_dict, min_size=5000000, max_size=10000000, gene_info=gene_file, save_csv='CNVs_310125_simulation2.csv') 
CNV_df = icv.pp.generate_cnvs(CNV_dict, min_size=10000000, max_size=20000000, gene_info=gene_file, save_csv='CNVs_310125_simulation3.csv') 

In [8]:
CNV_df

Unnamed: 0,Gene name,Chromosome,Size (bp),Type,Start (bp),End (bp)
0,DIS3,13,19739671,loss,62897297,82636967
1,MECOM,3,11795339,loss,163475968,175271306
2,ERBB2,17,14562701,gain,32427820,46990520
3,CHD7,8,14661851,gain,53442459,68104309
4,HCK,20,17670343,gain,23241855,40912197
5,KEAP1,19,13300526,loss,3844578,17145104
6,MYD88,3,12730315,gain,31775631,44505945
7,TBX3,12,11391694,gain,108981368,120373062


---

## Step 2: Create a CNV Template for AnnData
This step maps the CNVs onto the genes stored in an **AnnData object (adata)** to create a CNV template.  

The **CNV template** is a matrix that marks whether each gene is:  
🔹 **Gained** *(+1)*  
🔹 **Lost** *(-1)*  
🔹 **Unchanged** *(0)*  

**Process Overview**
- Sort genes in **adata.var** by **chromosome** and **start position**.
- Initialize an **empty CNV template matrix**.
- Identify genes **within** CNV regions.
- Map genes to their corresponding **CNV effect**.

**Returns:**
- A **CNV template DataFrame**, where rows correspond to CNVs and columns correspond to genes.


In [8]:
cnv_template_df = icv.pp.create_cnv_template(adata, CNV_df)
cnv_template_df

gene_ids,ENSG00000238009,ENSG00000241860,ENSG00000241599,ENSG00000286448,ENSG00000237491,ENSG00000225880,ENSG00000228794,ENSG00000230368,ENSG00000272438,ENSG00000223764,...,ENSG00000287171,ENSG00000196664,ENSG00000228933,ENSG00000224294,ENSG00000122824,ENSG00000158639,ENSG00000269437,ENSG00000204025,ENSG00000165509,ENSG00000126895
DIS3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MECOM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERBB2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CHD7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HCK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KEAP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MYD88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TBX3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---

## Step 3: Simulate CNVs in an AnnData Object
This step modifies the **gene expression matrix** in **adata** to simulate CNV effects.  
The CNVs are applied to selected **subclones**, scaling expression values accordingly.

**Process Overview**
- **Initialize** CNV simulation layers in `adata.layers`.
- **Assign subclone labels** to cells based on predefined groups.
- **Modify expression values**:
  - **Amplifications (gains)** → Increase gene expression.
  - **Deletions (losses)** → Decrease gene expression.

**Returns:**
- Modified **AnnData object** (`adata`) with CNV-simulated expression values stored in:  
  - `adata.layers['CNV_simulated']` → Scaled expression matrix.
  - `adata.layers['CNV_GT']` → CNV ground truth matrix.


In [9]:
subclone_dict = {
    'N': [], # Normal cells without CNV simulations
    'A': ['DIS3', 'MECOM', 'ERBB2', ], # Original genetic subclone
    'B': ['DIS3', 'MECOM', 'ERBB2', 'CHD7', 'HCK', 'KEAP1'], 
    'C': ['DIS3', 'MECOM', 'ERBB2', 'CHD7', 'MYD88', 'TBX3'],
} # 'DIS3','MECOM','ERBB2','CHD7','HCK','KEAP1','MYD88','TBX3'

In [11]:
adata.layers['counts'] = adata.layers['raw'].copy()

In [18]:
icv.pp.simulate_cnvs(adata, cnv_template_df, subclone_dict, cell_type_reference='cell_type', cell_type_cnv='secretory cell', alpha=6)

AnnData object with n_obs × n_vars = 1268 × 25691
    obs: 'organism_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type', 'model_id', 'sample_id', 'Phase', 'level_1', 'level_2', 'level_3', 'CountUMIs', 'CountGenes', 'X.Mitochondrial', 'NoveltyScore', 'nCount_SCT', 'nFeature_SCT', 'orig.ident', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'cnv_leiden', 'simulated_subclone'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_referenc

In [19]:
adata.write("lung_organoids_cnvclust_simulatedCNVs_310125_simulation3_simulationv2_rho6.h5ad", compression = 'gzip')