In [1]:
import scanpy as sc
import squidpy as sq
import os

In [2]:
# import sys
# sys.path.append('/home/augusta/SSS_mount/insituCNV/InSituCNV/modules')
# sys.path

['/home/augusta/SSS_mount/insituCNV/InSituCNV/Figure2/01_Simulate_CNVs_in_spatial_data/Simulate_CNVs',
 '/home/augusta/anaconda3/envs/scanpy_env/lib/python311.zip',
 '/home/augusta/anaconda3/envs/scanpy_env/lib/python3.11',
 '/home/augusta/anaconda3/envs/scanpy_env/lib/python3.11/lib-dynload',
 '',
 '/home/augusta/anaconda3/envs/scanpy_env/lib/python3.11/site-packages',
 '/home/augusta/anaconda3/envs/scanpy_env/lib/python3.11/site-packages/setuptools/_vendor',
 '/home/augusta/SSS_mount/insituCNV/InSituCNV/modules']

# Functions

In [2]:
import random
import pandas as pd
import numpy as np
import anndata

## generate_cnvs

In [3]:
def generate_cnvs(CNV_dict, min_size, max_size, gene_info, save_csv = None):
    """ 
    This function generates copy number variations (CNVs) based on a dictionary (CNV_dict) of genes (keys) and whether they should be gain or loss (value). The size (bp) of the CNV is a randomly chosen between min_size and max_size. The gene will be in the center of the CNV, so half of the size (bp) is subtracted (start) and added (end) from the center of the gene position ('Gene end (bp)'- 'Gene start (bp)')/2), according to the gene_list. The function returns the CNV information as a DataFrame, specifying the gene name, chromosome, size (bp), type (gain/loss), start (bp), end (bp).

    Parameters:
        - CNV_dict (dictionary): Gene name (key) and whether they should be 'gain' or 'loss'.
        - min_size (nbr): minimum size (in bp) of the CNV.
        - max_size (nbr): maximum size (in bp) of the CNV.
        - gene_info (str): The path to the gene information from the *Ensmbl_BioMart_gene_info.txt* file, including  'Gene stable ID', 'Chromosome/scaffold name', 'Gene start (bp)', 'Gene end (bp)', 'Gene name' of the human genome GRCh38.

    Returns:
        - cnv_df (DataFrame): Compiling the gene name, chromosome, size, type, start, end
    """

    gene_info = pd.read_csv(gene_info)
    cnv_list = []

    for gene in CNV_dict:
        # Check if the gene is present in gene_info DataFrame
        if gene not in gene_info['Gene name'].values:
            print(f"Gene '{gene}' not found in gene_info DataFrame.")
            continue
        
        # Find gene details in the gene_info DataFrame
        gene_row = gene_info.loc[gene_info['Gene name'] == gene].iloc[0]
        
        # Calculate gene center
        gene_center = (gene_row['Gene end (bp)'] + gene_row['Gene start (bp)']) // 2
        
        # Randomly generate CNV size
        cnv_size = random.randint(min_size, max_size)
        
        # Calculate CNV start and end positions
        cnv_start = gene_center - cnv_size // 2
        cnv_end = gene_center + cnv_size // 2
        
        # Create CNV entry
        cnv = {
            'Gene name': gene,
            'Chromosome': gene_row['Chromosome/scaffold name'],
            'Size (bp)': cnv_size,
            'Type': CNV_dict[gene],
            'Start (bp)': cnv_start,
            'End (bp)': cnv_end
        }
        
        cnv_list.append(cnv)
    
    # Convert the list of CNV dictionaries to a DataFrame
    cnv_df = pd.DataFrame(cnv_list)

    if save_csv:
        cnv_df.to_csv(save_csv, index=None)
    
    return cnv_df

## create_cnv_template

In [4]:
import numpy as np
import pandas as pd

def create_cnv_template(adata, CNV_df):
    """ 
    This function generates a template for CNVs to an AnnData object.

    Parameters:
        - adata (AnnData): The AnnData object to which the CNVs should fit.
        - CNV_df (DataFrame): CNVs generated by the generate_cnvs function. 

    Returns:
        - cnv_template_df (DataFrame): A CNV template where each CNV (gene name) has a -1/0/1 value for each gene in the adata.var.
    """
    
    # Step 1: Order adata.var by chromosome and start position
    adata.var.sort_values(by=['chromosome', 'start'], inplace=True)
    
    # Initialize the CNV template matrix with zeros
    cnv_template = np.zeros((len(CNV_df), len(adata.var_names)))

    # Create a mapping of gene names to indices in the sorted AnnData object
    gene_to_index = {gene: idx for idx, gene in enumerate(adata.var_names)}

    # Step 2: Loop through each CNV in the CNV_df
    for i, row in CNV_df.iterrows():
        chromosome = row['Chromosome']
        cnv_start = row['Start (bp)']
        cnv_end = row['End (bp)']
        cnv_type = row['Type']
        
        # Find the genes in the corresponding chromosome that fall within the CNV region
        selected_genes = adata.var[
            (adata.var['chromosome'] == f"chr{chromosome}") &
            (adata.var['start'] >= cnv_start) &
            (adata.var['start'] <= cnv_end)
        ]
        
        # Determine the CNV effect: -1 for loss, +1 for gain
        cnv_effect = -1 if cnv_type == 'loss' else 1
        
        # Step 3: Set the corresponding entries in the CNV template matrix
        for gene in selected_genes.index:
            gene_index = gene_to_index[gene]
            cnv_template[i, gene_index] = cnv_effect
    
    # Convert the CNV template matrix to a DataFrame for easier interpretation
    cnv_template_df = pd.DataFrame(cnv_template, columns=adata.var_names)
    
    # Optionally, add the name of the CNV (gene name) as rownames in the DataFrame for reference
    if 'Gene name' in CNV_df.columns:
        cnv_template_df.index = CNV_df['Gene name'].values
    else:
        print("Warning: 'Gene name' column missing from CNV_df.")

    return cnv_template_df


## simulate_cnvs

In [5]:
import numpy as np
import pandas as pd

def simulate_cnvs(adata, cnv_template_df, subclone_dict, cell_type_reference, cell_type_cnv, alpha=2):
    """ 
    This function creates a layer of the adata.X with simulated gains and losses according to the subclone dictionary.
    The cells are randomly distributed between subclones, with amplification or deletion determined by a uniformly drawn random value.

    Parameters:
        - adata (AnnData): The AnnData object where the CNVs should be simulated.
        - cnv_template_df (DataFrame): Templates for each CNV generated for the adata.
        - subclone_dict (dictionary): Specifies the subclones and their CNVs.
        - cell_type_reference (str): Column name in adata.obs indicating cell types.
        - cell_type_cnv (str or list): Specific cell types in which CNVs should be simulated.
        - alpha (float): The maximum amplitude for CNV change (default = 2).

    Returns:
        - adata (AnnData): The simulated AnnData object containing adata.obs['simulated_subclone'] 
                           and the simulated count matrix in an 'CNV_simulated' layer.
    """
    
    # Step 1: Create a new layer in adata to store the simulated CNV data
    adata.layers['CNV_simulated'] = adata.layers['counts'].copy()
    adata.layers['CNV_GT'] = np.zeros(adata.X.shape)

    # Step 2: Randomly assign cells to subclones based on subclone_dict
    if isinstance(cell_type_cnv, str):
        cell_type_cnv = [cell_type_cnv]  # Convert string to a list
    
    num_cells = adata[adata.obs[cell_type_reference].isin(cell_type_cnv)].shape[0]
    subclone_names = list(subclone_dict.keys())

    # Assign random subclone to cells of the desired cell type
    adata.obs.loc[adata.obs[cell_type_reference].isin(cell_type_cnv), 'simulated_subclone'] = np.random.choice(subclone_names, size=num_cells, replace=True)
    # Assign 'N' to cells that are not in the selected cell type
    adata.obs.loc[~adata.obs[cell_type_reference].isin(cell_type_cnv), 'simulated_subclone'] = "N"

    # Step 3: Apply CNVs to the expression data based on the assigned subclone
    for subclone, cnvs in subclone_dict.items():
        if not cnvs:
            # Skip if the CNV list is empty (as for normal subclone)
            continue
        
        # Get cells assigned to the current subclone
        subclone_cells = adata.obs['simulated_subclone'] == subclone
        
        for cnv in cnvs:
            # Extract the CNV effect for this gene from cnv_template_df
            cnv_row = cnv_template_df[cnv_template_df.index == cnv]
            cnv_effects = cnv_row.values.flatten()
            
            for gene_idx, effect in enumerate(cnv_effects):
                if effect == -1:
                    # Simulate deletion by drawing a random value rho in (0, alpha)
                    rho = np.random.uniform(0, alpha)
                    # Decrease counts by dividing by (1 + rho)
                    adata.layers['CNV_simulated'][subclone_cells, gene_idx] /= (1 + rho)
                    adata.layers['CNV_GT'][subclone_cells, gene_idx] = -1
                elif effect == 1:
                    # Simulate amplification by drawing a random value rho in (0, alpha)
                    rho = np.random.uniform(0, alpha)
                    # Increase counts by multiplying by (1 + rho)
                    adata.layers['CNV_simulated'][subclone_cells, gene_idx] *= (1 + rho)
                    adata.layers['CNV_GT'][subclone_cells, gene_idx] = 1

    return adata

In [6]:
import numpy as np

def simulate_cnvs(adata, cnv_template_df, subclone_dict, cell_type_reference, cell_type_cnv, alpha):
    
    """ 
    This function creates a layer of the adata.X with simulated gains and losses according to the subclone dictionary.
    The cells are randomly distributed between subclones, with amplification or deletion determined by a uniformly drawn random value.

    Parameters:
        - adata (AnnData): The AnnData object where the CNVs should be simulated.
        - cnv_template_df (DataFrame): Templates for each CNV generated for the adata.
        - subclone_dict (dictionary): Specifies the subclones and their CNVs.
        - cell_type_reference (str): Column name in adata.obs indicating cell types.
        - cell_type_cnv (str or list): Specific cell types in which CNVs should be simulated.
        - alpha (float): The maximum amplitude for CNV change (default = 2).

    Returns:
        - adata (AnnData): The simulated AnnData object containing adata.obs['simulated_subclone'] 
                           and the simulated count matrix in an 'CNV_simulated' layer.
    """

    # Step 1: Create a new layer in adata to store the simulated CNV data
    adata.layers['CNV_simulated'] = adata.layers['counts'].copy()
    adata.layers['CNV_GT'] = np.zeros(adata.X.shape)

    # Step 2: Randomly assign cells to subclones based on subclone_dict
    if isinstance(cell_type_cnv, str):
        cell_type_cnv = [cell_type_cnv]  # Convert string to a list

    num_cells = adata[adata.obs[cell_type_reference].isin(cell_type_cnv)].shape[0]
    subclone_names = list(subclone_dict.keys())

    # Assign random subclone to cells of the desired cell type
    adata.obs.loc[adata.obs[cell_type_reference].isin(cell_type_cnv), 'simulated_subclone'] = np.random.choice(subclone_names, size=num_cells, replace=True)
    # Assign 'N' to cells that are not in the selected cell type
    adata.obs.loc[~adata.obs[cell_type_reference].isin(cell_type_cnv), 'simulated_subclone'] = "N"

    # Step 3: Apply CNVs to the expression data based on the assigned subclone
    for subclone, cnvs in subclone_dict.items():
        if not cnvs:
            continue
        
        # Get cells assigned to the current subclone
        subclone_cells = adata.obs['simulated_subclone'] == subclone
        
        for cnv in cnvs:
            cnv_row = cnv_template_df[cnv_template_df.index == cnv]
            cnv_effects = cnv_row.values.flatten()
            
            for gene_idx, effect in enumerate(cnv_effects):
                if effect == -1:
                    # Simulate deletion
                    rho = np.random.uniform(0, alpha)
                    values_to_modify = adata.layers['CNV_simulated'][subclone_cells, gene_idx].todense().A1  # Flatten dense array
                    modified_values = values_to_modify / (1 + rho)
                    adata.layers['CNV_simulated'][subclone_cells, gene_idx] = modified_values[:, None]  # Reshape for compatibility
                    adata.layers['CNV_GT'][subclone_cells, gene_idx] = -1
                elif effect == 1:
                    # Simulate amplification
                    rho = np.random.uniform(0, alpha)
                    values_to_modify = adata.layers['CNV_simulated'][subclone_cells, gene_idx].todense().A1  # Flatten dense array
                    modified_values = values_to_modify * (1 + rho)
                    adata.layers['CNV_simulated'][subclone_cells, gene_idx] = modified_values[:, None]  # Reshape for compatibility
                    adata.layers['CNV_GT'][subclone_cells, gene_idx] = 1

    return adata


# Load AnnData

In [7]:
adata_path = "/home/augusta/SSS_mount/insituCNV/InSituCNV/Load_datasets/Lymph_Node/Xenium_HumanLymphNode_5K.h5ad"
adata = sc.read_h5ad(adata_path)

In [8]:
sc.pp.subsample(adata, fraction =0.1)

In [9]:
adata

AnnData object with n_obs × n_vars = 70898 × 4624
    obs: 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'group', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_10_genes', 'pct_counts_in_top_20_genes', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_150_genes', 'leiden'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'gene_names', 'chromosome', 'start', 'end'
    uns: 'group_colors', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [10]:
for cell_type in adata.obs.group.unique():
    print(cell_type, ':', adata[adata.obs.group ==  cell_type].n_obs)

Naive Thymus Derived CD4+ αβ T Cell : 6070
CD141+ Myeloid Dendritic Cell : 1324
Hematopoietic Stem Cell : 2585
CD8+ αβ Memory T Cell : 1920
Macrophage : 3963
T Cell : 1073
Endothelial Cell : 12095
Effector CD4+ αβ T Cell : 3139
CD1c+ Myeloid Dendritic Cell : 1090
Plasmacytoid Dendritic Cell : 1540
B Cell : 3668
Effector CD8+ αβ T Cell : 465
Memory B Cell : 8949
Stromal Cell : 5823
Innate Lymphoid Cell : 3248
Plasma Cell : 2499
Naive B Cell : 1258
Mature NK T Cell : 415
Regulatory T Cell : 4692
CD4+ αβ Memory T Cell : 2770
Intermediate Monocyte : 533
Neutrophil : 110
Non Classical Monocyte : 407
Mast Cell : 301
Mature Conventional Dendritic Cell : 593
Classical Monocyte : 108
Erythrocyte : 80
Type I NK T Cell : 180


In [11]:
adata_selected = adata[adata.obs.group.isin(['Macrophage', 'Endothelial Cell'])]

In [12]:
adata = adata_selected.copy()

# Illustrate the genome coverage of the data

In [19]:
import os
import pandas as pd
import plotly.express as px
import plotly.io as pio

def plot_gene_coverage(adata, gene_identifyer, gene_info, count_max = 200000, save_html=None):
    """
    This function generates an interactive gene coverage plot for the Human Lymph Node based on gene information and an AnnData object.

    Parameters:
    - adata (AnnData): An AnnData object containing gene IDs and total counts in `adata.var`.
    - gene_identifyer (str): The variable name where the adata.var gene names are located. 
    - gene_info (str): The path to the gene information from the *Ensmbl_BioMart_gene_info.txt* file, including  'Gene stable ID', 'Chromosome/scaffold name', 'Gene start (bp)', 'Gene end (bp)', 'Gene name' of the human genome GRCh38.
    - count_max (nbr): The maximum number of counts to plot in the y-axis.
    - save_html (str): The file path to save the plot as an HTML file. 

    Returns:
    - plot
    """

    # Load and preprocess gene data
    gene_data = pd.read_csv(gene_info, sep=',', header=0, names=['Gene ID', 'Chromosome', 'Start', 'End', 'Gene name'])
    gene_data_sorted = gene_data.sort_values(by=['Chromosome', 'Start'])

    # Initialize a matrix with zeros
    matrix = pd.DataFrame(0, index=gene_data_sorted['Gene ID'], columns=['data'])

    
    # Convert adata.var to a DataFrame for merging
     
    var_data = pd.DataFrame({
        'Gene ID': adata.var[gene_identifyer],
        'Total Counts': adata.var['total_counts']
    })

    # Merge gene data with total counts
    merged_data = gene_data_sorted.merge(var_data, on='Gene ID', how='left')
    merged_data.index = merged_data['Gene ID']

    # Fill the matrix with total counts
    matrix.loc[merged_data['Gene ID'], 'data'] = merged_data['Total Counts']

    # Create a column for combined chromosome and gene name
    gene_data_sorted['chrom_gene'] = gene_data_sorted['Chromosome'].astype(str) + ':' + gene_data_sorted['Gene name']

    # Create the Plotly bar plot
    fig = px.bar(
        matrix, 
        x=gene_data_sorted['chrom_gene'], 
        y='data', 
        title='Gene Coverage',
        labels={'x': 'Gene name', 'data': 'Total Counts'},
        color_discrete_sequence=['blue']
    )

    # Update layout for better visualization
    fig.update_layout(
        yaxis_range=[0, count_max],  # Set y-axis limit
        width=1200,  # Set plot width
        height=400,  # Set plot height
        plot_bgcolor='white', 
        paper_bgcolor='white'
    )


    # Save the plot as an HTML file
    if save_html:
        pio.write_html(fig, file=save_html, auto_open=True)

    return fig

In [34]:
import os
import pandas as pd
import plotly.express as px
import plotly.io as pio

def plot_gene_coverage(adata, gene_identifier, gene_info, count_max=200000, save_html=None):
    """
    This function generates an interactive gene coverage plot for the Human Lymph Node based on gene information and an AnnData object.

    Parameters:
    - adata (AnnData): An AnnData object containing gene IDs and total counts in `adata.var`.
    - gene_identifier (str): The variable name where the `adata.var` gene names are located. 
    - gene_info (str): The path to the gene information from the *Ensmbl_BioMart_gene_info.txt* file, which should include 'Gene stable ID', 'Chromosome/scaffold name', 'Gene start (bp)', 'Gene end (bp)', 'Gene name' of the human genome GRCh38.
    - count_max (int): The maximum number of counts to plot on the y-axis.
    - save_html (str): The file path to save the plot as an HTML file. 

    Returns:
    - fig: The Plotly figure object.
    """
    # Load and preprocess gene data
    gene_data = pd.read_csv(gene_info, sep=',', header=0, names=['Gene ID', 'Chromosome', 'Start', 'End', 'Gene name'])
    gene_data_sorted = gene_data.sort_values(by=['Chromosome', 'Start'])

    # Extract gene count data from adata.var and merge it with gene_data
    var_data = pd.DataFrame({
        'Gene ID': adata.var[gene_identifier],
        'Total Counts': adata.var['total_counts']
    }).reset_index(drop=True)  # Ensure index alignment if needed

    # Merge gene data with total counts from adata.var
    merged_data = gene_data_sorted.merge(var_data, on='Gene ID', how='left')
    merged_data['Total Counts'].fillna(0, inplace=True)  # Fill missing values with 0

    # Add combined chromosome and gene name for x-axis labels
    merged_data['chrom_gene'] = merged_data['Chromosome'].astype(str) + ':' + merged_data['Gene name']

    # Create the Plotly bar plot
    fig = px.bar(
        merged_data,
        x='chrom_gene', 
        y='Total Counts', 
        title='Gene Coverage',
        labels={'chrom_gene': 'Gene (Chromosome:Gene Name)', 'Total Counts': 'Total Counts'},
        color_discrete_sequence=['blue']
    )

    # Update layout for better visualization
    fig.update_layout(
        yaxis_range=[0, count_max],  # Set y-axis limit
        width=1200,  # Set plot width
        height=400,  # Set plot height
        plot_bgcolor='white', 
        paper_bgcolor='white'
    )

    # Save the plot as an HTML file
    if save_html:
        pio.write_html(fig, file=save_html, auto_open=True)

    return fig


In [30]:
gene_file = os.path.expanduser("~/SSS_mount/insituCNV/InSituCNV/Ensmbl_BioMart_gene_info.txt")

In [36]:
plot_gene_coverage(adata, count_max=400000, gene_identifier= 'gene_ids', gene_info=gene_file)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





# Simulate the data to contain CNVs

Divide the dataset into four simulated subclones

1.   Normal (unaltered)
2.   Subclone A (fewer CNVs)
3.   Subclone B (same as A but added)
4.   Subclone C (same as A but added)


Choose chromosomal regions to be duplicated or deleted

If we generate several CNVs throughout the genome, it might be possible to compare the detection efficiency as to how many of the CNVs are detected. Or else, it's works/not works.


Things to consider: 
- how the size (nbr of cells) of the subclone affect the outcome - or make same size populations and avoid adressing this
- choose the size of the CNVs
    - literature to find an appropriate size for CNVs (50bp - several Mbs, ref: https://doi.org/10.1016%2Fj.bj.2021.02.003)
    - different sizes to see how that affects the outcome - or make every CNV the same size


## Module to simulate CNVs in adata (simulate_CNVs.py)

In [4]:
from simulate_CNVs import *

### Function: Generate CNVs (generate_cnvs)

This function generates copy number variations (CNVs) based on a dictionary (**CNV_dict**) of genes (keys) and whether they should be gain or loss (value). The size (bp) of the CNV is a randomly chosen size between **min_size** and **max_size**. The gene will be in the center of the CNV, so half of the size (bp) is subtracted (start) and added (end) from the center of the gene position ('Gene end (bp)'- 'Gene start (bp)')/2), taken from the **gene_list**. The output will be a list of these CNVs, specifying the gene name, chromosome, size (bp), type (gain/loss), start (bp), end (bp).

Parameters:

- **CNV_dict** - dict. Gene name (key) and whether they should be 'gain' or 'loss'. Could for example be gain of known oncogenes or loss of tumor supressors. Gene selection inspo: https://doi.org/10.1080%2F07853890.2023.2280708
- **min_size** - nbr in bp.
- **max_size** - nbr in bp. A size of the CNV is generated as a random number between the min and max
- **gene_info** - the *Ensmbl_BioMart_gene_info.txt* containing 'Gene stable ID', 'Chromosome/scaffold name', 'Gene start (bp)', 'Gene end (bp)', 'Gene name'

Returns:

- **CNV_df** (DataFrame): Compiling the gene name, chromosome, size, type, start, end

In [13]:
def gene_exists(adata, gene_list):
    for gene in gene_list:
        if gene in adata.var_names:
            print(gene, 'exists! \n')
        else:
            print(gene, 'does not exist.. \n')

In [14]:
gene_exists(adata, gene_list=['DIS3','MECOM','ERBB2','CHD7','HCK','KEAP1','MYD88','TBX3']) #'EPHB1','FLT1',

# CNVs, including ABCC5, AGO2, ARID5B, CHD7, FAM58A, FOXA1, HEY1, HLA-C, HLA-DQB1, MCL1, MECOM, MSN, NFKBIA, PRSS1, RAD21,

DIS3 exists! 

MECOM exists! 

ERBB2 exists! 

CHD7 exists! 

HCK exists! 

KEAP1 exists! 

MYD88 exists! 

TBX3 exists! 



In [15]:
CNV_dict = { 
    'DIS3': 'loss',
    'MECOM': 'loss',
    'ERBB2': 'gain',
    'CHD7': 'gain',
    'HCK': 'gain',
    'KEAP1': 'loss',
    'MYD88': 'gain',
    'TBX3': 'gain'
}

min_size = 10000
max_size = 1000000


gene_file = ("/home/augusta/SSS_mount/insituCNV/InSituCNV/Ensmbl_BioMart_gene_info.txt")

In [16]:
CNV_df = generate_cnvs(CNV_dict, min_size, max_size, gene_info=gene_file, save_csv='CNVs_301024.csv') 
CNV_df

Unnamed: 0,Gene name,Chromosome,Size (bp),Type,Start (bp),End (bp)
0,DIS3,13,562238,loss,72486013,73048251
1,MECOM,3,15767,loss,169365754,169381520
2,ERBB2,17,803723,gain,39307309,40111031
3,CHD7,8,523984,gain,60511392,61035376
4,HCK,20,954807,gain,31599623,32554429
5,KEAP1,19,875978,loss,10056852,10932830
6,MYD88,3,865647,gain,37707965,38573611
7,TBX3,12,11895,gain,114671268,114683162


### Function: Create CNV template (create_cnv_template)

This function generates a CNV template for an AnnData object based on a predifined number of subclones, and their CNVs, generated by the generate_cnvs function.  

Parameters:

- **adata** - the AnnData object
- **CNV_df** (DataFrame): from the *'Generate CNVs'* module. the gene name, chromosome, size, type, start, end - . The type should generate either -1/+1 depending on if it's 'gain' or 'loss'.

Returns:

- **cnv_template_df** (DataFrame). For each CNV (gene name) - generate an array with -1/0/1 values for each gene in the adata.var_names

In [17]:
cnv_template_df = create_cnv_template(adata, CNV_df)
cnv_template_df

gene_names,HES4,TNFRSF18,PUSL1,DVL1,ATAD3A,PRKCZ,TNFRSF14,MMEL1,WRAP73,TP73,...,VEGFD,VSIG4,WAS,XIAP,ZBTB33,ZC4H2,ZFX,ZMYM3,ZNF280C,ZNF449
DIS3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MECOM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERBB2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CHD7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HCK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KEAP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MYD88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TBX3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Function: Simualte CNVs in the data (simulate_cnvs)

This function applies poission distribution / binomial probability to simulate one arm gains and losses (increase/keep/decrease the counts (adata.X) depending on the -1/0/1 value of the cnv_template_df: if -1, the probability is 0.5 and if 1 it is 1.5). This is done according to a subclone_dict, where each key is a subclone, and the values state which CNVs to add to these cells.  

Parameters:

- **adata** - the AnnData object to add the CNVs on
- **cnv_template_df** matrix. from the *'Create CNV template'* module
- **subclone_dict** (dictionary) For each subclone (key), a list of which CNVs to assign (values). randomly assigning each cell to one of these subclones (adata.obs.simulated_subclone)

Returns:

- **adata** - the AnnData object with simulated CNVs as a layer to the adata.X, (as well as the CNV template as a 'CNV_template' layer?)

In [18]:
adata

AnnData object with n_obs × n_vars = 16058 × 4624
    obs: 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'group', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_10_genes', 'pct_counts_in_top_20_genes', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_150_genes', 'leiden'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'gene_names', 'chromosome', 'start', 'end'
    uns: 'group_colors', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [19]:
subclone_dict = {
    'N': [], # Normal cells without CNV simulations
    'A': ['DIS3', 'MECOM', 'ERBB2', 'CHD7'], # Original genetic subclone
    'B': ['DIS3', 'MECOM', 'ERBB2', 'CHD7', 'HCK', 'KEAP1'], 
    'C': ['DIS3', 'MECOM', 'ERBB2', 'CHD7', 'MYD88', 'TBX3'],
} # 'DIS3','MECOM','ERBB2','CHD7','HCK','KEAP1','MYD88','TBX3'

In [21]:
simulate_cnvs(adata, cnv_template_df, subclone_dict, cell_type_reference='group', cell_type_cnv='Endothelial Cell', alpha=6)

  self._set_arrayXarray(i, j, x)


AnnData object with n_obs × n_vars = 16058 × 4624
    obs: 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'group', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_10_genes', 'pct_counts_in_top_20_genes', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_150_genes', 'leiden', 'simulated_subclone'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'gene_names', 'chromosome', 'start', 'end'
    uns: 'group_colors', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    layers: 'counts', 'CNV_simulated', 'CNV_GT'
    obsp: 'connectivities', 'distance

In [22]:
adata.obs

Unnamed: 0,x_centroid,y_centroid,transcript_counts,control_probe_counts,genomic_control_counts,control_codeword_counts,unassigned_codeword_counts,deprecated_codeword_counts,total_counts,cell_area,...,group,n_genes_by_counts,log1p_n_genes_by_counts,log1p_total_counts,pct_counts_in_top_10_genes,pct_counts_in_top_20_genes,pct_counts_in_top_50_genes,pct_counts_in_top_150_genes,leiden,simulated_subclone
ohepcfjp-1,2218.934082,3177.298584,13,0,0,0,0,0,13.0,24.429532,...,Macrophage,13,2.639057,2.639057,76.923077,100.000000,100.000000,100.000000,3,N
holeoipi-1,1285.019775,3309.815430,405,0,0,0,0,1,405.0,54.097189,...,Endothelial Cell,315,5.755742,6.006353,13.827160,19.012346,33.827160,59.259259,3,B
abidieei-1,3964.308350,659.619202,245,0,0,0,0,0,245.0,26.461563,...,Macrophage,208,5.342334,5.505332,13.061224,21.224490,35.510204,76.326531,3,N
eppicjhd-1,4588.578125,5139.946777,126,0,0,0,0,0,126.0,36.260470,...,Endothelial Cell,115,4.753590,4.844187,16.666667,24.603175,48.412698,100.000000,2,C
beblakni-1,4815.437012,1462.386719,227,0,0,0,0,0,227.0,34.860626,...,Endothelial Cell,196,5.283204,5.429346,13.656388,22.466960,35.682819,79.735683,0,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
obifnkjo-1,6484.874512,3343.062256,352,0,0,0,0,0,352.0,30.435314,...,Endothelial Cell,308,5.733341,5.866468,8.806818,14.488636,26.704545,55.113636,1,C
odkjpagl-1,6068.149902,3596.380859,120,0,0,0,0,0,120.0,18.785001,...,Endothelial Cell,113,4.736198,4.795791,14.166667,22.500000,47.500000,100.000000,3,B
iiiaiadg-1,5744.152344,786.925842,71,0,0,0,0,0,71.0,30.661095,...,Endothelial Cell,68,4.234107,4.276666,18.309859,32.394366,74.647887,100.000000,2,B
jcifchgm-1,2743.707031,4063.177734,214,0,0,0,0,0,214.0,58.025783,...,Macrophage,177,5.181784,5.370638,15.420561,24.766355,40.654206,87.383178,6,N


In [23]:
adata.write("Xenium_HumanLymphNode_5K_simulatedCNVs_301024.h5ad", compression = 'gzip')