In [None]:
import os
import sys

os.chdir("../..")
sys.path.append("../../")

# Simulate datasets with batch effect

## Introduction

In this example, we will show how to use pyscDesign3 to simulate data with original batch effects and how to remove the batch effects. We will also demostrate how to add ariticial batch effects.

## Import packages and Read in data

### import pacakges

In [None]:
import copy
import anndata as ad
import numpy as np
import pyscDesign3

### Read in data

The raw data is from the `SeuratData` package. The data is called `pbmcsca` in the package; it is PBMC Systematic Comparative Analysis dataset from the Broad Institute. The raw data is converted to `.h5ad` file using the R package `sceasy`.

To save time, we only choose the top 30 genes.

In [None]:
data = ad.read_h5ad("data/BATCH.h5ad")
data = data[:,0:30]
data.layers["log"] = np.log1p(data.X)
data

The column `batch` in this example dataset’s obs contains the batch information.

In [None]:
data.obs["batch"].head()

## Simulation

We can simulate a new data with batch effect information.

In [None]:
test = pyscDesign3.scDesign3(n_cores=6)
test.set_r_random_seed(123)
simu_res = test.scdesign3(anndata=data, 
                        default_assay_name = "counts", 
                        celltype = "cell_type", 
                        other_covariates = "batch", 
                        mu_formula = "cell_type + batch", 
                        sigma_formula = "1", 
                        family_use = "nb", 
                        usebam = True, 
                        corr_formula = "1", 
                        copula = "gaussian",)
simu_count = simu_res["new_count"]

We can also remove the batch effect and generate new data.

In [None]:
# create instance
batch = pyscDesign3.scDesign3(n_cores=6)

In [None]:
# construct data
batch_data = batch.construct_data(
    anndata=data,
    default_assay_name="counts",
    celltype = "cell_type",
    other_covariates = "batch",
    corr_formula = "1"
)

In [None]:
batch_data.keys()

In [None]:
# fit marginal
batch_marginal = batch.fit_marginal(
    mu_formula="cell_type + batch",
    sigma_formula="1",
    family_use="nb",
    usebam=True,
)

In [None]:
# fit copula
batch_copula = batch.fit_copula()

In here, we remove the batch effect by setting its coefficient to zero for all genes’ marginal fits. Then, we use the new sets of coefficients to generate the parameters for all genes across all cells.

In [None]:
batch_null = pyscDesign3.scDesign3(n_cores=6)
batch_data_null = batch_null.construct_data(
    anndata=data,
    default_assay_name="counts",
    celltype = "cell_type",
    other_covariates = "batch",
    corr_formula = "1"
)

In [None]:
batch_marginal_null = copy.deepcopy(batch_marginal)
for k,_ in batch_marginal_null.items():
    batch_marginal_null.rx2(k).rx2("fit").rx2("coefficients")[-1] = 0

In [None]:
batch_para_null = batch_null.extract_para(
    marginal_dict=batch_marginal_null,
    family_use="nb",
)

In [None]:
batch_null.set_r_random_seed(123)
batch_null.copula = "gaussian"
batch_new_count_null = batch_null.simu_new(
    copula_dict=batch_copula["copula_list"],
    family_use="nb",
    important_feature=batch_copula["important_feature"],
)

```{eval-rst}
.. Note::
    Here, as we direct use another copula model result, to tell the pyscDesign3 how to change the copula dict back to R list, the class property `copula` should be specified.
```

Additionally, we can alter the batch effect information by mannually change the estimated coefficient for batch effect in each gene’s marginal model. Then, we can simulate new dataset with altered batch effect information.

In [None]:
batch_alter = pyscDesign3.scDesign3(n_cores=6)
batch_data_alter = batch_alter.construct_data(
    anndata=data,
    default_assay_name="counts",
    celltype = "cell_type",
    other_covariates = "batch",
    corr_formula = "1"
)

In [None]:
batch_marginal_alter = copy.deepcopy(batch_marginal)
for k,_ in batch_marginal_alter.items():
    batch_marginal_null.rx2(k).rx2("fit").rx2("coefficients")[-1] = np.random.normal(loc=1,scale=2)

In [None]:
batch_para_alter = batch_alter.extract_para(
    marginal_dict=batch_marginal_alter,
    family_use="nb",
)

In [None]:
batch_alter.set_r_random_seed(123)
batch_alter.copula = "gaussian"
batch_new_count_alter = batch_alter.simu_new(
    copula_dict=batch_copula["copula_list"],
    family_use="nb",
    important_feature=batch_copula["important_feature"],
)

We then create the corresponding `anndata.AnnData` object.

In [None]:
simu_anndata_list = []
for count_mat in [simu_count,batch_new_count_null,batch_new_count_alter]:
    tmp = ad.AnnData(X=count_mat,obs=batch_data["newCovariate"])
    tmp.layers["log"] = np.log1p(tmp.X)
    simu_anndata_list.append(tmp)

## Visulization

In [None]:
plot = pyscDesign3.plot_reduceddim(
    ref_anndata=data,
    anndata_list=simu_anndata_list,
    name_list=["Reference", "w/ Batch", "w/o Batch", "Aritifical Batch"],
    assay_use="log",
    color_by = "cell_type", 
    shape_by = "batch",
    n_pc=20,
    point_size=5,
)

### UMAP

In [None]:
plot["p_umap"]