In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import scanpy as sc 
from scipy.sparse import csr_matrix

In [None]:
run_id = "run_02"
perturbed_path = Path(f"{run_id}/perturbed_data/")
unperturbed_path = Path(f"{run_id}/unperturbed_data/")
figures_path = Path(run_id)/"figures"
if not figures_path.exists():
    figures_path.mkdir(exist_ok=True, parents=True)

sc.settings.figdir = figures_path
np.random.seed(0)

FileNotFoundError: [Errno 2] No such file or directory: 'run_02/figures'

### Check if GRNs reflect perturbation

In [None]:
base_grn = pd.read_csv(unperturbed_path/"geff.csv", index_col=0)
base_grn += 0.000001
TFs = base_grn.columns
pert_grns = dict()
for dir in perturbed_path.iterdir():
    if dir.is_dir() and dir.name in TFs.to_list():
        df = pd.read_csv(dir/"geff.csv", index_col=0)+0.000001
        pert_grns[str(dir.name)]=df.div(base_grn)

In [None]:
fig, ax = plt.subplots(nrows=(len(TFs)+1)//2+1, ncols=2)
ax = ax.flatten()
#ax[0].bar(x = TFs.to_list(),
#          height = base_grn.sum(axis=0).to_list())
#ax[0].set_title("base_grn")
for n, (tf, grn) in enumerate(pert_grns.items()):
    ax[n+1].bar(x = TFs.to_list(),
              height = grn.sum(axis=0).to_list())
    ax[n+1].set_title(f"GRN with {tf} perturbed")

fig.tight_layout()
fig.savefig(figures_path/"perturbation_grn_bar.pdf")

### A look at the RNA data

In [None]:
df = pd.read_csv(unperturbed_path/"counts_obs.csv", index_col=0).T + 0.00001 # add pseudocounts
df.columns

In [None]:
counts = csr_matrix(df.to_numpy(dtype=np.float32))
unp_adata = sc.AnnData(df)
unp_adata.var_names = df.columns.to_numpy()
unp_adata.obs["perturbed"] = False
unp_adata.obs["Perturbation"] = "Not perturbed"
unp_adata.obs["target"] = np.nan

In [None]:
pert_adatas = dict()
for dir in perturbed_path.iterdir():
    if dir.is_dir() and dir.name:
        df = pd.read_csv(dir/"counts_obs.csv", index_col=0).T + 0.00001
        indexes = np.random.choice(df.index.to_numpy(), size=5, replace=False)
        df = df.loc[indexes, :]
        counts = csr_matrix(df.to_numpy(dtype=np.float32))
        adata = sc.AnnData(df)
        adata.obs_names = df.index + f"_{str(dir.name)}"
        adata.var_names = df.columns.to_numpy()
        adata.obs["perturbed"] = True
        adata.obs["Perturbation"] = "Perturbed"
        adata.obs["target"] = str(dir.name)
        pert_adatas[str(dir.name)] = adata
pert_adatas["unperturbed"] = unp_adata

In [None]:
adata = sc.concat(pert_adatas, label="dataset")

In [None]:
adata.var["TF"] = adata.var_names.map(lambda x: str(x) in TFs.to_list())
adata.obs["TF_perturbed"] = [row["perturbed"] and (row["target"] in TFs.to_list()) for cell, row in adata.obs.iterrows()]
adata.obs["TF_pert_cate"] = adata.obs["TF_perturbed"].map(lambda x: "TF_perturbed" if x else "no_TF_perturbed")

In [None]:
sc.pl.highest_expr_genes(adata)

In [None]:
assert ((adata.X>0).sum(axis=0)==5500).all()
assert ((adata.X>0).sum(axis=1)==110).all()

In [None]:
plt.hist(adata.X.sum(axis=0), bins=20)


In [None]:
# filtering not necessary as all genes and cells have the same amount
sc.pp.filter_cells(adata, min_genes=110)
sc.pp.filter_genes(adata, min_cells=5500)

In [None]:
adata

In [None]:
sc.pl.violin(
    adata,
    ["n_genes"],
    jitter=0.4,
    multi_panel=True,
)
sc.pl.violin(
    adata.T,
    ["n_cells"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, n_top_genes=50)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata

In [None]:
adata = adata[:, adata.var.highly_variable | adata.var.TF]

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
sc.pl.highest_expr_genes(adata)

In [None]:
TFs

In [None]:
adata.obs.perturbed

In [None]:
adata

In [None]:
sc.pl.violin(adata,
             keys=TFs,
             groupby="TF_pert_cate",
             save="_TF_expression.pdf"
             )

In [None]:
for tf in TFs:
    sc.pl.violin(
        adata[pd.Series(adata.obs.target == tf) | ~adata.obs.perturbed],
        groupby="Perturbation",
        keys=tf,
        
    )

In [None]:
sc.pp.scale(adata, max_value=10)
adata

In [None]:
sc.pp.pca(adata,svd_solver="arpack")

In [None]:
sc.pl.pca(adata, color="Perturbation",
          save="_perturbation.pdf")

In [None]:
sc.pl.pca(adata, color="target")

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color="target",
           save="_perturbation.pdf")

In [None]:
sc.pl.umap(adata, color=["TF_pert_cate", "Perturbation"],
           save="_TF_pert.pdf")

In [None]:
sc.pl.umap(adata, color=adata.var_names[adata.var.TF].append(pd.Index(["Perturbation", "TF_pert_cate"])),
           save="_TFs.pdf")

In [None]:
sc.tl.leiden(
    adata,
    resolution=0.9,
    random_state=0,
    flavor="igraph",
    n_iterations=2,
    directed=False,
)

In [None]:
sc.pl.umap(adata, color=["leiden","Perturbation", "TF_pert_cate"],
           save="_leiden.pdf")

In [None]:
sc.tl.rank_genes_groups(adata, "leiden", method="t-test")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False,
                        save="_leiden.pdf")

In [None]:
pd.DataFrame(adata.uns["rank_genes_groups"]["names"]).head(5)

In [None]:
result = adata.uns["rank_genes_groups"]
groups = result["names"].dtype.names
pd.DataFrame(
    {
        f"{group}_{key[:1]}": result[key][group]
        for group in groups
        for key in ["names", "pvals"]
    }
).head(5)

In [None]:
sc.pl.dotplot(adata, TFs, groupby="leiden")


In [None]:
sc.pl.stacked_violin(adata, TFs, groupby="leiden")

In [None]:
adata.write(Path(run_id)/"processed_adata.gz", compression="gzip")

### Check if other genes are differentially expressed in perturbed scenario
1. Check if underlying GRNs differ

In [None]:
del pert_adatas["unperturbed"]

In [None]:
# get all grns
base_grn = pd.read_csv(unperturbed_path/"geff.csv", index_col=0)
base_grn += 0.000001
TFs = base_grn.columns
pert_grns = dict()
for dir in perturbed_path.iterdir():
    if dir.is_dir():
        df = pd.read_csv(dir/"geff.csv", index_col=0)+0.000001
        pert_grns[str(dir.name)]=df.sub(base_grn)

In [None]:
# check if grns are the same except for TFs
assert pd.Series([df.sum().sum()==0.0 or (target in TFs.to_list()) for target, df in pert_grns.items()]).all()