In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import scanpy as sc 
from scipy.sparse import csr_matrix

In [12]:
run_id = "run_04"
data_path = Path("/data/toulouse/bicycle/notebooks/experiments/masking/data/scMultiSim_data")

figures_path = data_path/run_id/"figures"
if not figures_path.exists():
    figures_path.mkdir(exist_ok=True, parents=True)
run_id = data_path/run_id
sc.settings.figdir = figures_path
np.random.seed(0)

In [13]:
grn = pd.read_csv(run_id / "geff.csv", index_col=0)
TFs = grn.columns

In [14]:
TFs

Index(['2', '6', '10', '19', '80', '91'], dtype='object')

### A look at the RNA data

In [15]:
df = pd.read_csv(run_id / "counts_obs.csv", index_col=0).T + 0.00001 # add pseudocounts
df.index

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       ...
       'V4991', 'V4992', 'V4993', 'V4994', 'V4995', 'V4996', 'V4997', 'V4998',
       'V4999', 'V5000'],
      dtype='object', length=5000)

In [16]:
counts = csr_matrix(df.to_numpy(dtype=np.float32))
adata = sc.AnnData(df)
adata.var_names = df.columns.to_numpy(dtype=str)
# "perturbed" for format_data function in evaluate
adata.obs["perturbed"] = [False if n<4400 else True for n in range(len(df))]
adata.obs["Perturbation"] = ["not_perturbed" if n<4400 else "perturbed" for n in range(len(df))]

targets = TFs.to_numpy().repeat(600/len(TFs))
print(len(targets))
# "target_genes" for format_data function in evaluate
adata.obs["target"] = [np.nan if n<4400 else targets[(n-len(df))] for n in range(len(df))]



600




In [21]:
# write full adata for full bicycle run
adata.obs["target_genes"] = [[""] if n<4400 else [targets[(n-len(df))]] for n in range(len(df))]
adata.obs.target_genes = adata.obs.target_genes.map(str)
adata.write_h5ad(run_id/"ready_full_rna.h5ad")

In [22]:
adata.obs.target_genes.unique()

['['']', '['2']', '['6']', '['10']', '['19']', '['80']', '['91']']
Categories (7, object): ['['2']', '['6']', '['10']', '['19']', '['80']', '['91']', '['']']

In [None]:
def string_to_list(string:str, to_type=int):
    string = string[1:-1]
    list = string.split(",")
    list = [to_type(x) for x in list]
    return list

In [None]:
adata.var["TF"] = adata.var_names.map(lambda x: str(x) in TFs.to_list())
adata.obs["TF_perturbed"] = [row["perturbed"] and (row["target"] in TFs.to_list()) for cell, row in adata.obs.iterrows()]
adata.obs["TF_pert_cate"] = adata.obs["TF_perturbed"].map(lambda x: "TF_perturbed" if x else "no_TF_perturbed")

Preprocessing:

In [None]:

sc.pl.highest_expr_genes(adata)

In [None]:
assert ((adata.X>0).sum(axis=0)==5000).all()
assert ((adata.X>0).sum(axis=1)==110).all()

In [None]:
plt.hist(adata.X.sum(axis=0), bins=20)


In [None]:
# filtering not necessary as all genes and cells have the same amount
sc.pp.filter_cells(adata, min_genes=110)
sc.pp.filter_genes(adata, min_cells=5000)

In [None]:
adata

In [None]:
sc.pl.violin(
    adata,
    ["n_genes"],
    jitter=0.4,
    multi_panel=True,
)
sc.pl.violin(
    adata.T,
    ["n_cells"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, n_top_genes=50)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata

In [None]:
adata = adata[:, adata.var.highly_variable | adata.var.TF]

In [None]:
adata.var_names

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
sc.pl.highest_expr_genes(adata)

In [None]:
sc.pl.violin(adata,
             keys=TFs,
             groupby="TF_pert_cate",
             save="_TF_expression.pdf"
             )

In [None]:
for tf in TFs:
    sc.pl.violin(
        adata[pd.Series(adata.obs.target == tf) | ~adata.obs.perturbed],
        groupby="Perturbation",
        keys=tf,
        
    )

In [None]:
sc.pp.scale(adata, max_value=10)
adata

In [None]:
sc.pp.pca(adata,svd_solver="arpack")

In [None]:
sc.pl.pca(adata, color="Perturbation",
          save="_perturbation.pdf")

In [None]:
sc.pl.pca(adata, color="target")

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color="target",
           save="_perturbation.pdf")

In [None]:
sc.pl.umap(adata, color=["TF_pert_cate", "Perturbation"],
           save="_TF_pert.pdf")

In [None]:
sc.pl.umap(adata, color=adata.var_names[adata.var.TF].append(pd.Index(["Perturbation", "TF_pert_cate"])),
           save="_TFs.pdf")

In [None]:
sc.tl.leiden(
    adata,
    resolution=0.9,
    random_state=0,
    flavor="igraph",
    n_iterations=2,
    directed=False,
)

In [None]:
sc.pl.umap(adata, color=["leiden","Perturbation", "TF_pert_cate"],
           save="_leiden.pdf")

In [None]:
sc.tl.rank_genes_groups(adata, "leiden", method="t-test")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False,
                        save="_leiden.pdf")

In [None]:
pd.DataFrame(adata.uns["rank_genes_groups"]["names"]).head(5)

In [None]:
result = adata.uns["rank_genes_groups"]
groups = result["names"].dtype.names
pd.DataFrame(
    {
        f"{group}_{key[:1]}": result[key][group]
        for group in groups
        for key in ["names", "pvals"]
    }
).head(5)

In [None]:
sc.pl.dotplot(adata, TFs, groupby="leiden")


In [None]:
sc.pl.stacked_violin(adata, TFs, groupby="leiden")

In [None]:
adata.write(Path(run_id)/"processed_rna.gz", compression="gzip")
df = pd.DataFrame(adata.X)
df.to_csv(run_id/"processed_rna.csv")

In [None]:
adata

In [None]:
hvgs = adata.var_names[adata.var.highly_variable].astype(int)
filtered_genes = np.array([*set(np.append(hvgs, TFs.astype(int)))])
grn = grn.loc[filtered_genes, :]

In [None]:
grn.to_csv(run_id/"filtered_grn.csv")
with open(run_id/"filtered_genes.npy", "wb") as wf:
    np.save(wf, filtered_genes)