In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import snapatac2 as sn
from pathlib import Path
import numpy as np
import scipy.stats as stats
import pickle
import time
import timeit
from scipy.sparse import csr_matrix

np.random.seed(1)

In [2]:
run_id = Path("/data/toulouse/bicycle/notebooks/experiments/masking/data")/"scMultiSim_data"/"run_04"
out_path = Path("/data/toulouse/bicycle/notebooks/experiments/masking/data")

In [3]:
atac_base = pd.read_csv(run_id/"atac_counts.csv", index_col=0).T
grn = pd.read_csv(run_id/"geff.csv", index_col=0)
region_to_gene = pd.read_csv(run_id/"region_to_gene.csv", index_col=0)
region_to_tf = pd.read_csv(run_id/"region_to_tf.csv", index_col=0)

In [4]:
TFs = grn.columns
counts = csr_matrix(atac_base.to_numpy(dtype=np.float32))
adata = sc.AnnData(atac_base)
adata.var_names = atac_base.columns.to_numpy(dtype=str)
adata.obs["perturbed"] = [False if n<4400 else True for n in range(len(atac_base))]
adata.obs["Perturbation"] = ["not_perturbed" if n<4400 else "perturbed" for n in range(len(atac_base))]

targets = TFs.to_numpy().repeat(600/len(TFs))
print(len(targets))
adata.obs["target"] = [np.nan if n<4400 else targets[(n-len(atac_base))] for n in range(len(atac_base))]

600




In [5]:
# write full adata for full bicycle run
adata.obs["target_genes"] = [[""] if n<4400 else [targets[(n-len(atac_base))]] for n in range(len(atac_base))]
adata.obs.target_genes = adata.obs.target_genes.map(str)
adata.write_h5ad(run_id/"ready_full_atac.h5ad")

... storing 'Perturbation' as categorical
... storing 'target' as categorical
... storing 'target_genes' as categorical


In [9]:
adata.shape

(5000, 330)

In [None]:
adata.obs["TF_perturbed"] = [row["perturbed"] and (row["target"] in TFs.to_list()) for cell, row in adata.obs.iterrows()]
adata.obs["TF_pert_cate"] = adata.obs["TF_perturbed"].map(lambda x: "TF_perturbed" if x else "no_TF_perturbed")

In [None]:
sc.pl.highest_expr_genes(adata)

In [None]:
plt.hist(adata.X.sum(axis=0), bins=50)
plt.xticks(np.arange(0, 160000, 5000))

In [None]:
# filtering not necessary as all genes and cells have the same amount
sc.pp.filter_cells(adata, min_genes=0)
sc.pp.filter_genes(adata, min_cells=0)

In [None]:
adata

In [None]:
sc.pl.violin(
    adata,
    ["n_genes"],
    jitter=0.4,
    multi_panel=True,
)
sc.pl.violin(
    adata.T,
    ["n_cells"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pp.filter_cells(adata, min_genes=170)
sc.pl.violin(
    adata,
    ["n_genes"],
    jitter=0.4,
    multi_panel=True,
)

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, n_top_genes=50)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata

In [None]:
adata = adata[:, adata.var.highly_variable]

In [None]:
adata.var_names

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
sc.pl.highest_expr_genes(adata)

In [None]:
#for tf in TFs:
#    sc.pl.violin(
#        adata[pd.Series(adata.obs.target == tf) | ~adata.obs.perturbed],
#        groupby="Perturbation",
#        
#    )

In [None]:
sc.pp.scale(adata, max_value=10)
adata

In [None]:
sc.pp.pca(adata,svd_solver="arpack")

In [None]:
sc.pl.pca(adata, color="Perturbation",
          save="_atac_perturbation.pdf")

In [None]:
sc.pl.pca(adata, color="target")

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=10)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color="target",
           save="_atac_perturbation.pdf")

In [None]:
sc.tl.leiden(
    adata,
    resolution=0.9,
    random_state=0,
    flavor="igraph",
    n_iterations=2,
    directed=False,
)

In [None]:
sc.pl.umap(adata, color=["leiden","Perturbation", "TF_pert_cate"],
           save="_atac_leiden.pdf")

In [None]:
sc.tl.rank_genes_groups(adata, "leiden", method="t-test")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False,
                        save="_atac_leiden.pdf")

In [None]:
pd.DataFrame(adata.uns["rank_genes_groups"]["names"]).head(5)

In [None]:
result = adata.uns["rank_genes_groups"]
groups = result["names"].dtype.names
pd.DataFrame(
    {
        f"{group}_{key[:1]}": result[key][group]
        for group in groups
        for key in ["names", "pvals"]
    }
).head(5)

In [None]:
adata.obs[adata.obs.perturbed].index

In [None]:
adata.write(Path(run_id)/"processed_atac.gz", compression="gzip")
df = pd.DataFrame(adata.X).T
df.to_csv(run_id/"processed_atac.csv")

In [None]:
with open(run_id/"filtered_genes.npy", "rb") as rb:
    filtered_genes = np.load(rb, allow_pickle=True)


In [None]:
filtered_regions = adata.var_names[adata.var.highly_variable].astype(int)
region_to_gene = region_to_gene.loc[filtered_regions,:]
region_to_gene = region_to_gene.iloc[:, filtered_genes-1]
region_to_gene.to_csv(run_id/"filtered_region_to_gene.csv")
region_to_tf = region_to_tf.loc[filtered_regions, :]
region_to_tf.to_csv(run_id/"filtered_region_to_tf.csv")