# SCENIC GRN inference on TM - SS2

## Imports

In [3]:
import os
import glob
import pickle
import pandas as pd
import numpy as np

from dask.diagnostics import ProgressBar

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

import seaborn as sns
import scanpy
import scanpy.api as sc

## Load all pseudobulk mouse atlases

In [4]:
droplet = scanpy.read_h5ad('/work/sduknn/Andreas/TM_MCA/TM/droplet_pseudobulk.h5ad')
facs = scanpy.read_h5ad('/work/sduknn/Andreas/TM_MCA/TM/facs_pseudobulk.h5ad')
mca = scanpy.read_h5ad('/work/sduknn/Andreas/TM_MCA/MCA/mca_pseudobulk.h5ad')

In [5]:
#Only use overlap between all datasets
overlap = set(droplet.var_names.values) & set(facs.var_names.values) & set(mca.var_names.values)


droplet = droplet[:, list(overlap)]
facs = facs[:, list(overlap)]
mca = mca[:, list(overlap)]

sc.pp.filter_genes(droplet, min_cells=(len(droplet.obs_names) / 10))
sc.pp.filter_genes(facs, min_cells=(len(facs.obs_names) / 10))
sc.pp.filter_genes(mca, min_cells=(len(mca.obs_names) / 10))

overlap = set(droplet.var_names.values) & set(facs.var_names.values) & set(mca.var_names.values)


droplet = droplet[:, list(overlap)]
facs = facs[:, list(overlap)]
mca = mca[:, list(overlap)]

In [6]:
#library size normalization
sc.pp.normalize_per_cell(facs, counts_per_cell_after=1e4)

#log transform
sc.pp.log1p(facs)

## Settings for SCENIC 

In [12]:
#Settings
DATA_FOLDER="/work/sduknn/Andreas/TM_MCA/database_SCENIC/tmp/facs"
RESOURCES_FOLDER="/work/sduknn/Andreas/TM_MCA/database_SCENIC/resources"
DATABASE_FOLDER = "/work/sduknn/Andreas/TM_MCA/database_SCENIC/databases/"
#For clusters, probably wont have to use
SCHEDULER="123.122.8.24:8786"

DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.feather")
MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl")
MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, '/work/sduknn/Andreas/TM_MCA/database_SCENIC/making_TF_file/mm_tfs.txt')
#SC_EXP_FNAME = os.path.join(RESOURCES_FOLDER, "PATH.txt")
REGULONS_FNAME = os.path.join(DATA_FOLDER, "regulons_10_percent.p")
MOTIFS_FNAME = os.path.join(DATA_FOLDER, "motifs_10_percent.csv")

## Expression matrix as a pandas DataFrame

In [13]:
#Transpose and check array
ex_matrix_facs = facs.X
ex_matrix_facs.shape

(981, 11245)

In [14]:
#make pandas dataframe
ex_matrix_facs = pd.DataFrame(data= ex_matrix_facs.todense(),
                         index=pd.DataFrame(facs.obs.index.values),
                         columns=facs.var.index.values)

## Load databases

In [16]:
#Load the TF file
tf_names = load_tf_names(MM_TFS_FNAME)

In [17]:
#Load the ranking databases
db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
    return os.path.basename(fname).split(".")[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
dbs

dbs2= [dbs[2],dbs[4] ]
dbs2

[FeatherRankingDatabase(name="mm9-500bp-upstream-7species"),
 FeatherRankingDatabase(name="mm9-tss-centered-10kb-7species")]

## Setup dask cluster on slurm

In [18]:
from distributed import LocalCluster, Client
from dask_jobqueue import SLURMCluster

In [19]:
custom_client = SLURMCluster(project='xxxxx', cores=24, walltime='02:00:00', memory='50GB',processes=12)

In [20]:
custom_client.scale(120)
client = Client(custom_client)

## Coexpression inference using GRNBoost

In [29]:
%%time
adjacencies_facs = grnboost2(ex_matrix_facs, tf_names=tf_names, verbose=True,  client_or_address=client)

modules_facs = list(modules_from_adjacencies(adjacencies_facs, ex_matrix_facs))

preparing dask client
parsing input
creating dask graph


  expression_matrix = expression_data.as_matrix()


120 partitions
computing dask graph
not shutting down client, client was created externally
finished
CPU times: user 6min 2s, sys: 11.4 s, total: 6min 13s
Wall time: 3min 18s


## Pruning regulons using RCisTarget

In [None]:
df_facs = prune2df(dbs2, modules_facs, MOTIF_ANNOTATIONS_FNAME,  client_or_address=client)

In [None]:
# Create regulons from this table of enriched motifs.
regulons_facs = df2regulons(df_facs)

In [None]:
# Save the enriched motifs and the discovered regulons to disk.
df_facs.to_csv(MOTIFS_FNAME)
with open(REGULONS_FNAME, "wb") as f:
    pickle.dump(regulons_facs, f)

In [None]:
custom_client.close()