# SCENIC GRN inference on MCA

## Imports

In [None]:
import os
import glob
import pickle
import pandas as pd
import numpy as np

from dask.diagnostics import ProgressBar

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

import seaborn as sns
import scanpy
import scanpy.api as sc

## Load all pseudobulk mouse atlases

In [None]:
droplet = scanpy.read_h5ad('/work/sduknn/Andreas/TM_MCA/TM/droplet_pseudobulk.h5ad')
facs = scanpy.read_h5ad('/work/sduknn/Andreas/TM_MCA/TM/facs_pseudobulk.h5ad')
mca = scanpy.read_h5ad('/work/sduknn/Andreas/TM_MCA/MCA/mca_pseudobulk.h5ad')

In [None]:
#Only use overlap between all datasets
overlap = set(droplet.var_names.values) & set(facs.var_names.values) & set(mca.var_names.values)


droplet = droplet[:, list(overlap)]
facs = facs[:, list(overlap)]
mca = mca[:, list(overlap)]

sc.pp.filter_genes(droplet, min_cells=(len(droplet.obs_names) / 10))
sc.pp.filter_genes(facs, min_cells=(len(facs.obs_names) / 10))
sc.pp.filter_genes(mca, min_cells=(len(mca.obs_names) / 10))

overlap = set(droplet.var_names.values) & set(facs.var_names.values) & set(mca.var_names.values)


droplet = droplet[:, list(overlap)]
facs = facs[:, list(overlap)]
mca = mca[:, list(overlap)]

In [None]:
#library size normalization
sc.pp.normalize_per_cell(mca, counts_per_cell_after=1e4)

#log transform
sc.pp.log1p(mca)

## Settings for SCENIC 

In [None]:
#Settings
DATA_FOLDER="/work/sduknn/Andreas/TM_MCA/database_SCENIC/tmp/mca"
RESOURCES_FOLDER="/work/sduknn/Andreas/TM_MCA/database_SCENIC/resources"
DATABASE_FOLDER = "/work/sduknn/Andreas/TM_MCA/database_SCENIC/databases/"
#For clusters, probably wont have to use
SCHEDULER="123.122.8.24:8786"

DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.feather")
MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl")
MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, '/work/sduknn/Andreas/TM_MCA/database_SCENIC/making_TF_file/mm_tfs.txt')
#SC_EXP_FNAME = os.path.join(RESOURCES_FOLDER, "PATH.txt")
REGULONS_FNAME = os.path.join(DATA_FOLDER, "regulons_10_percent.p")
MOTIFS_FNAME = os.path.join(DATA_FOLDER, "motifs_10_percent.csv")

## Expression matrix as a pandas DataFrame

In [None]:
#Transpose and check array
ex_matrix_mca = mca.X
ex_matrix_mca.shape

In [None]:
#make pandas dataframe
ex_matrix_mca = pd.DataFrame(data= ex_matrix_mca.todense(),
                         index=pd.DataFrame(mca.obs.index.values),
                         columns=mca.var.index.values)

## Load databases

In [None]:
#Load the TF file
tf_names = load_tf_names(MM_TFS_FNAME)

In [17]:
#Load the ranking databases
db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
    return os.path.basename(fname).split(".")[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
dbs

dbs2= [dbs[2],dbs[4] ]
dbs2

[FeatherRankingDatabase(name="mm9-500bp-upstream-7species"),
 FeatherRankingDatabase(name="mm9-tss-centered-10kb-7species")]

## Setup dask cluster on slurm

In [None]:
from distributed import LocalCluster, Client
from dask_jobqueue import SLURMCluster

In [None]:
custom_client = SLURMCluster(project='xxxxx', cores=24, walltime='02:00:00', memory='50GB',processes=12)

In [None]:
custom_client.scale(120)
client = Client(custom_client)

## Coexpression inference using GRNBoost

In [None]:
%%time
adjacencies_mca = grnboost2(ex_matrix_mca, tf_names=tf_names, verbose=True,  client_or_address=client)

modules_mca = list(modules_from_adjacencies(adjacencies_mca, ex_matrix_mca))

## Pruning regulons using RCisTarget

In [None]:
df_mca = prune2df(dbs2, modules_mca, MOTIF_ANNOTATIONS_FNAME,  client_or_address=client)

In [None]:
# Create regulons from this table of enriched motifs.
regulons_mca = df2regulons(df_mca)

In [None]:
# Save the enriched motifs and the discovered regulons to disk.
df_mca.to_csv(MOTIFS_FNAME)
with open(REGULONS_FNAME, "wb") as f:
    pickle.dump(regulons_mca, f)

In [None]:
custom_client.close()