# Inferring enhancer-driven Gene Regulatory Networks (eGRNs) using SCENIC+

In [1]:
# Set up Environment
import dill
import scanpy as sc
import os
import warnings
warnings.filterwarnings("ignore")
import pandas
import pyranges
# Set stderr to null to avoid strange messages from ray
import sys
_stderr = sys.stderr
null = open(os.devnull,'wb')

# set working directory
work_dir = '/g/scb/zaugg/deuner/SCENIC+/'
# set tmp directory
tmp_dir = '/g/scb/zaugg/deuner/SCENIC+/tmp/combined/'
# set the figures directory
fig_dir = '/g/scb/zaugg/deuner/SCENIC+/figures/'
# set the output data directory
out_dir = '/g/scb/zaugg/deuner/SCENIC+/outputdata/'

# Load the AnnData object containing the scRNA-seq side of the analysis
adata = sc.read_h5ad(os.path.join(tmp_dir, 'combined.nomicro.adata.h5ad'))

# Load the cisTopic object containing the scATAC-seq side of the analysis.
cistopic_obj = dill.load(open(os.path.join(tmp_dir, 'scATAC/cistopic_obj.pkl'), 'rb'))

# Load the motif enrichment dictionary containing the motif enrichment results.
menr = dill.load(open(os.path.join(tmp_dir, 'motifs/menr.pkl'), 'rb'))

In [3]:
cistopic_obj.cell_data

Unnamed: 0,Dupl_rate,cisTopic_log_nr_acc,Total_nr_frag_in_regions,Log_total_nr_frag,Total_nr_frag,cisTopic_nr_frag,Unique_nr_frag_in_regions,Log_unique_nr_frag,cisTopic_nr_acc,cisTopic_log_nr_frag,...,wsnn_res.9,wsnn_res.10,wsnn_res.12,wsnn_res.14,wsnn_res.16,wsnn_res.18,wsnn_res.20,celltype_wnn,barcode,sample_id
TCAATTGCTCCTGAGT_timecourse,0.992421,3.625621,470578,5.885086,767513,4379,3350,3.764699,4223,3.641375,...,47,49,50,47,34,147,151,hiPSC-2,TCAATTGCTCCTGAGT,timecourse
TGAACCGACTATCCTG_timecourse,0.992839,3.804957,781384,6.076014,1191281,6794,5182,3.931000,6382,3.832126,...,34,32,26,23,14,62,50,diff-neuron,TGAACCGACTATCCTG,timecourse
GGATTAGTGGCTTTAA_timecourse,0.992924,4.093352,1642543,6.460567,2887798,13338,11197,4.310375,12398,4.125091,...,55,40,8,61,50,36,21,hiPSC-1,GGATTAGTGGCTTTAA,timecourse
AAGCCTAACTAATCAG_timecourse,0.992291,4.273441,2496161,6.590419,3894208,20864,19777,4.477411,18769,4.319398,...,50,97,105,113,114,125,115,hiPSC-2,AAGCCTAACTAATCAG,timecourse
GGCGGTTACCTGGATG_timecourse,0.992954,3.723948,655078,6.012438,1029054,5564,4457,3.860398,5296,3.745387,...,8,37,22,70,48,143,156,diff-NPC,GGCGGTTACCTGGATG,timecourse
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GATGAAGGAGGCCCTT_cocultured28,0.993454,3.123525,190524,5.496168,313450,1379,1164,3.312177,1329,3.139564,...,104,108,127,132,148,157,164,neuron-3,GATGAAGGAGGCCCTT,cocultured28
CTGTATTGATCCTAAT_cocultured28,0.992931,3.394802,336729,5.666258,463722,2584,2166,3.515609,2482,3.412293,...,33,43,29,21,45,73,48,neuron-3,CTGTATTGATCCTAAT,cocultured28
CCTAAGCCTTGAGGTA_cocultured28,0.993037,3.474216,413668,5.762289,578481,3171,2706,3.605089,2980,3.501196,...,22,14,55,49,94,80,90,neuron-2,CCTAAGCCTTGAGGTA,cocultured28
TCCACAATGCGTGATT_cocultured28,0.993336,3.218273,220009,5.499695,316006,1693,1345,3.323458,1653,3.228657,...,15,61,62,57,134,129,136,neuron-3,TCCACAATGCGTGATT,cocultured28


In [2]:
# adapt barcodes of cistopic object
new_bcs = []
old_bcs = cistopic_obj.cell_names
for i in range(len(old_bcs)):
    split_bc = str.split(old_bcs[i], "_")
    new_bc = split_bc[1] + "_" + split_bc[0]
    new_bcs.append(new_bc)
    cistopic_obj.selected_model.cell_topic.columns.values[i] = new_bc

cistopic_obj.cell_names = new_bcs

In [3]:
cistopic_obj.cell_data.index = new_bcs

In [6]:
cistopic_obj.selected_model.cell_topic

Unnamed: 0,timecourse_TCAATTGCTCCTGAGT,timecourse_TGAACCGACTATCCTG,timecourse_GGATTAGTGGCTTTAA,timecourse_AAGCCTAACTAATCAG,timecourse_GGCGGTTACCTGGATG,timecourse_CGTTAAGGAGTCAATA,timecourse_GATAGCCGACCTTTGG,timecourse_CACCACACTATCCTCC,timecourse_GATTTGGACCCACATT,timecourse_CCAACCTGACCGTTGT,...,cocultured28_AGCAATACTACATAAG,cocultured28_AGGAAGCACAATTGGC,cocultured28_TGAATTGACTTCATCG,cocultured28_CCACTATACCTAGTCT,cocultured28_ATGGCTAACGCTTGCT,cocultured28_GATGAAGGAGGCCCTT,cocultured28_CTGTATTGATCCTAAT,cocultured28_CCTAAGCCTTGAGGTA,cocultured28_TCCACAATGCGTGATT,cocultured28_GTGCGAGTGCAAGCTT
Topic1,0.012872,0.448228,0.022895,0.016632,0.39263,0.026084,0.136395,0.428571,0.427873,0.222501,...,0.00913,0.034632,0.055914,0.020237,0.050934,0.094996,0.041469,0.011221,0.03993,0.019521
Topic2,0.014978,0.040112,0.035749,0.016526,0.285073,0.021246,0.023561,0.248934,0.359294,0.453219,...,0.021144,0.010462,0.012289,0.011455,0.026599,0.019579,0.019747,0.012871,0.03993,0.034635
Topic3,0.008893,0.00824,0.0241,0.029332,0.025065,0.015081,0.011223,0.025586,0.014977,0.009376,...,0.013455,0.008658,0.019969,0.0126,0.016412,0.035533,0.009084,0.009241,0.012331,0.015743
Topic4,0.62813,0.006063,0.539444,0.622403,0.034979,0.01565,0.011138,0.031983,0.008671,0.056157,...,0.024027,0.009019,0.017512,0.019855,0.019808,0.042059,0.026856,0.017822,0.018203,0.018262
Topic5,0.078399,0.091573,0.194489,0.182209,0.139731,0.139144,0.110093,0.086887,0.091597,0.164315,...,0.060067,0.040043,0.022427,0.112257,0.020374,0.108049,0.032385,0.024092,0.056371,0.032746
Topic6,0.065996,0.02472,0.11287,0.075774,0.041152,0.06279,0.036155,0.049574,0.041621,0.021748,...,0.037963,0.008658,0.003687,0.019855,0.021505,0.060914,0.025276,0.013531,0.032296,0.005668
Topic7,0.051252,0.004509,0.040167,0.036612,0.008792,0.008821,0.009853,0.015458,0.01277,0.018848,...,0.01826,0.027417,0.028571,0.009164,0.022071,0.020305,0.015403,0.010891,0.028773,0.030227
Topic8,0.121226,0.168532,0.009399,0.006908,0.066218,0.349331,0.258482,0.098081,0.030427,0.043882,...,0.676117,0.691919,0.686022,0.639939,0.441426,0.459028,0.42575,0.728053,0.410452,0.518892
Topic9,0.010765,0.165578,0.010925,0.009299,0.003928,0.01233,0.037611,0.007996,0.004572,0.004833,...,0.095147,0.123737,0.132719,0.086292,0.015846,0.038434,0.006319,0.077228,0.019378,0.013224
Topic10,0.007489,0.042444,0.009961,0.004304,0.002432,0.349521,0.36549,0.00693,0.008198,0.005123,...,0.04469,0.045455,0.020891,0.068347,0.365025,0.121102,0.397709,0.09505,0.342337,0.311083


In [7]:
adata.obs
adata.obs_names.copy(deep=True)

Index(['timecourse_AAACAGCCAGCCAGTT', 'timecourse_AAACAGCCAGGCGAGT',
       'timecourse_AAACAGCCAGTAAAGC', 'timecourse_AAACAGCCATAAGTCT',
       'timecourse_AAACAGCCATAGGCGA', 'timecourse_AAACAGCCATCACAGC',
       'timecourse_AAACATGCAACTAGCC', 'timecourse_AAACATGCAAGACTCC',
       'timecourse_AAACATGCAGCACCAT', 'timecourse_AAACATGCAGCATTAT',
       ...
       'cocultured28_TTTGTCTAGGCGCTTA', 'cocultured28_TTTGTCTAGTGCACGC',
       'cocultured28_TTTGTGAAGCTTAGTA', 'cocultured28_TTTGTGAAGGGTGAAC',
       'cocultured28_TTTGTGAAGGGTGGAT', 'cocultured28_TTTGTGAAGTCTATGA',
       'cocultured28_TTTGTGGCAGCAAGTG', 'cocultured28_TTTGTGTTCTCGCCTG',
       'cocultured28_TTTGTGTTCTTAGCCC', 'cocultured28_TTTGTGTTCTTGCAGG'],
      dtype='object', length=22367)

In [4]:
len(set(adata.obs_names)) == len(adata.obs_names)

True

In [5]:
# check if there are common barcodes
list(set(adata.obs_names.copy(deep=True)) & set(list(cistopic_obj.cell_names.copy())))


[]

In [6]:
len(set(list(cistopic_obj.cell_names.copy()))) > 0

True

In [7]:
# maybe select the atac barcodes as defaults for adata
list(set(adata.obs["barcode"]) & set(list(cistopic_obj.cell_names.copy())))

l_bcs = []

for i in range(len(adata.obs_names)):
     l_bc = adata.obs["orig.ident"][i] + "_" + adata.obs["barcode"][i]
     l_bcs.append(l_bc)
     
adata.obs["long_barcode"] = l_bcs
adata.obs_names = list(adata.obs["long_barcode"])

In [12]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,wsnn_res.9,wsnn_res.10,wsnn_res.12,wsnn_res.14,wsnn_res.16,wsnn_res.18,wsnn_res.20,celltype_wnn,barcode,long_barcode
timecourse_AAACCGTACCCGCTGT,timecourse,5020.0,2676,6.155378,3.705179,6037.0,2676,7,25,0.084507,...,48,44,31,27,51,35,25,diff-NPC,AAACCGTACCCGCTGT,timecourse_AAACCGTACCCGCTGT
timecourse_GTAGGTTACCCGCTGT,timecourse,4719.0,2179,0.190718,0.466200,6032.0,2178,14,117,0.239437,...,96,96,109,114,120,115,117,neuron-1,GTAGGTTACCCGCTGT,timecourse_GTAGGTTACCCGCTGT
timecourse_CTAGTAAACCCGCTGT,timecourse,3832.0,2004,0.287056,0.730689,5979.0,2008,12,50,0.183099,...,34,32,26,23,14,62,50,diff-neuron,CTAGTAAACCCGCTGT,timecourse_CTAGTAAACCCGCTGT
timecourse_TGGCATGACCCGCTGT,timecourse,7329.0,3535,1.159776,1.132487,7247.0,3535,8,151,0.154930,...,47,49,50,47,34,147,151,hiPSC-2,TGGCATGACCCGCTGT,timecourse_TGGCATGACCCGCTGT
timecourse_TTTGTGCACCCGCTGT,timecourse,4511.0,2253,7.337619,1.906451,6061.0,2253,7,51,0.140845,...,11,12,41,36,70,52,51,diff-NPC,TTTGTGCACCCGCTGT,timecourse_TTTGTGCACCCGCTGT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cocultured28_AGCAATTGACACCTAC,cocultured28,11717.0,3893,0.162158,0.187761,7918.0,3746,5,72,,...,22,14,55,49,107,105,72,neuron-2,AGCAATTGACACCTAC,cocultured28_AGCAATTGACACCTAC
cocultured28_TGATTAGACACCAAAC,cocultured28,6845.0,2998,0.613587,0.555150,6845.0,2998,14,85,,...,88,89,100,103,105,93,85,neuron-4,TGATTAGACACCAAAC,cocultured28_TGATTAGACACCAAAC
cocultured28_GTTAAAGCTTGAGAAC,cocultured28,9557.0,3435,0.292979,0.282515,7697.0,3433,6,39,,...,15,70,79,78,73,50,39,neuron-3,GTTAAAGCTTGAGAAC,cocultured28_GTTAAAGCTTGAGAAC
cocultured28_ACTAAGACTTGAGAAC,cocultured28,1732.0,1112,2.078522,0.577367,5364.0,1614,6,123,,...,33,61,29,21,45,73,123,neuron-3,ACTAAGACTTGAGAAC,cocultured28_ACTAAGACTTGAGAAC


In [8]:
len(set(adata.obs_names)) == len(adata.obs_names)

True

In [9]:
# do the same for the cistopic object
cistopic_obj.cell_data["long_barcode"] = cistopic_obj.cell_names

In [10]:

# check if there are common barcodes
len(list(set(adata.obs_names.copy(deep=True)) & set(list(cistopic_obj.cell_names.copy())))) #128
#len(list(set(adata.obs_names.copy(deep=True)))) #757
#len(set(list(cistopic_obj.cell_names.copy())))  #128


19622

In [16]:
cistopic_obj.selected_model.cell_topic.columns.values

array(['timecourse_TCAATTGCTCCTGAGT', 'timecourse_TGAACCGACTATCCTG',
       'timecourse_GGATTAGTGGCTTTAA', ...,
       'cocultured28_CCTAAGCCTTGAGGTA', 'cocultured28_TCCACAATGCGTGATT',
       'cocultured28_GTGCGAGTGCAAGCTT'], dtype=object)

In [17]:
#from pycisTopic.cistopic_class import *
#from pycisTopic.diff_features import *
#common_cells = list(set(adata.obs_names.copy(deep=True)) & set(list(cistopic_obj.cell_names.copy())))
#impute_accessibility(cistopic_obj, selected_cells=common_cells)


In [11]:
print(len(adata.obs_names.copy(deep=True)) == len(set(adata.obs_names.copy(deep=True))))
print(len(adata.obs_names.copy(deep=True).drop_duplicates(keep='first')))
print(len(set(adata.obs_names.copy(deep=True))))
print(len(list(cistopic_obj.cell_names.copy())) == len(set(list(cistopic_obj.cell_names.copy()))))
print(len(list(cistopic_obj.cell_names.copy())))
print(len(set(list(cistopic_obj.cell_names.copy()))))

True
22367
22367
True
19827
19827


## Create SCENIC+ object

In [12]:
# Create the Scenic+ object
from scenicplus.scenicplus_class import create_SCENICPLUS_object
import numpy as np
scplus_obj = create_SCENICPLUS_object(
    GEX_anndata = adata,
    cisTopic_obj = cistopic_obj,
    menr = menr,
    gene_metadata = adata.var.copy(deep=True),
    bc_transform_func = None, #lambda x: f'{x}_timecourse' #None, #function to convert scATAC-seq barcodes to scRNA-seq ones
)
scplus_obj.X_EXP = np.array(scplus_obj.X_EXP.todense())
scplus_obj

2023-05-28 10:48:23,288 cisTopic     INFO     Imputing drop-outs
2023-05-28 10:48:56,811 cisTopic     INFO     Scaling
2023-05-28 10:49:36,280 cisTopic     INFO     Keep non zero rows
2023-05-28 10:50:13,752 cisTopic     INFO     Imputed accessibility sparsity: 0.6699118462550895
2023-05-28 10:50:13,754 cisTopic     INFO     Create CistopicImputedFeatures object
2023-05-28 10:50:13,755 cisTopic     INFO     Done!


SCENIC+ object with n_cells x n_genes = 19622 x 30768 and n_cells x n_regions = 19622 x 538878
	metadata_regions:'Chromosome', 'Start', 'End', 'Width', 'cisTopic_nr_frag', 'cisTopic_log_nr_frag', 'cisTopic_nr_acc', 'cisTopic_log_nr_acc'
	metadata_genes:'features'
	metadata_cell:'GEX_orig.ident', 'GEX_nCount_RNA', 'GEX_nFeature_RNA', 'GEX_percent.mt', 'GEX_percent.ribo', 'GEX_nCount_SCT', 'GEX_nFeature_SCT', 'GEX_SCT_snn_res.0.5', 'GEX_seurat_clusters', 'GEX_pANN_0.25_0.005_794', 'GEX_DF.classifications_0.25_0.005_794', 'GEX_doubletClass', 'GEX_bc', 'GEX_nCount_ATAC', 'GEX_nFeature_ATAC', 'GEX_SCT.weight', 'GEX_ATAC.weight', 'GEX_pANN_0.25_0.005_398', 'GEX_DF.classifications_0.25_0.005_398', 'GEX_pANN_0.25_0.005_1306', 'GEX_DF.classifications_0.25_0.005_1306', 'GEX_pANN_0.25_0.005_395', 'GEX_DF.classifications_0.25_0.005_395', 'GEX_ident', 'GEX_scDblFinder.sample', 'GEX_scDblFinder.class', 'GEX_scDblFinder.score', 'GEX_scDblFinder.weighted', 'GEX_scDblFinder.cxds_score', 'GEX_res.0.5', 

In [20]:
#scplus_obj.add_gene_data(adata.var.copy(deep=True))

In [21]:
scplus_obj.gene_names

Index(['MIR1302-2HG', 'AL627309.1', 'AL627309.5', 'AL627309.4', 'AP006222.2',
       'AC114498.1', 'AL669831.2', 'LINC01409', 'FAM87B', 'LINC01128',
       ...
       'HSFX3', 'HSFX4', 'MAGEA4', 'U82671.1', 'AVPR2', 'HCFC1-AS1',
       'AC009494.2', 'AC136616.3', 'AC023491.2', 'AC007325.1'],
      dtype='object', length=30768)

In [22]:
adata.var
adata.var.copy(deep=True)
adata.var.index

Index(['MIR1302-2HG', 'AL627309.1', 'AL627309.5', 'AL627309.4', 'AP006222.2',
       'AC114498.1', 'AL669831.2', 'LINC01409', 'FAM87B', 'LINC01128',
       ...
       'HSFX3', 'HSFX4', 'MAGEA4', 'U82671.1', 'AVPR2', 'HCFC1-AS1',
       'AC009494.2', 'AC136616.3', 'AC023491.2', 'AC007325.1'],
      dtype='object', length=30768)

In [23]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,wsnn_res.9,wsnn_res.10,wsnn_res.12,wsnn_res.14,wsnn_res.16,wsnn_res.18,wsnn_res.20,celltype_wnn,barcode,long_barcode
timecourse_AAACCGTACCCGCTGT,timecourse,5020.0,2676,6.155378,3.705179,6037.0,2676,7,25,0.084507,...,48,44,31,27,51,35,25,diff-NPC,AAACCGTACCCGCTGT,timecourse_AAACCGTACCCGCTGT
timecourse_GTAGGTTACCCGCTGT,timecourse,4719.0,2179,0.190718,0.466200,6032.0,2178,14,117,0.239437,...,96,96,109,114,120,115,117,neuron-1,GTAGGTTACCCGCTGT,timecourse_GTAGGTTACCCGCTGT
timecourse_CTAGTAAACCCGCTGT,timecourse,3832.0,2004,0.287056,0.730689,5979.0,2008,12,50,0.183099,...,34,32,26,23,14,62,50,diff-neuron,CTAGTAAACCCGCTGT,timecourse_CTAGTAAACCCGCTGT
timecourse_TGGCATGACCCGCTGT,timecourse,7329.0,3535,1.159776,1.132487,7247.0,3535,8,151,0.154930,...,47,49,50,47,34,147,151,hiPSC-2,TGGCATGACCCGCTGT,timecourse_TGGCATGACCCGCTGT
timecourse_TTTGTGCACCCGCTGT,timecourse,4511.0,2253,7.337619,1.906451,6061.0,2253,7,51,0.140845,...,11,12,41,36,70,52,51,diff-NPC,TTTGTGCACCCGCTGT,timecourse_TTTGTGCACCCGCTGT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cocultured28_AGCAATTGACACCTAC,cocultured28,11717.0,3893,0.162158,0.187761,7918.0,3746,5,72,,...,22,14,55,49,107,105,72,neuron-2,AGCAATTGACACCTAC,cocultured28_AGCAATTGACACCTAC
cocultured28_TGATTAGACACCAAAC,cocultured28,6845.0,2998,0.613587,0.555150,6845.0,2998,14,85,,...,88,89,100,103,105,93,85,neuron-4,TGATTAGACACCAAAC,cocultured28_TGATTAGACACCAAAC
cocultured28_GTTAAAGCTTGAGAAC,cocultured28,9557.0,3435,0.292979,0.282515,7697.0,3433,6,39,,...,15,70,79,78,73,50,39,neuron-3,GTTAAAGCTTGAGAAC,cocultured28_GTTAAAGCTTGAGAAC
cocultured28_ACTAAGACTTGAGAAC,cocultured28,1732.0,1112,2.078522,0.577367,5364.0,1614,6,123,,...,33,61,29,21,45,73,123,neuron-3,ACTAAGACTTGAGAAC,cocultured28_ACTAAGACTTGAGAAC


In [24]:
cistopic_obj.cell_data

Unnamed: 0,Dupl_rate,cisTopic_log_nr_acc,Total_nr_frag_in_regions,Log_total_nr_frag,Total_nr_frag,cisTopic_nr_frag,Unique_nr_frag_in_regions,Log_unique_nr_frag,cisTopic_nr_acc,cisTopic_log_nr_frag,...,wsnn_res.10,wsnn_res.12,wsnn_res.14,wsnn_res.16,wsnn_res.18,wsnn_res.20,celltype_wnn,barcode,sample_id,long_barcode
timecourse_TCAATTGCTCCTGAGT,0.992421,3.625621,470578,5.885086,767513,4379,3350,3.764699,4223,3.641375,...,49,50,47,34,147,151,hiPSC-2,TCAATTGCTCCTGAGT,timecourse,timecourse_TCAATTGCTCCTGAGT
timecourse_TGAACCGACTATCCTG,0.992839,3.804957,781384,6.076014,1191281,6794,5182,3.931000,6382,3.832126,...,32,26,23,14,62,50,diff-neuron,TGAACCGACTATCCTG,timecourse,timecourse_TGAACCGACTATCCTG
timecourse_GGATTAGTGGCTTTAA,0.992924,4.093352,1642543,6.460567,2887798,13338,11197,4.310375,12398,4.125091,...,40,8,61,50,36,21,hiPSC-1,GGATTAGTGGCTTTAA,timecourse,timecourse_GGATTAGTGGCTTTAA
timecourse_AAGCCTAACTAATCAG,0.992291,4.273441,2496161,6.590419,3894208,20864,19777,4.477411,18769,4.319398,...,97,105,113,114,125,115,hiPSC-2,AAGCCTAACTAATCAG,timecourse,timecourse_AAGCCTAACTAATCAG
timecourse_GGCGGTTACCTGGATG,0.992954,3.723948,655078,6.012438,1029054,5564,4457,3.860398,5296,3.745387,...,37,22,70,48,143,156,diff-NPC,GGCGGTTACCTGGATG,timecourse,timecourse_GGCGGTTACCTGGATG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cocultured28_GATGAAGGAGGCCCTT,0.993454,3.123525,190524,5.496168,313450,1379,1164,3.312177,1329,3.139564,...,108,127,132,148,157,164,neuron-3,GATGAAGGAGGCCCTT,cocultured28,cocultured28_GATGAAGGAGGCCCTT
cocultured28_CTGTATTGATCCTAAT,0.992931,3.394802,336729,5.666258,463722,2584,2166,3.515609,2482,3.412293,...,43,29,21,45,73,48,neuron-3,CTGTATTGATCCTAAT,cocultured28,cocultured28_CTGTATTGATCCTAAT
cocultured28_CCTAAGCCTTGAGGTA,0.993037,3.474216,413668,5.762289,578481,3171,2706,3.605089,2980,3.501196,...,14,55,49,94,80,90,neuron-2,CCTAAGCCTTGAGGTA,cocultured28,cocultured28_CCTAAGCCTTGAGGTA
cocultured28_TCCACAATGCGTGATT,0.993336,3.218273,220009,5.499695,316006,1693,1345,3.323458,1653,3.228657,...,61,62,57,134,129,136,neuron-3,TCCACAATGCGTGATT,cocultured28,cocultured28_TCCACAATGCGTGATT


In [13]:
# Select the optimal gene names host
ensembl_version_dict = {'105': 'http://www.ensembl.org',
                        '104': 'http://may2021.archive.ensembl.org/',
                        '103': 'http://feb2021.archive.ensembl.org/',
                        '102': 'http://nov2020.archive.ensembl.org/',
                        '101': 'http://aug2020.archive.ensembl.org/',
                        '100': 'http://apr2020.archive.ensembl.org/',
                        '99': 'http://jan2020.archive.ensembl.org/',
                        '98': 'http://sep2019.archive.ensembl.org/',
                        '97': 'http://jul2019.archive.ensembl.org/',
                        '96': 'http://apr2019.archive.ensembl.org/',
                        '95': 'http://jan2019.archive.ensembl.org/',
                        '94': 'http://oct2018.archive.ensembl.org/',
                        '93': 'http://jul2018.archive.ensembl.org/',
                        '92': 'http://apr2018.archive.ensembl.org/',
                        '91': 'http://dec2017.archive.ensembl.org/',
                        '90': 'http://aug2017.archive.ensembl.org/',
                        '89': 'http://may2017.archive.ensembl.org/',
                        '88': 'http://mar2017.archive.ensembl.org/',
                        '87': 'http://dec2016.archive.ensembl.org/',
                        '86': 'http://oct2016.archive.ensembl.org/',
                        '80': 'http://may2015.archive.ensembl.org/',
                        '77': 'http://oct2014.archive.ensembl.org/',
                        '75': 'http://feb2014.archive.ensembl.org/',
                        '54': 'http://may2009.archive.ensembl.org/'}

import pybiomart as pbm
def test_ensembl_host(scplus_obj, host, species):
    dataset = pbm.Dataset(name=species+'_gene_ensembl',  host=host)
    annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
    annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
    annot['Chromosome'] = annot['Chromosome'].astype('str')
    filter = annot['Chromosome'].str.contains('CHR|GL|JH|MT')
    annot = annot[~filter]
    annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
    gene_names_release = set(annot['Gene'].tolist())
    #print(gene_names_release)[1:5]
    #print(scplus_obj.gene_names)[1:5]
    print(len(list(set(gene_names_release) & set(scplus_obj.gene_names))) > 0)
    ov=len([x for x in scplus_obj.gene_names if x in gene_names_release])
    print('Genes recovered: ' + str(ov) + ' out of ' + str(len(scplus_obj.gene_names)))
    return ov

n_overlap = {}
for version in ensembl_version_dict.keys():
    print(f'host: {version}')
    try:
        n_overlap[version] =  test_ensembl_host(scplus_obj, ensembl_version_dict[version], 'hsapiens')
    except:
        print('Host not reachable')
v = sorted(n_overlap.items(), key=lambda item: item[1], reverse=True)[0][0]
print(f"version: {v} has the largest overlap, use {ensembl_version_dict[v]} as biomart host")

host: 105
True
Genes recovered: 20796 out of 30768
host: 104
True
Genes recovered: 21035 out of 30768
host: 103
True
Genes recovered: 30007 out of 30768
host: 102
True
Genes recovered: 30078 out of 30768
host: 101
True
Genes recovered: 30194 out of 30768
host: 100
True
Genes recovered: 30378 out of 30768
host: 99
True
Genes recovered: 30461 out of 30768
host: 98
True
Genes recovered: 30745 out of 30768
host: 97
True
Genes recovered: 30345 out of 30768
host: 96
True
Genes recovered: 28859 out of 30768
host: 95
True
Genes recovered: 28591 out of 30768
host: 94
True
Genes recovered: 28508 out of 30768
host: 93
True
Genes recovered: 28203 out of 30768
host: 92
True
Genes recovered: 28094 out of 30768
host: 91
Host not reachable
host: 90
Host not reachable
host: 89
Host not reachable
host: 88
Host not reachable
host: 87
Host not reachable
host: 86
Host not reachable
host: 80
True
Genes recovered: 20176 out of 30768
host: 77
True
Genes recovered: 19803 out of 30768
host: 75
Host not reachabl

In [14]:
# Choose the best host
biomart_host = "http://sep2019.archive.ensembl.org/"

In [15]:
# Before running  also download a list of known human TFs from the human transcription factors database
!wget -O /g/scb/zaugg/deuner/SCENIC+/inputdata/utoronto_human_tfs_v_1.01.txt  http://humantfs.ccbr.utoronto.ca/download/v_1.01/TF_names_v_1.01.txt


--2023-05-28 10:50:58--  http://humantfs.ccbr.utoronto.ca/download/v_1.01/TF_names_v_1.01.txt
Resolving humantfs.ccbr.utoronto.ca (humantfs.ccbr.utoronto.ca)... 142.150.52.218
Connecting to humantfs.ccbr.utoronto.ca (humantfs.ccbr.utoronto.ca)|142.150.52.218|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11838 (12K) [text/plain]
Saving to: ‘/g/scb/zaugg/deuner/SCENIC+/inputdata/utoronto_human_tfs_v_1.01.txt’


2023-05-28 10:50:59 (107 KB/s) - ‘/g/scb/zaugg/deuner/SCENIC+/inputdata/utoronto_human_tfs_v_1.01.txt’ saved [11838/11838]



In [16]:
# Also download a the program bedToBigBed this will be used to generate files which can be uploaded to the UCSC genome browser
!wget -O /g/scb/zaugg/deuner/SCENIC+/inputdata/bedToBigBed http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedToBigBed
!chmod +x /g/scb/zaugg/deuner/SCENIC+/inputdata/bedToBigBed

--2023-05-28 10:51:01--  http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedToBigBed
Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9631544 (9.2M)
Saving to: ‘/g/scb/zaugg/deuner/SCENIC+/inputdata/bedToBigBed’


2023-05-28 10:51:03 (5.09 MB/s) - ‘/g/scb/zaugg/deuner/SCENIC+/inputdata/bedToBigBed’ saved [9631544/9631544]



In [29]:
#only keep the first two columns of the PCA embedding in order to be able to visualize this in SCope
#scplus_obj.dr_cell['GEX_X_pca'] = scplus_obj.dr_cell['GEX_X_pca'].iloc[:, 0:2]
#scplus_obj.dr_cell['GEX_rep'] = scplus_obj.dr_cell['GEX_rep'].iloc[:, 0:2]

In [30]:
# import ray
# ray.shutdown()
# ray.init()

In [17]:
# Run the analysis
from scenicplus.wrappers.run_scenicplus import run_scenicplus
# try:
run_scenicplus(
        scplus_obj = scplus_obj,
        variable = ['GEX_celltype'],
        species = 'hsapiens',
        assembly = 'hg38',
        tf_file = '/g/scb/zaugg/deuner/SCENIC+/inputdata/utoronto_human_tfs_v_1.01.txt',
        save_path = os.path.join(tmp_dir, 'scenicplus'),
        biomart_host = biomart_host,
        upstream = [1000, 150000],
        downstream = [1000, 150000],
        calculate_TF_eGRN_correlation = True,
        calculate_DEGs_DARs = True,
        export_to_loom_file = True,
        export_to_UCSC_file = True,
        path_bedToBigBed = '/g/scb/zaugg/deuner/SCENIC+/inputdata',
        n_cpu = 24,
        _temp_dir = None)#'/g/scb/zaugg/deuner/ray_spill')
# except Exception as e:
#     #in case of failure, still save the object
#     dill.dump(scplus_obj, open(os.path.join(out_dir, '/scplus_obj.pkl'), 'wb'), protocol=-1)
#     raise(e)

2023-05-28 10:52:02,246 SCENIC+_wrapper INFO     /g/scb/zaugg/deuner/SCENIC+/tmp/combined/scenicplus folder already exists.
2023-05-28 10:52:02,247 SCENIC+_wrapper INFO     Merging cistromes
2023-05-28 11:02:06,754 SCENIC+_wrapper INFO     Getting search space
2023-05-28 11:02:08,809 R2G          INFO     Downloading gene annotation from biomart dataset: hsapiens_gene_ensembl
2023-05-28 11:02:21,881 R2G          INFO     Downloading chromosome sizes from: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes
2023-05-28 11:02:23,324 R2G          INFO     Extending promoter annotation to 10 bp upstream and 10 downstream
2023-05-28 11:02:42,457 R2G          INFO     Extending search space to:
            						150000 bp downstream of the end of the gene.
            						150000 bp upstream of the start of the gene.
2023-05-28 11:03:11,011 R2G          INFO     Intersecting with regions.


join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.


2023-05-28 11:03:12,322 R2G          INFO     Calculating distances from region to gene
2023-05-28 11:08:23,454 R2G          INFO     Imploding multiple entries per region and gene
2023-05-28 11:17:36,574 R2G          INFO     Done!
2023-05-28 11:17:37,334 SCENIC+_wrapper INFO     Inferring region to gene relationships
2023-05-28 11:17:37,976 R2G          INFO     Calculating region to gene importances, using GBM method


2023-05-28 11:17:55,147	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
initializing: 100%|███████████████████████| 17522/17522 [38:30<00:00,  7.58it/s]
Running using 24 cores: 100%|█████████████| 17522/17522 [10:29<00:00, 27.82it/s]


2023-05-28 12:07:17,334 R2G          INFO     Took 2979.356848716736 seconds
2023-05-28 12:07:17,338 R2G          INFO     Calculating region to gene correlation, using SR method


2023-05-28 12:07:35,686	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
initializing: 100%|███████████████████████| 17522/17522 [37:20<00:00,  7.82it/s]
Running using 24 cores: 100%|████████████| 17522/17522 [01:08<00:00, 257.56it/s]


2023-05-28 12:46:23,040 R2G          INFO     Took 2345.7002811431885 seconds
2023-05-28 12:46:38,601 R2G          INFO     Done!
2023-05-28 12:46:39,030 SCENIC+_wrapper INFO     Inferring TF to gene relationships


2023-05-28 12:46:59,828	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


2023-05-28 12:47:13,358 TF2G         INFO     Calculating TF to gene correlation, using GBM method


initializing:   1%|▏                      | 213/30768 [03:24<4:42:22,  1.80it/s][2m[36m(raylet)[0m Spilled 4837 MiB, 2 objects, write throughput 474 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(raylet)[0m Spilled 9675 MiB, 5 objects, write throughput 858 MiB/s.
[2m[36m(raylet)[0m Spilled 19351 MiB, 11 objects, write throughput 1327 MiB/s.
[2m[36m(raylet)[0m Spilled 24188 MiB, 14 objects, write throughput 1536 MiB/s.
initializing:   1%|▏                      | 235/30768 [03:50<2:44:39,  3.09it/s][2m[36m(raylet)[0m Spilled 53216 MiB, 33 objects, write throughput 1728 MiB/s.
[2m[33m(raylet)[0m [2023-05-28 12:51:06,022 E 1973851 1973874] (raylet) file_system_monitor.cc:105: /tmp/ray/session_2023-05-28_12-46-41_694128_1511726 is over 95% full, available space: 5491822592; capacity: 129866661888. Object creation will fail if spilling is required.
initializing:   1%|▏                      | 243/30768 [03:55<3:10:28,  2.67it/s][2m[33m(raylet)[0m [20

Local disk is full
The object cannot be created because the local object store is full and the local disk's utilization is over capacity (95% by default).Tip: Use `df` on this node to check disk usage and `ray memory` to check object store memory usage.
2023-05-28 12:52:47,860 TF2G         INFO     Took 334.49985218048096 seconds
2023-05-28 12:52:47,861 TF2G         INFO     Adding correlation coefficients to adjacencies.


UnboundLocalError: local variable 'tfs_to_genes' referenced before assignment

## Downstream Analysis

### Simplifying and filtering SCENIC+ output

In [None]:
from scenicplus.preprocessing.filtering import apply_std_filtering_to_eRegulons
apply_std_filtering_to_eRegulons(scplus_obj)

In [None]:
scplus_obj.uns['eRegulon_metadata_filtered'].head()

### eRegulon enrichment scores

In [None]:
from scenicplus.eregulon_enrichment import score_eRegulons
region_ranking = dill.load(open(os.path.join(out_dir, 'scenicplus/region_ranking.pkl'), 'rb')) #load ranking calculated using the wrapper function
gene_ranking = dill.load(open(os.path.join(out_dir, 'scenicplus/gene_ranking.pkl'), 'rb')) #load ranking calculated using the wrapper function
score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures_filtered',
                key_added = 'eRegulon_AUC_filtered',
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = 5)
score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures_filtered',
                key_added = 'eRegulon_AUC_filtered',
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = 5)

### eRegulon dimensionality reduction

In [None]:
from scenicplus.dimensionality_reduction import run_eRegulons_tsne, run_eRegulons_umap
run_eRegulons_umap(
    scplus_obj = scplus_obj,
    auc_key = 'eRegulon_AUC_filtered',
    reduction_name = 'eRegulons_UMAP', #overwrite previously calculated UMAP
)
run_eRegulons_tsne(
    scplus_obj = scplus_obj,
    auc_key = 'eRegulon_AUC_filtered',
    reduction_name = 'eRegulons_tSNE', #overwrite previously calculated tSNE
)

In [None]:
# Visualize it
from scenicplus.dimensionality_reduction import plot_metadata_given_ax
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#specify color_dictionary

color_dict = {
    'neuron': "#065143",
    'hiPSC': "#70B77E",
    'microglia': "#E0A890",
    'diff.state': "#053C5E" 
}

fig, axs = plt.subplots(ncols=2, figsize = (16, 8))
plot_metadata_given_ax(
    scplus_obj=scplus_obj,
    ax = axs[0],
    reduction_name = 'eRegulons_UMAP',
    variable = 'GEX_celltype', #note the GEX_ prefix, this metadata originated from the gene expression metadata (on which we did the cell type annotation before)
    color_dictionary={'GEX_celltype': color_dict}
)
plot_metadata_given_ax(
    scplus_obj=scplus_obj,
    ax = axs[1],
    reduction_name = 'eRegulons_tSNE',
    variable = 'GEX_celltype', #note the GEX_ prefix, this metadata originated from the gene expression metadata (on which we did the cell type annotation before)
    color_dictionary={'GEX_celltype': color_dict}
)
fig.tight_layout()
sns.despine(ax = axs[0]) #remove top and right edge of axis border
sns.despine(ax = axs[1]) #remove top and right edge of axis border
plt.show()

### plot the activity / expression of an eRegulon on the dimensionality reduction

In [None]:
from scenicplus.dimensionality_reduction import plot_eRegulon
plot_eRegulon(
    scplus_obj = scplus_obj,
    reduction_name = 'eRegulons_tSNE',
    selected_regulons = ['POU4F3', 'KLF12', 'POU4F1', 'CUX2', 'ONECUT3'],
    scale = True,
    auc_key = 'eRegulon_AUC_filtered')

### dotplot-heatmap


In [None]:
# We first generate pseudobulk gene expression and region accessibility data, per celltype, to limit the amount of noise for the correlation calculation.
from scenicplus.cistromes import TF_cistrome_correlation, generate_pseudobulks

generate_pseudobulks(
        scplus_obj = scplus_obj,
        variable = 'GEX_celltype',
        auc_key = 'eRegulon_AUC_filtered',
        signature_key = 'Gene_based')
generate_pseudobulks(
        scplus_obj = scplus_obj,
        variable = 'GEX_celltype',
        auc_key = 'eRegulon_AUC_filtered',
        signature_key = 'Region_based')

TF_cistrome_correlation(
            scplus_obj,
            use_pseudobulk = True,
            variable = 'GEX_celltype',
            auc_key = 'eRegulon_AUC_filtered',
            signature_key = 'Gene_based',
            out_key = 'filtered_gene_based')
TF_cistrome_correlation(
            scplus_obj,
            use_pseudobulk = True,
            variable = 'GEX_celltype',
            auc_key = 'eRegulon_AUC_filtered',
            signature_key = 'Region_based',
            out_key = 'filtered_region_based')

In [None]:
scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based'].head()

In [None]:
# Let's visualize these correlations in a scatter plot and select eRegulons for which the correlaiton coefficient is above 0.70 or below -0.75
import numpy as np
n_targets = [int(x.split('(')[1].replace('r)', '')) for x in scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Cistrome']]
rho = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'].to_list()
adj_pval = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Adjusted_p-value'].to_list()

thresholds = {
        'rho': [-0.75, 0.70],
        'n_targets': 0
}
import seaborn as sns
fig, ax = plt.subplots(figsize = (10, 5))
sc = ax.scatter(rho, n_targets, c = -np.log10(adj_pval), s = 5)
ax.set_xlabel('Correlation coefficient')
ax.set_ylabel('nr. target regions')
#ax.hlines(y = thresholds['n_targets'], xmin = min(rho), xmax = max(rho), color = 'black', ls = 'dashed', lw = 1)
ax.vlines(x = thresholds['rho'], ymin = 0, ymax = max(n_targets), color = 'black', ls = 'dashed', lw = 1)
ax.text(x = thresholds['rho'][0], y = max(n_targets), s = str(thresholds['rho'][0]))
ax.text(x = thresholds['rho'][1], y = max(n_targets), s = str(thresholds['rho'][1]))
sns.despine(ax = ax)
fig.colorbar(sc, label = '-log10(adjusted_pvalue)', ax = ax)
plt.show()

In [None]:
selected_cistromes = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based'].loc[
        np.logical_or(
                scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'] > thresholds['rho'][1],
                scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'] < thresholds['rho'][0]
        )]['Cistrome'].to_list()
selected_eRegulons = [x.split('_(')[0] for x in selected_cistromes]
selected_eRegulons_gene_sig = [
        x for x in scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based'].keys()
        if x.split('_(')[0] in selected_eRegulons]
selected_eRegulons_region_sig = [
        x for x in scplus_obj.uns['eRegulon_signatures_filtered']['Region_based'].keys()
        if x.split('_(')[0] in selected_eRegulons]
#save the results in the scenicplus object
scplus_obj.uns['selected_eRegulon'] = {'Gene_based': selected_eRegulons_gene_sig, 'Region_based': selected_eRegulons_region_sig}
print(f'selected: {len(selected_eRegulons_gene_sig)} eRegulons')

In [None]:
# Save these changes we have made to the scenicplus_obj
dill.dump(scplus_obj, open(os.path.join(out_dir, 'scenicplus/scplus_obj.pkl'), 'wb'), protocol=-1)

In [None]:
# Plot the heatmap-dotplot
from scenicplus.plotting.dotplot import heatmap_dotplot
heatmap_dotplot(
        scplus_obj = scplus_obj,
        size_matrix = scplus_obj.uns['eRegulon_AUC_filtered']['Region_based'], #specify what to plot as dot sizes, target region enrichment in this case
        color_matrix = scplus_obj.to_df('EXP'), #specify  what to plot as colors, TF expression in this case
        scale_size_matrix = True,
        scale_color_matrix = True,
        group_variable = 'GEX_celltype',
        subset_eRegulons = scplus_obj.uns['selected_eRegulon']['Gene_based'],
        figsize = (5, 20),
        orientation = 'vertical')

### Overlap of predicted target regions

In [None]:
# calculate the RSS for the target regions of the selected eRegulons.
from scenicplus.RSS import *
regulon_specificity_scores(
        scplus_obj,
        variable = 'GEX_celltype',
        auc_key = 'eRegulon_AUC_filtered',
        signature_keys = ['Region_based'],
        selected_regulons = [x for x in scplus_obj.uns['selected_eRegulon']['Region_based'] if '-' not in x],
        out_key_suffix = '_filtered')

In [None]:
# visualize the RSS values using a scatter plot
plot_rss(scplus_obj, 'GEX_celltype_filtered', num_columns=2, top_n=10, figsize = (5, 10))

In [None]:
# select the top 10 eRegulons per cell type
flat_list = lambda t: [item for sublist in t for item in sublist]
selected_markers = list(set(flat_list(
    [scplus_obj.uns['RSS']['GEX_celltype_filtered'].loc[celltype].sort_values(ascending = False).head(10).index.to_list()
    for celltype in scplus_obj.uns['RSS']['GEX_celltype_filtered'].index])))

In [None]:
from scenicplus.plotting.correlation_plot import *

region_intersetc_data, Z = jaccard_heatmap(
        scplus_obj,
        method = 'intersect',
        gene_or_region_based = 'Region_based',
        use_plotly = False,
        selected_regulons = selected_markers,
        signature_key = 'eRegulon_signatures_filtered',
        figsize = (10, 10), return_data = True, vmax = 0.5, cmap = 'plasma')

### Plotting a Network

In [None]:
from pycisTopic.diff_features import find_highly_variable_features
hvr = find_highly_variable_features(scplus_obj.to_df('ACC').loc[list(set(scplus_obj.uns['eRegulon_metadata_filtered']['Region']))], n_top_features=1000, plot = False)
hvg = find_highly_variable_features(scplus_obj.to_df('EXP')[list(set(scplus_obj.uns['eRegulon_metadata_filtered']['Gene']))].T, n_top_features=1000, plot = False)

In [None]:
from scenicplus.networks import create_nx_tables, create_nx_graph, plot_networkx, export_to_cytoscape
nx_tables = create_nx_tables(
    scplus_obj = scplus_obj,
    eRegulon_metadata_key ='eRegulon_metadata_filtered',
    subset_eRegulons = ['PAX5', 'EBF1', 'POU2AF1'],
    subset_regions = hvr,
    subset_genes = hvg,
    add_differential_gene_expression = True,
    add_differential_region_accessibility = True,
    differential_variable = ['GEX_celltype'])

In [None]:
G, pos, edge_tables, node_tables = create_nx_graph(nx_tables,
                   use_edge_tables = ['TF2R','R2G'],
                   color_edge_by = {'TF2R': {'variable' : 'TF', 'category_color' : {'PAX5': 'Orange', 'EBF1': 'Purple', 'POU2AF1': 'Red'}},
                                    'R2G': {'variable' : 'R2G_rho', 'continuous_color' : 'viridis', 'v_min': -1, 'v_max': 1}},
                   transparency_edge_by =  {'R2G': {'variable' : 'R2G_importance', 'min_alpha': 0.1, 'v_min': 0}},
                   width_edge_by = {'R2G': {'variable' : 'R2G_importance', 'max_size' :  1.5, 'min_size' : 1}},
                   color_node_by = {'TF': {'variable': 'TF', 'category_color' : {'PAX5': 'Orange', 'EBF1': 'Purple', 'POU2AF1': 'Red'}},
                                    'Gene': {'variable': 'GEX_celltype_Log2FC_B_cells_1', 'continuous_color' : 'bwr'},
                                    'Region': {'variable': 'GEX_celltype_Log2FC_B_cells_1', 'continuous_color' : 'viridis'}},
                   transparency_node_by =  {'Region': {'variable' : 'GEX_celltype_Log2FC_B_cells_1', 'min_alpha': 0.1},
                                    'Gene': {'variable' : 'GEX_celltype_Log2FC_B_cells_1', 'min_alpha': 0.1}},
                   size_node_by = {'TF': {'variable': 'fixed_size', 'fixed_size': 30},
                                    'Gene': {'variable': 'fixed_size', 'fixed_size': 15},
                                    'Region': {'variable': 'fixed_size', 'fixed_size': 10}},
                   shape_node_by = {'TF': {'variable': 'fixed_shape', 'fixed_shape': 'ellipse'},
                                    'Gene': {'variable': 'fixed_shape', 'fixed_shape': 'ellipse'},
                                    'Region': {'variable': 'fixed_shape', 'fixed_shape': 'diamond'}},
                   label_size_by = {'TF': {'variable': 'fixed_label_size', 'fixed_label_size': 20.0},
                                    'Gene': {'variable': 'fixed_label_size', 'fixed_label_size': 10.0},
                                    'Region': {'variable': 'fixed_label_size', 'fixed_label_size': 0.0}},
                   layout='kamada_kawai_layout',
                   scale_position_by=250)

In [None]:
plt.figure(figsize=(10,10))
plot_networkx(G, pos)

In [None]:
export_to_cytoscape(G, pos, out_file = os.path.join(out_dir, 'scenicplus/network_combined.cys'))