# SCENIC+

# 01 Database preparation

All the steps up until `RNA preparation` were done in command line, but I copy them here for posterity

The code is based on SCENIC+ tutorial; see: https://scenicplus.readthedocs.io/en/latest/tutorials.html

to create custom database, I create a conda environment: https://github.com/aertslab/create_cisTarget_databases

with the following packages:
```
Package         Version
--------------- -----------
flatbuffers     25.2.10
llvmlite        0.40.1
numba           0.57.1
numpy           1.24.4
pandas          2.3.1
pip             25.1.1
pyarrow         20.0.0
python-dateutil 2.9.0.post0
pytz            2025.2
setuptools      80.9.0
six             1.17.0
tzdata          2025.2
wheel           0.45.1
```

In [None]:
git clone https://github.com/aertslab/create_cisTarget_databases

## Cluster buster download

In [None]:
wget https://resources.aertslab.org/cistarget/programs/cbust
chmod a+x cbust

## Motif collection download

In [None]:
mkdir -p aertslab_motif_colleciton
wget -O aertslab_motif_colleciton/v10nr_clust_public.zip https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/v10nr_clust_public.zip

In [None]:
cd aertslab_motif_colleciton; unzip -q v10nr_clust_public.zip
cd ..

In [None]:
!ls aertslab_motif_colleciton/v10nr_clust_public/snapshots/


In [None]:
!ls -l aertslab_motif_colleciton/v10nr_clust_public/singletons | head

## Fasta from consensus regions

In [None]:
module load BEDTools/2.31.1-GCC-10.3.0

REGION_BED="/preprocessing/atac/outs/consensus_peak_calling/consensus_regions.bed"
GENOME_FASTA="/genomes/homo_sapiens/hg38_ucsc/fasta/hg38.fa"
CHROMSIZES="/genomes/homo_sapiens/hg38_ucsc/fasta/hg38.chrom.sizes"
DATABASE_PREFIX="10x_ONT_BALL"
SCRIPT_DIR="/preprocessing/grn/create_cisTarget_databases"

${SCRIPT_DIR}/create_fasta_with_padded_bg_from_bed.sh \
        ${GENOME_FASTA} \
        ${CHROMSIZES} \
        ${REGION_BED} \
        hg38.BALL.with_1kb_bg_padding.fa \
        1000 \
        yes
        

## Create cistarget database

In [None]:
mamba activate create_cistarget_databases

In [1]:
ls aertslab_motif_colleciton/v10nr_clust_public/singletons > motifs.txt

In [None]:
OUT_DIR="/preprocessing/grn/"
CBDIR="${OUT_DIR}/aertslab_motif_colleciton/v10nr_clust_public/singletons"
FASTA_FILE="${OUT_DIR}/hg38.BALL.with_1kb_bg_padding.fa"
MOTIF_LIST="${OUT_DIR}/motifs.txt"
DATABASE_PREFIX="10x_ONT_BALL"

"${OUT_DIR}/create_cisTarget_databases/create_cistarget_motif_databases.py" \
    -f ${FASTA_FILE} \
    -M ${CBDIR} \
    -m ${MOTIF_LIST} \
    -o ${OUT_DIR}/${DATABASE_PREFIX} \
    --bgpadding 1000 \
    -t 20

# 02 RNA preparation

 Save rna counts in adata.raw and change count matrix barcode to ATAC one

From this step on, all is done in jupyter notebook

In [1]:
import scanpy as sc

In [2]:
rna = sc.read_h5ad("/lustre1/project/stg_00096/home/projects/2023_Cools_B-ALL/multiome_adata_04072025.h5ad")

In [3]:
rna.layers

Layers with keys: counts, log_counts

In [4]:
rna.obs

Unnamed: 0,sample,dna_total_counts,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb,...,leiden_1.25,leiden_1.5,leiden_2.0,leiden_5.0,leiden_10.0,leiden_20.0,celltypist_cell_label_coarse,celltypist_conf_score_coarse,celltypist_cell_label_fine,celltypist_conf_score_fine
AAACAGCCAGTTTGTG-D0,D0,8056.0,1891,7.545390,7641.0,8.941415,21.823060,13.584609,7.642979,0.0,...,4,4,4,1,9,37,B-cell lineage,0.978114,Pro-B cells,0.950723
AAACAGCCATGTTTGG-D0,D0,23901.0,2319,7.749322,10986.0,9.304468,21.354451,13.899508,9.111597,0.0,...,8,8,7,13,63,36,B-cell lineage,0.603815,Pro-B cells,0.370262
AAACAGCCATTTAAGC-D0,D0,31317.0,1667,7.419381,6636.0,8.800415,21.503918,13.803495,6.268836,0.0,...,4,4,4,1,26,134,B-cell lineage,0.951335,Pro-B cells,0.903238
AAACATGCATTGGGAG-D0,D0,1039.0,1599,7.377759,6888.0,8.837681,24.811266,18.626598,5.894309,0.0,...,4,4,4,10,19,81,B-cell lineage,0.121671,Pro-B cells,0.183371
AAACCAACAATAACCT-D0,D0,25718.0,1403,7.247081,5495.0,8.611775,24.913558,18.817106,5.150136,0.0,...,4,4,4,10,8,46,B-cell lineage,0.991872,Pro-B cells,0.996793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGAAGCATGAAG-Q3,Q3,13995.0,2061,7.631432,8523.0,9.050641,21.858501,14.009152,6.112871,0.0,...,5,7,18,38,30,9,B-cell lineage,0.930010,Pro-B cells,0.984357
TTTGTGAAGCGAGTAA-Q3,Q3,34490.0,1361,7.216709,5059.0,8.529121,23.739870,17.019173,8.381103,0.0,...,10,11,8,23,14,5,B-cell lineage,0.403587,Pro-B cells,0.973642
TTTGTGGCAGCCTGCA-Q3,Q3,3673.0,1536,7.337588,6272.0,8.744010,29.432398,23.852041,6.505102,0.0,...,1,0,0,16,22,58,B-cell lineage,0.484664,Pro-B cells,0.177973
TTTGTTGGTAAACAAG-Q3,Q3,9679.0,1279,7.154615,5301.0,8.575839,35.653650,28.730429,2.886248,0.0,...,9,10,11,45,50,31,Monocytes,0.745200,Non-classical monocytes,0.041574


In [None]:
import pandas as pd
atac_CB = pd.read_csv('/barcodes/cellranger_arc_atac.737K-arc-v1.txt.gz', header = None)
rna_CB = pd.read_csv('/barcodes/cellranger_arc_rna.737K-arc-v1.txt.gz', header = None)
rna_CB.rename(columns={0: 'rna_CB'}, inplace=True)
atac_CB.rename(columns={0: 'atac_CB'}, inplace=True)
df_merged =  pd.concat([rna_CB, atac_CB], axis=1)

In [7]:
rna.obs['base_index'] = rna.obs.index.str.split('-').str[0]
mapping_dict = pd.Series(df_merged['atac_CB'].values, index=df_merged['rna_CB']).to_dict()
rna.obs['atac_CB'] = rna.obs['base_index'].map(mapping_dict)
rna.obs['sample'] = rna.obs['sample'].str.split('_').str[0]
rna.obs['atac_CB'] = rna.obs['atac_CB'].astype(str)+'-'+rna.obs['sample'].astype(str) + '___' + rna.obs['sample'].astype(str)
rna.obs.set_index('atac_CB', inplace=True)
rna.obs

Unnamed: 0_level_0,sample,dna_total_counts,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb,...,leiden_1.5,leiden_2.0,leiden_5.0,leiden_10.0,leiden_20.0,celltypist_cell_label_coarse,celltypist_conf_score_coarse,celltypist_cell_label_fine,celltypist_conf_score_fine,base_index
atac_CB,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACAGCGGGTGTTACTA-D0___D0,D0,8056.0,1891,7.545390,7641.0,8.941415,21.823060,13.584609,7.642979,0.0,...,4,4,1,9,37,B-cell lineage,0.978114,Pro-B cells,0.950723,AAACAGCCAGTTTGTG
ACAGCGGGTTTATCCT-D0___D0,D0,23901.0,2319,7.749322,10986.0,9.304468,21.354451,13.899508,9.111597,0.0,...,8,7,13,63,36,B-cell lineage,0.603815,Pro-B cells,0.370262,AAACAGCCATGTTTGG
ACAGCGGGTTCACTTT-D0___D0,D0,31317.0,1667,7.419381,6636.0,8.800415,21.503918,13.803495,6.268836,0.0,...,4,4,1,26,134,B-cell lineage,0.951335,Pro-B cells,0.903238,AAACAGCCATTTAAGC
CATTTAGGTGTCCACA-D0___D0,D0,1039.0,1599,7.377759,6888.0,8.837681,24.811266,18.626598,5.894309,0.0,...,4,4,10,19,81,B-cell lineage,0.121671,Pro-B cells,0.183371,AAACATGCATTGGGAG
CTTTATCGTCGAGGCA-D0___D0,D0,25718.0,1403,7.247081,5495.0,8.611775,24.913558,18.817106,5.150136,0.0,...,4,4,10,8,46,B-cell lineage,0.991872,Pro-B cells,0.996793,AAACCAACAATAACCT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTAGGTGTCAGTGAAC-Q3___Q3,Q3,13995.0,2061,7.631432,8523.0,9.050641,21.858501,14.009152,6.112871,0.0,...,7,18,38,30,9,B-cell lineage,0.930010,Pro-B cells,0.984357,TTTGTGAAGCATGAAG
GTAGGTGTCGGTCCAT-Q3___Q3,Q3,34490.0,1361,7.216709,5059.0,8.529121,23.739870,17.019173,8.381103,0.0,...,11,8,23,14,5,B-cell lineage,0.403587,Pro-B cells,0.973642,TTTGTGAAGCGAGTAA
GTTTGGTGTTTCTAAG-Q3___Q3,Q3,3673.0,1536,7.337588,6272.0,8.744010,29.432398,23.852041,6.505102,0.0,...,0,0,16,22,58,B-cell lineage,0.484664,Pro-B cells,0.177973,TTTGTGGCAGCCTGCA
CGTACTTCAAATATGC-Q3___Q3,Q3,9679.0,1279,7.154615,5301.0,8.575839,35.653650,28.730429,2.886248,0.0,...,10,11,45,50,31,Monocytes,0.745200,Non-classical monocytes,0.041574,TTTGTTGGTAAACAAG


In [8]:
rna.X = rna.layers['counts'].copy()
rna.raw = rna

In [None]:
rna.write_h5ad("/grn/multiome_adata_04072025.raw.BALL.h5ad")

# 03 Run scenicplus

In [10]:
! scenicplus


   ____   ____ _____ _   _ ___ ____      
  / ___| / ___| ____| \ | |_ _/ ___|[31;1m _ [0m
  \___ \| |   |  _| |  \| || | |  [31;1m _|.|_[0m
   ___) | |___| |___| |\  || | |__[31;1m|_..._|[0m
  |____/ \____|_____|_| \_|___\____|[31;1m|_|[0m 


scenicplus verions: 1.0a1
usage: scenicplus [-h] {init_snakemake,prepare_data,grn_inference} ...

Single-Cell Enhancer-driven gene regulatory Network Inference and Clustering

positional arguments:
  {init_snakemake,prepare_data,grn_inference}

options:
  -h, --help            show this help message and exit


In [12]:
!scenicplus init_snakemake --out_dir scplus_pipeline

2025-07-10 21:41:47,855 SCENIC+      INFO     Creating snakemake folder in: scplus_pipeline
[0m

In [13]:
!tree scplus_pipeline/

/bin/bash: line 1: tree: command not found


```
tree -d scplus_pipeline/
scplus_pipeline/
└── Snakemake
    ├── config
    └── workflow
```

In [14]:
!mkdir -p outs
!mkdir -p tmp

In [15]:
! ls

01_SCENIC_plus.ipynb
02_scenic_output-Copy1.ipynb
02_scenic_output.ipynb
10x_ONT_BALL.motifs_vs_regions.scores.feather
10x_ONT_BALL.regions_vs_motifs.rankings.feather
10x_ONT_BALL.regions_vs_motifs.scores.feather
aertslab_motif_colleciton
cbust
cbust.1
create_cisTarget_databases
figures
hg38.BALL.with_1kb_bg_padding.fa
misc
motif_cna_overlaps.ipynb
motifs.txt
multiome_adata_04072025.raw.BALL.h5ad
old
outs
pseudobulk_exp_BALL.ipynb
regulons
scplus_pipeline
subset
tmp
xx_visualisations.ipynb


In [16]:
!cd scplus_pipeline/Snakemake/

In [None]:
!cat scplus_pipeline/Snakemake/config/config.yaml

adjust `config.yaml` with the input from pycistopic and RNA output files;
for exmaple:

```
input_data:
  cisTopic_obj_fname: "/preprocessing/atac/outs/cistopic_obj.pkl"
  GEX_anndata_fname: "/preprocessing/grn/multiome_adata_04072025.raw.BALL.h5ad"
  region_set_folder: "/preprocessing/atac/outs/region_sets"
  ctx_db_fname: "/preprocessing/grn/10x_ONT_BALL.regions_vs_motifs.rankings.feather"
  dem_db_fname: "/preprocessing/grn/10x_ONT_BALL.regions_vs_motifs.scores.feather"
  path_to_motif_annotations: "/preprocessing/grn/aertslab_motif_colleciton/v10nr_clust_public/snapshots/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl"

output_data:
  # output for prepare_GEX_ACC .h5mu
  combined_GEX_ACC_mudata: "/preprocessing/grn/outs/ACC_GEX.h5mu"
  # output for motif enrichment results .hdf5
  dem_result_fname: "/preprocessing/grn/outs/dem_results.hdf5"
  ctx_result_fname: "/preprocessing/grn/outs/ctx_results.hdf5"
  # output html for motif enrichment results .html
  output_fname_dem_html: "/preprocessing/grn/outs/dem_results.html"
  output_fname_ctx_html: "/preprocessing/grn/outs/ctx_results.html"
  # output for prepare_menr .h5ad
  cistromes_direct: "/preprocessing/grn/outs/cistromes_direct.h5ad"
  cistromes_extended: "/preprocessing/grn/outs/cistromes_extended.h5ad"
  # output tf names .txt
  tf_names: "/preprocessing/grn/outs/tf_names.txt"
  # output for download_genome_annotations .tsv
  genome_annotation: "/preprocessing/grn/outs/genome_annotation.tsv"
  chromsizes: "/preprocessing/grn/outs/chromsizes.tsv"
  # output for search_space .tsb
  search_space: "/preprocessing/grn/outs/search_space.tsv"
  # output tf_to_gene .tsv
  tf_to_gene_adjacencies: "/preprocessing/grn/outs/tf_to_gene_adj.tsv"
  # output region_to_gene .tsv
  region_to_gene_adjacencies: "/preprocessing/grn/outs/region_to_gene_adj.tsv"
  # output eGRN .tsv
  eRegulons_direct: "/preprocessing/grn/outs/eRegulon_direct.tsv"
  eRegulons_extended: "/preprocessing/grn/outs/eRegulons_extended.tsv"
  # output AUCell .h5mu
  AUCell_direct: "/preprocessing/grn/outs/AUCell_direct.h5mu"
  AUCell_extended: "/preprocessing/grn/outs/AUCell_extended.h5mu"
  # output scplus mudata .h5mu
  scplus_mdata: "preprocessing/grn/outs/scplusmdata.h5mu"

params_general:
  temp_dir: "/preprocessing/grn//tmp"
  n_cpu: 40
  seed: 666

params_data_preparation:
  # Params for prepare_GEX_ACC
  bc_transform_func: "\"lambda x: f'{x}'\""
  is_multiome: True
  key_to_group_by: ""
  nr_cells_per_metacells: 10
  # Params for prepare_menr
  direct_annotation: "Direct_annot"
  extended_annotation: "Orthology_annot"
  # Params for download_genome_annotations
  species: "hsapiens"
  biomart_host: "http://www.ensembl.org"
  # Params for search_space
  search_space_upstream: "1000 150000"
  search_space_downstream: "1000 150000"
  search_space_extend_tss: "10 10"

params_motif_enrichment:
  species: "homo_sapiens"
  annotation_version: "v10nr_clust"
  motif_similarity_fdr: 0.001
  orthologous_identity_threshold: 0.0
  annotations_to_use: "Direct_annot Orthology_annot"
  fraction_overlap_w_dem_database: 0.4
  dem_max_bg_regions: 500
  dem_balance_number_of_promoters: True
  dem_promoter_space: 1_000
  dem_adj_pval_thr: 0.05
  dem_log2fc_thr: 1.0
  dem_mean_fg_thr: 0.0
  dem_motif_hit_thr: 3.0
  fraction_overlap_w_ctx_database: 0.4
  ctx_auc_threshold: 0.005
  ctx_nes_threshold: 3.0
  ctx_rank_threshold: 0.05




params_inference:
  # Params for tf_to_gene
  tf_to_gene_importance_method: "GBM"
  # Params regions_to_gene
  region_to_gene_importance_method: "GBM"
  region_to_gene_correlation_method: "SR"
  # Params for eGRN inference
  order_regions_to_genes_by: "importance"
  order_TFs_to_genes_by: "importance"
  gsea_n_perm: 1000
  quantile_thresholds_region_to_gene: "0.85 0.90 0.95"
  top_n_regionTogenes_per_gene: "5 10 15"
  top_n_regionTogenes_per_region: ""
  min_regions_per_gene: 0
  rho_threshold: 0.05
  min_target_genes: 10

```

In [None]:
!pwd

In [None]:
cd /processing/grn/scplus_pipeline/Snakemake/

In [29]:
! snakemake --cores 20

[33mAssuming unrestricted shared filesystem usage for local execution.[0m
[33mBuilding DAG of jobs...[0m
[33mUsing shell: /usr/bin/bash[0m
[33mProvided cores: 20[0m
[33mRules claiming more threads will be scaled down.[0m
[33mJob stats:
job                           count
--------------------------  -------
AUCell_direct                     1
AUCell_extended                   1
all                               1
eGRN_direct                       1
eGRN_extended                     1
get_search_space                  1
motif_enrichment_cistarget        1
motif_enrichment_dem              1
prepare_menr                      1
region_to_gene                    1
scplus_mudata                     1
tf_to_gene                        1
total                            12
[0m
[33mSelect jobs to execute...[0m
[33mExecute 1 jobs...[0m
[32m[0m
[32m[Thu Jul 10 22:49:28 2025][0m
[32mlocalrule get_search_space:
    input: /staging/leuven/stg_00096/home/projects/2023_Cools_B-ALL

In [30]:
! cat .snakemake/log/2025-07-10T224927.855785.snakemake.log

Building DAG of jobs...
Using shell: /usr/bin/bash
Provided cores: 20
Rules claiming more threads will be scaled down.
Job stats:
job                           count
--------------------------  -------
AUCell_direct                     1
AUCell_extended                   1
all                               1
eGRN_direct                       1
eGRN_extended                     1
get_search_space                  1
motif_enrichment_cistarget        1
motif_enrichment_dem              1
prepare_menr                      1
region_to_gene                    1
scplus_mudata                     1
tf_to_gene                        1
total                            12

Select jobs to execute...
Execute 1 jobs...

[Thu Jul 10 22:49:28 2025]
localrule get_search_space:
    input: /staging/leuven/stg_00096/home/projects/2023_Cools_B-ALL/scenic_plus/outs/ACC_GEX.h5mu, /staging/leuven/stg_00096/home/projects/2023_Cools_B-ALL/scenic_plus/outs/genome_annotation.tsv, /staging/leuven/stg_00096/home/pr

In [31]:
pip list

Package                              Version
------------------------------------ -----------------
adjustText                           1.0.4
aiohttp                              3.9.3
aiosignal                            1.3.1
anndata                              0.10.5.post1
annoy                                1.17.3
appdirs                              1.4.4
arboreto                             0.1.6
argparse-dataclass                   2.0.0
array_api_compat                     1.5.1
asttokens                            2.4.1
attr                                 0.3.2
attrs                                23.2.0
bbknn                                1.6.0
beautifulsoup4                       4.12.3
bidict                               0.23.1
bioservices                          1.11.2
blosc2                               2.5.1
bokeh                                3.4.0
boltons                              23.1.1
bs4                                  0.0.2
cattrs                     