#### Summary:

This notebook contains code for running Cicero on a 10X Multiome object. This assumes you have merged all your samples into one Seurat object (called `adata` here) and have also made a new assay from the final cell type peak calls. You can use other peak calls, just create the `atac.counts` object from a different assay.

Required inputs:
- Seurat object with ATAC counts, or just a sparse matrix with ATAC counts
- Per cell type bed files of accessible peaks

In [1]:
suppressMessages(library(hdf5r))
suppressMessages(library(Seurat))
suppressMessages(library(Signac))
suppressMessages(library(dplyr))
suppressMessages(library(stringr))
suppressMessages(library(tictoc))
suppressMessages(library(cicero))

In [6]:
set.seed(8)

In [None]:
peaks_dir <- '/dir/with/per/celltype/accessible/cres/beds'

# Read in Relevant Data

In [8]:
# Read in the adata object that includes the celltype ATAC peaks
indir <- "/path/to/dir/with/seurat/object"
rds_fp <- file.path(indir,"final_object.rds")
tic()
adata <- readRDS(rds_fp)
toc()
adata

694.275 sec elapsed


An object of class Seurat 
543177 features across 174819 samples within 4 assays 
Active assay: ATAC (210485 features, 210485 variable features)
 3 other assays present: RNA, SCT, ATAC_CTpeaks
 7 dimensional reductions calculated: pca, harmony.rna, umap.rna, lsi, harmony.atac, umap.atac, umap.wnn

In [9]:
# Extract the ATAC counts data -- this notebook assumes peaks are written in this format: chr-start-end
DefaultAssay(adata) <- 'ATAC_CTpeaks'
atac.counts = GetAssayData(adata,slot='counts')
head(atac.counts)

   [[ suppressing 34 column names 'R207_AAACAGCCAAACGGGC-1', 'R207_AAACAGCCACAAAGAC-1', 'R207_AAACAGCCAGCAAGTG-1' ... ]]



6 x 174819 sparse Matrix of class "dgCMatrix"
                                                                              
chr1-100036775-100037124 . . . . . . . . . . . . . . . . . . . . . . . . . . .
chr1-100037591-100039086 . . . . . . 2 2 2 . . . . . . . . . . . . 2 . . 4 4 .
chr1-100046104-100046480 . . . . . . . . . . . . . . . . . . . . . . . . . 2 .
chr1-100046883-100047599 . . . . . . . . . . . . . . . . . . . . . . 2 . . . .
chr1-100050187-100050403 . . . . . . . . . . . . . . . . . . . . . . . . . . .
chr1-100053889-100054089 . . . . . . . . . . . . . . . . . . . . . . . . . . .
                                             
chr1-100036775-100037124 . . . . . . . ......
chr1-100037591-100039086 . 4 . . . . . ......
chr1-100046104-100046480 . . . . . . . ......
chr1-100046883-100047599 . . . . . . . ......
chr1-100050187-100050403 . . . . . . . ......
chr1-100053889-100054089 . . . . . . . ......

 .....suppressing 174785 columns in show(); maybe adjust 'options(max.print= *

In [10]:
# Create a cell type specific peaks reference object
# This is a list where each celltype corresponds to a vector of peak names called in that celltype
celltypes <- unique(adata@meta.data[['major_celltypes']]) #celltypes you want to run ABC on

ct_peaks_list <- list()
for (celltype in celltypes){
    peak.fp <- file.path(peaks_dir,sprintf('%s.merged_peaks.anno.mergedOverlap.bed',celltype))
    ct.peaks <- read.table(peak.fp,sep='\t')
    ct_peaks <- paste(ct.peaks$V1,ct.peaks$V2,ct.peaks$V3,sep="_")
    ct_peaks_list[[celltype]] = ct_peaks
}

# Functions

### Prepare inputs to make a CellDataSet Object

In [14]:
# Function to extract the ATAC counts data for a specific celltype
extract_CT_ATAC_mat <- function(adata, atac.counts, celltype, ct_col){
    celltype_bcs <- row.names(adata@meta.data)[adata@meta.data[[ct_col]] == celltype]
    celltype_peaks <- ct_peaks_list[[celltype]]
    celltype_peaks <- unique(gsub('_','-',celltype_peaks)) #reformat peaks to match atac.counts, make sure no repeats
    sc.data.subset <- atac.counts[celltype_peaks, celltype_bcs]
    print(paste('Cell type specific peaks matrix dimensions: ', dim(sc.data.subset)[1], dim(sc.data.subset)[2]))
    return (sc.data.subset)
}


# Function to prepare the CDS datastructure from the celltype specific ATAC counts data
prepare_cic_CDS <- function(sc.data.subset){
    # Make a dataframe with information on cell barcodes
    cellinfo <-data.frame(cells=colnames(sc.data.subset))
    row.names(cellinfo) <- cellinfo$cells
    
    # Make a dataframe with information on peaks
    dhsinfo <- data.frame(site_name=rownames(sc.data.subset))
    row.names(dhsinfo) <- dhsinfo$site_name
    dhsinfo <- cbind(dhsinfo, stringr::str_split_fixed(dhsinfo$site_name, "-", 3))
    names(dhsinfo) <- c('site_name','chr','bp1','bp2')
    dhsinfo$chr <- gsub('chr','', dhsinfo$chr)
    dhsinfo$bp1 <- as.numeric(as.character(dhsinfo$bp1))
    dhsinfo$bp2 <- as.numeric(as.character(dhsinfo$bp2))
    
    # Make the CellDataSet object
    input_cds <- suppressWarnings(newCellDataSet(as(sc.data.subset, 'dgCMatrix'),
                                                 phenoData = methods::new('AnnotatedDataFrame', data = cellinfo),
                                                 featureData = methods::new('AnnotatedDataFrame', data = dhsinfo),
                                                 expressionFamily=negbinomial.size(),
                                                 lowerDetectionLimit=0))
    input_cds@expressionFamily <- binomialff()
    input_cds@expressionFamily@vfamily <- 'binomialff'
    input_cds <- detectGenes(input_cds)
    input_cds <- estimateSizeFactors(input_cds)
    input_cds <- input_cds[fData(input_cds)$num_cells_expressed > 0,]
    return(input_cds)
}


# Function to prepare the UMAP coordinate datastructure
prepare_UMAP_coords <- function(sc.data.subset){
    umap_coords <- Embeddings(adata[['umap.wnn']])[colnames(sc.data.subset),]
    colnames(umap_coords) <- NULL
    return(umap_coords)
}

### Run Cicero

In [15]:
# Function to do everything necessary to run Cicero and output a connections object
run_cicero <- function(input_cds, umap_coords, celltype){
    #run cicero with our standard settings
    tic()
    cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords, k=30)
    window <- 1e6
    chromsizes <- 'non-diabetic-islet-multiomics/references/hg38.chrom.sizes'
    distance_parameters <- estimate_distance_parameter(cicero_cds, window=window, maxit=100, sample_num=100, distance_constraint=500000, genomic_coords=chromsizes)
    mean_distance_parameter <- mean(unlist(distance_parameters))
    cicero_out <- generate_cicero_models(cicero_cds, distance_parameter=mean_distance_parameter, window=window, genomic_coords=chromsizes)
    conns <- assemble_connections(cicero_out, silent=FALSE)
    time <- toc()
    print(sprintf('Time to run cicero on %s cells: %s', celltype, time$callback_msg))
    return(conns)
}

### Output Results

In [27]:
# Output all connection results to a file
output_raw_results <- function(conns, outdir, celltype){
    # Make bedpe format df with the peaks and coaccess scores
    out_df <- cbind(as.data.frame(str_split_fixed(conns$Peak1, '-', 3)), 
                    as.data.frame(str_split_fixed(conns$Peak2, '-', 3)), 
                    conns$coaccess)
    
    # Remove links with no score (if they exist)
    out_df <- out_df[!is.na(out_df[,7]),]
    
    # Output the total df
    out_fp = file.path(outdir,sprintf('Cicero_links.%s.all.bedpe',celltype))
    write.table(out_df, out_fp, sep='\t', col.names=FALSE, row.names=FALSE, quote=FALSE)
    return(out_df)
}


# Remove duplicate connections, threshold connections and output as filtered results
output_filt_results <- function(out_df, threshold, outdir, celltype){
    # Subset connections df to scores > threshold before removing duplicates (takes too long otherwise)
    colnames(out_df) <- paste('V', seq(1,7), sep='')
    out_df_cut = out_df[out_df$V7 > threshold,]
    
    # Check if any links pass the threshold 
    if (dim(out_df_cut)[1] < 1){
        print(paste('No results pass threshold ', threshold, sep=''))
    } else {
        # Remove duplicated links (same peaks and score, diff order)
        get_ordered_peaks <- function(row){
            if (row[2] < row[5]){
                peak1 = paste(row[[1]], as.character(row[[2]]), as.character(row[[3]]), sep='-')
                peak2 = paste(row[[4]], as.character(row[[5]]), as.character(row[[6]]), sep='-')
            } else {
                peak1 = paste(row[[4]], as.character(row[[5]]), as.character(row[[6]]), sep='-')
                peak2 = paste(row[[1]], as.character(row[[2]]), as.character(row[[3]]), sep='-')
            }
            return(paste(peak1,peak2,sep='_'))
        }

        out_df_cut$ordered_peaks = apply(out_df_cut,1,get_ordered_peaks)
        out_df_fin = out_df_cut[!duplicated(out_df_cut$ordered_peaks),]

        # Output the thresholded and dedup df
        out_fp2 = file.path(outdir,sprintf('Cicero_links.%s.above%s.dedup.bedpe',celltype, threshold))
        write.table(out_df_fin, out_fp2, sep='\t', col.names=FALSE, row.names=FALSE, quote=FALSE)
    }
}

# Use Functions to Run Cicero
Note that Cicero can actually take quite a lot of memory to run on larger groups of cells. For example, when running it on a 80k cell cluster it took about 300GB.

In [12]:
# Final necessary inputs
ct_col <- "major_celltypes" #name of whichever column in adata@meta.data corresponds to your desired celltypes
outdir <- '/dir/to/save/outputs/to'
threshold <- 0.05

In [28]:
# Testing the functions on one cell type, but you can easily loop through all of them instead

celltypes_cut <- c('immune')
for (celltype in celltypes_cut){
    tic()
    # Prepare inputs
    sc.data.subset <- extract_CT_ATAC_mat(adata, atac.counts, celltype, ct_col)
    input_cds      <- prepare_cic_CDS(sc.data.subset)
    umap_coords    <- prepare_UMAP_coords(sc.data.subset)

    # Run ABC
    conns <- run_cicero(input_cds, umap_coords, celltype)

    # Output results
    out_df <- output_raw_results(conns, outdir, celltype)
    output_filt_results(out_df, threshold, outdir, celltype)
    total_time <- toc()
    print(sprintf('Total time to run all Cicero functions on %s: %s',celltype,total_time$callback_msg))
    print('')
}

[1] "Cell type specific peaks matrix dimensions:  61472 633"


Overlap QC metrics:
Cells per bin: 30
Maximum shared cells bin-bin: 26
Mean shared cells bin-bin: 1.6385035729298
Median shared cells bin-bin: 0

"the condition has length > 1 and only the first element will be used"


[1] "Successful cicero models:  4877"
[1] "Other models: "

Zero or one element in range 
                        1866 
[1] "Models with errors:  0"
177.449 sec elapsed
[1] "Time to run cicero on immune cells: 177.449 sec elapsed"
400.727 sec elapsed
[1] "Total time to run all Cicero functions on immune: 400.727 sec elapsed"
[1] ""


In [None]:
#Run on the rest of the cell types
for (celltype in celltypes[-6]){
    tic()
    # Prepare inputs
    sc.data.subset <- extract_CT_ATAC_mat(adata, atac.counts, celltype, ct_col)
    input_cds      <- prepare_cic_CDS(sc.data.subset)
    umap_coords    <- prepare_UMAP_coords(sc.data.subset)

    # Run ABC
    conns <- run_cicero(input_cds, umap_coords, celltype)

    # Output results
    out_df <- output_raw_results(conns, outdir, celltype)
    output_filt_results(out_df, threshold, outdir, celltype)
    total_time <- toc()
    print(sprintf('Total time to run all Cicero functions on %s: %s',celltype,total_time$callback_msg))
    print('')
}

In [2]:
sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.2 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] grid      splines   stats4    stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] cicero_1.12.0        Gviz_1.38.4          GenomicRanges_1.46.1
 [4] GenomeInfoDb_1.30.1  IRanges_2.28.0       S4Vectors_0.32.4    
 [7] monocle_2.22.0       DDRTree_0.1.5        irlba_2.3.5.1       
[10] VGAM_1.1-7           gg