#### Summary:
This notebook contains code to prepare the inputs for Scrublet (MM files) and also reads in the output files and merges them into one to use later.

In [12]:
suppressMessages(library(hdf5r))
suppressMessages(library(Seurat))
suppressMessages(library(Signac))
suppressMessages(library(EnsDb.Hsapiens.v86))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(Matrix))
suppressMessages(library(harmony))
suppressMessages(library(data.table))
suppressMessages(library(ggpubr))
suppressMessages(library(future))

In [13]:
suppressMessages(library(tictoc))
suppressMessages(library(plyr))
suppressMessages(library(enrichR))
suppressMessages(library(stringr))
suppressMessages(library(ggrepel))

In [22]:
### UPDATE THESE PATHS
rds_indir = '/path/to/multiome/processing/pipeline/outputs/dir'
outdir = "/dir/to/save/doublets/list/to"

# Exporting RDS Objects as MM files for Scrublet

In [4]:
samples <- c('R207','R217','R218','R221','R223',
             'R226','R228','R234','R237','R238',
             'R246', 'R247', 'R275', 'R277', 'R284', 
             'R290', 'R292', 'R316', 'R317', 'R319',
             'R325','R326','R327','R332','R343',
             'R349','R353','R354','R362','R363','R364')

In [8]:
for (sample in samples){
    print(paste(sample, Sys.time()))
    #set up necessary file structures -- this dir should be the sample's output dir from the multiome processing pipeline scripts
    sample_dir <- file.path(rds_indir, sample)
    mm_outdir <- file.path(sample_dir,'matrix_market')
    dir.create(mm_outdir)
    
    #read in RDS
    adata <- readRDS(file = file.path(sample_dir,"final_filtered.rds"))
    
    #pull out RNA counts to a separate object and writeMM
    DefaultAssay(adata) <- 'RNA'
    rna.counts <- GetAssayData(adata,slot='counts')
    mm.fp <- file.path(mm_outdir,'matrix.mtx')
    writeMM(rna.counts,mm.fp)
    
    #also export the gene list and barcodes lists
    genes.fp <- file.path(mm_outdir,'genes.tsv')
    write(row.names(rna.counts),genes.fp,sep='\n')

    barcodes.fp <- file.path(mm_outdir,'barcodes.tsv')
    write(colnames(rna.counts),barcodes.fp,sep='\n')
}

[1] "R207 2022-08-25 11:59:34"
[1] "R217 2022-08-25 12:02:55"
[1] "R218 2022-08-25 12:06:10"
[1] "R221 2022-08-25 12:08:01"
[1] "R223 2022-08-25 12:09:46"
[1] "R226 2022-08-25 12:13:02"
[1] "R228 2022-08-25 12:16:00"
[1] "R234 2022-08-25 12:18:36"
[1] "R237 2022-08-25 12:21:33"
[1] "R238 2022-08-25 12:24:23"
[1] "R246 2022-08-25 12:27:13"
[1] "R247 2022-08-25 12:28:09"
[1] "R275 2022-08-25 12:30:32"
[1] "R277 2022-08-25 12:32:15"
[1] "R284 2022-08-25 12:34:51"
[1] "R290 2022-08-25 12:37:05"
[1] "R292 2022-08-25 12:39:55"
[1] "R316 2022-08-25 12:42:40"
[1] "R317 2022-08-25 12:46:24"
[1] "R319 2022-08-25 12:50:43"
[1] "R325 2022-08-25 12:54:43"
[1] "R326 2022-08-25 12:55:32"
[1] "R327 2022-08-25 12:59:36"
[1] "R332 2022-08-25 13:01:56"
[1] "R343 2022-08-25 13:04:40"
[1] "R349 2022-08-25 13:06:15"
[1] "R353 2022-08-25 13:07:39"
[1] "R354 2022-08-25 13:09:55"
[1] "R362 2022-08-25 13:12:06"
[1] "R363 2022-08-25 13:15:12"
[1] "R364 2022-08-25 13:18:28"


# Read in Scrublet outputs and merge into one file

In [3]:
#samples for which I manually set a Scrublet threshold of 0.25
redo_samples = c('R218','R221','R238','R275', 'R277',
                 'R317', 'R319','R343','R349','R364')

In [6]:
fin_scrub_df = data.frame()

for (sample in samples){
    print(c(sample, Sys.time()))
    #get the cutoff number
    outdir = file.path(rds_indir, sample, 'matrix_market')
    if (sample %in% redo_samples){
        cutoff = '0.25'
        print(cutoff)
    } else {
        files = list.files(outdir)
        fp = files[grep('scrub',files)]
        cutoff = substr(fp, unlist(gregexpr('cutoff',fp))+6, unlist(gregexpr('.txt',fp))-1)
        print(cutoff)
    }
        
    #read in scrublet file
    scrub_fp = file.path(outdir,sprintf('scrublet_predicted_doublets_cutoff%s.txt',cutoff))
    scrub_df = read.table(scrub_fp, sep='\t', header=FALSE)
    
    #add on sample BC prefix
    scrub_df$V1 = paste(sample, '_', scrub_df$V1, sep='')

    #add to overall dataframe
    fin_scrub_df = rbind(fin_scrub_df,scrub_df)
}

out_fp = file.path(outdir,'indiv_samples_combined.scrublet_predicted_doublets.txt')
write.table(fin_scrub_df[,c(1,2,3)],out_fp, sep='\t', row.names=FALSE, col.names=FALSE, quote=FALSE)

[1] "R207"             "1662063138.78032"
[1] "0.2557159026533267"
[1] "R217"             "1662063138.81173"
[1] "0.17592480680131617"
[1] "R218"             "1662063138.86208"
[1] "0.25"
[1] "R221"             "1662063138.88525"
[1] "0.25"
[1] "R223"             "1662063138.91489"
[1] "0.20057109796338418"
[1] "R226"             "1662063138.95066"
[1] "0.21798500551758931"
[1] "R228"             "1662063138.98634"
[1] "0.18229496603578663"
[1] "R234"             "1662063139.05762"
[1] "0.20616746937851066"
[1] "R237"             "1662063139.10211"
[1] "0.21591387236966297"
[1] "R238"             "1662063139.15696"
[1] "0.25"
[1] "R246"             "1662063139.19769"
[1] "0.21450430034488194"
[1] "R247"             "1662063139.24848"
[1] "0.206563350297425"
[1] "R275"             "1662063139.33675"
[1] "0.25"
[1] "R277"            "1662063139.3739"
[1] "0.25"
[1] "R284"             "1662063139.41574"
[1] "0.17801217813100262"
[1] "R290"             "1662063139.46352"
[1] "0.17703093285