In [1]:
rm(list=ls())
setwd("/scratch/AG_Ohler/CheWei/kallisto-bus/output")

In [2]:
suppressMessages(library(Matrix))
suppressMessages(library(DropletUtils))
suppressMessages(library(ggplot2))
suppressMessages(library(scales))
suppressMessages(library(rjson))
suppressMessages(library(R2HTML))
suppressMessages(library(Seurat))
suppressMessages(library(SeuratWrappers))
suppressMessages(library(tidyverse))
suppressMessages(library(DoubletFinder))
suppressMessages(library(COPILOT))

In [3]:
# Load unwanted genes
pp.genes <- as.character(read.table("../../proj_sc/cbpsc/data/Root_sc/Protoplasting_DEgene_FC2_list.txt", header=F)$V1)

In [4]:
# Prepare sample meta data for pp1
sample.name <- "pp1"
bscs <- read.csv("../../proj_sc/cbpsc/Benfey_single_cell-Samples.csv", na.strings=c("","NA"))
bscs <- bscs %>% select(c('sample','name','source','genotype','transgene','treatment','age','timepoint','rep','target_cells','date','seq_run')) %>% filter(sample==sample.name)
bscs$date <- gsub('^([0-9]{4})([0-9]{2})([0-9]+)$', '\\1-\\2-\\3', bscs$date)
bscs$target_cells <- prettyNum(bscs$target_cells, big.mark = ',')
bscs <- t(bscs)
sample.stats <- data.frame(stat = c('Sample','Name','Source','Genotype','Transgene','Treatment','Age','Timepoint','Rep','Target Cells','Date','Seq Run'), 
                    value = bscs[,1])

In [5]:
sample.stats

Unnamed: 0_level_0,stat,value
Unnamed: 0_level_1,<fct>,<fct>
sample,Sample,pp1
name,Name,WT Plant Physiology 1
source,Source,"Ryu et al. 2019, Plant Physiology"
genotype,Genotype,WT Col-0
transgene,Transgene,
treatment,Treatment,Untreated
age,Age,5_day
timepoint,Timepoint,
rep,Rep,1
target_cells,Target Cells,


In [None]:
# Run copilot, please notice that do.annotation only supports root of Arabidopsis thaliana and Oryza sativa 
copilot(sample.name = "pp1", species.name = "Arabidopsis thaliana", transcriptome.name = "TAIR10", sample.stats = sample.stats, mt.pattern = "ATMG", 
        mt.threshold = 5, cp.pattern = "ATCG", do.seurat = TRUE, do.annotation = TRUE, unwanted.genes = pp.genes, dir_to_bulk = "../../cellranger/Root_bulk_arabidopsis_curated.RD", 
        min.UMI.high.quality = 100, min.UMI.high.quality = 300)

# copilot

# Description

Single cell RNA-seq preprocessing tool for gene-by-cell matrices of UMI counts. It is recommended to use the raw spliced and unpliced counts matrices produced by scKB pipeline as the input of copilot.

# Usage

copilot(
  sample.name,
  spliced.mtx = NULL,
  unspliced.mtx = NULL,
  total.mtx = NULL,
  filtered.mtx.output.dir = NULL,
  species.name = "Not Provided",
  transcriptome.name = "Not Provided",
  sample.stats = NULL,
  mt.pattern = NULL,
  mt.threshold = 5,
  cp.pattern = NULL,
  top.percent = 1,
  filtering.ratio = 1,
  estimate.doublet.rate = TRUE,
  doublet.rate = NULL,
  remove.doublet = TRUE,
  do.seurat = TRUE,
  do.annotation = FALSE,
  unwanted.genes = NULL,
  HVG = FALSE,
  HVGN = 200,
  dir_to_bulk = NULL,
  clustering_alg = 3,
  res = 0.5,
  min.UMI.low.quality = 100,
  min.UMI.high.quality = 300,
  legend.position = c(0.8, 0.8)
)

# Arguments

sample.name:
User defined sample name (character), which should be the same as the name of directory that contains spliced and spliced matrices if you are following scKB pipeline to produce raw counts matrices.

spliced.mtx:
Gene by cell matrix of spliced counts, which should have column and row names, Default is NULL.

unspliced.mtx:
Gene by cell matrix of unspliced counts, which should have column and row names. Default is NULL.

total.mtx:
Gene by cell matrix of total counts, which should have column and row names. Default is NULL.

filtered.mtx.output.dir:
Output directory for quality filtered matrices. Default is NULL.

species.name:
Species name (character). Default is "Not Provided".

transcriptome.name:
Name of transcriptome annotation file. (e.g. TAIR10 for Arabidopsis). Default is "Not Provided".
                                        
sample.stats:
Meta data of the sample in data.frame format. Default is NULL.
                                        
mt.pattern:
Pattern of mitochondrial gene names/ids (character; e.g. "ATMG") or list of mitochondrial genes (character vector). Default is NULL, however this argument is required to run copilot.
                                        
mt.threshold:
Threshold of mitochondrial expression percentage. Cell would be treated as dying cell if it has mitochodrial expression percentage higherthan this threshold (numeric). Default is 5.
                                        
cp.pattern:
Pattern of chloroplast gene names/ids (character; e.g. "ATCG") or list of chloroplast genes (character vector). Default is NULL.
                                        
top.percent:
Percentage of cells that contain high numer of UMIs filtered (numeric). Default is 1.
                                        
filtering.ratio:
Metric that controls the stringency of cell filtering (lenient: 1; strict:0; moderate: 0 < filtering.ratio < 1; numeric). Default is 1.
                                        
estimate.doublet.rate:
Whether or not to estimate doublet rate according to 10X Genomics' estimation (boolean). Default is TRUE.

doublet.rate:
User specified doublet rate (numeric). Default is NULL.

remove.doublet:
Whether or not to remove doublets after quality filtering of gene and cell (boolean). Default is TRUE.

do.seurat:
Whether or not to perform normalization, PCA, UMAP and clustering using Seurat and output a Seurat object (boolean). Default is TRUE.
                                        
do.annotation:
Whether or not to do annotation (boolean). COPILOT only supports annotation on root of Arabidopsis thaliana and Oryza sativa. Default is FALSE.
                                        
unwanted.genes:
Gene IDs/names of unwanted genes (character vector, e.g. cell cycle related genes, organelle genes ... etc). Default is NULL.
                                        
HVG:
Whether or not to select highly variable genes (boolean). Default is FALSE.
                                        
HVGN:
Number of highly variable genes selected (numeric). Defalut is 200.
                                        
dir_to_bulk:
Directory to reference expression profile for annotation. Default is NULL.
                                        
clustering_alg:
Algorithm for clustering (1 = original Louvain algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM algorithm; 4 = Leiden algorithm, which requires the leidenalg python; numeric). Default is 3.

res:
Resolution used for clustering (numeric). Default is 0.5.

min.UMI.low.quality:
Minimum UMIs for a barcode to be considered as cell (numeric). Default is 100.

min.UMI.high.quality:
Minimum UMIs for a cell to be considered as high quality cell (numeric). Default is 300.

legend.position:
x y position of the legend on UMI histogram plot (numeric vector of length 2). Default is c(0.8,0.8).