/
Best Practices in RNA-seq.Rmd
417 lines (347 loc) · 19.9 KB
/
Best Practices in RNA-seq.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
---
title: "Seurat Version 3 Walkthrough"
author: "Single Cell Users Group"
date: "12/18/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Background on Seurat and Alternatives
Seurat was initially developed by Rahul Satija. The package is currently developed and maintained by the Satija lab. The latest sstable release is version 3.1.1; a pre-release beta version of 3.2 is available, which enables the analysis of spatial datasets. Ancillary packages to Seurat (also developed by the Satija lab) include sctransform for normalizing data and SeuratData, a collection of datasets.
Although this is a walkthrough of Seurat, we do not take a firm position on its merits relative to the other freely available analysis packages. Alternative integrated analysis packages include scanpy (written in python) and scater (written in R).
This R markdown starts from count matrices. Pre-processing steps were surveyed in some lectures we gave earlier this year and are nicely covered by Lun et al, F1000Res5 (2016):2122.
```{r Preliminaries, echo = FALSE, warning = FALSE}
# Basic definitions - ONLY CHANGE THESE
base_dir <- "/Users/cordessf/projects/SCUG"
# Derived definitions
# ... cache directory
cache_dir <- file.path(base_dir, "cache")
if(!dir.exists(cache_dir)) {
dir.create(cache_dir)
}
# ... data directory
data_dir <- file.path(base_dir, "data")
# ... reference directory
ref_dir <- file.path(base_dir, "ref")
# ... results directory
results_dir <- file.path(base_dir, "results")
if(!dir.exists(results_dir)) {
dir.create(results_dir)
}
# Load required packages
suppressPackageStartupMessages(require(cowplot))
suppressPackageStartupMessages(require(ggplot2))
suppressPackageStartupMessages(require(sctransform))
suppressPackageStartupMessages(require(Seurat))
suppressPackageStartupMessages(require(SeuratData))
# Set options to permit aalysis of larger datasets
options(future.globals.maxSize= 2147483648)
```
## Loading the data
We will be working with three different datasets that are freely available from 10X genomics: 3'-end and 5'-end sequenced expression data with cell surface protein quantification. For each dataset, we label the cells sequenced by the method with which they were sequenced (either '3-prime expression' or '5-prime expression'). We also pre-compute the fraction of mitochondrial reads for each cell.
Note: the joint gene expression levels combined with the abundance of cell surface proteins is an example of multi-modal single cell data.
```{r Load_Data, echo = FALSE, warning = FALSE}
# Load data
# ... 3' expression data wtih CITE-seq
pbmnc_3_prime_cite_seq <- Seurat::Read10X_h5(filename = file.path(data_dir, "5k_pbmnc_3_cite_seq", "5k_pbmc_protein_v3_nextgem_filtered_feature_bc_matrix.h5"))
# ... ... separate out gene expression data and create the Seurat Object
pbmnc_3_expression_data <- pbmnc_3_prime_cite_seq[["Gene Expression"]]
pbmnc_3_so <- Seurat::CreateSeuratObject(counts = pbmnc_3_expression_data)
# ... ... label as '3-prime expression'
pbmnc_3_so <- Seurat::AddMetaData(object = pbmnc_3_so,
metadata = rep('3-prime expression', ncol(pbmnc_3_so)),
col.name = 'Method')
# ... ... compute fraction of mitochondrial reads for each cell
pbmnc_3_so <- Seurat::PercentageFeatureSet(object = pbmnc_3_so, pattern = '^MT-', col.name = "percent.mt")
# ... ... separate out CITE-seq data and create Seurat::AssayObject for protein data
pbmnc_3_cite_seq_data <- pbmnc_3_prime_cite_seq[["Antibody Capture"]]
pbmnc_3_so[["ADT"]] <- Seurat::CreateAssayObject(counts = pbmnc_3_cite_seq_data)
pbmnc_3_so <- Seurat::NormalizeData(object = pbmnc_3_so,
assay = "ADT",
normalization.method = "CLR")
# ... 5' expression data with TCR-seq and CITE-seq
pbmnc_5_prime_vdj_cite_seq <- Seurat::Read10X_h5(filename = file.path(data_dir, "5k_pbmnc_5_vdj_cite_seq", "vdj_nextgem_hs_pbmc3_filtered_feature_bc_matrix.h5"))
# ... ... separate out gene expression data and create the Seurat Object
pbmnc_5_data <- pbmnc_5_prime_vdj_cite_seq[["Gene Expression"]]
pbmnc_5_so <- Seurat::CreateSeuratObject(counts = pbmnc_5_data)
# ... ... compute fraction of mitochondrial reads for each cell
pbmnc_5_so <- Seurat::PercentageFeatureSet(object = pbmnc_5_so, pattern = '^MT-', col.name = "percent.mt")
# ... ... label as '5-prime expression'
pbmnc_5_so <- Seurat::AddMetaData(object = pbmnc_5_so,
metadata = rep('5-prime expression', ncol(pbmnc_5_so)),
col.name = 'Method')
# ... ... separate out CITE-seq data and create Seurat::AssayObject for protein data
pbmnc_5_cite_seq_data <- pbmnc_5_prime_vdj_cite_seq[["Antibody Capture"]]
pbmnc_5_so[["ADT"]] <- Seurat::CreateAssayObject(counts = pbmnc_5_cite_seq_data)
pbmnc_5_so <- Seurat::NormalizeData(object = pbmnc_5_so,
assay = "ADT",
normalization.method = "CLR")
# Create a list of Seurat objects
# Create Seurat object list
pbmnc_sol <- list(pbmnc_3_so, pbmnc_5_so)
names(pbmnc_sol) <- c("3-prime", "5-prime")
# Cache the Seurat object list
save(pbmnc_sol, file = file.path(cache_dir, "PBMNC_SOL.RDS"))
```
## Merge and normalize data
To begin with we'll simply merge the two raw datasets. We'll compare the resultant merged dataset to methods that mitigate technical differences in the datasets a bit later.
```{r Merge, echo = FALSE, warning = FALSE}
for(n in names(pbmnc_sol)) {
cat("Processing library: ", n, " ... ")
pbmnc_sol[[n]] <- Seurat::RenameCells(pbmnc_sol[[n]], add.cell.id = n)
if(n == names(pbmnc_sol[1])) {
# First Seurat object is simply assigned to the merged list
pbmnc_merged_so <- pbmnc_sol[[1]]
} else {
# Subsequent Seurat objects are added to the merged Seurat object
pbmnc_merged_so <- merge(x = pbmnc_merged_so,
y = pbmnc_sol[[n]],
merge.data = TRUE,
project = '3-prime and 5-prime expression data')
}
cat("Done.\n")
}
# Cache the merged Seurat object
save(pbmnc_merged_so, file = file.path(cache_dir, "MERGED_PBMNC.RDS"))
```
## Normalization and Variance Stabilization via Regularized Negative Binomial Regression - scTransform
Single cell RNA-seq data exhibits significant cell-to-cell variation due to technical factors. Observed sequencing depth (number of genes or molecules detected per cell) can vary significantly with variations of an order of magnitude even for the same cell type. Causes include variability in cell lysis, reverse transcription efficiency and stochastic molecular sampling during sequencing.
```{r, echo = FALSE, warning = FALSE}
# Normalize via SCTransform
pbmnc_merged_so <- Seurat::SCTransform(object = pbmnc_merged_so,
assay = "RNA",
new.assay.name = "SCT",
do.correct.umi = TRUE,
vars.to.regress = "percent.mt",
verbose = FALSE)
# ... Standard visulaization
pbmnc_merged_so <- Seurat::RunPCA(object = pbmnc_merged_so, verbose = FALSE)
pbmnc_merged_so <- Seurat::RunUMAP(object = pbmnc_merged_so, dims = 1:30, verbose = FALSE)
pbmnc_merged_so <- Seurat::FindNeighbors(object = pbmnc_merged_so, dims = 1:30, verbose = FALSE)
pbmnc_merged_so <- Seurat::FindClusters(object = pbmnc_merged_so, verbose = FALSE)
# ... Plot by cluster
p_merged_cluster <- Seurat::DimPlot(pbmnc_merged_so, label = TRUE) + NoLegend()
plot(p_merged_cluster)
# ... Plot by sequencing method
p_merged_method <- Seurat::DimPlot(pbmnc_merged_so, group.by = c('Method'), label = TRUE) + NoLegend()
plot(p_merged_method)
# Cache merged Seurat object
save(pbmnc_merged_so, file = file.path(cache_dir, 'PBMNC_MERGED_SO.RDS'))
```
## Load reference data
During the course of analysis one often wants to compare one's data with previously analyzed datasets. Seurat v. 3 permits comparison between the cells in a query dataset with those of a reference dataset with transfer of annotations from the reference to the query dataset.
```{r Load_reference, echo = FALSE, warning = FALSE}
# Load a some reference datasets - these are datasets from 8 different technologies
if(!exists("pbmcsca")) {
SeuratData::InstallData("pbmcsca")
data("pbmcsca")
}
# Separate the data into a list of Seurat objects by method that the data was obtained
reference_sol <- Seurat::SplitObject(object = pbmcsca, split.by = "Method")
# Pare down the list to "Smart-seq2", "10x Chromium (v2)", "10x Chromium (v2) A" and "10x Chromium (v2) B", which are distict from our datasets (which we shall henceforth refer to as the query datasets)
reference_dataset_names <- c("Smart-seq2", "10x Chromium (v2)", "10x Chromium (v2) A", "10x Chromium (v2) B")
reference_sol <- reference_sol[reference_dataset_names]
# Separately perform scTransform for each dataset
for(n in names(reference_sol)) {
cat("Performing sctransform on data set", n, "...")
reference_sol[[n]] <- Seurat::SCTransform(object = reference_sol[[n]], verbose = FALSE)
cat("Done.\n")
}
# Integrate the reference data
reference_features <- Seurat::SelectIntegrationFeatures(object.list = reference_sol,
nfeatures = 3000)
reference_sol <- Seurat::PrepSCTIntegration(object.list = reference_sol,
anchor.features = reference_features)
# Integrate the reference data list into a single reference data set
reference_anchors <- Seurat::FindIntegrationAnchors(object.list = reference_sol,
normalization.method = 'SCT',
anchor.features = reference_features,
verbose = FALSE)
reference_integrated_so <- Seurat::IntegrateData(anchorset = reference_anchors,
normalization.method = 'SCT',
verbose = FALSE)
# Cache integrated reference data
save(reference_integrated_so, file = file.path(cache_dir, 'REFERENCE_INTEGRATED_SO.RDS'))
# Compute transfer anchors
for(n in names(pbmnc_sol)) {
cat("Transferring labels to", n, "...\n")
query_anchors <- Seurat::FindTransferAnchors(reference = reference_integrated_so,
query = pbmnc_sol[[n]],
reference.assay = 'integrated',
query.assay = 'RNA')
cell_types <- Seurat::TransferData(anchorset = query_anchors, refdata = reference_integrated_so$CellType, dims = 1:30)
pbmnc_sol[[n]] <- Seurat::AddMetaData(object = pbmnc_sol[[n]], metadata = cell_types$predicted.id, col.name = "CellType")
cat("Done.\n")
}
save(pbmnc_sol, file = file.path(cache_dir, 'PBMNC_SOL_LABELS_TRANSFERRED.RDS'))
# N.B. Should filter out low prediction scores - Have not done this here
```
## Dataset Integration
In this step we integrate the two expression datasets.
```{r Merged_And_Integrated, echo = FALSE, warning = FALSE}
# Separately perform scTransform for each dataset
for(n in names(pbmnc_sol)) {
cat("Performing sctransform on data set", n, "...")
pbmnc_sol[[n]] <- Seurat::SCTransform(object = pbmnc_sol[[n]], verbose = FALSE)
cat("Done.\n")
}
# Select features for integration
pbmnc_features <- Seurat::SelectIntegrationFeatures(object.list = pbmnc_sol, nfeatures = 3000)
pbmnc_sol <- PrepSCTIntegration(object.list = pbmnc_sol,
anchor.features = pbmnc_features,
verbose = FALSE)
# ... Integration
pbmnc_integration_anchors <- Seurat::FindIntegrationAnchors(object.list = pbmnc_sol,
normalization.method = 'SCT',
anchor.features = pbmnc_features,
verbose = FALSE)
pbmnc_integrated_so <- Seurat::IntegrateData(anchorset = pbmnc_integration_anchors,
normalization.method = 'SCT',
verbose = FALSE)
# Cache intermediate results
save(pbmnc_integrated_so, file = file.path(cache_dir, 'PBMNC_INTEGRATED_SO.RDS'))
```
## Visualization of merged and integrated datasets
```{r Visualization, echo = FALSE, warning = FALSE}
# ... merged data
pbmnc_merged_so <- Seurat::RunPCA(object = pbmnc_merged_so, verbose = FALSE)
pbmnc_merged_so <- Seurat::RunUMAP(object = pbmnc_merged_so, dims = 1:30)
pbmnc_merged_so <- Seurat::FindNeighbors(object = pbmnc_merged_so, dims = 1:30, verbose = FALSE)
pbmnc_merged_so <- Seurat::FindClusters(object = pbmnc_merged_so, verbose = FALSE)
# ... ... plot by sequencing method
merged_plots <- Seurat::DimPlot(object = pbmnc_merged_so,
group.by = c("Method"),
label = TRUE) + NoLegend()
plot(merged_plots)
# ... integrated data
pbmnc_integrated_so <- Seurat::RunPCA(object = pbmnc_integrated_so, verbose = FALSE)
pbmnc_integrated_so <- Seurat::RunUMAP(object = pbmnc_integrated_so, dims = 1:30)
pbmnc_integrated_so <- Seurat::FindNeighbors(object = pbmnc_integrated_so, dims = 1:30, verbose = FALSE)
pbmnc_integrated_so <- Seurat::FindClusters(object = pbmnc_integrated_so, verbose = FALSE)
# ,,, .... plot by sequencing methof
integrated_plots_method <- Seurat::DimPlot(object = pbmnc_integrated_so,
group.by = c("Method"),
label = TRUE) + NoLegend()
plot(integrated_plots_method)
# ... ... plot by cluster
integrated_plots_cluster <- Seurat::DimPlot(object = pbmnc_integrated_so,
label = TRUE) + NoLegend()
plot(integrated_plots_cluster)
# ... ... plot by cell type
integrated_plots_celltype <- Seurat::DimPlot(object = pbmnc_integrated_so,
group.by = c("CellType"),
label = TRUE) + NoLegend()
plot(integrated_plots_celltype)
```
## Multimodal Analysis - Viualization of the CITE-seq data
```{r, echo = FALSE, warning = FALSE}
features <- paste(c('CD3',
# 'CD4',
'CD8a',
'CD16',
'CD45RA',
'CD45RO',
# 'CD56',
'CD62L'), 'TotalSeqB', sep = '-')
for(feature in features) {
# ... Feature Plots
p_features <- Seurat::FeaturePlot(object = pbmnc_integrated_so,
features = feature, label = FALSE) + NoLegend()
plot(p_features)
# ... Ridge Plots
p_ridge <- Seurat::RidgePlot(object = pbmnc_integrated_so,
features = feature,
group.by = 'CellType',
legend = 'none') + NoLegend()
plot(p_ridge)
}
```
## scATAC-seq and scRNA-seq integration
```{r, echo = FALSE, warning = FALSE}
# Read in ATAC-seq data
peaks <- Seurat::Read10X_h5(file.path(data_dir, "5k_pbmnc_atac_seq","atac_pbmc_5k_nextgem_filtered_peak_bc_matrix.h5"))
activity_matrix <- Seurat::CreateGeneActivityMatrix(peak.matrix = peaks,
annotation.file = file.path(ref_dir, "Homo_sapiens.GRCh38.98.gtf.gz"),
seq.levels = c(1:22, "X", "Y"),
upstream = 2000, verbose = FALSE)
# Convert to Seurat object
pbmnc_atac <- Seurat::CreateSeuratObject(counts = peaks, assay = "ATAC", project = "10X_ATAC")
pbmnc_atac[["ACTIVITY"]] <- Seurat::CreateAssayObject(counts = activity_matrix)
# Read in metadata and add to the Seurat object
meta_data <- read.table(file = file.path(data_dir, "5k_pbmnc_atac_seq", "atac_pbmc_5k_nextgem_peak_annotation.tsv"),
sep = "\t", header = TRUE, row.names = 1, stringsAsFactors = FALSE)
meta_data <- meta_data[colnames(pbmnc_atac), ]
pbmnc_atac <- Seurat::AddMetaData(pbmnc_atac, metadata = meta_data)
# Subset the data
pbmnc_atac <- subset(pbmnc_atac, subset = nCount_ATAC > 5000)
pbmnc_atac$Method <- 'ATAC-seq'
# Preprocessing
Seurat::DefaultAssay(pbmnc_atac) <- 'ACTIVITY'
pbmnc_atac <- Seurat::FindVariableFeatures(pbmnc_atac)
pbmnc_atac <- Seurat::NormalizeData(pbmnc_atac)
pbmnc_atac <- Seurat::ScaleData(pbmnc_atac)
#
Seurat::DefaultAssay(pbmnc_atac) <- 'ATAC'
Seurat::VariableFeatures(pbmnc_atac) <- names(which(Matrix::rowSums(pbmnc_atac) > 100))
pbmnc_atac <- Seurat::RunLSI(pbmnc_atac, n = 50, scale.max = NULL)
pbmnc_atac <- Seurat::RunUMAP(pbmnc_atac, reduction = 'lsi', dims = 1:50)
# Cache results
save(pbmnc_atac, file = file.path(cache_dir, "PBMNC_ATAC.RDS"))
# Visualization
p1 <- Seurat::DimPlot(pbmnc_atac, reduction = 'umap') + NoLegend() + ggtitle('scATAC-seq')
plot(p1)
p2 <- Seurat::DimPlot(pbmnc_integrated_so, group.by = 'CellType', label = TRUE, repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')
plot(p2)
```
## Transfer labels to the ATAC-seq data
We have labeled expression data with cell types and would now like to transfer to the cell type labels to the ATAC-seq data
```{r, integrate_atac, echo = FALSE, warning = FALSE}
# Compute transfer anchors
expression_features <- Seurat::VariableFeatures(object = pbmnc_integrated_so)
transfer_anchors <- Seurat::FindTransferAnchors(reference = pbmnc_integrated_so,
reference.assay = 'integrated',
query = pbmnc_atac,
query.assay = 'ACTIVITY',
features = expression_features,
reduction = 'cca')
# Transfer data
cell_types <- Seurat::TransferData(anchorset = transfer_anchors,
refdata = pbmnc_integrated_so$CellType,
weight.reduction = pbmnc_atac[['lsi']])
pbmnc_atac <- Seurat::AddMetaData(pbmnc_atac, metadata = cell_types$predicted.id, col.name = "CellType")
# Should filter out low prediction scores
# Cache
save(pbmnc_atac, file = file.path(cache_dir, "PBMNC_ATAC_ANNOTATED.RDS"))
```
## Co-embed scRNA-seq and scRNA-seq cells in same low dimensional space
Expression values for the scATAC-seq data will be imputed from those of of the integrated scRNA-seq data
```{r, Co_embed, echo = FALSE, warning = FALSE}
# Determine variable features and subset the expression data to those
genes.use <- Seurat::VariableFeatures(object = pbmnc_integrated_so)
refdata <- Seurat::GetAssayData(pbmnc_integrated_so, assay = 'integrated', slot = 'data')[genes.use, ]
# Impute expression levels for the scATAC-seq dataset
imputation <- Seurat::TransferData(anchorset = transfer_anchors,
refdata = refdata,
weight.reduction = pbmnc_atac[['lsi']])
pbmnc_atac[['integrated']] <- imputation
# Coembed actual and imputed RNA-seq expression data
pbmnc_coembedded <- merge(x = pbmnc_integrated_so, pbmnc_atac)
Seurat::DefaultAssay(pbmnc_coembedded) <- 'integrated'
# plot the coembedded data
pbmnc_coembedded <- Seurat::ScaleData(object = pbmnc_coembedded,
features = genes.use,
do.scale = FALSE)
pbmnc_coembedded <- Seurat::RunPCA(object = pbmnc_coembedded,
features = genes.use,
verbose = FALSE)
pbmnc_coembedded <- Seurat::RunUMAP(object = pbmnc_coembedded,
dims = 1:30)
# Cache
save(pbmnc_coembedded, file = file.path(cache_dir, "PBMNC_COEMBEDDED.RDS"))
# Visualization
p3 <- Seurat::DimPlot(pbmnc_coembedded, group.by = 'Method')
plot(p3)
p4 <- Seurat::DimPlot(pbmnc_coembedded, group.by = 'CellType', label = TRUE, repel = TRUE)
plot(p4)
```