-
Notifications
You must be signed in to change notification settings - Fork 0
/
16S_DataProcessing.R
210 lines (159 loc) · 9.65 KB
/
16S_DataProcessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
Code used for processing raw microbiome fastq files to generate an ASV file for (Raina et al. 2022) Chemotaxis shapes the microscale organisation of the ocean's microbiome. This DADA2 pipeline is based on Callahan, B.J. et al. 2016. In this pipeline R1 and R2 were trimmed to remove low quality terminal ends (trunc(R1= 260; R2= 255)), this was found to be the trunc lengths that produced the most quality merged reads for this dataset.
#librarys for this work must be installed and loaded in R
library(R.utils);
library(dada2);
library(ShortRead); packageVersion("ShortRead") # dada2 depends on this
library(tidyverse); packageVersion("dplyr") # for manipulating data
library(Biostrings); # for creating the final graph at the end of the pipeline
library(Hmisc); packageVersion("Hmisc") # for creating the final graph at the end of the pipeline
library(plotly); packageVersion("plotly") # enables creation of interactive graphs, especially helpful for quality plots
library(here); packageVersion("here");
here();
library(readr);
library(Biostrings);
library(DECIPHER); packageVersion("DECIPHER");
#STEP1: Set Directories and make subfolders for your files
home.dir <- ("/home/anna/jb_isca_2022/"); #BPA dataset location
setwd(home.dir);
plates<-read_csv('plates');
args = commandArgs(trailingOnly=TRUE);
p=args[1];
p=1
print(p);
plates[p,1];
home.dir <- ("/home/anna/jb_isca_2022/");
base.dir <- (paste0(home.dir, plates[p,1], "/"));
dir.create(paste0(plates[p,1], ".2021_exports_260_255"));
export_dir <-(paste0(plates[p,1], ".2021_exports_260_255/"));
dir.create(paste0(export_dir, "final"));
trimmed_dir <-(paste0(plates[p,1], ".trimmed/"));
dir.create(paste0(trimmed_dir));
trunc_dir <-(paste0(export_dir, "final"));
trimLeng <-(paste0(trimmed_dir, "final"));
dir.create(paste0(trimLeng));
files.fp.gz<-list.files(path=base.dir, pattern=".fastq.gz");
data.fp.gz <- paste0(plates[p,1], "/");
print(data.fp.gz);
for (i in seq_along (files.fp.gz)){
gunzip(filename=paste0(data.fp.gz,files.fp.gz[i]), overwrite=T)};
files.fp.gz<-list.files(path=base.dir, pattern=".fastq");
data.fp.gz <- paste0(plates[p,1], "/");
print(data.fp.gz);
fnFs <- sort(list.files(data.fp.gz, pattern="_R1.fastq", full.names = TRUE));
fnRs <- sort(list.files(data.fp.gz, pattern="_R2.fastq", full.names = TRUE));
sample.names <- sapply(strsplit(basename(fnFs), "-"), `[`, 1);
fwd.plot.quals <- plotQualityProfile(fnFs[1:6])
ggsave(file = paste0(export_dir,"fwd.qualplot.pdf"), fwd.plot.quals, device="pdf");
rev.plot.quals <- plotQualityProfile(fnRs[1:6])
ggsave(file = paste0(export_dir,"rev.qualplot.pdf"), rev.plot.quals, device="pdf");
FWD <- "AGAGTTTGATCMTGGCTCAG"; ## CHANGE: Bacteria 16S: 27f V1_V3
REV <- "GWATTACCGCGGCKGCTG"; ## CHANGE: Bacteria 16S: 519r V1_V3
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD);
REV.orients <- allOrients(REV);
data.fp <- trimmed_dir;
fnFs.filtN <- file.path(data.fp, "filtN", basename(fnFs)); # Put N-filterd files in filtN/ subdirectory
fnRs.filtN <- file.path(data.fp, "filtN", basename(fnRs));
filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = TRUE, compress=F, matchIDs=TRUE);
passed.filtN <- file.exists(fnFs.filtN) # TRUE/FALSE vector of which samples passed the filter
fnFs.filtN <- fnFs.filtN[passed.filtN] # Keep only those samples that passed the filter
fnFs <- fnFs[passed.filtN] # Keep only those samples that passed the filter
fnRs.filtN <- fnRs.filtN[passed.filtN] # Keep only those samples that passed the filter
fnRs <- fnRs[passed.filtN] # Keep only those samples that passed the filter
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0));
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]));
#cutadapt was used to remove FWD and REV primers
cutadapt <- "/usr/bin/cutadapt";
data.fp <- paste0(trimmed_dir, "filtN");
#system2(cutadapt, args = "--version"); # Run shell commands from R
path.cut <- file.path(trimmed_dir, "cutadapt");
if(!dir.exists(path.cut)) dir.create(path.cut);
fnFs.cut <- file.path(path.cut, basename(fnFs));
fnRs.cut <- file.path(path.cut, basename(fnRs));
FWD.RC <- dada2:::rc(FWD);
REV.RC <- dada2:::rc(REV);
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC);
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC);
# Run Cutadapt
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
fnFs.filtN[i], fnRs.filtN[i])) # input files Default error rate: 0.1
}
data.fp <- paste0(trimmed_dir, "cutadapt");
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0));
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]));
data.fp <- paste0(trimmed_dir, "cutadapt");
passed.fnFs.cut <- file.exists(fnFs.cut) # TRUE/FALSE vector of which samples passed the filter
fnFs.cut <- fnFs.cut[passed.fnFs.cut] # Keep only those samples that passed the filter
fnFs <- fnFs[passed.fnFs.cut] # Keep only those samples that passed the filter
fnRs.cut <- fnRs.cut[passed.fnFs.cut] # Keep only those samples that passed the filter
fnRs <- fnRs[passed.fnFs.cut] # Keep only those samples that passed the filter
sample.names <- sapply(strsplit(basename(fnFs), "-"), `[`, 1);
#STEP5: Dada2 trim step (CHANGE TRIM lengths in this step!)
#here you are renaming your new files to be sample name_R1_trim.fastq, no need to change
trimFs <- file.path(trimmed_dir, paste0(sample.names, "-R1_trim.fastq"));
trimRs <- file.path(trimmed_dir, paste0(sample.names, "-R2_trim.fastq"));
names(trimFs) <- sample.names;
names(trimRs) <- sample.names;
head(sample.names);
#In this pipeline R1 and R2 were trimmed to remove low quality terminal ends (trunc(R1= 260; R2= 255)), this was #found to be the trunc lengths that produced the most quality merged reads for this dataset.
out <- filterAndTrim(fnFs.cut, trimFs, fnRs.cut, trimRs, truncLen=c(260,255),
maxN=0, maxEE=c(4,6), truncQ=6, rm.phix=TRUE,
compress=FALSE, multithread=10, minLen = 50, matchIDs=TRUE); # On Windows set multithread=FALSE
head(out);
passed.trim <- file.exists(trimFs); # TRUE/FALSE vector of which samples passed the filter
trimFs <- trimFs[passed.trim]; # Keep only those samples that passed the filter
trimRs <- trimRs[passed.trim]; # Keep only those samples that passed the filter
data.fp <- paste0(trimLeng);
errF <- learnErrors(trimFs, nbases =1e8, verbose=TRUE, multithread=10, MAX_CONSIST=20);
errR <- learnErrors(trimRs, nbases =1e8, verbose=TRUE, multithread=10, MAX_CONSIST=20);
fwd.plot.errors <- plotErrors(errF, nominalQ=TRUE);
ggsave(file = paste0(trunc_dir,"/","fwd.errors.pdf"), fwd.plot.errors, width = 10, height = 10, device="pdf");
rev.plot.errors <- plotErrors(errR, nominalQ=TRUE);
ggsave(file = paste0(trunc_dir,"/","rev.errors.pdf"), rev.plot.errors, width = 10, height = 10, device="pdf");
#STEP7: DEREPLICATION, DADA2 Step, MERGE, and COLLAPSE if the same! SAVE RDS
derepFs<-derepFastq(trimFs);
derepRs<-derepFastq(trimRs);
dadaFs <- dada(derepFs, err=errF, multithread=10, pool="pseudo"); #"pseudo" # or for pooled go: TRUE no ""
dadaRs <- dada(derepRs, err=errR, multithread=10, pool="pseudo");
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, minOverlap = 12, maxMismatch = 1, verbose=TRUE);
# Inspect the merger data.frame from the first sample
head(mergers[[1]]);
seqtab <- makeSequenceTable(mergers);
dim(seqtab);
saveRDS(seqtab, file = paste0(trunc_dir,"/",plates[p,1],"seqtab_260_255_final_mm1.RDS"), ascii = FALSE, version = NULL, compress = TRUE, refhook = NULL)
#chimera removal
getN <- function(x) sum(getUniques(x));
table(nchar(getSequences(seqtab)));
seqtab.nochim.1 <- removeBimeraDenovo(seqtab, method="consensus", multithread=10, verbose=TRUE, minFoldParentOverAbundance=1); #consensus #pooled
dim(seqtab.nochim.1);
sum(seqtab.nochim.1)/sum(seqtab);
#collapse identical sequences
seqtab.nochim.1 <- collapseNoMismatch(seqtab.nochim.1, minOverlap = 200, verbose = T)
#save RDS files of the seqtab files of importance:
saveRDS(seqtab.nochim.1, file = paste0(trunc_dir,"/",plates[p,1],"seqtab.1_260_255_final_mm1.RDS"), ascii = FALSE, version = NULL, compress = TRUE, refhook = NULL)
#seqtab.1_260_255_final_mm1.RDS is used for assigning taxonomy with silva and for subsequent data analysis.