RADseq.html

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />


<title>RADseq</title>

<script src="site_libs/header-attrs-2.28/header-attrs.js"></script>
<script src="site_libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
       h1.title {font-size: 38px;}
       h2 {font-size: 30px;}
       h3 {font-size: 24px;}
       h4 {font-size: 18px;}
       h5 {font-size: 16px;}
       h6 {font-size: 12px;}
       code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
       pre:not([class]) { background-color: white }</style>
<script src="site_libs/jqueryui-1.13.2/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/default.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>

<style type="text/css">code{white-space: pre;}</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>


<style type="text/css">
/* for pandoc --citeproc since 2.11 */
div.csl-bib-body { }
div.csl-entry {
  clear: both;
  margin-bottom: 0em;
}
.hanging div.csl-entry {
  margin-left:2em;
  text-indent:-2em;
}
div.csl-left-margin {
  min-width:2em;
  float:left;
}
div.csl-right-inline {
  margin-left:2em;
  padding-left:1em;
}
div.csl-indent {
  margin-left: 2em;
}
</style>

<link rel="stylesheet" href="tutorial.css" type="text/css" />


<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
img {
  max-width:100%;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
details > summary > p:only-child {
  display: inline;
}
pre code {
  padding: 0;
}
</style>


<style type="text/css">
.dropdown-submenu {
  position: relative;
}
.dropdown-submenu>.dropdown-menu {
  top: 0;
  left: 100%;
  margin-top: -6px;
  margin-left: -1px;
  border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
  display: block;
}
.dropdown-submenu>a:after {
  display: block;
  content: " ";
  float: right;
  width: 0;
  height: 0;
  border-color: transparent;
  border-style: solid;
  border-width: 5px 0 5px 5px;
  border-left-color: #cccccc;
  margin-top: 5px;
  margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
  border-left-color: #adb5bd;
}
.dropdown-submenu.pull-left {
  float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
  left: -100%;
  margin-left: 10px;
  border-radius: 6px 0 6px 6px;
}
</style>

<script type="text/javascript">
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark the anchor link active (and if it's in a dropdown, also mark that active)
  var dropdown = menuAnchor.closest('li.dropdown');
  if (window.bootstrap) { // Bootstrap 4+
    menuAnchor.addClass('active');
    dropdown.find('> .dropdown-toggle').addClass('active');
  } else { // Bootstrap 3
    menuAnchor.parent().addClass('active');
    dropdown.addClass('active');
  }

  // Navbar adjustments
  var navHeight = $(".navbar").first().height() + 15;
  var style = document.createElement('style');
  var pt = "padding-top: " + navHeight + "px; ";
  var mt = "margin-top: -" + navHeight + "px; ";
  var css = "";
  // offset scroll position for anchor links (for fixed navbar)
  for (var i = 1; i <= 6; i++) {
    css += ".section h" + i + "{ " + pt + mt + "}\n";
  }
  style.innerHTML = "body {" + pt + "padding-bottom: 40px; }\n" + css;
  document.head.appendChild(style);
});
</script>

<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "\e259";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "\e258";
  font-family: 'Glyphicons Halflings';
  border: none;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
  background-color: transparent;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<!-- code folding -->


<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}

@media print {
.toc-content {
  /* see https://github.com/w3c/csswg-drafts/issues/4434 */
  float: right;
}
}

.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
}

.tocify .list-group-item {
  border-radius: 0px;
}

.tocify-subheader {
  display: inline;
}
.tocify-subheader .tocify-item {
  font-size: 0.95em;
}

</style>


</head>

<body>


<div class="container-fluid main-container">


<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">


<div class="navbar navbar-inverse  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-bs-toggle="collapse" data-target="#navbar" data-bs-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">MarineOmics</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="ADMIN_04_best_principles.html">Best Principles</a>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Contributions
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="ADMIN_01_submissions_instructions.html">Guide for Building a Page</a>
    </li>
    <li>
      <a href="ADMIN_02_contributions.html">Past and Current Contributors</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Population Genomics
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="POP_01_choosing_population_genetics.html">Choosing a Population Genomics Approach</a>
    </li>
    <li>
      <a href="POP_04_WGS_intro.html">Whole Genome Resequencing</a>
    </li>
    <li>
      <a href="RADseq.html">Reduced Representation Sequencing</a>
    </li>
    <li>
      <a href="POP_03_poolseq.html">Poolseq</a>
    </li>
    <li>
      <a href="RDAtraitPredictionTutorial.html">Redundancy Analysis (RDA) Trait Prediction</a>
    </li>
    <li>
      <a href="POP_08_PCA.html">PCA</a>
    </li>
    <li>
      <a href="POP_09_aDNA.html">Ancient &amp; Degraded DNA</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Functional Genomics
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="DGE_comparison_v2.html">Mutifactorial RNAseq</a>
    </li>
    <li>
      <a href="FUN_02_DNA_methylation.html">DNA Methylation Assessment</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Genome-Phenome
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li class="dropdown-header">coming soon!</li>
  </ul>
</li>
<li>
  <a href="ADMIN_03_panels.html">Panel Seminars</a>
</li>
<li>
  <a href="https://github.com/MarineOmics/marineomics.github.io/discussions">Discussion Forum</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->
<script>
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

  ga('create', 'G-53GH9PV49T', 'auto');
  ga('send', 'pageview');

</script>

<div id="header">


<h1 class="title toc-ignore">RADseq</h1>
<h3 class="subtitle"><em>Katherine Silliman, Danielle
Davenport</em></h3>

</div>


<p>Initial publication year: 2022 <br> <a
href="https://marineomics.github.io/#How_to_Cite">How to cite</a></p>
<div id="setup-for-running-code" class="section level1">
<h1>Setup for running code</h1>
<p>If you would like to run the R code examples that are scattered
throughout the guide (recommended but not required!), you will need to
install some R packages. Only need to run this code once:</p>
<pre class="r"><code>install.packages(&quot;tidyverse&quot;)
if (!requireNamespace(&quot;BiocManager&quot;, quietly = TRUE))
    install.packages(&quot;BiocManager&quot;)

BiocManager::install(&quot;SeqArray&quot;)
BiocManager::install(&quot;SNPRelate&quot;)</code></pre>
<p>Now load those packages, if using:</p>
<pre class="r"><code>library(SeqArray) # efficient storage and filtering of genomic data</code></pre>
<pre><code>## Loading required package: gdsfmt</code></pre>
<pre><code>## 
## Attaching package: &#39;SeqArray&#39;</code></pre>
<pre><code>## The following object is masked from &#39;package:stringr&#39;:
## 
##     fixed</code></pre>
<pre class="r"><code>library(tidyverse) # plotting data formatting and manipulation
library(SNPRelate) # PCA and other popgen analyses</code></pre>
<pre><code>## SNPRelate</code></pre>
<p>We also provide optional alternative coding examples that are based
only on the command line, requiring these software packages:</p>
<ul>
<li><a
href="https://vcftools.github.io/man_latest.html">vcftools</a></li>
<li><a
href="https://riptutorial.com/gnuplot/example/11275/installation-or-setup">gnuplot</a></li>
</ul>
</div>
<div id="reduced-representation-sequencing-radseqgbs"
class="section level1">
<h1>Reduced Representation Sequencing (RADseq/GBS)</h1>
<p>“Restriction-site Associated DNA sequencing” - RADseq - combines
restriction enzymes with next-gen, massively parallel, short-read
sequencing. RADseq involves the use of restriction enzymes, which are
used to shear DNA at restriction enzyme cutsites. RADseq comes in
different flavors. Double-digest RADseq (ddRAD; <span
class="citation">Peterson et al. (2012)</span>) selects markers with two
restriction enzymes with different cut frequencies.
Genotype-by-Sequencing (GBS; <span class="citation">Elshire et al.
(2011)</span>) uses a frequent-cutting restriction enzyme with PCR size
selection. There are a number of reviews comparing different RADseq and
GBS methods (e.g., <span class="citation">Andrews et al. (2016)</span>).
In this guide we use the term “RADseq” to refer to any of these
protocols, including those that don’t involve random shearing of data.
When recommendations are specific to a certain type of RAD/GBS, we will
explicitly say so.</p>
<!--Some aspects of RADseq data that are unique compared to whole genome resequencing (more):-->
<p>Using RADseq to generate single nucleotide polymorphisms (SNPs)
involves:</p>
<ol style="list-style-type: decimal">
<li>library preparation in the lab</li>
<li>bioinformatic processing through assembly and/or mapping to a
reference, then</li>
<li>filtering of SNPs and individuals for quality.</li>
</ol>
<p>All of these steps can (and will) introduce some error, so the goal
is to minimize this error through mitigation steps at all three parts of
the process. Every dataset is different!<br />
<!--add more about philosophy over best practices--></p>
<p>Much of this guide is directly inspired by <a
href="https://onlinelibrary.wiley.com/doi/10.1111/mec.14792">this
excellent review paper</a> <span class="citation">(O’Leary et al.
2018)</span>, especially the section on minimizing errors due to library
prep. We recommend reading and cross-referencing with this paper, and
citing it if you follow its suggestions. Table 1 from this paper
summarizes the various potential issues that can arise from RAD datasets
and some mitigation steps. The goal of this guide is to expand on the
O’Leary paper and provide some example code to help implement quality
control and mitigation steps.</p>
<div class="float">
<img src="POP_02_RADseq_files/oleary_table1.png"
alt="Table 1 from (O’Leary et al. 2018)." />
<div class="figcaption">Table 1 from <span class="citation">(O’Leary et
al. 2018)</span>.</div>
</div>
</div>
<div id="labwork" class="section level1">
<h1>Considerations During Lab Work</h1>
<p>There are steps you can take before you even sequence RAD libraries
that can help minimize issues downstream. Here, we use “library” to
refer to a set of RADseq fragments from a group of individuals that are
barcoded and sequenced together on a single lane or group of lanes.
While specific RAD library prep methods have their own nuances for
minimizing error, there are some steps you can take that are common
across methods.</p>
<ul>
<li><p>If this RAD/GBS method has not been done in your species or in
your molecular lab setup before, spend some time optimizing the protocol
using a representative subset of individuals. Then try to keep
everything about the library prep as consistent as possible across
samples (eg, DNA extraction kit, PCR cycles, sequencing platform). This
isn’t always possible, especially if you need to optimize the protocol
for certain tricky samples. Just make sure to keep track of
everything!</p></li>
<li><p>Randomize samples across library prep batches and sequencing
lanes! For example, if you are sequencing two different groups of
samples on two different sequencing lanes, make sure they are randomized
with respect to sample location or whatever your groups of interest are.
If you are preparing groups of individuals in different batches to be
pooled later, randomize across these batches. <a
href="https://mastrettayanes-lab.org/">Alicia Mastretta-Yanes</a> even
recommends randomizing across DNA extractions, especially if the person
doing the extractions is new to molecular work. This is to allow you to
control for potential <a href="#batch">batch effects</a> that are often
observed with RAD data.</p></li>
</ul>
<!--ADW: add figure showing random plate design-->
<ul>
<li>Keep track of all potential batch effect sources in a Metadata file
(eg, storage conditions of tissue/DNA, date/method of DNA extraction,
date/method library prep batch, sequencing lane).<br />
</li>
<li>Have a core set of 2-4 technical sample replicates across all
libraries. Ideally these are true technical replicates, meaning the same
tissue/DNA sample is processed multiple times all the way from library
preparation to sequencing. Sequencing replicates, where you sequence the
same sample library preparation multiple times, can also be useful for
downstream quality control.</li>
</ul>
<!--ADW: add figure or text to define sequencing replicates vs technical replicates-->
</div>
<div id="principles-for-analyzing-your-data" class="section level1">
<h1>Principles for Analyzing Your Data</h1>
<div id="steps-for-a-robust-rad-analysis" class="section level2">
<h2>Steps for a robust RAD analysis</h2>
<p>This is just one approach for working through your data. Some people
will prefer to run just a subset of samples through a pipeline at first
and evaluate parameters, then run all the samples through. Either way,
be prepared to make MULTIPLE assemblies and go through this process
iteratively, especially if this is a new-to-you study system.</p>
<!-- 1) [Look at your raw data.](#fastqc)
2) [Run an assembly pipeline](#assembly) through to a SNP dataset, using either all samples or a representative subset. Parameters can come from those used in a similar study, or default parameters.
3) Filter your data minimally, then [evaluate for potential sources of error.](#error)
4) Subset or remove individuals based on initial evaluation.  
5) Using a representative subset of samples, [test key parameters](#test) to optimize your assembly.
6) Run your optimized assembly on all non-removed samples.
7) Evaluate the difference between multiple filtering schemes for your analyses of interest. *Popgen analysis guide coming soon*
8) Repeat as needed. -->
<ol style="list-style-type: decimal">
<li><a href="#fastqc">Look at your raw data.</a></li>
<li><a href="#assembly">Run an assembly pipeline</a>, using either all
samples or a representative subset.</li>
<li><a href="#error">Evaluate potential sources of error.</a></li>
<li><a href="#bad">Remove individuals based on initial
evaluation</a>.<br />
</li>
<li><a href="#test">Test a range of key parameters</a> to optimize your
assembly.</li>
<li>Run your optimized assembly on all non-removed samples.</li>
<li><a href="#filter">Filter your SNPs</a></li>
<li><a href="#repeat">Repeat as needed!</a></li>
</ol>
</div>
<div id="fastqc" class="section level2">
<h2>First, look at the raw data!</h2>
<p>Always look at your data with <a
href="https://www.bioinformatics.babraham.ac.uk/projects/fastqc/">FastQC</a>
before starting an assembly. First, this is a good check to just make
sure the sequencing worked. If you have demultiplexed data, you can use
<a href="https://multiqc.info/">MultiQC</a> to generate FastQC plots for
all individuals and quickly identify ones that did not sequence
well.</p>
<p>Check out <a
href="https://datacarpentry.org/wrangling-genomics/02-quality-control/index.html">this
informative tutorial</a> on running FastQC and interpreting the
output.</p>
<p><strong>Questions to ask:</strong> Do you have a lot of adapter
sequences? Are the ends of your reads very low quality? If so, you
should expect a fair amount of trimming and read filtering to occur
prior to assembly. If that doesn’t occur or if too many reads are being
filtered so as to only recover a small number of SNPs, something might
need to be tweaked with your trimming and filtering step.
<!--ADW: elaborating what you mean by 'tweaked' or possibly providing a link to some resources for how someone might thoughtfully change the parameters for their trimming and filtering steps.--></p>
<p>You should also look at the top few reads in the Terminal.</p>
<pre><code>## zcat: unZip and conCATenate the file to the screen
## head -n 20: Just take the first 8 lines of input

$ zcat raw-fastqs/BC2_10_C5.fastq.gz | head -n 20
@BC2_10_C5.1 1 length=96
CAGCGTGTAATAGTCACCGGCGGCTCCCTCTGGAGAATAGCACAAGTGATCATTTTGCTCATCTTCCGTCCACTGGTGATTGTGGACCAGCCTCAC
+BC2_10_C5.1 1 length=96
&lt;GGGGGA&lt;GGGGIIIGIGGIGGGIIIIIGGGGGGGGGGGGGGGGGIIIIIIIIGGIIIGGGGIGIIGIGGGIIIIIAGGGGIIIGIIGGGGGAAGG
@BC2_10_C5.2 2 length=96
CTGCTACATGCAGTGTTCTGTATTACTTTTATTGTACGTTGATATGAATGAATGAGTGTTTTGTATACTTAGAGTACAAGTTTGTCAGTCATATCG
+BC2_10_C5.2 2 length=96
GIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIG
@BC2_10_C5.3 3 length=96
CAGCACATGTTCCTGTGTAGAAAGCTTGTTAGTAGAATAAATAACACATGGCTGGTCAAACACAACACATGAAGAAACAACTTTCTGAACAGTTTT
+BC2_10_C5.3 3 length=96
GIIGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIIIIIIGGGGIIGIIIGGGII
@BC2_10_C5.4 4 length=96
CAGCGATTCGGCCCAAATTTGCACCACATCAGGCCCTTGACAGGGCGCTTCGATGGTGCAAATTTGGTGCGATTCGCTGCGCACCTAGCATATATG
+BC2_10_C5.4 4 length=96
AGGGIGIIIGIGGGIIGIGGGIIIGGGGAGGGGGAGGIGIGGIGGGGAGGGGGGGGGGGGGGGIIIII.G&lt;GGIGGGIIIGGGGGIGGIIIGAGGI
@BC2_10_C5.5 5 length=96
CAGCAGTTTGGTGGAGTTCTGCAACCTTCCATTTCCAAAGAATTACCCAGGAGCTCTTCCCAGTGAATTTCTTCGGCACTTTTCATTGACCTTTTA
+BC2_10_C5.5 5 length=96
GGAGAAA.&lt;AGAA.G.GGAGA&lt;.GGA.&lt;GAAGGAAGGIA&lt;...&lt;GA&lt;..&lt;G.&lt;.&lt;&lt;.&lt;AAAGGGG..&lt;GGGGG&lt;G.A.&lt;GGGII.AG..&lt;.GGGGG
</code></pre>
<p>If the sequencing center gave you one fastq file with all your
samples, you should expect to see a barcode sequence, followed by the
cutsite at the start of the read. If the Data are already demultiplexed
(as the example is), you should only see the cutsite overhang (in this
case, CWGC). Sometimes you can look at your fastq data files and see
that there was a problem with the sequencing such that the cut site is
either offset by one or more bases, or contains many errors. If this is
not being addressed by the default filtering steps in your assembly
pipeline, you can trim off N bases from the beginning or end of R1 and
R2 reads in <a
href="https://ipyrad.readthedocs.io/en/latest/index.html">ipyrad</a>
(<code>trim_reads</code> param), or with cutadapt before using stacks,
or customize the Trimmoatic step for <a
href="https://www.ddocent.com/">dDocent</a>.</p>
</div>
<div id="assembly" class="section level2">
<h2>Run an assembly pipeline</h2>
<p>There are a number of freely available pipelines for processing
RADseq data, with the most popular being <a
href="https://www.ddocent.com/">dDocent</a>, <a
href="https://catchenlab.life.illinois.edu/stacks/">Stacks</a>, and <a
href="https://ipyrad.readthedocs.io/en/latest/index.html">ipyrad</a>.
These pipelines vary slightly in their underlying methodologies,
customization options, and additional included analyses.</p>
<table>
<colgroup>
<col width="25%" />
<col width="25%" />
<col width="25%" />
<col width="25%" />
</colgroup>
<thead>
<tr class="header">
<th></th>
<th><a href="https://www.ddocent.com/">dDocent</a></th>
<th><a
href="https://ipyrad.readthedocs.io/en/latest/index.html">ipyrad</a></th>
<th><a
href="https://catchenlab.life.illinois.edu/stacks/">Stacks2</a></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Supported datatypes</td>
<td>Paired-end: ddRAD, ezRAD, RAD (random shearing), data with large
overlap between forward and reverse reads; Single-end: any RADseq method
de novo and reference-based. If doing de novo assembly, reads cannot be
trimmed outside of dDocent</td>
<td>Wide range of RAD/GBS methods with paired/single end data, see <a
href="https://ipyrad.readthedocs.io/en/master/4-data.html">ipyrad
documentation</a></td>
<td>Paired-end and single-end data for single and double digest RAD and
DART; GBS with single-end sequencing. NOT suitable for paired-end
GBS.</td>
</tr>
<tr class="even">
<td>Unique aspects</td>
<td>Novel data reduction approach used to inform coverage cutoffs</td>
<td>Python API for popgen and phylogenetic analyses written specifically
for RAD data</td>
<td><code>populations</code> module calculates sliding window and
site-specific popgen metrics</td>
</tr>
<tr class="odd">
<td>Documentation</td>
<td>Very good (esp. tutorials), active community support on Google
Groups</td>
<td>Excellent (esp. installation, parameter explanations, and
tutorials), active community support on Gitter</td>
<td>Very good, in depth tutorials as published manuscripts, active
community support on Google Groups</td>
</tr>
<tr class="even">
<td>Speed/Accuracy (based on discussion in <a
href="https://www.youtube.com/watch?v=C74GBESeIq4">panel
seminar</a>)</td>
<td>fastest and most accurate</td>
<td>close to dDocent accuracy, more over-splitting is possible if
parameters are not tuned</td>
<td>produces some untrue genome fragments (esp. with higher levels of
indel polymorphism), but can be addressed with downstream filtering</td>
</tr>
<tr class="odd">
<td>Open source/development</td>
<td>Open source, primarily combines existing software</td>
<td>Open source</td>
<td>No</td>
</tr>
<tr class="even">
<td>Filtering options</td>
<td>Minimal default filtering as implemented in VCFtools. Ideal for
those who want full freedom in filtering their SNPs.</td>
<td>Wide range of filtering options, some hard-coded filtering to deal
with paralogs</td>
<td>Moderate range of filtering options</td>
</tr>
<tr class="odd">
<td>Output options</td>
<td>Only produces a VCF, fasta of the de novo assembly, and individual
BAM files from mapping to the reference</td>
<td>Lots of output formats (see <a
href="https://ipyrad.readthedocs.io/en/master/output_formats.html">documentation</a>)</td>
<td>Lots of Stacks-specific output and log files, as well as various
inputs for popgen programs (see <a
href="https://catchenlab.life.illinois.edu/stacks/comp/populations.php">documentation</a>)</td>
</tr>
<tr class="even">
<td>Popgen analyses</td>
<td>No additional popgen analyses included</td>
<td>Unique, flexible Python API implementing range of popgen and
phylogenetic methods (see <a
href="https://ipyrad.readthedocs.io/en/master/API-analysis/index.html">documentation</a>)</td>
<td>Popgen summary statistics (F-statistics, pi), including
kernel-smoothing along reference genome. Underlying equations and
assumptions for these are not very clear.</td>
</tr>
<tr class="odd">
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>
<p>Some groups have also developed pipelines for specific flavors of RAD
(e.g., <a
href="https://github.com/z0on/2bRAD_denovo/blob/master/2bRAD_README.sh">Matz
lab for 2b-RAD</a>) or proprietary software (eg, <a
href="http://georges.biomatix.org/storage/app/media/uploaded-files/dartR_Workbook.pdf">DARTseq</a>).
<strong>In most cases, it is recommended that you use a pipeline
developed and tested for RAD data, especially if you are making a de
novo assembly.</strong> <span class="citation">(LaCava et al.
2020)</span> have an excellent study where they review various <em>de
novo</em> assemblers that are used in these pipelines. Assemblers that
were not explicitly developed for short reads (eg, Velvet, ABySS)
performed very poorly, while CD-HIT (the assembler in dDocent) performed
the best. Stacks/Stacks2 and vsearch (the assembler in ipyrad) both
performed worse when analyzing simulated data with indels. It should be
noted that all of these RAD pipelines have optimized the specific
parameters of these de novo assemblers to work with RAD data of various
flavors, which may not be fully reflected in the LaCava study.</p>
<p>Based on the MarineOmics RAD panel, the general consensus is that all
the popular pipelines can produce adequately accurate datasets (with
appropriate parameter optimization and data filtering).<br />
<img src="POP_02_RADseq_files/lacavaTable3.png"
alt="Table 3 from LaCava et al 2019. CD-HIT is used in dDocent, VSEARCH is used in ipyrad." /></p>
</div>
<div id="error" class="section level2">
<h2>Evaluate potential sources of error</h2>
<p>Once you have processed your samples (or a subset of samples) through
a genotyping pipeline, you will have a bunch of different output file
options for genetic data. The VCF file is one of the most popular file
formats, and is the most versatile for initial data exploration as many
programs exist to filter and accept VCF files. Below are a list of
potential confounding factors that may exist in your data, and how to
tease them out.</p>
<p><strong>Using SeqArray in R</strong> If you are following along with
the R code examples, we need to 1st read in our data (skip this section
if not running R code).</p>
<p>For the code examples in this section, we primarily make use of the R
package SeqArray <span class="citation">(Zheng et al. 2017)</span>,
which can read and manipulate VCFs. If you’re familiar with R, SeqArray
is simple to use. The package can efficiently store sequence variant
calls with hundreds of thousands of individuals, variants (e.g., SNPs,
indels and structural variation calls) and annotations. It employs a
data container called a CoreArray Genomic Data Structure (GDS). It’s
super-fast (5X faster than PLINK v1.9; 16X faster than vcftools) and it
integrates well with other R packages you might use in your analysis
pipeline. (i.e. SNPRelate, SeqVarTools). We also like it because you can
filter your data before running certain analyses without 1st generating
a separate filtered VCF file.</p>
<p>First, we need to convert our VCF file into the GDS format. We will
do this once here, then use the GDS file for subsequent code
examples.</p>
<pre class="r"><code>filename = &quot;OL_subset&quot; #replace with y our file name 
filename.gds = paste0(&quot;POP_02_RADseq_files/&quot;, paste0(filename, &quot;.gds&quot;))
filename.vcf = paste0(&quot;POP_02_RADseq_files/&quot;, paste0(filename, &quot;.vcf&quot;))
  # 1 . Convert VCF to GDS
SeqArray::seqVCF2GDS(vcf.fn = filename.vcf, out.fn = filename.gds, storage.option=&quot;ZIP_RA&quot;)</code></pre>
<pre><code>## Sat Mar 18 13:56:29 2023
## Variant Call Format (VCF) Import:
##     file(s):
##         OL_subset.vcf (22.3M)
##     file format: VCFv4.0
##     genome reference: pseudo-reference (most common base at site)
##     the number of sets of chromosomes (ploidy): 2
##     the number of samples: 18
##     genotype storage: bit2
##     compression method: ZIP_RA
##     # of samples: 18
## Output:
##     POP_02_RADseq_files/OL_subset.gds
## Parsing &#39;OL_subset.vcf&#39;:
## + genotype/data   { Bit2 2x18x70005 ZIP_ra(34.4%), 216.0K }
## Digests:
##     sample.id  [md5: dd28e5c928ffd0a817743a0e9447a808]
##     variant.id  [md5: 03df6156357de104368e6ed4694ebf92]
##     position  [md5: 690fe39440c87b7bfb5eeb07d7c0a310]
##     chromosome  [md5: 9f967c382b54d12060ab45d8c293652b]
##     allele  [md5: fb5bdfa95fb2448dd960bb41c96f7bff]
##     genotype  [md5: 25350d469a1102cc70b367663332282f]
##     phase  [md5: cd66242aebb89cfc3f2082c9413847dc]
##     annotation/id  [md5: c95d7c12f4fdae536da42bfec73942c9]
##     annotation/qual  [md5: 2864e4ded2a2cdc9dccfeb45c4fb3465]
##     annotation/filter  [md5: 2134c67ca1fdd51b7d3bf17ca1ca2c9e]
##     annotation/info/NS  [md5: 4ad6db26b594a567be24b0bff7f1f909]
##     annotation/info/DP  [md5: c0e5e2a0856c1f43788d2fa9842d2e7a]
##     annotation/format/DP  [md5: ae6991de5adbf082a733198faf0f11ca]
##     annotation/format/CATG  [md5: 71d62ffa848999a375e8bb5f329be275]
## Done.
## Sat Mar 18 13:56:30 2023
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
##     open the file &#39;POP_02_RADseq_files/OL_subset.gds&#39; (2.8M)
##     # of fragments: 214
##     save to &#39;POP_02_RADseq_files/OL_subset.gds.tmp&#39;
##     rename &#39;POP_02_RADseq_files/OL_subset.gds.tmp&#39; (2.8M, reduced: 1.8K)
##     # of fragments: 64
## Sat Mar 18 13:56:30 2023</code></pre>
<pre class="r"><code>gdsin = SeqArray::seqOpen(filename.gds)
print(paste0(&quot;The number of SAMPLES in data: &quot;, length(c(SeqArray::seqGetData(gdsin, &quot;sample.id&quot;)))))</code></pre>
<pre><code>## [1] &quot;The number of SAMPLES in data: 18&quot;</code></pre>
<pre class="r"><code>print(paste0(&quot;The number of SNPs in data: &quot;,  length(c(SeqArray::seqGetData(gdsin, &quot;variant.id&quot;)))))</code></pre>
<pre><code>## [1] &quot;The number of SNPs in data: 69989&quot;</code></pre>
<p>It is always helpful to have a metadata file with information for
each sample, such as sampling site, sequencing library, etc. In our
example, our metadata file (OL.popmap) is tab-delimited and has the
column headers:<br />
ID: sample ID STRATA: sampling location/population PLATE: sequencing
batch<br />
Next we read in our metadata file, and make sure the samples are in the
same order as your VCF file:</p>
<pre class="r"><code>metafile = &quot;POP_02_RADseq_files/OL.popmap&quot;
sample.ids = seqGetData(gdsin, &quot;sample.id&quot;)
sample.strata =  read.table(metafile, header = T, sep = &quot;\t&quot;) %&gt;%
                  dplyr::select(ID, STRATA, PLATE)</code></pre>
<p>Now, on to evaluating our data!</p>
<div id="bad" class="section level3">
<h3>“Bad” samples</h3>
<p>Sometimes a sample doesn’t sequence well (few sequencing reads,
higher error rate). This can be due to DNA quality, an issue during
library prep, or not enough sequencing depth (average # of reads per
sample). Generally, it will lead to a sample with fewer sequencing
reads, higher missing data in a SNP dataset, and fewer shared loci with
other samples. Identifying and then removing these samples
<em>prior</em> to the final RADseq assembly analysis can help minimize
mis-assembled loci, genotyping errors, and excessive filtering of
acceptable loci.</p>
<p>The process of identifying low quality individuals is usually
iterative, as the way you initially filter your SNPs will influence the
amount of missing data and locus sharing among samples. This is why we
recommend minimally filtering your SNPs for sample coverage (the # of
individuals a locus is called in) when initially exploring your data.
Some ways to identify bad samples:</p>
<ol style="list-style-type: decimal">
<li>For every SNP dataset you generate, it is a good idea is always
evaluate the missingness per sample (and report this distribution in
your manuscript!). Identify samples with way more missingness than the
rest, and observe how they look in a PCA and locus sharing plot. If they
stick out or all cluster together in the middle, then try removing them
from the assembly and seeing if it changes downstream analyses. If so,
you may want to specify a missingness cutoff for including samples in
the final analysis.</li>
</ol>
<p><strong>Missingness in R with SeqArray</strong></p>
<pre class="r"><code>#using previously loaded gdsin object
print(&quot;Per variant: &quot;)</code></pre>
<pre><code>## [1] &quot;Per variant: &quot;</code></pre>
<pre class="r"><code>summary(m1 &lt;- SeqArray::seqMissing(gdsin, per.variant=TRUE))</code></pre>
<pre><code>##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2778  0.3889  0.3855  0.4444  0.9444</code></pre>
<pre class="r"><code>print(&quot;Per sample: &quot;)</code></pre>
<pre><code>## [1] &quot;Per sample: &quot;</code></pre>
<pre class="r"><code>summary(m2 &lt;- SeqArray::seqMissing(gdsin, per.variant=FALSE))</code></pre>
<pre><code>##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1218  0.1574  0.2287  0.3855  0.6213  0.9177</code></pre>
<pre class="r"><code>samples &lt;- SeqArray::seqGetData(gdsin, &quot;sample.id&quot;)
cbind(samples,m2)[order(-m2),]</code></pre>
<pre><code>##       samples     m2                 
##  [1,] &quot;OR3_13_C3&quot; &quot;0.917672777150695&quot;
##  [2,] &quot;CA4_12_C1&quot; &quot;0.886539313320665&quot;
##  [3,] &quot;OR3_3_C7&quot;  &quot;0.873308662789867&quot;
##  [4,] &quot;CA4_13_C1&quot; &quot;0.692623126491306&quot;
##  [5,] &quot;BC2_6_C2&quot;  &quot;0.634999785680607&quot;
##  [6,] &quot;BC2_9_C4&quot;  &quot;0.580248324736744&quot;
##  [7,] &quot;CA4_8_C9&quot;  &quot;0.365486004943634&quot;
##  [8,] &quot;CA4_1_C4&quot;  &quot;0.292274500278615&quot;
##  [9,] &quot;OR3_1_C3&quot;  &quot;0.245538584634728&quot;
## [10,] &quot;CA4_2_C3&quot;  &quot;0.211790424209519&quot;
## [11,] &quot;CA4_14_C8&quot; &quot;0.201074454557145&quot;
## [12,] &quot;OR3_20_C4&quot; &quot;0.180756976096244&quot;
## [13,] &quot;BC2_10_C5&quot; &quot;0.165683178785238&quot;
## [14,] &quot;OR3_9_C8&quot;  &quot;0.154638586063524&quot;
## [15,] &quot;OR3_5b_C6&quot; &quot;0.15163811456086&quot; 
## [16,] &quot;BC2_17_C7&quot; &quot;0.132277929388904&quot;
## [17,] &quot;BC2_12_C6&quot; &quot;0.130006143822601&quot;
## [18,] &quot;BC2_16_C6&quot; &quot;0.121776279129578&quot;</code></pre>
<pre class="r"><code>#plot histogram 
hist(m2,breaks=50)</code></pre>
<p><img src="POP_02_RADseq_files/figure-html/unnamed-chunk-7-1.png" width="960" /></p>
<p>Another common method of filtering and evaluating you data without
ever using R is <a
href="https://vcftools.github.io/man_latest.html">VCFtools</a>.</p>
<p><strong>Missingness with vcftools, on the command line</strong></p>
<pre class="bash"><code>vcftools --vcf POP_02_RADseq_files/OL_subset.vcf --missing-indv --out POP_02_RADseq_files/OL_subset
# sort the file by most missing data and print the top 10 samples  
cat POP_02_RADseq_files/OL_subset.imiss | (read h; echo &quot;$h&quot;; sort -k5 -r) </code></pre>
<pre><code>## /Users/jason/.bashrc: line 1: /Users/jason/perl5/perlbrew/etc/bashrc: No such file or directory
## 
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
## 
## Parameters as interpreted:
##  --vcf POP_02_RADseq_files/OL_subset.vcf
##  --missing-indv
##  --out POP_02_RADseq_files/OL_subset
## 
## After filtering, kept 18 out of 18 Individuals
## Outputting Individual Missingness
## After filtering, kept 69989 out of a possible 69989 Sites
## Run Time = 0.00 seconds
## INDV N_DATA  N_GENOTYPES_FILTERED    N_MISS  F_MISS
## OR3_13_C3    69989   0   64227   0.917673
## CA4_12_C1    69989   0   62048   0.886539
## OR3_3_C7 69989   0   61122   0.873309
## CA4_13_C1    69989   0   48476   0.692623
## BC2_6_C2 69989   0   44443   0.635
## BC2_9_C4 69989   0   40611   0.580248
## CA4_8_C9 69989   0   25580   0.365486
## CA4_1_C4 69989   0   20456   0.292275
## OR3_1_C3 69989   0   17185   0.245539
## CA4_2_C3 69989   0   14823   0.21179
## CA4_14_C8    69989   0   14073   0.201074
## OR3_20_C4    69989   0   12651   0.180757
## BC2_10_C5    69989   0   11596   0.165683
## OR3_9_C8 69989   0   10823   0.154639
## OR3_5b_C6    69989   0   10613   0.151638
## BC2_17_C7    69989   0   9258    0.132278
## BC2_12_C6    69989   0   9099    0.130006
## BC2_16_C6    69989   0   8523    0.121776</code></pre>
<p>You can open the file analysis/OL_subset.imiss in any text editor an
look at the missingness values by eye.</p>
<p>To plot the missingness on the command line, you can use gnuplot:</p>
<pre class="bash"><code>#code from Jon Puritz
mawk &#39;!/IN/&#39; POP_02_RADseq_files/OL_subset.imiss | cut -f5 &gt; totalmissing
gnuplot &lt;&lt; \EOF
set terminal dumb size 120, 30
set autoscale
unset label
set title &quot;Histogram of % missing data per individual&quot;
set ylabel &quot;Number of Occurrences&quot;
set xlabel &quot;% of missing data&quot;
#set yr [0:100000]
binwidth=0.01
bin(x,width)=width*floor(x/width) + binwidth/2.0
plot &#39;totalmissing&#39; using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF</code></pre>
<pre><code>## /Users/jason/.bashrc: line 1: /Users/jason/perl5/perlbrew/etc/bashrc: No such file or directory
##                                                                                                                         
##                                                                                                                         
##                                          Histogram of % missing data per individual                                     
##        2 +----------------------------------------------------------------------------------------------------------+   
##          |   ** *    +           +           +           +          +           +           +           +           |   
##          |   ** *                                             &#39;totalmissing&#39; using (bin($1,binwidth)):(1.0) ******* |   
##          |   ** *                                                                                                   |   
##          |   ** *                                                                                                   |   
##          |   ** *                                                                                                   |   
##      1.5 |-+ ** *                                                                                                 +-|   
##          |   ** *                                                                                                   |   
##          |   ** *                                                                                                   |   
##          |   ** *                                                                                                   |   
##          |   ** *                                                                                                   |   
##        1 |-**** *******************************************************************************************       +-|   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##      0.5 |-* ** * * * * *    *      *                 *               *     *             *           * * *       +-|   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##          | * ** * * * * *    *      *                 *               *     *             *           * * *         |   
##          | * ** * * *+* *    *   +  *        +        *  +          + *     *   +         * +         * * *         |   
##        0 +----------------------------------------------------------------------------------------------------------+   
##         0.1         0.2         0.3         0.4         0.5        0.6         0.7         0.8         0.9          1   
##                                                       % of missing data                                                 
## </code></pre>
<p>It looks like samples OR3_13_C3,CA4_12_C1, OR3_3_C7, CA4_13_C1,
BC2_6_C2, BC2_9_C4 have a lot of missing data relative to other samples.
We can remove them from our dataset for now, and just keep exploring our
data, but if you decide to exclude them permanently from the analysis
you should eventually rerun your genotyping pipeline without those
samples, esp. if doing a de novo assembly.</p>
<ol start="2" style="list-style-type: decimal">
<li>Another great way to explore missingness with your data is to
construct a heatmap of loci that are genoptyped between pairs of
samples. Generally, samples that are more closely related will share
more loci with each other due to cutsite dropout. Deviations from that
pattern can pinpoint bad samples as well as batch effects.</li>
</ol>
<ul>
<li><a
href="https://ipyrad.readthedocs.io/en/latest/API-analysis/cookbook-sharing.html">ipyrad
API tutorial</a> for making a locus sharing heatmap, requires ipyrad
output or ipyrad’s vcf2hdf5 conversion<br />
</li>
<li><a href="https://github.com/atcg/clustOpt">code to generate locus
sharing/missingness heatmap from a VCF</a></li>
</ul>
</div>
<div id="the-power-of-pca" class="section level3">
<h3>The power of PCA</h3>
<p>One of the most powerful methods for exploring your data is a
Principle Components Analysis. From <a
href="https://comppopgenworkshop2019.readthedocs.io/en/latest/contents/03_pca/pca.html">this
genetics tutorial</a>: “To understand how PCA works, consider a single
individual and its representation by its 593,124 markers. Formally, each
individual is a point in a 593,124-dimensional space, where each
dimension can take only the three possible genotypes indicated above, or
have missing data. To visualize this high-dimensional dataset, we would
like to project it down to two dimensions. But as there are many ways to
project the shadow of a three-dimensional object on a two dimensional
plane, there are many (and even more) ways to project a
593,124-dimensional cloud of points to two dimensions. What PCA does is
figuring out the “best” way to do this project in order to visualise the
major components of variance in the data.” See <a
href="https://michaelaalcorn.medium.com/yet-another-pca-explanation-727dff6ce26a">here</a>
for a linear algebra-based explanation of PCA.</p>
<p>When you have a VCF file with SNPs, use PCA before extensive
filtering or playing with parameters to look at the data. Check which
SNPs are associated with axes showing the most variation.</p>
<p>Here we use <em>SeqArray</em> and <em>SNPRelate</em> to run a PCA in
R.</p>
<!--Other popular methods for PCA include pcangsd(link), ...., but note that ... requires some file conversions everytime you filter.-->
<p><em>Reminder:</em> Missing data is a feature of RAD. Be aware of how
different analysis tools deal with missingness, especially PCA which
will fill in all missing data with some values. Here, the PCA from
<em>SNPRelate</em> imputes missing genotypes as the mean genotype across
samples.</p>
<div id="doing-a-pca-with-seqarray-and-snprelate-in-r."
class="section level5">
<h5>Doing a PCA with SeqArray and SNPRelate in R.</h5>
<pre class="r"><code>#open the gds file previously created, if not already open
#filename.gds = &quot;POP_02_RADseq_files/OL_subset.gds&quot; #replace with your file name 
#gdsin = SeqArray::seqOpen(filename.gds)
# exclude samples previously identified as having too much missing data
bad_samples = c(&quot;OR3_13_C3&quot;,&quot;CA4_12_C1&quot;,&quot;OR3_3_C7&quot;,&quot;CA4_13_C1&quot;,&quot;BC2_6_C2&quot;,&quot;BC2_9_C4&quot;)
sample.ids = seqGetData(gdsin, &quot;sample.id&quot;)
keep = sample.ids[which(!sample.ids %in% bad_samples)]</code></pre>
<p>Whether your data are denovo or reference-based, it is important to
filter out linked sites before performing a PCA. Here’s how to do it in
SeqArray. Note, if you have already filtered your VCF file to have only
1 SNP per RAD locus, you don’t need to do this:</p>
<pre class="r"><code>snpset &lt;- SNPRelate::snpgdsLDpruning(gdsin, ld.threshold=0.2, autosome.only = F, start.pos=&quot;random&quot;, num.thread=1, remove.monosnp = T, sample.id = keep)  
snpset.id &lt;- unlist(unname(snpset))</code></pre>
<p>Now we will actually run the PCA, again removing the samples with
missing data, keeping unlinked SNPs, and removing SNPs with less than 5%
minor allele frequency.</p>
<pre class="r"><code># PCA only on SNPs with a minor allele freq greater than 5%
pca.out = SNPRelate::snpgdsPCA(autosome.only = F, gdsin, num.thread=2, remove.monosnp = T, maf = 0.05,
                               snp.id=snpset.id,
                               sample.id = keep) # filtering for pruned SNPs</code></pre>
<pre><code>## Principal Component Analysis (PCA) on genotypes:
## Calculating allele counts/frequencies ...
## # of selected variants: 22,693
## Excluding 5,350 SNVs (monomorphic: TRUE, MAF: 0.05, missing rate: NaN)
##     # of samples: 12
##     # of SNVs: 22,693
##     using 2 threads
##     # of principal components: 32
## CPU capabilities:
## Sat Mar 18 13:57:03 2023    (internal increment: 40960)
## [..................................................]  0%, ETC: ---        [==================================================] 100%, completed, 0s
## Sat Mar 18 13:57:03 2023    Begin (eigenvalues and eigenvectors)
## Sat Mar 18 13:57:03 2023    Done.</code></pre>
<pre class="r"><code>#close the gds file (saves memory)
#seqClose(gdsin)

eig = pca.out$eigenval[!is.na(pca.out$eigenval)]
barplot(100*eig/sum(eig), main=&quot;PCA Eigenvalues&quot;)</code></pre>
<p><img src="POP_02_RADseq_files/figure-html/unnamed-chunk-12-1.png" width="960" /></p>
<!-- Data Info: Below we use RAD data from an Australian marine species complex. --->
<p>First color/shape the individuals by the factor you expect to matter
(eg, sampling site or region, family, ecotype).</p>
<pre class="r"><code>#PLOT PCA
#PC1 v PC2 colored by collection location
id.order = sapply(keep, function(x,df){which(df$ID == x)}, df=sample.strata) #incase your strata file is not in the same order as your vcf
sample.strata.order = sample.strata[id.order,]
print(
  as.data.frame(pca.out$eigenvect) %&gt;%
      tibble::add_column(., STRATA =  sample.strata.order$STRATA) %&gt;%
      ggplot(., aes(x=V1, y=V2, color = STRATA)) + 
      geom_point(size=2) +
      stat_ellipse(level = 0.95, size = 1) +
      geom_hline(yintercept = 0) +
      geom_vline(xintercept = 0) +
      theme_bw() +
      xlab(paste0(&quot;PC1 [&quot;,paste0(round(eig[1], 2)), &quot;%]&quot;)) +
      ylab(paste0(&quot;PC2 [&quot;,paste0(round(eig[2], 2)), &quot;%]&quot;)) +
      ggtitle(&quot;PCA Colored by Collection Location&quot;)
)</code></pre>
<pre><code>## Warning: Using `size` aesthetic for lines was deprecated in
## ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.</code></pre>
<p><img src="POP_02_RADseq_files/figure-html/unnamed-chunk-14-1.png" width="960" /></p>
</div>
<div id="questions-to-ask-about-your-pca"
class="section level4 unlisted unnumbered">
<h4 class="unlisted unnumbered">Questions to ask about your PCA:</h4>
<ul>
<li>Is there any clustering? If so, is it different than your
expectation?
<ul>
<li>If different than expectation, suggests either <a
href="#batch">batch effects</a>, <a href="#cryptic">cryptic
variation</a> among individuals, or systematic biological issues
affecting assembly/mapping</li>
</ul></li>
<li>Are there outlier samples driving a large amount of variation?
<ul>
<li>If so, they may be <a href="#cryptic">cryptic species</a>,
clones/sample replicates, or they <a href="#bad">sequenced/genotyped
poorly</a></li>
<li>In our example we see 2 potential outlier samples in the bottom left
corner. We can check which samples they are by plotting the PCA with
sample ID and/or looking at the actual PC scores.</li>
</ul></li>
</ul>
<pre class="r"><code>#PLOT PCA
#PC1 v PC2 with Sample Labels
print(
  as.data.frame(pca.out$eigenvect) %&gt;%
      tibble::add_column(., ID =  sample.strata.order$ID) %&gt;%
      ggplot(., aes(x=V1, y=V2, label = ID)) +
      geom_text(size =3) +
      geom_hline(yintercept = 0) +
      geom_vline(xintercept = 0) +
      theme_bw() +
      xlab(paste0(&quot;PC1 [&quot;,paste0(round(eig[1], 2)), &quot;%]&quot;)) +
      ylab(paste0(&quot;PC2 [&quot;,paste0(round(eig[2], 2)), &quot;%]&quot;)) +
      ggtitle(&quot;PCA with Labels&quot;)
)</code></pre>
<p><img src="POP_02_RADseq_files/figure-html/unnamed-chunk-15-1.png" width="960" /></p>
<pre class="r"><code># which samples have PC1 &gt; -0.7 and PC2 &lt; -0.3?
as.data.frame(pca.out$eigenvect) %&gt;%
      tibble::add_column(., ID =  sample.strata.order$ID) %&gt;%
  filter(V1 &gt; 0.7 &amp; V2 &lt; -0.3) %&gt;%
  select(ID,V1,V2)</code></pre>
<ul>
<li>Is there no clustering at all? If so, it may be due to:
<ul>
<li>bioinformatic artifacts leading to noise, <a href="#test">see
Testing Parameters</a></li>
<li>issues with how you are <a href="#filter">filtering the data</a>
(eg, too strict or too lax)</li>
<li>issues with how your PCA is treating missing data</li>
<li>or you actually have no structure in your dataset!</li>
</ul></li>
</ul>
<!--<span style='color: red;'>figure example, no clustering</span>-->
<ul>
<li>If you used sample replicates, first see if replicates cluster very
close together. If not, then there may be batch effects or bioinformatic
artifacts leading to genotyping error that must be addressed. Then
remove replicates for a subsequent PCA evaluation.</li>
</ul>
<!--Here is an example of a PCA with sample replicates, and then with sample replicates removed: -->
<p>When removing a sample replicate, you can choose one at random or
pick the replicate with the least <a href="#bad">missing data</a>.</p>
<pre class="r"><code># example code for removing samples from a SNPRelate PCA 
# assumes you have already loaded a vcf file as a GDS
samples.to.remove = c(&quot;SampleA_rep&quot;,&quot;SampleB_rep&quot;)
sample.ids = seqGetData(gdsin, &quot;sample.id&quot;)
keep = sample.ids[which(!sample.ids %in% bad_samples)]

# PCA only on SNPs with a minor allele freq greater than 2.5%, only keeping samples in keep
pca.out.noreps= SNPRelate::snpgdsPCA(autosome.only = F, gdsin, num.thread=1, remove.monosnp = T, maf = 0.05,
                               snp.id=snpset.id,
                               sample.id = keep)</code></pre>
<p>Next, color the individuals by potential sources of batch effects
(sequencing lane, library prep batch, age of tissue sample, person doing
the DNA extraction…).</p>
<pre class="r"><code># again, but colored by batch
print(
  as.data.frame(pca.out$eigenvect) %&gt;%
      tibble::add_column(., PLATE =  as.factor(sample.strata.order$PLATE)) %&gt;%
      ggplot(., aes(x=V1, y=V2, color = PLATE))  + #label = ID
      geom_point(size=2) +
      stat_ellipse(level = 0.95, size = 1) +
      geom_hline(yintercept = 0) +
      geom_vline(xintercept = 0) +
      theme_bw() +
      xlab(paste0(&quot;PC1 [&quot;,paste0(round(eig[1], 2)), &quot;%]&quot;)) +
      ylab(paste0(&quot;PC2 [&quot;,paste0(round(eig[2], 2)), &quot;%]&quot;)) +
      ggtitle(&quot;PCA colored by Batch/Sequencing Plate&quot;)
)</code></pre>
<pre><code>## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse</code></pre>
<pre><code>## Warning: Removed 7 rows containing missing values (`geom_path()`).</code></pre>
<p><img src="POP_02_RADseq_files/figure-html/unnamed-chunk-18-1.png" width="960" /></p>
<ul>
<li>Do any observed clusters match a source of batch effect? <a
href="#batch">How to remove batch effects</a></li>
</ul>
<!-- #### Visually inspect alignments {#align}  
Another useful approach is to visually inspect alignments for SNPs or loci that are identified as "weird" in the subsequent analyses. In ipyrad this can be done by searching the .loci file, in dDocent... If the alignment looks wonky, you can feel better about removing the whole locus (?).  
<span style='color: red;'>code example .loci</span>
<span style='color: red;'>code dDocent</span>  
<span style='color: red;'>code stacks</span>    
-->
</div>
</div>
<div id="batch" class="section level3">
<h3>Batch effects</h3>
<p>As discussed in <a href="#labwork">Considerations During Lab
Work</a>, batch effects can arise by minor (or not so minor) differences
during the library prep and sequencing stage. Randomizing samples across
libraries and sequencing lanes can help mitigate the influence of batch
effects on your downstream analyses, but sometimes this isn’t possible,
esp. when using pre-existing data. Even if you randomize your samples,
it is a good idea to check for batch effects with a PCA and then try to
mitigate them.</p>
<p>Nicolas Lou and Nina Therkildsen have a <a
href="https://onlinelibrary.wiley.com/doi/abs/10.1111/1755-0998.13559">great
article</a> and <a
href="https://github.com/therkildsen-lab/batch-effect">Github repo</a>
discussing causes, detection, and mitigation of batch effects in low
coverage WGS, where batch effects can have particularly large effects
<span class="citation">(@ Lou and Therkildsen 2021)</span>. While some
of their recommendations are specific to reference-based analyses,
others are relevant to RAD methods too (eg, batch effects due to
different sequencing platforms, DNA degradation levels, and sequencing
depth).</p>
<!--<span style='color: red;'>PCA figure of batch effect</span>-->
<p>You can identify which loci are driving the batch effect by 1)
identifying the top SNPs contributing to PC variation (termed
“loadings”) and/or 2) do an Fst outlier analysis with the batches
specified as populations (eg, Bayescan). Both of these methods are most
appropriate if you’ve randomized your samples between batches. If you
have not randomized samples but are certain there is a batch effect
driving variation in your dataset, you can still remove loci this way
but you may also be removing biologically informative loci.</p>
<p>In SeqArray/SNPRelate, if you have batch effects that are
identifiable by PCA, you would extract the PC loadings for each SNP at
the PC which represents the batch effect, plot a histogram to see the
distribution, choose a cutoff, then filter those SNPs out and create a
new VCF for downstream analyses. For this example, let’s pretend that we
see a batch effect on PC1.</p>
<pre class="r"><code># assume you have a GDS object called gdsin and a PCA object called pca.out
# extract PCA loadings
SnpLoad &lt;- snpgdsPCASNPLoading(pca.out, gdsin)</code></pre>
<pre><code>## SNP Loading:
##     # of samples: 12
##     # of SNPs: 22,693
##     using 1 thread
##     using the top 12 eigenvectors
## Sat Mar 18 13:57:04 2023    (internal increment: 65536)
## [..................................................]  0%, ETC: ---        [==================================================] 100%, completed, 0s
## Sat Mar 18 13:57:04 2023    Done.</code></pre>
<pre class="r"><code>names(SnpLoad)</code></pre>
<pre><code>## [1] &quot;sample.id&quot;  &quot;snp.id&quot;     &quot;eigenval&quot;   &quot;snploading&quot; &quot;TraceXTX&quot;  
## [6] &quot;Bayesian&quot;   &quot;avgfreq&quot;    &quot;scale&quot;</code></pre>
<pre class="r"><code>dim(SnpLoad$snploading)</code></pre>
<pre><code>## [1]    12 22693</code></pre>
<pre class="r"><code>#plot a histogram of absolute PC loading values for PC1, with larger loading meaning more effect on PC1
pc&lt;-1
hist(sort(abs(SnpLoad$snploading[pc,]),decreasing=T,index.return=T)[[1]],breaks = 30,main=&quot;PC loadings: PC1&quot;,)</code></pre>
<p><img src="POP_02_RADseq_files/figure-html/unnamed-chunk-19-1.png" width="960" />
Let’s remove the largest effect loci over abs(0.010) and see if it
changes our PCA.</p>
<pre class="r"><code>#extract SNP.IDs for loadings on PC 1 &gt; abs(0.010) 
batch.snp.ids &lt;- SnpLoad$snp.id[which(abs(SnpLoad$snploading[1,]) &gt;= 0.010)]

#rerun PCA
snp.id.allfilt &lt;- setdiff(snpset.id,batch.snp.ids)
pca.out.pcfilt = SNPRelate::snpgdsPCA(autosome.only = F, gdsin, num.thread=2, remove.monosnp = T, maf = 0.05,
                               snp.id=snp.id.allfilt,
                               sample.id = keep) # filtering for pruned SNPs</code></pre>
<pre><code>## Principal Component Analysis (PCA) on genotypes:
## Calculating allele counts/frequencies ...
## # of selected variants: 19,885
## Excluding 5,350 SNVs (monomorphic: TRUE, MAF: 0.05, missing rate: NaN)
##     # of samples: 12
##     # of SNVs: 19,885
##     using 2 threads
##     # of principal components: 32
## CPU capabilities:
## Sat Mar 18 13:57:04 2023    (internal increment: 40960)
## [..................................................]  0%, ETC: ---        [==================================================] 100%, completed, 0s
## Sat Mar 18 13:57:04 2023    Begin (eigenvalues and eigenvectors)
## Sat Mar 18 13:57:04 2023    Done.</code></pre>
<pre class="r"><code>print(
  as.data.frame(pca.out.pcfilt$eigenvect) %&gt;%
      tibble::add_column(., STRATA =  sample.strata.order$STRATA) %&gt;%
      ggplot(., aes(x=V1, y=V2, color = STRATA)) + 
      geom_point(size=2) +
      stat_ellipse(level = 0.95, size = 1) +
      geom_hline(yintercept = 0) +
      geom_vline(xintercept = 0) +
      theme_bw() +
      xlab(paste0(&quot;PC1 [&quot;,paste0(round(eig[1], 2)), &quot;%]&quot;)) +
      ylab(paste0(&quot;PC2 [&quot;,paste0(round(eig[2], 2)), &quot;%]&quot;)) +
      ggtitle(&quot;PCA Colored by Collection Location&quot;)
)</code></pre>
<p><img src="POP_02_RADseq_files/figure-html/unnamed-chunk-20-1.png" width="960" /></p>
<p>Our PCA is now clustering better by population, so we will now export
a new VCF that includes all the filtering we’ve done so far: 1) removed
individuals with missing data, 2) thinned our SNPs for linkage, and 3)
removed SNPs driving a batch effect.</p>
<p>To save your filtered GDS object as a VCF in SeqArray:</p>
<pre class="r"><code># set a filter to exclude samples with missing data, and SNPs in linkage or with a batch effect
seqSetFilter(gdsin, sample.id=keep,variant.id=snp.id.allfilt)</code></pre>
<pre><code>## # of selected samples: 12
## # of selected variants: 25,235</code></pre>
<pre class="r"><code># convert to vcf
seqGDS2VCF(gdsin, &quot;POP_02_RADseq_files/OL-miss-pc1-linkage-filt.vcf.gz&quot;)</code></pre>
<pre><code>## Loading required namespace: Rsamtools</code></pre>
<pre><code>## Hint: install Rsamtools to enable the BGZF-format output.</code></pre>
<pre><code>## Sat Mar 18 13:57:04 2023
## VCF Export: OL-miss-pc1-linkage-filt.vcf.gz
##     12 samples, 25,235 variants
##     INFO Field: NS, DP
##     FORMAT Field: DP, CATG
##     output to a general gzip file
## [..................................................]  0%, ETC: ---    [==================================================] 100%, completed, 1s
## Sat Mar 18 13:57:05 2023    Done.</code></pre>
<pre class="r"><code>#close the gds file to save memory
closefn.gds(gdsin)</code></pre>
</div>
<div id="cryptic" class="section level3">
<h3>Cryptic species/contamination/clones</h3>
<p>One issue that is particularly common, especially for marine
invertebrates, are cryptic, yet highly diverged lineages occurring in
the same geographic location. If you are trying to do a
landscape/seascape genetics study, cryptic species can dramatically
throw off population genetic analyses. A PCA using all the samples can
help identify cryptic species if samples are forming clusters that do
not align with any other possible factor (eg, geography, sequencing
batch).</p>
<!--Note: Notice the huge variation explained along the first PC-axis.-->
<p>You can also check heterozygosity. Large differences in
heterozygosity between outliers can suggest different species or cryptic
hybrids. Two outliers are obvious in the previous PCA. Large differences
in heterozygosity between samples are clear, with one location showing
two distinct clusters of samples. These samples were later determined
using mtDNA and morphological analysis to be cryptic species.
<em>Unpublished data from a marine species in Australian
waters.</em></p>
<!--Note: -->
<p>You can then estimate genetic divergence between these clusters using
pairwise Fst and/or phylogenetic inference to determine how diverged the
groups are. If they are very diverged, it may be better to assemble each
cryptic species separately, especially for denovo assemblies with
dDocent and Stacks, especially since often times restriction cut-sites
are not always conserved; see <span class="citation">(Wagner et al.
2013; Rubin, Ree, and Moreau 2012)</span>. These two pipelines used a
set of given samples to create a “reference” or “catalog” to which all
other samples are mapped. If highly diverged samples are included in
creating this reference, it can lead to <a
href="#cluster">oversplitting</a>.</p>
<p>Cryptic clones can also dramatically throw off population genetic
analyses. In a PCA, clones will often appear as their own very tight
cluster, similar to sample replicates. A calculation of sample pairwise
distance (e.g., Manhattan distance) can very duplicates/clones.</p>
<!--Below the PCA shows x and a simple calculate of pairwise-distance (i.e. here manhattan distance) shows x known duplicates, and x unknown clones. 
<span style='color: red;'>PCA figure of cryptic clones</span>-->
</div>
</div>
<div id="test" class="section level2">
<h2>Test a range of key parameters</h2>
<p>There are many parameter choices to make when assembling and
genotyping, and those parameters vary slightly across pipelines. The
parameters below are general across RADseq pipelines (although with
different names), and are shown to have the largest impact on genoyping
error rates.</p>
<div id="cluster" class="section level3">
<h3>Clustering threshold</h3>
<div id="de-novo-assembly-only" class="section level4">
<h4>(de novo assembly only)</h4>
<p>During <em>de novo</em> RADseq assembly, reads are assembled into
contigs (contiguous sequences), so that each contig represents a single
locus. This assembly can occur separately for each individual (ipyrad)
or jointly for multiple individuals (dDocent, Stacks). To call SNPs, in
the case of ipyrad, contigs are then clustered and aligned across
individuals, while in dDocent and Stacks, reads for each individual are
mapped to the “reference” contigs. All of these pipelines use sequence
similarity (or <strong>clustering threshold</strong>) to determine if
fragments are homologous (from the same place in the genome) and
therefore should be clustered together.</p>
<p>If the clustering threshold is too low, <em>undersplitting</em> can
occur so that multiple loci are combined into a single contig.
Undersplitting is common with repetitive regions and paralogs. It can
inflate mean observed heterozygosity. If the threshold is too high, it
can lead to <em>oversplitting</em>, where alleles of the same locus are
split into two or more contigs. This can lead to reduced mean observed
heterozygosity.</p>
<p>If severely off, both of these issues can bias the models used to
identify SNPs. Undersplitting a little is better than oversplitting, as
it is easier to identify lumped paralogs downstream and remove them.</p>
</div>
<div id="what-influences-the-optimal-clustering-threshold-of-your-data"
class="section level4">
<h4><strong>What influences the “optimal” clustering threshold of your
data?</strong></h4>
<p>Clustering threshold is usually determined by how much genetic
diversity you have in your dataset. If your dataset has high genetic
diversity (eg., a single species with high heterozygosity, multiple
highly diverged populations, multiple species), then you will need a
lower clustering threshold. If your dataset has low genetic diversity (a
single species with lower heterozygosity, a single family), then you
will need a higher clustering threshold. Polyploid or highly repetitive
(i.e., large) genomes may also have different requirements than diploid
species with smaller genomes. Other factors, such as sequencing error
rate, may play a role as well. <strong>We recommend always testing a few
clustering thresholds with a new dataset and/or new species</strong>.
However, as long as your clustering threshold is in the ballpark it
shouldn’t affect downstream analyses (according to <a
href="https://isaacovercast.github.io/">Isaac Overcast</a>).</p>
<p>There are a few strategies to identify a good clustering threshold
for your data, all of which require assembling your data with a range of
clustering thresholds.<br />
1) Metrics developed by <span class="citation">(McCartney-Melstad,
Gidiş, and Shaffer 2019)</span>, based on popgen theory and landscape
genetic expectations<br />
2) Utilize sample replicates (either exact replicates or individuals
from the same locality) to quantify genotyping error and minimize error
while maximizing the number of informative loci <span
class="citation">(Mastretta-Yanes et al. 2015)</span> 3) Evaluating the
number of haplotypes across loci, with many 3+ haplotypes indicating
undersplitting <span class="citation">(Harvey et al. 2015; Ilut, Nydam,
and Hare 2014)</span> 4) Make assemblies at a range of thresholds and
parameters, plot the # of SNPs or contigs per assembly, and make a
judgement call <a href="https://www.ddocent.com/assembly/">example in
dDocent tutorial</a>. If most parameters give you about the same # of
SNPs or contigs, you can feel confident that this parameter doesn’t
matter much for your data and just choose a reasonable value.</p>
</div>
</div>
<div id="mapping-parameters" class="section level3">
<h3>Mapping parameters</h3>
<div id="reference-based-and-some-de-novo-assemblers"
class="section level4 unlisted unnumbered">
<h4 class="unlisted unnumbered">(reference-based and some de novo
assemblers)</h4>
<p>When a sequencing read maps to the wrong assembled contig or place in
a reference genome (ie, a non-homologous locus), this can result in
erroneous SNPs. In many cases, these SNPs can be filtered out later
(link to filtering). However, data should be checked to make sure there
aren’t systematic, wide-scale mapping issues that need to be
addressed.</p>
<p>For referenced-based assembly, both dDocent and ipyrad make use of
bwa(link) to map reads, while Stacks allows the user to choose their
aligner. dDocent also utilizes bwa for de novo assemblies.</p>
<p>Mapping defaults for popular RADseq pipelines:<br />
<code>L</code>: clipping penalty for 5’ and 3’<br />
<code>A</code>: matching score<br />
<code>O</code>: gap open penalty<br />
<code>T</code>: mapping quality filter</p>
<p>dDocent:
<code>bwa mem -L 20,5 -t 8 -a -M -T 10 -A1 -B 3 -O 5 -R</code></p>
<ul>
<li>can change -A and -O interactively during the dDocent pipeline.</li>
</ul>
<p>ipyrad: (defaults) <code>-L 5,5 -B 4 -A1 -O 6 -T 30</code></p>
<ul>
<li>requires changing the ipyrad source code to change parameters</li>
</ul>
<p>Our <a href="https://marineomics.github.io/WGS_intro.html">whole
genome resequencing tutorials</a>] has some good content on mapping.
General considerations with RAD data:</p>
<ul>
<li>how many reads aligned for each sample? If many reads didn’t
aligned, this could be an issue with the reference, read
trimming/filtering, or the mapping parameters.</li>
<li>What is the distribution of mapping scores?<br />
</li>
<li>ratio of mean (across individuals) mapping quality scores for
reference and alternate allele <span class="citation">(O’Leary et al.
2018)</span>. 0.25-1.75 cut off implemented in dDocent.
<ul>
<li>this info might not be coded in all VCFs</li>
</ul></li>
</ul>
</div>
</div>
<div id="seq" class="section level3">
<h3>Read depth threshold</h3>
<p>Mistakes that occur during sequencing can lead to a base-calling
error in a read, which can then lead to an erroneous SNP. This type of
error most commonly manifests as rare alleles (eg, only observed in one
individual) or singletons (only one copy of the minor allele across all
individuals). A conservative approach to deal with these errors is to
filter out all loci with rare alleles (link filtering), but for some
demographic analyses this filtering is inappropriate as they require the
site frequency spectrum.<br />
The best ways to minimize the impact of sequencing error during
bioinformatic processing is to 1) properly trim and filter your data,
and 2) require higher read depth thresholds for assembly and
genotyping.</p>
<div id="filtering-reads" class="section level5">
<h5>1) Filtering reads</h5>
<p>FASTQ files that typically come from the sequencing center include a
<a
href="https://gatk.broadinstitute.org/hc/en-us/articles/360035531872-Phred-scaled-quality-scores">Phred
scale quality score</a> for each base, which indicates the probability
of that base call being correct. This information can be used prior to
assembly to trim low-quality sections from either end of reads or remove
reads altogether. Some SNP callers, including FreeBayes which is part of
dDocent, also use Phred scores when calling SNPs (ipyrad and Stacks do
not). All three of the most popular RAD pipelines include steps to
filter reads due to low quality bases low quality bases or adapter
sequences, but vary slightly on what kinds of read trimming are allowed.
Always make sure to filter out adapter sequences (this is not default
with stacks), otherwise default parameters are usually adequate. If your
initial <a href="#fastqc">fastqc</a> indicates that you have many low
quality bases in your reads, you may lose too much of your data during
filtering. In this case, you may want to make your read filtering a
little more permissible (keeping in mind that this may increase SNP
error rate).<br />
<strong>Always make sure to check with your sequencing center to see if
they have done anything to the raw reads before you received
them!</strong> <!---turn these into html tabs---></p>
<p><strong>dDocent: </strong></p>
<ul>
<li>does not allow you to use reads that have already been trimmed and
filtered for de novo assembly, only referenced-based.</li>
<li>Uses Trimmomatic to trim Illumina TruSeq adaters and low quality
bases. Make sure to change the defaults if you used a different
adapter/sequencing system.</li>
</ul>
<p><strong>Stacks: </strong></p>
<ul>
<li>requires your reads to be all the same length, so if they were
previously trimmed of low quality bases or adapter sequence this must be
specified and they will be truncated to the same length.</li>
</ul>
</div>
<div
id="require-higher-read-depth-thresholds-for-assembly-and-genotyping."
class="section level5">
<h5>2) Require higher read depth thresholds for assembly and
genotyping.</h5>
<p>For RADseq, ideally you are generating an average of <em>at
least</em> 10-20 sequence reads (10x-20x coverage) for each RAD locus in
your library preparation (but this doesn’t always happen…). For diploid
species, this would be sequencing each RAD haplotype at least 5x-10x.
The SNP calling step of RAD pipelines all require setting a minimum
number of reads to determine whether a site is homozygous or
heterozygous. If this threshold is too low, sequencing errors are more
likely to be included and the SNP caller will have a harder time
accurately identifying SNPs (<a
href="https://github.com/therkildsen-lab/genomic-data-analysis/blob/master/lcwgs_data_analysis.md">in
which case, you’d be better off with genotype likelihood methods</a>. If
it’s too high, then real polymorphic loci will be excluded. The <em>too
high</em> threshold for your dataset can usually best be determined by
using a few different read depth thresholds and comparing how the
results change downstream analyses.</p>
</div>
</div>
</div>
<div id="filter" class="section level2">
<h2>Filtering SNPs</h2>
<div id="and-embrace-the-missingness"
class="section level3 unlisted unnumbered">
<h3 class="unlisted unnumbered">(and embrace the missingness!)</h3>
<p><span style="color: red;">Tutorials for this section are still under
development! Please contribute on Github or provide feedback in <a
href="https://github.com/MarineOmics/marineomics.github.io/discussions">Discussions</a></span></p>
<p>Even with a reduced representation sequencing approach, your assembly
and genotyping pipeline will generate data for thousands of SNPs across
your individuals. These SNPs will always need to be filtered before
doing downstream analyses.</p>
<div id="main-reasons-to-filter-your-data" class="section level4">
<h4>Main reasons to filter your data:</h4>
<ul>
<li>Remove potentially erroneous SNPs that remain after optimizing your
assembly pipeline</li>
<li>Remove SNPs that are uninformative for downstream analyses</li>
<li>Subsample to remove SNPs that are in linkage disequilibrium
(required for analyses that assume independence like PCA,
Structure)</li>
</ul>
<p>Unique to RADseq approaches is the missingness of RAD data.</p>
<ul>
<li>Don’t over filter for missingness
<ul>
<li>Papers showing how too much filtering can skew results:
<ul>
<li>Empirical: <span class="citation">(Eaton et al. 2017; Tripp et al.
2017; Martı́n-Hernanz et al. 2019; Silliman et al. 2021; Dı́az-Arce and
Rodrı́guez-Ezpeleta 2019)</span><br />
</li>
<li>Simulation investigating effects and sources of missing data: <span
class="citation">(Huang and Knowles 2016; Leaché et al. 2015; Gautier et
al. 2013; Rivera-Colón, Rochette, and Catchen 2021)</span></li>
</ul></li>
</ul></li>
<li>Some ways to account for missing data in downstream analyses:
<ul>
<li>impute mising data (PCA), subsample (PCA, structure, pretty much
anything with SNPs), try a few different missing filters and see what
impact it has on results</li>
</ul></li>
</ul>
<p>Jon Puritz has a <a href="http://www.ddocent.com/filtering/">great
tutorial</a> on filtering SNPs from RADseq data assembled with dDocent.
While it requires certain VCF fields to be present that aren’t generated
by ipyrad or Stacks, it contains great concepts on how to filter.</p>
<!-- use [pcANGSD](link)-->
<!--## Special Cases  
### DARTseq  
### 2bRAD

### Reference-based vs. de novo RADseq
-->
</div>
</div>
</div>
<div id="repeat" class="section level2">
<h2>Repeat as needed</h2>
</div>
</div>
<div id="helpful-tutorials" class="section level1">
<h1>Helpful tutorials</h1>
<ul>
<li><a href="https://baylab.github.io/MarineGenomics/">Marine Genomics
open course</a></li>
<li><a href="http://evomics.org/learning/unix-tutorial/">UNIX
tutorial</a></li>
<li><a
href="http://evomics.org/learning/population-and-speciation-genomics/2020-population-and-speciation-genomics/first-steps-in-genomic-data-analysis/">what
is a VCF file and how can I filter it?</a></li>
</ul>
</div>
<div id="references" class="section level1 unnumbered">
<h1 class="unnumbered">References</h1>
<div id="refs" class="references csl-bib-body hanging-indent"
entry-spacing="0">
<div id="ref-Andrews2016-ay" class="csl-entry">
Andrews, Kimberly R, Jeffrey M Good, Michael R Miller, Gordon Luikart,
and Paul A Hohenlohe. 2016. <span>“Harnessing the Power of
<span>RADseq</span> for Ecological and Evolutionary Genomics.”</span>
<em>Nat. Rev. Genet.</em> 17 (2): 81–92. <a
href="https://doi.org/10.1038/nrg.2015.28">https://doi.org/10.1038/nrg.2015.28</a>.
</div>
<div id="ref-Diaz-Arce2019-nh" class="csl-entry">
Dı́az-Arce, Natalia, and Naiara Rodrı́guez-Ezpeleta. 2019.
<span>“Selecting <span>RAD-Seq</span> Data Analysis Parameters for
Population Genetics: The More the Better?”</span> <em>Front. Genet.</em>
10 (May): 533. <a
href="https://doi.org/10.3389/fgene.2019.00533">https://doi.org/10.3389/fgene.2019.00533</a>.
</div>
<div id="ref-Eaton2017-gx" class="csl-entry">
Eaton, Deren A R, Elizabeth L Spriggs, Brian Park, and Michael J
Donoghue. 2017. <span>“Misconceptions on Missing Data in <span
class="nocase">RAD-seq</span> Phylogenetics with a Deep-Scale Example
from Flowering Plants.”</span> <em>Syst. Biol.</em> 66 (3): 399–412. <a
href="https://doi.org/10.1093/sysbio/syw092">https://doi.org/10.1093/sysbio/syw092</a>.
</div>
<div id="ref-Elshire2011-or" class="csl-entry">
Elshire, Robert J, Jeffrey C Glaubitz, Qi Sun, Jesse A Poland, Ken
Kawamoto, Edward S Buckler, and Sharon E Mitchell. 2011. <span>“A
Robust, Simple Genotyping-by-Sequencing (<span>GBS</span>) Approach for
High Diversity Species.”</span> <em>PLoS One</em> 6 (5): e19379. <a
href="https://doi.org/10.1371/journal.pone.0019379">https://doi.org/10.1371/journal.pone.0019379</a>.
</div>
<div id="ref-Gautier2013-vj" class="csl-entry">
Gautier, Mathieu, Karim Gharbi, Timothee Cezard, Julien Foucaud, Carole
Kerdelhué, Pierre Pudlo, Jean-Marie Cornuet, and Arnaud Estoup. 2013.
<span>“The Effect of <span>RAD</span> Allele Dropout on the Estimation
of Genetic Variation Within and Between Populations.”</span> <em>Mol.
Ecol.</em> 22 (11): 3165–78. <a
href="https://doi.org/10.1111/mec.12089">https://doi.org/10.1111/mec.12089</a>.
</div>
<div id="ref-Harvey2015-vh" class="csl-entry">
Harvey, Michael G, Caroline Duffie Judy, Glenn F Seeholzer, James M
Maley, Gary R Graves, and Robb T Brumfield. 2015. <span>“Similarity
Thresholds Used in <span>DNA</span> Sequence Assembly from Short Reads
Can Reduce the Comparability of Population Histories Across
Species.”</span> <em>PeerJ</em> 3 (April): e895. <a
href="https://doi.org/10.7717/peerj.895">https://doi.org/10.7717/peerj.895</a>.
</div>
<div id="ref-Huang2016-na" class="csl-entry">
Huang, Huateng, and L Lacey Knowles. 2016. <span>“Unforeseen
Consequences of Excluding Missing Data from Next-Generation Sequences:
Simulation Study of <span>RAD</span> Sequences.”</span> <em>Syst.
Biol.</em> 65 (3): 357–65. <a
href="https://doi.org/10.1093/sysbio/syu046">https://doi.org/10.1093/sysbio/syu046</a>.
</div>
<div id="ref-Ilut2014-ja" class="csl-entry">
Ilut, Daniel C, Marie L Nydam, and Matthew P Hare. 2014. <span>“Defining
Loci in Restriction-Based Reduced Representation Genomic Data from
Nonmodel Species: Sources of Bias and Diagnostics for Optimal
Clustering.”</span> <em>Biomed Res. Int.</em> 2014 (June): 675158. <a
href="https://doi.org/10.1155/2014/675158">https://doi.org/10.1155/2014/675158</a>.
</div>
<div id="ref-LaCava2020-bg" class="csl-entry">
LaCava, Melanie E F, Ellen O Aikens, Libby C Megna, Gregg Randolph,
Charley Hubbard, and C Alex Buerkle. 2020. <span>“Accuracy of de Novo
Assembly of <span>DNA</span> Sequences from Double-Digest Libraries
Varies Substantially Among Software.”</span> <em>Mol. Ecol. Resour.</em>
20 (2): 360–70. <a
href="https://doi.org/10.1111/1755-0998.13108">https://doi.org/10.1111/1755-0998.13108</a>.
</div>
<div id="ref-Leache2015-eg" class="csl-entry">
Leaché, Adam D, Andreas S Chavez, Leonard N Jones, Jared A Grummer,
Andrew D Gottscho, and Charles W Linkem. 2015. <span>“Phylogenomics of
Phrynosomatid Lizards: Conflicting Signals from Sequence Capture Versus
Restriction Site Associated <span>DNA</span> Sequencing.”</span>
<em>Genome Biol. Evol.</em> 7 (3): 706–19. <a
href="https://doi.org/10.1093/gbe/evv026">https://doi.org/10.1093/gbe/evv026</a>.
</div>
<div id="ref-Lou2021-ew" class="csl-entry">
Lou, Runyang Nicolas, and Nina Overgaard Therkildsen. 2021. <span>“Batch
Effects in Population Genomic Studies with Low-Coverage Whole Genome
Sequencing Data: Causes, Detection, and Mitigation.”</span> <em>Authorea
Preprints</em>, August. <a
href="https://doi.org/10.22541/au.162791857.78788821/v2">https://doi.org/10.22541/au.162791857.78788821/v2</a>.
</div>
<div id="ref-Martin-Hernanz2019-bu" class="csl-entry">
Martı́n-Hernanz, Sara, Abelardo Aparicio, Mario Fernández-Mazuecos,
Encarnación Rubio, J Alfredo Reyes-Betancort, Arnoldo Santos-Guerra,
Marı́a Olangua-Corral, and Rafael G Albaladejo. 2019. <span>“Maximize
Resolution or Minimize Error? Using
<span>Genotyping-By-Sequencing</span> to Investigate the Recent
Diversification of Helianthemum (Cistaceae).”</span> <em>Front. Plant
Sci.</em> 10 (November): 1416. <a
href="https://doi.org/10.3389/fpls.2019.01416">https://doi.org/10.3389/fpls.2019.01416</a>.
</div>
<div id="ref-Mastretta-Yanes2015-jb" class="csl-entry">
Mastretta-Yanes, A, N Arrigo, N Alvarez, T H Jorgensen, D Piñero, and B
C Emerson. 2015. <span>“Restriction Site-Associated <span>DNA</span>
Sequencing, Genotyping Error Estimation and de Novo Assembly
Optimization for Population Genetic Inference.”</span> <em>Mol. Ecol.
Resour.</em> 15 (1): 28–41. <a
href="https://doi.org/10.1111/1755-0998.12291">https://doi.org/10.1111/1755-0998.12291</a>.
</div>
<div id="ref-McCartney-Melstad2019-ac" class="csl-entry">
McCartney-Melstad, Evan, Müge Gidiş, and H Bradley Shaffer. 2019.
<span>“An Empirical Pipeline for Choosing the Optimal Clustering
Threshold in <span>RADseq</span> Studies.”</span> <em>Mol. Ecol.
Resour.</em> 19 (5): 1195–1204. <a
href="https://doi.org/10.1111/1755-0998.13029">https://doi.org/10.1111/1755-0998.13029</a>.
</div>
<div id="ref-OLeary2018-iy" class="csl-entry">
O’Leary, Shannon J, Jonathan B Puritz, Stuart C Willis, Christopher M
Hollenbeck, and David S Portnoy. 2018. <span>“These Aren’t the Loci
You’e Looking for: Principles of Effective <span>SNP</span> Filtering
for Molecular Ecologists.”</span> <em>Mol. Ecol.</em>, July. <a
href="https://doi.org/10.1111/mec.14792">https://doi.org/10.1111/mec.14792</a>.
</div>
<div id="ref-Peterson2012-tr" class="csl-entry">
Peterson, Brant K, Jesse N Weber, Emily H Kay, Heidi S Fisher, and Hopi
E Hoekstra. 2012. <span>“Double Digest <span>RADseq</span>: An
Inexpensive Method for de Novo <span>SNP</span> Discovery and Genotyping
in Model and Non-Model Species.”</span> <em>PLoS One</em> 7 (5): e37135.
<a
href="https://doi.org/10.1371/journal.pone.0037135">https://doi.org/10.1371/journal.pone.0037135</a>.
</div>
<div id="ref-Rivera-Colon2021-en" class="csl-entry">
Rivera-Colón, Angel G, Nicolas C Rochette, and Julian M Catchen. 2021.
<span>“Simulation with <span>RADinitio</span> Improves
<span>RADseq</span> Experimental Design and Sheds Light on Sources of
Missing Data.”</span> <em>Mol. Ecol. Resour.</em> 21 (2): 363–78. <a
href="https://doi.org/10.1111/1755-0998.13163">https://doi.org/10.1111/1755-0998.13163</a>.
</div>
<div id="ref-Rubin2012-hw" class="csl-entry">
Rubin, Benjamin E R, Richard H Ree, and Corrie S Moreau. 2012.
<span>“Inferring Phylogenies from <span>RAD</span> Sequence
Data.”</span> <em>PLoS One</em> 7 (4): e33394. <a
href="https://doi.org/10.1371/journal.pone.0033394">https://doi.org/10.1371/journal.pone.0033394</a>.
</div>
<div id="ref-Silliman2021-oo" class="csl-entry">
Silliman, Katherine, Jane L Indorf, Nancy Knowlton, William E Browne,
and Carla Hurt. 2021. <span>“Base-Substitution Mutation Rate Across the
Nuclear Genome of Alpheus Snapping Shrimp and the Timing of Isolation by
the Isthmus of Panama.”</span> <em>BMC Ecol Evol</em> 21 (1): 104. <a
href="https://doi.org/10.1186/s12862-021-01836-3">https://doi.org/10.1186/s12862-021-01836-3</a>.
</div>
<div id="ref-Tripp2017-cp" class="csl-entry">
Tripp, Erin A, Yi-Hsin Erica Tsai, Yongbin Zhuang, and Kyle G Dexter.
2017. <span>“<span>RADseq</span> Dataset with 90% Missing Data Fully
Resolves Recent Radiation of Petalidium (Acanthaceae) in the Ultra-Arid
Deserts of Namibia.”</span> <em>Ecol. Evol.</em> 7 (19): 7920–36. <a
href="https://doi.org/10.1002/ece3.3274">https://doi.org/10.1002/ece3.3274</a>.
</div>
<div id="ref-Wagner2013-ff" class="csl-entry">
Wagner, Catherine E, Irene Keller, Samuel Wittwer, Oliver M Selz, Salome
Mwaiko, Lucie Greuter, Arjun Sivasundar, and Ole Seehausen. 2013.
<span>“Genome-Wide <span>RAD</span> Sequence Data Provide Unprecedented
Resolution of Species Boundaries and Relationships in the Lake Victoria
Cichlid Adaptive Radiation.”</span> <em>Mol. Ecol.</em> 22 (3): 787–98.
<a
href="https://doi.org/10.1111/mec.12023">https://doi.org/10.1111/mec.12023</a>.
</div>
<div id="ref-Zheng2017-pl" class="csl-entry">
Zheng, Xiuwen, Stephanie M Gogarten, Michael Lawrence, Adrienne Stilp,
Matthew P Conomos, Bruce S Weir, Cathy Laurie, and David Levine. 2017.
<span>“<span class="nocase">SeqArray-a</span> Storage-Efficient
High-Performance Data Format for <span>WGS</span> Variant Calls.”</span>
<em>Bioinformatics</em> 33 (15): 2251–57. <a
href="https://doi.org/10.1093/bioinformatics/btx145">https://doi.org/10.1093/bioinformatics/btx145</a>.
</div>
</div>
</div>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- tabsets -->

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open');
  });
});
</script>

<!-- code folding -->

<script>
$(document).ready(function ()  {

    // temporarily add toc-ignore selector to headers for the consistency with Pandoc
    $('.unlisted.unnumbered').addClass('toc-ignore')

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_');
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = false;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>