Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
172 lines (137 sloc) 27.6 KB
title = "Github resources library"
avaliable_cfg = ["github.toml"]
prefix_url = ""
cfg_dir = "@>@system.file('extdata', 'config/github', package = 'BioInstaller')@<@"
title = "Burrow-Wheeler Aligner for pairwise alignment between DNA sequences"
description = "BWA is a software package for mapping DNA sequences against a large reference genome, such as the human genome. It consists of three algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina sequence reads up to 100bp, while the rest two for longer sequences ranged from 70bp to a few megabases. BWA-MEM and BWA-SW share similar features such as the support of long reads and chimeric alignment, but BWA-MEM, which is the latest, is generally recommended as it is faster and more accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina reads."
publication = [
"Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]",
"Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]",
"Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN]"]
title = "a project-oriented pipeline for processing of RNA-seq data in high performance cluster environments"
description = "The wide range of RNA-seq applications and their high-computational needs require the development of pipelines orchestrating the entire workflow and optimizing usage of available computational resources. We present aRNApipe, a project-oriented pipeline for processing of RNA-seq data in high-performance cluster environments. aRNApipe is highly modular and can be easily migrated to any high-performance computing (HPC) environment. The current applications included in aRNApipe combine the essential RNA-seq primary analyses, including quality control metrics, transcript alignment, count generation, transcript fusion identification, alternative splicing and sequence variant calling. aRNApipe is project-oriented and dynamic so users can easily update analyses to include or exclude samples or enable additional processing modules. Workflow parameters are easily set using a single configuration file that provides centralized tracking of all analytical processes. Finally, aRNApipe incorporates interactive web reports for sample tracking and a tool for managing the genome assemblies available to perform an analysis."
publication = "Alonso A, Lasseigne B N, Williams K, et al. aRNApipe: a balanced, efficient and distributed pipeline for processing RNA-seq data in high-performance computing environments[J]. Bioinformatics, 2017, 33(11): 1727-1729."
title = "A toolkit for the quality control (QC) of next generation sequencing (NGS) data"
description = "Next generation sequencing (NGS) technologies provide a high-throughput means to generate large amount of sequence data. However, quality control (QC) of sequence data generated from these technologies is extremely important for meaningful downstream analysis. Further, highly efficient and fast processing tools are required to handle the large volume of datasets. Here, we have developed an application, NGS QC Toolkit, for quality check and filtering of high-quality data. This toolkit is a standalone and open source application freely available at All the tools in the application have been implemented in Perl programming language. The toolkit is comprised of user-friendly tools for QC of sequencing data generated using Roche 454 and Illumina platforms, and additional tools to aid QC (sequence format converter and trimming tools) and analysis (statistics tools). A variety of options have been provided to facilitate the QC at user-defined parameters. The toolkit is expected to be very useful for the QC of NGS data to facilitate better downstream analysis."
publication = "Patel R K, Jain M. NGS QC Toolkit: a toolkit for quality control of next generation sequencing data[J]. PloS one, 2012, 7(2): e30619."
title = "A wrapper around Cutadapt and FastQC to consistently apply adapter and quality trimming to FastQ files, with extra functionality for RRBS data"
description = "Trim Galore! is a wrapper script to automate quality and adapter trimming as well as quality control, with some added functionality to remove biased methylation positions for RRBS sequence files (for directional, non-directional (or paired-end) sequencing)."
publication = ""
title = "Aggregate results from bioinformatics analyses across many samples into a single report."
description = """MultiQC is a tool to create a single report with interactive plots for multiple bioinformatics analyses across many samples.
MultiQC is written in Python (tested with v2.7, 3.4, 3.5 and 3.6). It is available on the Python Package Index and through conda using Bioconda.
Reports are generated by scanning given directories for recognised log files. These are parsed and a single HTML report is generated summarising the statistics for all logs found. MultiQC reports can describe multiple analysis steps and large numbers of samples within a single plot, and multiple analysis tools making it ideal for routine fast quality control."""
publication = "Ewels P, Magnusson M, Lundin S, et al. MultiQC: summarize analysis results for multiple tools and samples in a single report[J]. Bioinformatics, 2016, 32(19): 3047-3048."
title = "OLego: fast and sensitive mapping of spliced mRNA-Seq reads using small seeds"
description = "A crucial step in analyzing mRNA-Seq data is to accurately and efficiently map hundreds of millions of reads to the reference genome and exon junctions. Here we present OLego, an algorithm specifically designed for de novo mapping of spliced mRNASeq reads. OLego adopts a multiple-seed-andextend scheme, and does not rely on a separate external aligner. It achieves high sensitivity of junction detection by strategic searches with small seeds (14 nt for mammalian genomes). To improve accuracy and resolve ambiguous mapping at junctions, OLego uses a built-in statistical model to score exon junctions by splice-site strength and intron size. Burrows–Wheeler transform is used in multiple steps of the algorithm to efficiently map seeds, locate junctions and identify small exons. OLego is implemented in C++ with fully multithreaded execution, and allows fast processing of large-scale data. We systematically evaluated the performance of OLego in comparison with published tools using both simulated and real data. OLego demonstrated better sensitivity, higher or comparable accuracy and substantially improved speed. OLego also identified hundreds of novel micro-exons (<30 nt) in the mouse transcriptome, many of which are phylogenetically conserved and can be validated experimentally in vivo. OLego is freely available at http://zhanglab.c2b2.columbia. edu/index.php/OLego."
publication = "Wu J, et al. OLego: fast and sensitive mapping of spliced mRNA-Seq reads using small seeds[J]. Nucleic acids research, 2013, 41(10): 5149-5163."
title = "ChronQC: A Quality Control Monitoring System for Clinical Next Generation Sequencing"
description = "ChronQC is a quality control (QC) tracking system for clinical implementation of next-generation sequencing (NGS). ChronQC generates time series plots for various QC metrics to allow comparison of current runs to historical runs. ChronQC has multiple features for tracking QC data including Westgard rules for clinical validity, laboratory-defined thresholds, and historical observations within a specified time period. Users can record their notes and corrective actions directly onto the plots for long-term recordkeeping. ChronQC facilitates regular monitoring of clinical NGS to enable adherence to high quality clinical standards."
publication = "Tawari N R, Seow J J W, Dharuman P, et al. ChronQC: A Quality Control Monitoring System for Clinical Next Generation Sequencing[J]. Bioinformatics, 2017."
title = "DART: a fast and accurate RNA-seq mapper with a partitioning strategy"
description = """
We proposed a novel RNA-seq de novo mapping algorithm, call DART, which adopts a partitioning strategy to avoid the extension step. The experiment results on synthetic datasets and real NGS datasets showed that DART is a highly efficient aligner that yields the highest or comparable sensitivity and accuracy compared to most state-of-the-art aligners, and more importantly, it spends the least amount of time among the selected aligners."""
publication = "Lin H N, Hsu W L. DART: a fast and accurate RNA-seq mapper with a partitioning strategy[J]. Bioinformatics, 2017, 34(2): 190-197."
title = "rHAT: fast alignment of noisy long reads with regional hashing."
description = """MOTIVATION:Single Molecule Real-Time (SMRT) sequencing has been widely applied in cutting-edge genomic studies. However, it is still an expensive task to align the noisy long SMRT reads to reference genome by state-of-the-art aligners, which is becoming a bottleneck in applications with SMRT sequencing. Novel approach is on demand for improving the efficiency and effectiveness of SMRT read alignment.
RESULTS:We propose Regional Hashing-based Alignment Tool (rHAT), a seed-and-extension-based read alignment approach specifically designed for noisy long reads. rHAT indexes reference genome by regional hash table (RHT), a hash table-based index which describes the short tokens within local windows of reference genome. In the seeding phase, rHAT utilizes RHT for efficiently calculating the occurrences of short token matches between partial read and local genomic windows to find highly possible candidate sites. In the extension phase, a sparse dynamic programming-based heuristic approach is used for reducing the cost of aligning read to the candidate sites. By benchmarking on the real and simulated datasets from various prokaryote and eukaryote genomes, we demonstrated that rHAT can effectively align SMRT reads with outstanding throughput."""
publication = "Liu B, Guan D, Teng M, et al. rHAT: fast alignment of noisy long reads with regional hashing[J]. Bioinformatics, 2015, 32(11): 1625-1631."
title = "GIGGLE: a search engine for large-scale integrated genome analysis"
description = "GIGGLE is a genomics search engine that identifies and ranks the significance of genomic loci shared between query features and thousands of genome interval files. GIGGLE (https:// scales to billions of intervals and is over three orders of magnitude faster than existing methods. Its speed extends the accessibility and utility of resources such as ENCODE , Roadmap Epigenomics, and GTE x by facilitating data integration and hypothesis generation."
publication = "Layer, R.M. et al. GIGGLE: a search engine for large-scale integrated genome analysis. Nat Methods (2018)."
title = "RSEM: accurate quantification of gene and isoform expression from RNA-Seq data"
description = "RSEM is a software package for estimating gene and isoform expression levels from RNA-Seq data. The RSEM package provides an user-friendly interface, supports threads for parallel computation of the EM algorithm, single-end and paired-end read data, quality scores, variable-length reads and RSPD estimation. In addition, it provides posterior mean and 95% credibility interval estimates for expression levels. For visualization, It can generate BAM and Wiggle files in both transcript-coordinate and genomic-coordinate. Genomic-coordinate files can be visualized by both UCSC Genome browser and Broad Institute's Integrative Genomics Viewer (IGV). Transcript-coordinate files can be visualized by IGV. RSEM also has its own scripts to generate transcript read depth plots in pdf format. The unique feature of RSEM is, the read depth plots can be stacked, with read depth contributed to unique reads shown in black and contributed to multi-reads shown in red. In addition, models learned from data can also be visualized. Last but not least, RSEM contains a simulator."
publication = "Li B, Dewey C N. RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome[J]. BMC bioinformatics, 2011, 12(1): 323."
title = "RADIA: RNA and DNA Integrated Analysis for Somatic Mutation Detection"
description = """RADIA identifies RNA and DNA variants in BAM files. RADIA is typically run on 3 BAM files consisting of the Normal DNA, Tumor DNA and Tumor RNA. If no RNA is available from the tumor, then it is run on the normal/tumor pairs. For the normal DNA, RADIA outputs any differences compared to the reference which could be potential Germline mutations. For the tumor DNA, RADIA outputs any differences compared to the reference and the normal DNA which could be potential Somatic mutations. RADIA combines the tumor DNA and tumor RNA to augment the somatic mutation calls. It also uses the tumor RNA to identify potential RNA editing events.
The DNA Only Method (DOM) uses just the tumor/normal pairs of DNA (ignoring the RNA), while the Triple BAM Method (TBM) uses all three datasets from the same patient to detect somatic mutations. The mutations from the TBM are further categorized into 2 sub-groups: RNA Confirmation and RNA Rescue calls. RNA Confirmation calls are those that are made by both the DOM and the TBM due to the strong read support in both the DNA and RNA. RNA Rescue calls are those that had very little DNA support, hence not called by the DOM, but strong RNA support, and thus called by the TBM. RNA Rescue calls are typically missed by traditional methods that only interrogate the DNA."""
publication = "Radenbaugh AJ, Ma S, Ewing A, Stuart JM, Collisson EA, Zhu J, Haussler D. (2014) RADIA: RNA and DNA Integrated Analysis for Somatic Mutation Detection. PLoS ONE 9(11): e111516. doi:10.1371/journal.pone.0111516"
title = "Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications"
description = """Manta calls structural variants (SVs) and indels from mapped paired-end sequencing reads. It is optimized for analysis of germline variation in small sets of individuals and somatic variation in tumor/normal sample pairs. Manta discovers, assembles and scores large-scale SVs, medium-sized indels and large insertions within a single efficient workflow. The method is designed for rapid analysis on standard compute hardware: NA12878 at 50x genomic coverage is analyzed in less than 20 minutes on a 20 core server, and most WGS tumor/normal analyses can be completed within 2 hours. Manta combines paired and split-read evidence during SV discovery and scoring to improve accuracy, but does not require split-reads or successful breakpoint assemblies to report a variant in cases where there is strong evidence otherwise. It provides scoring models for germline variants in small sets of diploid samples and somatic variants in matched tumor/normal sample pairs. There is experimental support for analysis of unmatched tumor samples as well. Manta accepts input read mappings from BAM or CRAM files and reports all SV and indel inferences in VCF 4.1 format. See the user guide for a full description of capabilities and limitations.
publication = "Chen, X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications. Bioinformatics, 32, 1220-1222. doi:10.1093/bioinformatics/btv710"
title = "JAFFA is a multi-step pipeline that takes either raw RNA-Seq reads, or pre-assembled transcripts, then searches for gene fusions"
description = """JAFFA is a multi-step pipeline that takes either raw RNA-Seq reads, or pre-assembled transcripts, then searches for gene fusions. It will output the names and locations of candidate gene fusions along with the cDNA sequence of their breakpoints. JAFFA is based on the idea of comparing a transcriptome (e.g. in a cancer sample) against a reference transcriptome. In this way, it is a transcript-centric approach rather than a genome-centric approach like other fusion finders. In validation studies, JAFFA performed well over a range of read lengths - from 50bp to full-length transcripts and on single and paired-end reads.
publication = "Davidson N M, Majewski I J, Oshlack A. JAFFA: High sensitivity transcriptome-focused fusion gene detection[J]. Genome Medicine,7,1(2015-05-11), 2015, 7(1):1-12. doi:10.1186/s13073-015-0167-x"
title = "The MARVEL assembler"
description = """MARVEL consists of a set of tools that facilitate the overlapping, patching, correction and assembly of noisy (not so noisy ones as well) long reads."""
publication = ""
title = "Structural Variants Pipeline for Long Reads"
description = """Picky is a structural variant pipeline for long reads developed by Genome Technologies, The Jackson Laboratory. Picky was initially designed for Oxford Nanopore long reads, but will also work for PacBio reads. Picky uses LAST to generate all possible High-scoring Segment Pairs (HSPs) for a read, and 'pick'-and-stitch the segments into the representative alignments with a greedy algorithm instead of using last-split. Picky's picking does not assume colinearity nor mandate that each read base must only be aligned to at most a single genomic location."""
publication = """(Gong et al., 2017) Nanopore Sequencing Reveals High-Resolution Structural Variation in the Cancer Genome. BioRxiv.
DOI: 10.1101/209718"""
title = "Control-FREEC: Copy number and genotype annotation in whole genome and whole exome sequencing data"
description = """Control-FREEC is a tool for detection of copy-number changes and allelic imbalances (including LOH) using deep-sequencing data originally developed by the Bioinformatics Laboratory of Institut Curie (Paris). Since 2016, the project has moved to Insitut Cochin, INSERM U1016 (Paris).
Control-FREEC automatically computes, normalizes, segments copy number and beta allele frequency (BAF) profiles, then calls copy number alterations and LOH. The control (matched normal) sample is optional for whole genome sequencing data but mandatory for whole exome or targeted sequencing data. For whole genome sequencing data analysis, the program can also use mappability data (files created by GEM).
Starting from version 8.0, we provide a possibility to detect subclonal gains and losses and evaluate the likeliest average ploidy of the sample. Also, the evaluation procedure for the level of contamination by normal cells has been improved."""
publication = """Boeva V, Zinovyev A, Bleakley K, Vert JP, Janoueix-Lerosey I, Delattre O, Barillot E. (2011) Control-free calling of copy number alterations in deep-sequencing data using GC-content normalization. Bioinformatics 2011; 27(2):268-9. PMID: 21081509.
Boeva V, Popova T, Bleakley K, Chiche P, Cappo J, Schleiermacher G, Janoueix-Lerosey I, Delattre O, Barillot E. (2011) Control-FREEC: a tool for assessing copy number and allelic content using next generation sequencing data. Bioinformatics. 2011 Dec 6. [Epub ahead of print] PubMed PMID: 22155870."""
title = "OncodriveCLUST: a method aimed to identify genes whose mutations are biased towards a large spatial clustering"
description = """This method is designed to exploit the feature that mutations in cancer genes, especially oncogenes, often cluster in particular positions of the protein. We consider this as a sign that mutations in these regions change the function of these proteins in a manner that provides an adaptive advantage to cancer cells and consequently are positively selected during clonal evolution of tumours, and this property can thus be used to nominate novel candidate driver genes.
The method does not assume that the baseline mutation probability is homogeneous across all gene positions but it creates a background model using silent mutations. Coding silent mutations are supposed to be under no positive selection and may reflect the baseline clustering of somatic mutations. Given recent evidences of non-random mutation processes along the genome, the assumption of homogenous mutation probabilities is likely an oversimplication introducing bias in the detection of meaningful events."""
publication = "Tamborero D, Gonzalez-Perez A and Lopez-Bigas N. OncodriveCLUST: exploiting the positional clustering of somatic mutations to identify cancer genes. Bioinformatics. 2013; doi: 10.1093/bioinformatics/btt395s"
title = "Sequenza: allele-specific copy number and mutation profiles from tumor sequencing data"
description = """Sequenza is a novel set of tools providing a fast python script to genotype cancer samples, and an R package to estimate cancer cellularity, ploidy, genome wide copy number profile and infer for mutated alleles."""
publication = """F. Favero, T. Joshi,A. M. Marquard, N. J. Birkbak, M. Krzystanek, Q. Li, Z. Szallasi, and A. C. Eklund. “Sequenza: allele-specific copy number and mutation profiles from tumor sequencing data”. Annals of Oncology, 2015, vol. 26, issue 1, 64-70."""
title = "Ultra-efficient taxonomic mapping of NGS data"
description = "taxMaps is an ultra-efficient, customizable and fully scalable taxonomic classification tool for short-read data designed to deal with large DNA/RNA metagenomics samples. Its performance and comprehensiveness makes it highly suitable for unbiased contamination detection in large-scale sequencing operations, microbiome studies comprising a large number of samples, and for applications where the analysis delivery time is a critical factor, such as pathogen identification from clinical or environmental samples."
publication = "Corvelo A, Clarke W E, Robine N, et al. taxMaps: Comprehensive and highly accurate taxonomic classification of short-read data in reasonable time.[J]. Genome Research, 2018. doi: 10.1101/gr.225276.117"
title = "Structural variation and indel detection by local assembly"
description = """
SvABA is a method for detecting structural variants in sequencing data using genome-wide local assembly. Under the hood, SvABA uses a custom implementation of SGA (String Graph Assembler) by Jared Simpson, and BWA-MEM by Heng Li. Contigs are assembled for every 25kb window (with some small overlap) for every region in the genome. The default is to use only clipped, discordant, unmapped and indel reads, although this can be customized to any set of reads at the command line using VariantBam rules. These contigs are then immediately aligned to the reference with BWA-MEM and parsed to identify variants. Sequencing reads are then realigned to the contigs with BWA-MEM, and variants are scored by their read support.
SvABA is currently configured to provide indel and rearrangement calls (and anything "in between"). It can jointly call any number of BAM/CRAM/SAM files, and has built-in support for case-control experiments (e.g. tumor/normal, or trios or quads). In case/control mode, any number of cases and controls (but min of 1 case) can be input, and will jointly assemble all sequences together. If both a case and control are present, variants are output separately in "somatic" and "germline" VCFs. If only a single BAM is present (input with the -t flag), a single SV and a single indel VCF will be emitted.
publication = "Wala J A, Bandopadhayay P, Greenwald N F, et al. SvABA: genome-wide detection of structural variants and indels by local assembly[J]. Genome research, 2018, 28(4): 581-591. doi: 10.1101/gr.221028.117"
title = "The Read Origin Protocol (ROP) is a computational protocol that aims to discover the source of all reads, including those originating from repeat sequences, recombinant B and T cell receptors, and microbial communities."
description = """ROP is a computational protocol aimed to discover the source of all unmapped, which originate from complex RNA molecules, recombinant B and T cell receptors and microbial communities. We have tested ROP on 1 trillion reads from 10641 RNA-Seq samples across at least 54 tissues and 2630 individuals. The ROP accounts for 99.9% of all reads, compared to 82.9% by conventional mapping-based protocols. ROP is able to profile:
- repeats
- hyper-edited RNAs
- circRNAs, gene fusions, trans-splicing events
- recombined B and T cell receptor repertoires
- microbial communities
The 'dumpster diving' profile of unmapped reads output by our method is not limited to RNA-Seq technology and may be applied to whole-exome and whole-genome sequencing."""
publication = "Mangul S, Yang H T, Strauli N, et al. ROP: dumpster diving in RNA-sequencing to find the source of 1 trillion reads across diverse adult human tissues[J]. Genome Biology, 2018, 19(1):36. doi: 10.1186/s13059-018-1403-7"
title = "software tool for the manipulation, annotation, selection, simulation, and analysis of variants in the context of next-gen sequencing analysis"
description = "VariantTools, software tool for the manipulation, annotation, selection, simulation, and analysis of variants in the context of next-gen sequencing analysis."
publication = "Integrated annotation and analysis of genetic variants from next-generation sequencing studies with variant tools, Bioinformatics 28 (3): 421-422."
title = "A tool for automated alignment trimming in large-scale phylogenetic analyses."
description = "trimAl can consider several parameters, alone or in multiple combinations, in order to select the most-reliable positions in the alignment. These include the proportion of sequences with a gap, the level of residue similarity and, if several alignments for the same set of sequences are provided, the consistency level of columns among alignments. Moreover, trimAl is able to manually select a set of columns to be removed from the alignment."
publication = "trimAl: a tool for automated alignment trimming in large-scale phylogenetic analyses, Bioinformatics, 2009, doi:10.1093/bioinformatics/btp348"
title = "An implementation of a Gated Recurrent Unit (GRU) network for classification of transcripts as either coding or noncoding"
description = "mRNN is a package for distinguishing coding transcripts from noncoding using gated recurrent neural networks (GRNNs)."
publication = "Hill, S. T., Kuintzle, R. C., Teegarden, A., Merrill, E., Danaee, P., & Hendrix, D. A. (2017). A Deep Recurrent Neural Network Discovers Complex Biological Rules to Decipher RNA Protein-Coding Potential. Nucleic acids research,"
description = "This Snakemake pipeline implements the GATK best-practices workflow"
title = "A flexible package manager that supports multiple versions, configurations, platforms, and compilers."
description = """
Spack is a package manager for supercomputers, Linux, and macOS. It makes installing scientific software easy. With Spack, you can build a package with multiple versions, configurations, platforms, and compilers, and all of these builds can coexist on the same machine. Spack isn't tied to a particular language; you can build a software stack in Python or R, link to libraries written in C, C++, or Fortran, and easily swap compilers. Use Spack to install in your home directory, to manage shared installations and modules on a cluster, or to build combinatorial versions of software for testing.
publication = "Todd Gamblin, Matthew P. LeGendre, Michael R. Collette, Gregory L. Lee, Adam Moody, Bronis R. de Supinski, and W. Scott Futral. The Spack Package Manager: Bringing Order to HPC Software Chaos. In Supercomputing 2015 (SC’15), Austin, Texas, November 15-20 2015. LLNL-CONF-669890."