From 81d4804f8f1159926db4c1f87bf5fa992eff1388 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 7 Apr 2022 13:33:35 -0700 Subject: [PATCH 01/15] Counter now offers basic support for counting SAM files which were not produced by the pipeline. This commit simply changes how sequence counts are determined from each alignment's QNAME field. If this field does not have Collapser's signature "#_count=#" format, it next attempts to parse it as a fastx_collapse string. If that fails the alignment defaults to a count of 1. More broadly speaking, if Counter fails to parse the QNAME field, a default count of 1 is assumed. --- tiny/rna/counter/statistics.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index c6cb5aa3..2cbe83b2 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -1,3 +1,5 @@ +import re + import pandas as pd import mmap import json @@ -11,6 +13,7 @@ from ..util import make_filename +_re_fastx = r'seq\d+_x(\d+)$' class LibraryStats: @@ -43,8 +46,14 @@ def count_bundle(self, aln_bundle: iter) -> dict: loci_counts = len(aln_bundle) nt5, seqlen = bundle_read['nt5'], len(bundle_read['seq']) + try: + read_counts = int(bundle_read['name'].split('=')[1]) + except IndexError: + # Check if fastx collapsed; default to count=1 + fastx = re.match(_re_fastx, bundle_read['name']) + read_counts = 1 if fastx is None else int(fastx[1]) + # Calculate counts for multi-mapping - read_counts = int(bundle_read['name'].split('=')[1]) corr_counts = read_counts / loci_counts # Fill in 5p nt/length matrix From e8d071f6e2aad11d6329c7590af4358ea630c00c Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 8 Apr 2022 12:15:11 -0700 Subject: [PATCH 02/15] The SAM_reader class has been adapted to handle fastx_collapser QNAME in addition to those made by tiny-collapse. If a SAM file contains fastx style QNAMEs then it is also required that the SAM file's header reports SO:queryname so that multiple-alignments can be properly bundled --- tests/unit_tests_hts_parsing.py | 2 +- tiny/rna/counter/hts_parsing.py | 60 ++++++++++++++++++++++++++++----- tiny/rna/counter/statistics.py | 2 +- 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 1278778b..ec406448 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -509,7 +509,7 @@ def test_SAM_reader_read_thru_header(self): ] sam_out.assert_has_calls(expected_writelines) - self.assertTrue(len(reader._headers) == 1) + self.assertTrue(len(reader._header_lines) == 1) """Does SAM_reader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?""" diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 2f591234..0c4eb596 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -13,6 +13,9 @@ _re_attr_main = re.compile(r"\s*([^\s=]+)[\s=]+(.*)") _re_attr_empty = re.compile(r"^\s*$") +# For SAM_reader +_re_fastx = r'seq\d+_x(\d+)' + # For Alignment complement = {ord('A'): 'T', ord('T'): 'A', ord('G'): 'C', ord('C'): 'G'} @@ -23,10 +26,12 @@ class SAM_reader: def __init__(self, **prefs): self.decollapse = prefs.get("decollapse", False) self.out_prefix = prefs.get("out_prefix", None) + self.collapser_type = None + self.file = None self._decollapsed_filename = None self._decollapsed_reads = [] - self._headers = [] - self.file = None + self._header_lines = [] + self._header_dict = {} def bundle_multi_alignments(self, file) -> Iterator[List[dict]]: """Bundles multiple alignments by name""" @@ -50,8 +55,8 @@ def bundle_multi_alignments(self, file) -> Iterator[List[dict]]: def _parse_alignments(self, file_obj) -> Iterator[dict]: """Parses and yields individual SAM alignments from the open file_obj""" - line = self._read_thru_header(file_obj) - line_no = len(self._headers) + line, line_no = self._read_thru_header(file_obj) + self._validate_collapsed_input(line) decollapse_sam = self.decollapse try: @@ -101,24 +106,57 @@ def _get_decollapsed_filename(self): def _read_thru_header(self, file_obj): """Advance file_obj past the SAM header and return the first alignment unparsed""" + lineno = 1 line = file_obj.readline() while line[0] == ord('@'): - self._headers.append(line.decode('utf-8')) + self._save_header_line(line.decode('utf-8')) line = file_obj.readline() + lineno += 1 if self.decollapse: # Write the same header data to the decollapsed file with open(self._get_decollapsed_filename(), 'w') as f: - f.writelines(self._headers) + f.writelines(self._header_lines) + + return line, lineno + + def _save_header_line(self, headerline: str): + # Header lines + self._header_lines.append(headerline) - return line + # Header dict + rec_type = headerline[:3] + fields = headerline[3:].strip().split('\t') + + if rec_type is "@CO": + self._header_dict[rec_type] = fields[0] + else: + self._header_dict[rec_type] = \ + {field[:2]: field[3:].strip() for field in fields} + + def _validate_collapsed_input(self, first_aln_line): + if re.match(_re_fastx, first_aln_line) is not None: + sort_order = self._header_dict.get('@HD', {}).get('SO', None) + if sort_order is None or sort_order != "queryname": + raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" + "(and the @HD [...] SO header must be set accordingly).") + + self.collapser_type = "fastx" + + elif re.match(r"\d+_count=\d+", first_aln_line) is None: + raise ValueError("SAM files produced outside of the tinyRNA pipeline must be derived\n" + "from either a tiny-collapse or fastx_collapser output, and sorted\n" + "by queryname.") + else: + self.collapser_type = "tiny-collapse" def _write_decollapsed_sam(self): aln_out, prevname, seq_count = [], None, 0 + token = self.get_counts_split_token() for name, line in self._decollapsed_reads: # Parse count just once per multi-alignment if name != prevname: - seq_count = int(name.split(b"=")[1]) + seq_count = int(name.split(token)[1]) aln_out.extend([line] * seq_count) prevname = name @@ -127,6 +165,12 @@ def _write_decollapsed_sam(self): sam_o.writelines(aln_out) self._decollapsed_reads.clear() + def get_counts_split_token(self): + if self.collapser_type is "tiny-collapse": + return b"=" + elif self.collapser_type is "fastx": + return b"_x" + def infer_strandedness(sam_file: str, intervals: dict) -> str: """Infers strandedness from a sample SAM file and intervals from a parsed GFF file diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 2cbe83b2..25929949 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -11,9 +11,9 @@ from typing import Tuple, Optional from collections import Counter, defaultdict +from tiny.rna.counter.hts_parsing import _re_fastx from ..util import make_filename -_re_fastx = r'seq\d+_x(\d+)$' class LibraryStats: From e9305840e8778094425a7a3badfc7896f7809794 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 26 Jul 2022 13:13:40 -0700 Subject: [PATCH 03/15] Minor bugs related to the recent upgrade from Python 3.7 to 3.9 have been addressed. SAM file requirements have been relaxed to once again allow non-collapsed SAM files. When these files are encountered, SAM_reader cannot produce decollapsed outputs without parsing and counting the entire file, so a warning is produced and the decollapse routine is skipped --- tests/testdata/counter/non-collapsed.sam | 2 ++ tests/unit_tests_hts_parsing.py | 23 +++++++++++++++-- tiny/rna/counter/hts_parsing.py | 32 +++++++++++++++--------- 3 files changed, 43 insertions(+), 14 deletions(-) create mode 100644 tests/testdata/counter/non-collapsed.sam diff --git a/tests/testdata/counter/non-collapsed.sam b/tests/testdata/counter/non-collapsed.sam new file mode 100644 index 00000000..cadeadb6 --- /dev/null +++ b/tests/testdata/counter/non-collapsed.sam @@ -0,0 +1,2 @@ +@SQ SN:I LN:21 +NON_COLLAPSED_QNAME 16 I 15064570 255 21M * 0 0 CAAGACAGAGCTTCACCGTTC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 XM:i:2 \ No newline at end of file diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index efc68442..f4366144 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -1,9 +1,11 @@ import collections +import contextlib import unittest -from random import randint - import HTSeq +import io + from copy import deepcopy +from random import randint from unittest.mock import patch, mock_open, call from tiny.rna.counter.features import FeatureSelector @@ -566,6 +568,7 @@ def test_SAM_reader_read_thru_header(self): def test_SAM_reader_write_decollapsed_sam(self): reader = SAM_reader(decollapse=True) + reader.collapser_type = "tiny-collapse" reader._decollapsed_reads = [(b"0_count=5", b"mock line from SAM file")] reader._decollapsed_filename = "mock_outfile_name.sam" @@ -601,6 +604,22 @@ def test_SAM_reader_parse_alignments_decollapse(self): self.exhaust_iterator(reader._parse_alignments(sam_in)) write_fn.assert_called_once() + """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?""" + + def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self): + stdout_capture = io.StringIO() + with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn: + with contextlib.redirect_stderr(stdout_capture): + reader = SAM_reader(decollapse=True) + records = reader.bundle_multi_alignments(f"{resources}/non-collapsed.sam") + self.exhaust_iterator(records) + + write_fn.assert_not_called() + self.assertEqual(reader.collapser_type, None) + self.assertEqual(stdout_capture.getvalue(), + "Decollapsed SAM files will not be produced because input alignments " + "are not derived from a tiny-collapse or fastx_collapser output\n") + """Does CaseInsensitiveAttrs correctly store, check membership, and retrieve?""" def test_CaseInsensitiveAttrs(self): diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 848d3d7b..5fe9ed7f 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -57,7 +57,7 @@ def _parse_alignments(self, file_obj) -> Iterator[dict]: """Parses and yields individual SAM alignments from the open file_obj""" line, line_no = self._read_thru_header(file_obj) - self._validate_collapsed_input(line) + self._validate_collapsed_input(line.decode()) decollapse_sam = self.decollapse try: @@ -117,6 +117,9 @@ def _read_thru_header(self, file_obj): line = file_obj.readline() lineno += 1 + # Todo: this is probably better off in _validate_collapsed_input + # or perhaps part of a slightly larger refactor to group these routines into a prepare() fn + # the problem is if input is non-collapsed and decollapse=True, the header is written before issue is discovered if self.decollapse: # Write the same header data to the decollapsed file with open(self._get_decollapsed_filename(), 'w') as f: @@ -132,29 +135,32 @@ def _save_header_line(self, headerline: str): rec_type = headerline[:3] fields = headerline[3:].strip().split('\t') - if rec_type is "@CO": + if rec_type == "@CO": self._header_dict[rec_type] = fields[0] else: self._header_dict[rec_type] = \ {field[:2]: field[3:].strip() for field in fields} def _validate_collapsed_input(self, first_aln_line): - if re.match(_re_fastx, first_aln_line) is not None: + if re.match(r"\d+_count=\d+", first_aln_line) is not None: + self.collapser_type = "tiny-collapse" + + elif re.match(_re_fastx, first_aln_line) is not None: + self.collapser_type = "fastx" sort_order = self._header_dict.get('@HD', {}).get('SO', None) if sort_order is None or sort_order != "queryname": raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" "(and the @HD [...] SO header must be set accordingly).") - self.collapser_type = "fastx" - - elif re.match(r"\d+_count=\d+", first_aln_line) is None: - raise ValueError("SAM files produced outside of the tinyRNA pipeline must be derived\n" - "from either a tiny-collapse or fastx_collapser output, and sorted\n" - "by queryname.") else: - self.collapser_type = "tiny-collapse" + self.collapser_type = None + if self.decollapse: + self.decollapse = False + print("Decollapsed SAM files will not be produced because input alignments are not derived " + "from a tiny-collapse or fastx_collapser output", file=sys.stderr) def _write_decollapsed_sam(self): + assert self.collapser_type is not None aln_out, prevname, seq_count = [], None, 0 token = self.get_counts_split_token() for name, line in self._decollapsed_reads: @@ -170,10 +176,12 @@ def _write_decollapsed_sam(self): self._decollapsed_reads.clear() def get_counts_split_token(self): - if self.collapser_type is "tiny-collapse": + if self.collapser_type == "tiny-collapse": return b"=" - elif self.collapser_type is "fastx": + elif self.collapser_type == "fastx": return b"_x" + elif self.collapser_type is None: + return None def infer_strandedness(sam_file: str, intervals: dict) -> str: From 496ed8bc71af6b9413a110dd6fd6f889691f6b4c Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 27 Jul 2022 12:29:31 -0700 Subject: [PATCH 04/15] Mild refactor to clean up the method signatures of run_cwltool_subprocess and run_cwltool_native. This was prompted by the addition of another configurable cwltool argument: --tmpdir-prefix. This will allow users to specify the location for temporary/intermediate files if the default location is on a space-limited volume --- tiny/entry.py | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/tiny/entry.py b/tiny/entry.py index 83a2bbe2..7f8e50c7 100644 --- a/tiny/entry.py +++ b/tiny/entry.py @@ -89,21 +89,16 @@ def run(tinyrna_cwl_path: str, config_file: str) -> None: # First get the configuration file set up for this run config_object = Configuration(config_file) run_directory = config_object.create_run_directory() - cwl_conf_file = config_object.save_run_profile() + config_object.save_run_profile() workflow = f"{tinyrna_cwl_path}/workflows/tinyrna_wf.cwl" - parallel = config_object['run_parallel'] - loudness = config_object['verbosity'] - if config_object['run_native']: # experimental + if config_object['run_native']: # Execute the CWL runner via native Python - return_code = run_native(config_object, workflow, run_directory, verbosity=loudness) + return_code = run_cwltool_native(config_object, workflow, run_directory) else: # Use the cwltool CWL runner via command line - return_code = run_cwltool_subprocess( - cwl_conf_file, workflow, - run_directory=run_directory, - parallel=parallel, verbosity=loudness) + return_code = run_cwltool_subprocess(config_object, workflow, run_directory) # If the workflow completed without errors, we want to update # the Paths Sheet to point to the new bowtie index prefix @@ -149,43 +144,46 @@ def resume(tinyrna_cwl_path: str, config_file: str, step: str) -> None: if config['run_native']: # We can pass our config object directly without writing to disk first - run_native(config, resume_wf, verbosity=config['verbosity']) + run_cwltool_native(config, resume_wf) else: # Processed Run Config must be written to disk first - resume_conf_file = "resume_" + os.path.basename(config_file) + resume_conf_file = config.get_outfile_path() config.write_processed_config(resume_conf_file) - run_cwltool_subprocess(resume_conf_file, resume_wf, verbosity=config['verbosity']) + run_cwltool_subprocess(config, resume_wf) if os.path.isfile(resume_wf): # We don't want the generated workflow to be returned by a call to setup-cwl os.remove(resume_wf) -def run_cwltool_subprocess(config_file: str, workflow: str, run_directory=None, parallel=False, verbosity="normal") -> int: +def run_cwltool_subprocess(config_object: 'ConfigBase', workflow: str, run_directory='.') -> int: """Executes the workflow using a command line invocation of cwltool Args: - config_file: the processed configuration file produced by configuration.py + config_object: a constructed ConfigBase-derived object workflow: the path to the workflow to be executed run_directory: the destination folder for workflow output subdirectories (default: CWD) - parallel: process libraries in parallel where possible - verbosity: controls the depth of information written to terminal by cwltool Returns: None """ - command = ['cwltool --timestamps --relax-path-checks --on-error continue'] + processed_configfile = config_object.get_outfile_path() + tmpdir = config_object['tmp_directory'] + parallel = config_object['run_parallel'] + verbosity = config_object['verbosity'] + + command = [f'cwltool --timestamps --relax-path-checks --on-error continue --outdir {run_directory}'] + if tmpdir is not None: command.append(f'--tmpdir-prefix {tmpdir}') if verbosity == 'debug': command.append('--debug --js-console --leave-tmpdir') if verbosity == 'quiet': command.append('--quiet') - if run_directory: command.append(f'--outdir {run_directory}') if parallel: command.append('--parallel') - cwl_runner = ' '.join(command + [workflow, config_file]) + cwl_runner = ' '.join(command + [workflow, processed_configfile]) return subprocess.run(cwl_runner, shell=True).returncode -def run_native(config_object: 'ConfigBase', workflow: str, run_directory: str = '.', verbosity="normal") -> int: +def run_cwltool_native(config_object: 'ConfigBase', workflow: str, run_directory: str = '.') -> int: """Executes the workflow using native Python rather than subprocess "command line" Args: @@ -214,6 +212,7 @@ def furnish_if_file_record(file_dict): furnish_if_file_record(config_param) # Set overall config for cwltool + verbosity = config_object['verbosity'] runtime_context = RuntimeContext({ 'secret_store': cwltool.secrets.SecretStore(), 'outdir': run_directory, From c19e7759f303bfb4a5c256ac88db27b5e1648598 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 27 Jul 2022 12:31:10 -0700 Subject: [PATCH 05/15] Adding support for the new tmp_directory parameter --- tiny/rna/configuration.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 6ef9b379..ebaf0c10 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -124,8 +124,9 @@ def create_run_directory(self) -> str: return run_dir - def get_outfile_path(self, infile: str) -> str: - """Prepend date+time to the Config File name. Change its path to reside in the Run Directory.""" + def get_outfile_path(self, infile: str = None) -> str: + """Returns the path and filename for the processed run config""" + if infile is None: infile = self.inf return self.joinpath(self['run_directory'], os.path.basename(infile)) def write_processed_config(self, filename: str = None) -> str: @@ -184,7 +185,7 @@ def to_cwl_file_class(input_file_path): path_to_input = self.paths.from_here(input_file_path) return self.cwl_file(path_to_input) - for absorb_key in ['ebwt', 'plot_style_sheet', 'adapter_fasta']: + for absorb_key in ['ebwt', 'plot_style_sheet', 'adapter_fasta', 'tmp_directory']: self[absorb_key] = self.paths[absorb_key] self['run_directory'] = self.paths.from_here(self.paths['run_directory']) From c7bcb640d2ec6812bde5a0ccd27afefb87eb84aa Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 27 Jul 2022 12:31:43 -0700 Subject: [PATCH 06/15] Minor addition to ResumeBase for compatibility with the refactor in entry.py --- tiny/rna/resume.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tiny/rna/resume.py b/tiny/rna/resume.py index b8aa6edb..a3444375 100644 --- a/tiny/rna/resume.py +++ b/tiny/rna/resume.py @@ -97,6 +97,11 @@ def _add_timestamps(self, steps): # Update run_name output prefix variable for the current date and time self['run_name'] = re.sub(timestamp_format, self.dt, self['run_name']) + # Override + def get_outfile_path(self, infile: str = None) -> str: + if infile is None: infile = self.inf + return "resume_" + os.path.basename(infile) + def write_workflow(self, workflow_outfile: str) -> None: with open(workflow_outfile, "w") as wf: self.yaml.dump(self.workflow, wf) From 219113b380574a868d533e7fc8947b098f53a366 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 27 Jul 2022 12:32:24 -0700 Subject: [PATCH 07/15] tmp_directory key and placeholder key added to paths.yml and run_config.yml --- START_HERE/paths.yml | 5 ++++- START_HERE/run_config.yml | 1 + tiny/templates/paths.yml | 3 +++ tiny/templates/run_config_template.yml | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index 93ce213b..33ad4c32 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -17,6 +17,9 @@ features_csv: ./features.csv ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory +##-- The directory for temporary files. Determined by cwltool if blank. --## +tmp_directory: + ######-------------------------------- BOWTIE-BUILD ---------------------------------###### # # To build bowtie indexes: @@ -33,7 +36,7 @@ run_directory: run_directory ##-- The prefix for your bowtie index, include relative path (relative to this config file) --## ##-- If you do not have a bowtie index, change this to ebwt: '' -ebwt: '' +ebwt: /Users/alex/Projects/PycharmProjects/tinyrna/START_HERE/tinyrna_2022-07-26_13-35-30_run_directory/bowtie-build/ram1 ##-- If you do not have a bowtie index, provide your reference genome file(s) here --## ##-- One file per line, with "- " at the beginning (think: bulleted list) --## diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index 235c7090..ea9d4aa3 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -328,6 +328,7 @@ dir_name_plotter: plots ######-------------------------------------------------------------------------------###### run_directory: ~ +tmp_directory: ~ features_csv: { } samples_csv: { } reference_genome_files: [ ] diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index 24490eae..6d8ed972 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -17,6 +17,9 @@ features_csv: './features.csv' ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory +##-- The directory for temporary files. Determined by cwltool if blank. --## +tmp_directory: + ######-------------------------------- BOWTIE-BUILD ---------------------------------###### # # To build bowtie indexes: diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 49ffff01..a1ec8610 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -328,6 +328,7 @@ dir_name_plotter: plots ######-------------------------------------------------------------------------------###### run_directory: ~ +tmp_directory: ~ features_csv: { } samples_csv: { } reference_genome_files: [ ] From edbeb706843fe596ea9de1b7afad8da9a6c51ba1 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 30 Jul 2022 13:55:20 -0700 Subject: [PATCH 08/15] Light refactoring to help clean up SAM_reader --- tests/unit_tests_hts_parsing.py | 15 +++++++----- tiny/rna/counter/hts_parsing.py | 43 +++++++++++++++------------------ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index f4366144..7479472f 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -544,7 +544,7 @@ def test_SAM_reader_get_decollapsed_filename(self): self.assertEqual(sam_out, "sam_file_decollapsed.sam") - """Does SAM_reader._read_thru_header() correctly identify header lines and write them to the decollapsed file?""" + """Does SAM_reader._read_to_first_aln() correctly identify header lines and write them to the decollapsed file?""" def test_SAM_reader_read_thru_header(self): reader = SAM_reader(decollapse=True) @@ -552,7 +552,7 @@ def test_SAM_reader_read_thru_header(self): with open(self.short_sam_file, 'rb') as sam_in: with patch('builtins.open', mock_open()) as sam_out: - line = reader._read_thru_header(sam_in) + line = reader._read_to_first_aln(sam_in) expected_writelines = [ call('mock_outfile_name.sam', 'w'), @@ -608,17 +608,20 @@ def test_SAM_reader_parse_alignments_decollapse(self): def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self): stdout_capture = io.StringIO() - with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn: + with patch.object(SAM_reader, "_write_decollapsed_sam") as write_sam, \ + patch.object(SAM_reader, "_write_header_for_decollapsed_sam") as write_header: + with contextlib.redirect_stderr(stdout_capture): reader = SAM_reader(decollapse=True) records = reader.bundle_multi_alignments(f"{resources}/non-collapsed.sam") self.exhaust_iterator(records) - write_fn.assert_not_called() + write_sam.assert_not_called() + write_header.assert_not_called() self.assertEqual(reader.collapser_type, None) self.assertEqual(stdout_capture.getvalue(), - "Decollapsed SAM files will not be produced because input alignments " - "are not derived from a tiny-collapse or fastx_collapser output\n") + "Alignments do not appear to be derived from a supported collapser input. " + "Decollapsed SAM files will therefore not be produced.\n") """Does CaseInsensitiveAttrs correctly store, check membership, and retrieve?""" diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 5fe9ed7f..f1c71a08 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -56,8 +56,7 @@ def bundle_multi_alignments(self, file) -> Iterator[List[dict]]: def _parse_alignments(self, file_obj) -> Iterator[dict]: """Parses and yields individual SAM alignments from the open file_obj""" - line, line_no = self._read_thru_header(file_obj) - self._validate_collapsed_input(line.decode()) + line, line_no = self._read_to_first_aln(file_obj) decollapse_sam = self.decollapse try: @@ -101,33 +100,22 @@ def _parse_alignments(self, file_obj) -> Iterator[dict]: e.args = (str(e.args[0]) + '\n' + f"Error occurred on line {line_no} of {self.file}",) raise e.with_traceback(sys.exc_info()[2]) from e - def _get_decollapsed_filename(self): - if self._decollapsed_filename is None: - basename = os.path.splitext(os.path.basename(self.file))[0] - self._decollapsed_filename = make_filename([basename, "decollapsed"], ext='.sam') - return self._decollapsed_filename - - def _read_thru_header(self, file_obj): + def _read_to_first_aln(self, file_obj): """Advance file_obj past the SAM header and return the first alignment unparsed""" lineno = 1 line = file_obj.readline() while line[0] == ord('@'): - self._save_header_line(line.decode('utf-8')) + self._parse_header_line(line.decode('utf-8')) line = file_obj.readline() lineno += 1 - # Todo: this is probably better off in _validate_collapsed_input - # or perhaps part of a slightly larger refactor to group these routines into a prepare() fn - # the problem is if input is non-collapsed and decollapse=True, the header is written before issue is discovered - if self.decollapse: - # Write the same header data to the decollapsed file - with open(self._get_decollapsed_filename(), 'w') as f: - f.writelines(self._header_lines) + self._determine_collapser_type(line.decode()) + if self.decollapse: self._write_header_for_decollapsed_sam() return line, lineno - def _save_header_line(self, headerline: str): + def _parse_header_line(self, headerline: str): # Header lines self._header_lines.append(headerline) @@ -141,23 +129,32 @@ def _save_header_line(self, headerline: str): self._header_dict[rec_type] = \ {field[:2]: field[3:].strip() for field in fields} - def _validate_collapsed_input(self, first_aln_line): + def _determine_collapser_type(self, first_aln_line): if re.match(r"\d+_count=\d+", first_aln_line) is not None: self.collapser_type = "tiny-collapse" - elif re.match(_re_fastx, first_aln_line) is not None: self.collapser_type = "fastx" sort_order = self._header_dict.get('@HD', {}).get('SO', None) if sort_order is None or sort_order != "queryname": raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" "(and the @HD [...] SO header must be set accordingly).") - else: self.collapser_type = None if self.decollapse: self.decollapse = False - print("Decollapsed SAM files will not be produced because input alignments are not derived " - "from a tiny-collapse or fastx_collapser output", file=sys.stderr) + print("Alignments do not appear to be derived from a supported collapser input. " + "Decollapsed SAM files will therefore not be produced.", file=sys.stderr) + + def _get_decollapsed_filename(self): + if self._decollapsed_filename is None: + basename = os.path.splitext(os.path.basename(self.file))[0] + self._decollapsed_filename = make_filename([basename, "decollapsed"], ext='.sam') + return self._decollapsed_filename + + def _write_header_for_decollapsed_sam(self): + assert self.collapser_type is not None + with open(self._get_decollapsed_filename(), 'w') as f: + f.writelines(self._header_lines) def _write_decollapsed_sam(self): assert self.collapser_type is not None From cc32895872ff329d5951341f45dbadef4401a8eb Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 30 Jul 2022 17:27:14 -0700 Subject: [PATCH 09/15] Further refinements to SAM_reader. Mostly just experimenting with moving the bundled read count determination out of statistics.py and into the sam reader. I'm still a little on the fence about it. While this makes it much cleaner to support a variety of collapser utilities (used pre-alignment), it also doesn't make nearly as much sense in the new location. The change slightly increases the runtime, and while it isn't by a significant amount, the %time reduced in statistics.py is far less than the %time increased by having it in SAM_reader. May continue testing. For now, I'm going to reflect on whether this is worth fretting over any further. --- tests/unit_tests_hts_parsing.py | 6 +- tiny/rna/counter/features.py | 4 +- tiny/rna/counter/hts_parsing.py | 103 +++++++++++++++++++++----------- tiny/rna/counter/statistics.py | 9 +-- 4 files changed, 74 insertions(+), 48 deletions(-) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 7479472f..12002f49 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -71,7 +71,7 @@ def test_sam_reader(self): self.assertEqual(sam_record['start'], 15064569) self.assertEqual(sam_record['end'], 15064590) self.assertEqual(sam_record['strand'], '-') - self.assertEqual(sam_record['name'], "0_count=5") + self.assertEqual(sam_record['name'], b"0_count=5") self.assertEqual(sam_record['seq'], b"CAAGACAGAGCTTCACCGTTC") self.assertEqual(sam_record['len'], 21) self.assertEqual(sam_record['nt5'], 'G') @@ -89,13 +89,13 @@ def test_sam_parser_comparison(self): ours = SAM_reader().bundle_multi_alignments(file) theirs = HTSeq.bundle_multiple_alignments(HTSeq.BAM_Reader(file)) - for our_bundle, their_bundle in zip(ours, theirs): + for (our_bundle, _), their_bundle in zip(ours, theirs): self.assertEqual(len(our_bundle), len(their_bundle)) for our, their in zip(our_bundle, their_bundle): self.assertEqual(our['chrom'], their.iv.chrom) self.assertEqual(our['start'], their.iv.start) self.assertEqual(our['end'], their.iv.end) - self.assertEqual(our['name'], their.read.name) + self.assertEqual(our['name'].decode(), their.read.name) self.assertEqual(our['nt5'], chr(their.read.seq[0])) # See note above self.assertEqual(our['strand'], their.iv.strand) if our['strand'] == '-': # See note above diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 785b6567..1a01874f 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -65,8 +65,8 @@ def count_reads(self, library: dict): self.stats.assign_library(library) # For each sequence in the sam file... - for bundle in read_seq: - bstat = self.stats.count_bundle(bundle) + for bundle, read_count in read_seq: + bstat = self.stats.count_bundle(bundle, read_count) # For each alignment of the given sequence... for alignment in bundle: diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index f1c71a08..e426cbb1 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -4,7 +4,7 @@ import re from collections import Counter, defaultdict, namedtuple -from typing import Tuple, List, Dict, Iterator, Optional, DefaultDict, Set, Union +from typing import Tuple, List, Dict, Iterator, Optional, DefaultDict, Set, Union, IO from inspect import stack from tiny.rna.counter.matching import Wildcard @@ -15,6 +15,9 @@ _re_attr_empty = re.compile(r"^\s*$") # For SAM_reader +AlignmentDict = Dict[str, Union[str, int, bytes]] +Bundle = Tuple[List[AlignmentDict], int] +_re_tiny = r"\d+_count=\d+" _re_fastx = r'seq\d+_x(\d+)' # For Alignment @@ -22,38 +25,50 @@ class SAM_reader: - """A minimal SAM reader that bundles multiple-alignments and only parses data relevant to the workflow""" + """A minimal SAM reader that bundles multiple-alignments and only parses data relevant to the workflow.""" def __init__(self, **prefs): self.decollapse = prefs.get("decollapse", False) self.out_prefix = prefs.get("out_prefix", None) self.collapser_type = None self.file = None + self._collapser_token = None self._decollapsed_filename = None self._decollapsed_reads = [] self._header_lines = [] self._header_dict = {} - def bundle_multi_alignments(self, file) -> Iterator[List[dict]]: - """Bundles multiple alignments by name""" + def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]: + """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count""" self.file = file - sam_parser = self._parse_alignments with open(file, 'rb') as f: - aln_iter = iter(sam_parser(f)) - bundle = [next(aln_iter)] + aln_iter = iter(self._parse_alignments(f)) + bundle, read_count = self._new_bundle(next(aln_iter)) + for aln in aln_iter: if aln['name'] != bundle[0]['name']: - yield bundle - bundle = [aln] + yield bundle, read_count + bundle, read_count = self._new_bundle(aln) else: bundle.append(aln) - yield bundle + yield bundle, read_count if self.decollapse and len(self._decollapsed_reads): self._write_decollapsed_sam() - def _parse_alignments(self, file_obj) -> Iterator[dict]: + def _new_bundle(self, aln: AlignmentDict) -> Bundle: + """Wraps the provided alignment in a list and reports the read's count""" + + if self.collapser_type is not None: + token = self.collapser_token + count = int(aln['name'].split(token)[-1]) + else: + count = 1 + + return [aln], count + + def _parse_alignments(self, file_obj: IO) -> Iterator[AlignmentDict]: """Parses and yields individual SAM alignments from the open file_obj""" line, line_no = self._read_to_first_aln(file_obj) @@ -86,7 +101,7 @@ def _parse_alignments(self, file_obj) -> Iterator[dict]: nt5 = chr(seq[0]) yield { - "name": cols[0].decode(), + "name": cols[0], "len": length, "seq": seq, "nt5": nt5, @@ -100,7 +115,7 @@ def _parse_alignments(self, file_obj) -> Iterator[dict]: e.args = (str(e.args[0]) + '\n' + f"Error occurred on line {line_no} of {self.file}",) raise e.with_traceback(sys.exc_info()[2]) from e - def _read_to_first_aln(self, file_obj): + def _read_to_first_aln(self, file_obj: IO) -> Tuple[bytes, int]: """Advance file_obj past the SAM header and return the first alignment unparsed""" lineno = 1 @@ -115,13 +130,14 @@ def _read_to_first_aln(self, file_obj): return line, lineno - def _parse_header_line(self, headerline: str): - # Header lines - self._header_lines.append(headerline) + def _parse_header_line(self, header_line: str) -> None: + """Parses and saves information from the provided header line""" + + self._header_lines.append(header_line) # Header dict - rec_type = headerline[:3] - fields = headerline[3:].strip().split('\t') + rec_type = header_line[:3] + fields = header_line[3:].strip().split('\t') if rec_type == "@CO": self._header_dict[rec_type] = fields[0] @@ -129,56 +145,73 @@ def _parse_header_line(self, headerline: str): self._header_dict[rec_type] = \ {field[:2]: field[3:].strip() for field in fields} - def _determine_collapser_type(self, first_aln_line): - if re.match(r"\d+_count=\d+", first_aln_line) is not None: + def _determine_collapser_type(self, first_aln_line: str) -> None: + """Attempts to determine the collapsing utility that was used before producing the + input alignment file, then checks basic requirements for the utility's outputs.""" + + if re.match(_re_tiny, first_aln_line) is not None: self.collapser_type = "tiny-collapse" + elif re.match(_re_fastx, first_aln_line) is not None: self.collapser_type = "fastx" + sort_order = self._header_dict.get('@HD', {}).get('SO', None) if sort_order is None or sort_order != "queryname": raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" "(and the @HD [...] SO header must be set accordingly).") else: self.collapser_type = None + if self.decollapse: self.decollapse = False print("Alignments do not appear to be derived from a supported collapser input. " "Decollapsed SAM files will therefore not be produced.", file=sys.stderr) - def _get_decollapsed_filename(self): + def _get_decollapsed_filename(self) -> str: + """Returns the filename to be used for writing decollapsed outputs""" + if self._decollapsed_filename is None: basename = os.path.splitext(os.path.basename(self.file))[0] self._decollapsed_filename = make_filename([basename, "decollapsed"], ext='.sam') return self._decollapsed_filename - def _write_header_for_decollapsed_sam(self): + def _write_header_for_decollapsed_sam(self) -> None: + """Writes the input file's header lines to the decollapsed output file""" + assert self.collapser_type is not None with open(self._get_decollapsed_filename(), 'w') as f: f.writelines(self._header_lines) - def _write_decollapsed_sam(self): + def _write_decollapsed_sam(self) -> None: + """Expands the collapsed alignments in the _decollapsed_reads buffer + and writes the result to the decollapsed output file""" + assert self.collapser_type is not None - aln_out, prevname, seq_count = [], None, 0 - token = self.get_counts_split_token() + aln_out, prev_name, seq_count = [], None, 0 for name, line in self._decollapsed_reads: # Parse count just once per multi-alignment - if name != prevname: - seq_count = int(name.split(token)[1]) + if name != prev_name: + seq_count = int(name.split(self.collapser_token)[-1]) aln_out.extend([line] * seq_count) - prevname = name + prev_name = name with open(self._get_decollapsed_filename(), 'ab') as sam_o: sam_o.writelines(aln_out) self._decollapsed_reads.clear() - def get_counts_split_token(self): - if self.collapser_type == "tiny-collapse": - return b"=" - elif self.collapser_type == "fastx": - return b"_x" - elif self.collapser_type is None: - return None + @property + def collapser_token(self) -> bytes: + """Returns the split token to be used for determining read count from the QNAME field""" + + if self._collapser_token is None: + self._collapser_token = { + "tiny-collapse": b"=", + "fastx": b"_x", + "BioSeqZip": b":" + }[self.collapser_type] + + return self._collapser_token def infer_strandedness(sam_file: str, intervals: dict) -> str: diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 6f963ff7..0dd480fc 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -40,20 +40,13 @@ def __init__(self, out_prefix: str = None, report_diags: bool = False, **prefs): def assign_library(self, library: dict): self.library = library - def count_bundle(self, aln_bundle: iter) -> dict: + def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict: """Called for each multiple-alignment bundle before it is processed""" bundle_read = aln_bundle[0] loci_counts = len(aln_bundle) nt5, seqlen = bundle_read['nt5'], len(bundle_read['seq']) - try: - read_counts = int(bundle_read['name'].split('=')[1]) - except IndexError: - # Check if fastx collapsed; default to count=1 - fastx = re.match(_re_fastx, bundle_read['name']) - read_counts = 1 if fastx is None else int(fastx[1]) - # Calculate counts for multi-mapping corr_counts = read_counts / loci_counts From fff006c8b5807da5e89e5891794a8e3d0fc2f39b Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 31 Jul 2022 12:19:41 -0700 Subject: [PATCH 10/15] Added an additional test to check that a read count of 1 is reported for non-collapsed SAM records --- tests/testdata/counter/non-collapsed.sam | 2 +- tests/unit_tests_hts_parsing.py | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/testdata/counter/non-collapsed.sam b/tests/testdata/counter/non-collapsed.sam index cadeadb6..94e2d107 100644 --- a/tests/testdata/counter/non-collapsed.sam +++ b/tests/testdata/counter/non-collapsed.sam @@ -1,2 +1,2 @@ @SQ SN:I LN:21 -NON_COLLAPSED_QNAME 16 I 15064570 255 21M * 0 0 CAAGACAGAGCTTCACCGTTC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 XM:i:2 \ No newline at end of file +NON_COLLAPSED_QNAME 16 I 15064570 255 21M * 0 0 CAAGACAGAGCTTCACCGTTC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 XM:i:2 diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 12002f49..a62bd0ec 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -64,7 +64,7 @@ def exhaust_iterator(self, it): """Did SAM_reader correctly skip header values and parse all pertinent info from a single record SAM file?""" def test_sam_reader(self): - sam_bundle = next(SAM_reader().bundle_multi_alignments(self.short_sam_file)) + sam_bundle, read_count = next(SAM_reader().bundle_multi_alignments(self.short_sam_file)) sam_record = sam_bundle[0] self.assertEqual(sam_record['chrom'], "I") @@ -604,6 +604,23 @@ def test_SAM_reader_parse_alignments_decollapse(self): self.exhaust_iterator(reader._parse_alignments(sam_in)) write_fn.assert_called_once() + """Does SAM_reader report a single read count for non-collapsed SAM records?""" + + def test_SAM_reader_single_readcount_non_collapsed_SAM(self): + # Read non-collapsed.sam but duplicate its single record twice + with open(f"{resources}/non-collapsed.sam", 'rb') as f: + sam_lines = f.readlines() + sam_lines.extend([sam_lines[1]] * 2) + mock_file = mock_open(read_data=b''.join(sam_lines)) + + with patch('tiny.rna.counter.hts_parsing.open', new=mock_file): + reader = SAM_reader() + bundle, read_count = next(reader.bundle_multi_alignments('mock_file')) + + self.assertEqual(bundle[0]['name'], b'NON_COLLAPSED_QNAME') + self.assertEqual(len(bundle), 3) + self.assertEqual(read_count, 1) + """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?""" def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self): From df56f065c605d4eb18f064da74d90207786b2d90 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 31 Jul 2022 12:44:32 -0700 Subject: [PATCH 11/15] Irrelevant information was accidentally included in a prior commit --- START_HERE/paths.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index 33ad4c32..c83a7607 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -36,7 +36,7 @@ tmp_directory: ##-- The prefix for your bowtie index, include relative path (relative to this config file) --## ##-- If you do not have a bowtie index, change this to ebwt: '' -ebwt: /Users/alex/Projects/PycharmProjects/tinyrna/START_HERE/tinyrna_2022-07-26_13-35-30_run_directory/bowtie-build/ram1 +ebwt: '' ##-- If you do not have a bowtie index, provide your reference genome file(s) here --## ##-- One file per line, with "- " at the beginning (think: bulleted list) --## From e304c19557dab8c64be52315d46a295f97c6db32 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 4 Aug 2022 10:17:21 -0700 Subject: [PATCH 12/15] Detail polish --- tiny/rna/counter/statistics.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 0dd480fc..48443d1b 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -1,5 +1,3 @@ -import re - import pandas as pd import mmap import json @@ -12,7 +10,6 @@ from typing import Tuple, Optional, Union from collections import Counter, defaultdict -from tiny.rna.counter.hts_parsing import _re_fastx from ..util import make_filename @@ -45,10 +42,8 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict: bundle_read = aln_bundle[0] loci_counts = len(aln_bundle) - nt5, seqlen = bundle_read['nt5'], len(bundle_read['seq']) - - # Calculate counts for multi-mapping corr_counts = read_counts / loci_counts + nt5, seqlen = bundle_read['nt5'], len(bundle_read['seq']) # Fill in 5p nt/length matrix self.mapped_nt_len[nt5][seqlen] += read_counts From 6e7c0205f1c3a65c7f9e593314241891a7946423 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 4 Aug 2022 11:15:43 -0700 Subject: [PATCH 13/15] Detail polish after resolving merge conflicts --- tiny/entry.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tiny/entry.py b/tiny/entry.py index 02410d68..9db63c74 100644 --- a/tiny/entry.py +++ b/tiny/entry.py @@ -190,13 +190,15 @@ def run_cwltool_native(config_object: 'ConfigBase', workflow: str, run_directory config_object: a constructed ConfigBase-derived object workflow: the path to the workflow to be executed run_directory: the destination folder for workflow output subdirectories (default: CWD) - parallel: process libraries in parallel where possible - verbosity: controls the depth of information written to terminal by cwltool Returns: None """ + tmpdir = config_object['tmp_directory'] + parallel = config_object['run_parallel'] + verbosity = config_object['verbosity'] + def furnish_if_file_record(file_dict): if isinstance(file_dict, dict) and file_dict.get('class', None) == 'File': file_dict['basename'] = os.path.basename(file_dict['path']) @@ -212,7 +214,6 @@ def furnish_if_file_record(file_dict): furnish_if_file_record(config_param) # Set overall config for cwltool - verbosity = config_object['verbosity'] runtime_context = RuntimeContext({ 'secret_store': cwltool.secrets.SecretStore(), 'outdir': run_directory, @@ -221,8 +222,10 @@ def furnish_if_file_record(file_dict): 'debug': verbosity == "debug" }) - # Set proper temp directory for Mac users - if sys.platform == "darwin": + # Set temp directory for intermediate outputs + if tmpdir is not None: + runtime_context.tmp_outdir_prefix = tmpdir + elif sys.platform == "darwin": default_mac_path = "/private/tmp/docker_tmp" if runtime_context.tmp_outdir_prefix == DEFAULT_TMP_PREFIX: runtime_context.tmp_outdir_prefix = default_mac_path From 1c600b794cf63533e44fa0cc3bc843596cd50248 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 9 Aug 2022 10:57:33 -0700 Subject: [PATCH 14/15] Documentation update to indicate that third party and non-collapsed SAM inputs are now accepted by tiny-count. Requested changes have been made to tiny-count's command line arguments. The helpstring has also been updated to indicate that the --decollapse option will be ignored for non-collapsed inputs. --- doc/Parameters.md | 17 +++++++++-------- doc/tiny-count.md | 2 +- tiny/rna/counter/counter.py | 8 ++++---- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/doc/Parameters.md b/doc/Parameters.md index dd1560a1..aec57919 100644 --- a/doc/Parameters.md +++ b/doc/Parameters.md @@ -75,14 +75,14 @@ By default, tiny-count will only evaluate alignments to features which match a ` |----------------------------|---------------------------| | counter-normalize-by-hits: | `--normalize-by-hits T/F` | -By default, tiny-count will divide the number of counts associated with each sequence, twice, before they are assigned to a feature. Each unique sequence's count is determined by tiny-collapse and is preserved through the alignment process. The original count is divided first by the number of loci that the sequence aligns to, and second by the number of features passing selection at each locus. Switching this option "off" disables the latter normalization step. +By default, tiny-count will divide the number of counts associated with each sequence, twice, before they are assigned to a feature. Each unique sequence's count is determined by tiny-collapse (or a compatible collapsing utility) and is preserved through the alignment process. The original count is divided first by the number of loci that the sequence aligns to, and second by the number of features passing selection at each locus. Switching this option "off" disables the latter normalization step. ### Decollapse | Run Config Key | Commandline Argument | |---------------------|------------------------| | counter_decollapse: | `--decollapse` | -The SAM files produced by the tinyRNA pipeline are collapsed by default; alignments sharing a SEQ field are strictly multi-alignments and do not reflect original sequence counts. If this option is switched "on", tiny-count will produce a decollapsed copy of each input SAM file. Each alignment in the decollapsed SAM will be duplicated by the sequence's original count. This is useful for browsing in IGV. +The SAM files produced by the tinyRNA pipeline are collapsed by default; alignments sharing a SEQ field are strictly multi-alignments and do not reflect original sequence counts. If this option is switched "on", tiny-count will produce a decollapsed copy of each input SAM file. Each alignment in the decollapsed SAM will be duplicated by the sequence's original count. This is useful for browsing in IGV. If non-collapsed inputs are provided to tiny-count in standalone mode, this option will be ignored. ### Filters | Run Config Key | Commandline Argument | @@ -108,9 +108,9 @@ Diagnostic information will include intermediate alignment files for each librar ### Full tiny-count Help String ``` -tiny-count -i SAMPLES -c CONFIGFILE -o OUTPUTPREFIX [-h] - [-sf [SOURCE [SOURCE ...]]] [-tf [TYPE [TYPE ...]]] - [-nh T/F] [-dc] [-a] [-p] [-d] +tiny-count -i SAMPLES -f FEATURES -o OUTPUTPREFIX [-h] + [-sf [SOURCE ...]] [-tf [TYPE ...]] [-nh T/F] [-dc] [-a] + [-p] [-d] This submodule assigns feature counts for SAM alignments using a Feature Sheet ruleset. If you find that you are sourcing all of your input files from a @@ -118,9 +118,9 @@ prior run, we recommend that you instead run `tiny recount` within that run's directory. Required arguments: - -i SAMPLES, --input-csv SAMPLES + -i SAMPLES, --samples-csv SAMPLES your Samples Sheet - -c CONFIGFILE, --config CONFIGFILE + -f FEATURES, --features-csv FEATURES your Features Sheet -o OUTPUTPREFIX, --out-prefix OUTPUTPREFIX output prefix to use for file names @@ -137,7 +137,8 @@ Optional arguments: If T/true, normalize counts by (selected) overlapping feature counts. Default: true. -dc, --decollapse Create a decollapsed copy of all SAM files listed in - your Samples Sheet. + your Samples Sheet. This option is ignored for non- + collapsed inputs. -a, --all-features Represent all features in output counts table, even if they did not match a Select for / with value. -p, --is-pipeline Indicates that tiny-count was invoked as part of a diff --git a/doc/tiny-count.md b/doc/tiny-count.md index a647a247..111fa33d 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -7,7 +7,7 @@ For an explanation of tiny-count's parameters in the Run Config and by commandli tiny-count offers a variety of options for refining your analysis. You might find that repeat analyses are required while tuning these options to your goals. However, the earlier pipeline steps are resource and time intensive, so it is inconvenient to rerun an end-to-end analysis to test new selection rules. Using the command `tiny recount`, tinyRNA will run the workflow starting at the tiny-count step using inputs from a prior end-to-end run. See the [pipeline resume documentation](Pipeline.md#resuming-a-prior-analysis) for details and prerequesites. ## Running as a Standalone Tool -If you would like to run tiny-count as a standalone tool, not as part of an end-to-end or resumed analysis, you can do so with the command `tiny-count`. The command requires that you specify the paths to your Samples Sheet and Features Sheet, and a filename prefix for outputs. [All other arguments are optional](Parameters.md#full-tiny-count-help-string). You will need to make a copy of your Samples Sheet and modify it so that the `Input FASTQ Files` column instead contains paths to the corresponding SAM files from a prior end-to-end run. SAM files from non-tinyRNA sources are not currently supported as the tiny-collapse step is critical to its function. +If you would like to run tiny-count as a standalone tool, not as part of an end-to-end or resumed analysis, you can do so with the command `tiny-count`. The command requires that you specify the paths to your Samples Sheet and Features Sheet, and a filename prefix for outputs. [All other arguments are optional](Parameters.md#full-tiny-count-help-string). You will need to make a copy of your Samples Sheet and modify it so that the `Input FASTQ Files` column instead contains paths to the corresponding SAM files from a prior end-to-end run. SAM files from third party sources are also supported, and can be produced from reads collapsed by tiny-collapse or fastx, or from non-collapsed reads. SAM files from non-collapsed reads will result in substantially higher resource usage and runtimes, so we strongly recommend collapsing prior to alignment. >**Important:** reusing the same output filename prefix between standalone runs will result in prior outputs being overwritten. diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 7733907e..a6d83855 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -30,9 +30,9 @@ def get_args(): optional_args = arg_parser.add_argument_group("Optional arguments") # Required arguments - required_args.add_argument('-i', '--input-csv', metavar='SAMPLES', required=True, + required_args.add_argument('-i', '--samples-csv', metavar='SAMPLES', required=True, help='your Samples Sheet') - required_args.add_argument('-c', '--config', metavar='CONFIGFILE', required=True, + required_args.add_argument('-f', '--features-csv', metavar='FEATURES', required=True, help='your Features Sheet') required_args.add_argument('-o', '--out-prefix', metavar='OUTPUTPREFIX', required=True, help='output prefix to use for file names') @@ -49,8 +49,8 @@ def get_args(): help='If T/true, normalize counts by (selected) ' 'overlapping feature counts. Default: true.') optional_args.add_argument('-dc', '--decollapse', action='store_true', - help='Create a decollapsed copy of all SAM ' - 'files listed in your Samples Sheet.') + help='Create a decollapsed copy of all SAM files listed in your ' + 'Samples Sheet. This option is ignored for non-collapsed inputs.') optional_args.add_argument('-a', '--all-features', action='store_true', help='Represent all features in output counts table, ' 'even if they did not match a Select for / with value.') From 4993b88f6a71353821e2604feb7672b7e2aeb69c Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 10 Aug 2022 17:49:21 -0700 Subject: [PATCH 15/15] Expanded on the "non-collapsed SAM file caveats" in tiny-count.md. The description now addresses the effect these files will have on sequence-related stats --- doc/tiny-count.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/tiny-count.md b/doc/tiny-count.md index 111fa33d..b769df31 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -7,10 +7,12 @@ For an explanation of tiny-count's parameters in the Run Config and by commandli tiny-count offers a variety of options for refining your analysis. You might find that repeat analyses are required while tuning these options to your goals. However, the earlier pipeline steps are resource and time intensive, so it is inconvenient to rerun an end-to-end analysis to test new selection rules. Using the command `tiny recount`, tinyRNA will run the workflow starting at the tiny-count step using inputs from a prior end-to-end run. See the [pipeline resume documentation](Pipeline.md#resuming-a-prior-analysis) for details and prerequesites. ## Running as a Standalone Tool -If you would like to run tiny-count as a standalone tool, not as part of an end-to-end or resumed analysis, you can do so with the command `tiny-count`. The command requires that you specify the paths to your Samples Sheet and Features Sheet, and a filename prefix for outputs. [All other arguments are optional](Parameters.md#full-tiny-count-help-string). You will need to make a copy of your Samples Sheet and modify it so that the `Input FASTQ Files` column instead contains paths to the corresponding SAM files from a prior end-to-end run. SAM files from third party sources are also supported, and can be produced from reads collapsed by tiny-collapse or fastx, or from non-collapsed reads. SAM files from non-collapsed reads will result in substantially higher resource usage and runtimes, so we strongly recommend collapsing prior to alignment. +If you would like to run tiny-count as a standalone tool, not as part of an end-to-end or resumed analysis, you can do so with the command `tiny-count`. The command requires that you specify the paths to your Samples Sheet and Features Sheet, and a filename prefix for outputs. [All other arguments are optional](Parameters.md#full-tiny-count-help-string). You will need to make a copy of your Samples Sheet and modify it so that the `Input FASTQ Files` column instead contains paths to the corresponding SAM files from a prior end-to-end run. SAM files from third party sources are also supported, and can be produced from reads collapsed by tiny-collapse or fastx, or from non-collapsed reads. >**Important:** reusing the same output filename prefix between standalone runs will result in prior outputs being overwritten. +#### Using Non-collapsed Sequence Alignments +While third-party SAM files from non-collapsed reads are supported, there are some caveats. These files will result in substantially higher resource usage and runtimes; we strongly recommend collapsing prior to alignment. Additionally, the sequence-related stats produced by tiny-count will no longer represent _unique_ sequences. These stats will instead refer to all sequences with unique QNAMEs (that is, multi-alignment bundles still cary a sequence count of 1.) # Feature Selection