Skip to content

Commit

Permalink
Bc generalize attach barcodes (#57)
Browse files Browse the repository at this point in the history
* first draft for generalizing attaching barcodes

* updated generalized class

* Fixed PR #1

* styling comment changes

* styling comment changes

* styling comment changes

* barcode end postion bug

* Fixed PR #2

* Fixed PR #2, made arg validation more human readable

* add comments/documentation to the class

* updated comments/documentation style for return values

* Fixed PR #57 comments

* Updated doc strings, error handling

* provided descriptions for input files and purpose of class

* updated class description
  • Loading branch information
benjamincarlin committed Mar 11, 2019
1 parent ad3fbc5 commit 09eb7f4
Show file tree
Hide file tree
Showing 2 changed files with 316 additions and 0 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
],
entry_points={
'console_scripts': [
'AttachBarcodes = sctools.platform:BarcodePlatform.attach_barcodes',
'Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes',
'SplitBam = sctools.platform:GenericPlatform.split_bam',
'CalculateGeneMetrics = sctools.platform:GenericPlatform.calculate_gene_metrics',
Expand Down
315 changes: 315 additions & 0 deletions src/sctools/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,3 +629,318 @@ def attach_barcodes(cls, args=None):
cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators)

return 0


class BarcodePlatform(GenericPlatform):
"""Command Line Interface for extracting and attaching barcodes with specified positions
generalizing TenXV2 attach barcodes
Sample, cell and/or molecule barcodes can be extracted and attached to an unmapped bam when the
corresponding barcode's start position and and length are provided. The sample barcode is extracted
from the index i7 fastq file and the cell and molecule barcode are extracted from the r1 fastq file
This class defines several methods that are created as CLI tools when sctools is installed
(see setup.py)
Attributes
----------
cell_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the cell barcode and the tags to
assign the sequence and quality of the cell barcode
molecule_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the molecule barcode and the tags
to assign the sequence and quality of the molecule barcode
sample_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the sample barcode and the tags
to assign the sequence and quality of the sample barcode
Methods
-------
attach_barcodes()
Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse
(r2) bam file
"""
cell_barcode = None
molecule_barcode = None
sample_barcode = None

@classmethod
def _validate_barcode_args(cls, args):
"""Validates that the barcode start position is greater than 0
Parameters
----------
args : object
arguments list, The default value of None, when passed to `parser.parse_args`
causes the parser to read `sys.argv`
Returns
-------
args : object
return arguments list if valid
"""
# check that if a barcode start position is provided, its length is also (and vice versa)
cls._validate_barcode_length_and_position(args.cell_barcode_start_pos, args.cell_barcode_length)
cls._validate_barcode_length_and_position(args.molecule_barcode_start_pos, args.molecule_barcode_length)
cls._validate_barcode_length_and_position(args.sample_barcode_start_pos, args.sample_barcode_length)

# check that an index fastq is provided sample barcode length and position are given
if args.i1 is None and args.sample_barcode_length:
raise argparse.ArgumentError('An i7 index fastq file must be given to attach a sample barcode')

# check that cell and molecule barcodes don't overlap
if args.cell_barcode_length and args.molecule_barcode_length:
cls._validate_barcode_input(args.molecule_barcode_start_pos,
args.cell_barcode_start_pos + args.cell_barcode_length)

return args

@classmethod
def _validate_barcode_length_and_position(cls, barcode_start_position, barcode_length):
"""Checks that either that both barcode length and position are given or that neither are given as arguments
Parameters
----------
barcode_start_position : int
the user defined start position (base pairs) of the barcode
barcode_length : int
the user defined length (base pairs) of the barcode
Returns
-------
given_value : int
return given value if valid
"""
barcode_start_pos_exists = bool(barcode_start_position) or (barcode_start_position == 0)
barcode_length_exists = bool(barcode_length)
# (XOR boolean logic)
if (barcode_start_pos_exists != barcode_length_exists):
raise argparse.ArgumentError('Invalid position/length, both position and length must be provided by the user together')

@classmethod
def _validate_barcode_input(cls, given_value, min_value):
"""Validates that the barcode input is greater than a min value
Parameters
----------
given_value : int
the given value that must be greater than the min_value,
(barcode length or barcode starting position)
min_value : int
the min value that the given_value must be greater than
Returns
-------
given_value : int
return given value if valid
"""
if given_value < min_value:
raise argparse.ArgumentTypeError('Invalid barcode length/position')
return given_value

@classmethod
def _validate_barcode_start_pos(cls, given_value):
"""Validates that the barcode start position is greater than 0
Parameters
----------
given_value : Union[int, str]
the given start position of the barcode to validate
Returns
-------
given_value : int
returns the start position if it is valid
"""
return cls._validate_barcode_input(int(given_value), 0)

@classmethod
def _validate_barcode_length(cls, given_value):
"""Validates that the barcode length is greater than 1
Parameters
----------
given_value : Union[int, str]
the given length of the barcode to validate
Returns
-------
given_value : int
returns the length if it is valid
"""
return cls._validate_barcode_input(int(given_value), 1)

@classmethod
def _tag_bamfile(cls,
input_bamfile_name: str,
output_bamfile_name: str,
tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator]) -> None:
"""Adds tags from fastq file(s) to a bam file.
Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from
`input_bamfile_name`, and writes the result to `output_bamfile_name`
Parameters
----------
input_bamfile_name : str
input bam
output_bamfile_name : str
output bam
tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator]
Iterable of generators that yield barcodes from fastq files
"""
bam_tagger = bam.Tagger(input_bamfile_name)
bam_tagger.tag(output_bamfile_name, tag_generators)

@classmethod
def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.EmbeddedBarcodeGenerator]:
"""Create tag generators from fastq files.
Tag generators are iterators that run over fastq records, they extract and yield all of the
barcodes embedded in each fastq record. This means extracting the cell, umi, and/or the sample barcode.
Parameters
----------
r1 : str
forward fastq file, where possibly the cell and/or molecule barcode is found
i1 : str, optional
index fastq file, where the sample barcode is found
whitelist : str, optional
A file that contains a list of acceptable cell barcodes
Returns
-------
tag_generators : List[EmbeddedBarcodeGenerator]
EmbeddedBarcodeGenerators containing barcodes from the given fastq
"""
tag_generators = []
barcode_args = {'fastq_files': r1}

if i1:
barcode_args['embedded_barcodes'] = [cls.sample_barcode]
tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args))

if whitelist:
barcode_args['whitelist'] = whitelist
if cls.cell_barcode:
barcode_args['embedded_cell_barcode'] = cls.cell_barcode
if cls.molecule_barcode:
barcode_args['other_embedded_barcodes'] = cls.molecule_barcode
tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args))

else:
# for all the barcodes that have a length and starting position specified
barcode_args['embedded_barcodes'] = [barcode for barcode in [cls.cell_barcode, cls.molecule_barcode] if barcode]
tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args))

return tag_generators

@classmethod
def attach_barcodes(cls, args=None):
"""Command line entrypoint for attaching barcodes to a bamfile.
Parameters
----------
args : Iterable[str], optional
arguments list, The default value of None, when passed to `parser.parse_args`
causes the parser to read `sys.argv`
Returns
-------
return_call : 0
return call if the program completes successfully
"""
parser = argparse.ArgumentParser()
parser.add_argument('--r1',
required=True,
help='read 1 fastq file, where the cell and molecule barcode is found')
parser.add_argument('--u2',
required=True,
help='unaligned bam, can be converted from fastq read 2'
'using picard FastqToSam')
parser.add_argument('-o',
'--output-bamfile',
required=True,
help='filename for tagged bam')
parser.add_argument('-w',
'--whitelist',
default=None,
help='optional cell barcode whitelist. If provided, corrected barcodes '
'will also be output when barcodes are observed within 1ED of a '
'whitelisted barcode')
parser.add_argument('--i1',
default=None,
help='(optional) i7 index fastq file, where the sample barcode is found')
parser.add_argument('--sample-barcode-start-position',
dest='sample_barcode_start_pos',
default=None,
help='the user defined start position (base pairs) of the sample barcode',
type=cls._validate_barcode_start_pos)
parser.add_argument('--sample-barcode-length',
dest='sample_barcode_length',
default=None,
help='the user defined length (base pairs) of the sample barcode',
type=cls._validate_barcode_length)
parser.add_argument('--cell-barcode-start-position',
dest='cell_barcode_start_pos',
default=None,
help='the user defined start position, in base pairs, of the cell barcode',
type=cls._validate_barcode_start_pos)
parser.add_argument('--cell-barcode-length',
dest='cell_barcode_length',
default=None,
help='the user defined length, in base pairs, of the cell barcode',
type=cls._validate_barcode_length)
parser.add_argument('--molecule-barcode-start-position',
dest='molecule_barcode_start_pos',
default=None,
help='the user defined start position, in base pairs, of the molecule barcode '
'(must be not overlap cell barcode if cell barcode is provided)',
type=cls._validate_barcode_start_pos)
parser.add_argument('--molecule-barcode-length',
dest='molecule_barcode_length',
default=None,
help='the user defined length, in base pairs, of the molecule barcode',
type=cls._validate_barcode_length)

# parse and validate the args
if args:
args = parser.parse_args(args)
else:
args = parser.parse_args()
cls._validate_barcode_args(args)

# if the length and there for the start pos have been given as args
# get the appropriate barcodes
if args.cell_barcode_length:
cls.cell_barcode = fastq.EmbeddedBarcode(start=args.cell_barcode_start_pos,
end=args.cell_barcode_start_pos + args.cell_barcode_length,
quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY)
if args.molecule_barcode_length:
cls.molecule_barcode = fastq.EmbeddedBarcode(start=args.molecule_barcode_start_pos,
end=args.molecule_barcode_start_pos + args.molecule_barcode_length,
quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY)
if args.sample_barcode_length:
cls.sample_barcode = fastq.EmbeddedBarcode(start=args.sample_barcode_start_pos,
end=args.sample_barcode_start_pos + args.sample_barcode_length,
quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY)

# make the tags and attach the barcodes
tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist)
cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators)

return 0

0 comments on commit 09eb7f4

Please sign in to comment.