diff --git a/Bio/AlignIO/MafIO.py b/Bio/AlignIO/MafIO.py index 8f4a26ad9df..b358084273c 100644 --- a/Bio/AlignIO/MafIO.py +++ b/Bio/AlignIO/MafIO.py @@ -1,6 +1,7 @@ # Copyright 2011, 2012 by Andrew Sczesnak. All rights reserved. # Revisions Copyright 2011, 2017 by Peter Cock. All rights reserved. # Revisions Copyright 2014, 2015 by Adam Novak. All rights reserved. +# Revisions Copyright 2015, 2017 by Blaise Li. All rights reserved. # # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included @@ -17,9 +18,58 @@ You are expected to use this module via the Bio.AlignIO functions(or the Bio.SeqIO functions if you want to work directly with the gapped sequences). +According to the MAF format documentation +(https://genome.ucsc.edu/FAQ/FAQformat.html#format5), alignment blocks are +composed of sequence lines: +----- + Lines starting with 's' -- a sequence within an alignment block + + s hg16.chr7 27707221 13 + 158545518 gcagctgaaaaca + s panTro1.chr6 28869787 13 + 161576975 gcagctgaaaaca + s baboon 249182 13 + 4622798 gcagctgaaaaca + s mm4.chr6 53310102 13 + 151104725 ACAGCTGAAAATA + + The 's' lines together with the 'a' lines define a multiple alignment. The 's' + lines have the following fields which are defined by position rather than + name=value pairs. + + * src -- The name of one of the source sequences for the alignment. For + sequences that are resident in a browser assembly, the form + 'database.chromosome' allows automatic creation of links to other assemblies. + Non-browser sequences are typically reference by the species name alone. + + * start -- The start of the aligning region in the source sequence. This is a + zero-based number. If the strand field is '-' then this is the start relative + to the reverse-complemented source sequence (see Coordinate Transforms). + + * size -- The size of the aligning region in the source sequence. This number + is equal to the number of non-dash characters in the alignment text field + below. + + * strand -- Either '+' or '-'. If '-', then the alignment is to the + reverse-complemented source. + + * srcSize -- The size of the entire source sequence, not just the parts + involved in the alignment. + + * text -- The nucleotides (or amino acids) in the alignment and any insertions (dashes) as well. +----- + +We see that the coordinates in the MAF format are defined in terms of +zero-based start positions and aligning region sizes. + +A minimal aligned region of length one and starting at first position in the +source sequence would have start == 0 and size == 1. + +As we can see on this example, start + size will give one more than the +zero-based end position. We can therefore manipulate start and start + size as +python list slice boundaries. + +If we want an inclusive end coordinate, we need to use end = start + size - 1: +a 1-column wide alignment would have start == end. """ import os -import itertools +from itertools import islice try: from sqlite3 import dbapi2 as _sqlite @@ -40,7 +90,6 @@ class MafWriter(SequentialAlignmentWriter): """Accepts a MultipleSeqAlignment object, writes a MAF file""" - def write_header(self): """Writes the MAF header""" self.handle.write("##maf version=1 scoring=none\n") @@ -54,6 +103,7 @@ def _write_record(self, record): elif record.annotations.get("strand") == -1: strand = "-" else: + # TODO: issue warning? strand = "+" fields = ["s", @@ -64,7 +114,7 @@ def _write_record(self, record): strand, "%15s" % record.annotations.get("srcSize", 0), str(record.seq)] - self.handle.write(" ".join(fields) + "\n") + self.handle.write("%s\n" % " ".join(fields)) def write_alignment(self, alignment): """ @@ -83,7 +133,9 @@ def write_alignment(self, alignment): # for now, use ._annotations private property, but restrict keys to those # specifically supported by the MAF format, according to spec try: - anno = " ".join(["%s=%s" % (x, y) for x, y in alignment._annotations.items() if x in ("score", "pass")]) + anno = " ".join(["%s=%s" % (x, y) + for x, y in alignment._annotations.items() + if x in ("score", "pass")]) except AttributeError: anno = "score=0.00" @@ -101,6 +153,8 @@ def write_alignment(self, alignment): return recs_out +# Invalid function name according to pylint, but kept for compatibility +# with Bio* conventions. def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): """ Iterates over lines in a MAF file-like object (handle), yielding @@ -133,6 +187,7 @@ def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): elif line_split[4] == "-": strand = -1 else: + # TODO: issue warning, set to 0? strand = 1 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) @@ -143,7 +198,7 @@ def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): sequence = line_split[6] - # interpret a dot/period to mean same the first sequence + # interpret a dot/period to mean the same as the first sequence if "." in sequence: if not records: raise ValueError("Found dot/period in first sequence of alignment") @@ -151,16 +206,16 @@ def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): ref = str(records[0].seq) new = [] - for (s, r) in zip(sequence, ref): - new.append(r if s == "." else s) + for (letter, ref_letter) in zip(sequence, ref): + new.append(ref_letter if letter == "." else letter) sequence = "".join(new) records.append(SeqRecord(Seq(sequence, alphabet), - id=line_split[1], - name=line_split[1], - description="", - annotations=anno)) + id=line_split[1], + name=line_split[1], + description="", + annotations=anno)) elif line.startswith("i"): # TODO: information about what is in the aligned species DNA before # and after the immediately preceding "s" line @@ -175,6 +230,11 @@ def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): # Can then store in each SeqRecord's .letter_annotations dictionary, # perhaps as the raw string or turned into integers / None for gap? pass + elif line.startswith("#"): + # ignore comments + # (not sure whether comments + # are in the maf specification, though) + pass elif not line.strip(): # end a bundle of records if seq_count is not None: @@ -197,11 +257,10 @@ def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): elif line.startswith("a"): # start a bundle of records in_a_bundle = True - - if len(line.strip().split()[1:]) != line.count("="): + annot_strings = line.strip().split()[1:] + if len(annot_strings) != line.count("="): raise ValueError("Error parsing alignment - invalid key in 'a' line") - - annotations = dict([x.split("=") for x in line.strip().split()[1:]]) + annotations = dict([a_string.split("=") for a_string in annot_strings]) elif line.startswith("#"): # ignore comments pass @@ -210,6 +269,11 @@ def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): class MafIndex(object): + """This is used as an index for a MAF file. + + The index is a sqlite3 database that is built upon creation of the object + if necessary, and queried when methods *search* or *get_spliced* are + used.""" def __init__(self, sqlite_file, maf_file, target_seqname): """Indexes or loads the index of a MAF file""" self._target_seqname = target_seqname @@ -231,27 +295,37 @@ def __init__(self, sqlite_file, maf_file, target_seqname): def __check_existing_db(self): """Basic sanity checks upon loading an existing index""" try: - idx_version = int(self._con.execute("SELECT value FROM meta_data WHERE key = 'version'").fetchone()[0]) + idx_version = int(self._con.execute( + "SELECT value FROM meta_data WHERE key = 'version'").fetchone()[0]) if idx_version != MAFINDEX_VERSION: - raise ValueError("Index version (%s) incompatible with this version of MafIndex" % (idx_version,)) + raise ValueError("Index version (%s) incompatible with " + "this version of MafIndex" % idx_version) - filename = self._con.execute("SELECT value FROM meta_data WHERE key = 'filename'").fetchone()[0] + filename = self._con.execute( + "SELECT value FROM meta_data WHERE key = 'filename'").fetchone()[0] if filename != self._maf_file: - raise ValueError("Index uses a different file (%s != %s)" % (filename, self._maf_file)) + raise ValueError("Index uses a different file (%s != %s)" + % (filename, self._maf_file)) - db_target = self._con.execute("SELECT value FROM meta_data WHERE key = 'target_seqname'").fetchone()[0] + db_target = self._con.execute( + "SELECT value FROM meta_data WHERE key = 'target_seqname'").fetchone()[0] if db_target != self._target_seqname: - raise ValueError("Provided database indexed for %s, expected %s" % (db_target, self._target_seqname)) + raise ValueError("Provided database indexed for %s, expected %s" + % (db_target, self._target_seqname)) - record_count = int(self._con.execute("SELECT value FROM meta_data WHERE key = 'record_count'").fetchone()[0]) + record_count = int(self._con.execute( + "SELECT value FROM meta_data WHERE key = 'record_count'").fetchone()[0]) if record_count == -1: raise ValueError("Unfinished/partial database provided") - records_found = int(self._con.execute("SELECT COUNT(*) FROM offset_data").fetchone()[0]) + records_found = int(self._con.execute( + "SELECT COUNT(*) FROM offset_data").fetchone()[0]) if records_found != record_count: - raise ValueError("Expected %s records, found %s. Corrupt index?" % (record_count, records_found)) + raise ValueError("Expected %s records, found %s. Corrupt index?" + % (record_count, records_found)) return records_found + except (_sqlite.OperationalError, _sqlite.DatabaseError) as err: raise ValueError("Problem with SQLite database: %s" % err) @@ -261,8 +335,10 @@ def __make_new_index(self): self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', 1);") self._con.execute("INSERT INTO meta_data (key, value) VALUES ('record_count', -1);") - self._con.execute("INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" % (self._target_seqname,)) - self._con.execute("INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % (self._maf_file,)) + self._con.execute("INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" % + (self._target_seqname,)) + self._con.execute("INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % + (self._maf_file,)) self._con.execute("CREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);") insert_count = 0 @@ -271,13 +347,14 @@ def __make_new_index(self): mafindex_func = self.__maf_indexer() while True: - batch = list(itertools.islice(mafindex_func, 100)) + batch = list(islice(mafindex_func, 100)) if not batch: break - self._con.executemany("INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", batch) + # batch is made from self.__maf_indexer(), + self._con.executemany( + "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", batch) self._con.commit() - insert_count += len(batch) # then make indexes on the relevant fields @@ -285,14 +362,20 @@ def __make_new_index(self): self._con.execute("CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);") self._con.execute("CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);") - self._con.execute("UPDATE meta_data SET value = '%s' WHERE key = 'record_count'" % (insert_count,)) + self._con.execute( + "UPDATE meta_data SET value = '%s' WHERE key = 'record_count'" % (insert_count,)) self._con.commit() return insert_count def __maf_indexer(self): - """Generator function, returns index information for each bundle""" + """Generator function, returns index information for each bundle. + + Yields index information for each bundle in the form of + (bin, start, end, offset) tuples where start and end are + 0-based inclusive coordinates. + """ line = self._maf_fp.readline() while line: @@ -305,7 +388,9 @@ def __maf_indexer(self): line = self._maf_fp.readline() if not line.strip() or line.startswith("a"): - raise ValueError("Target for indexing (%s) not found in this bundle" % (self._target_seqname,)) + # Empty line or new alignment record + raise ValueError("Target for indexing (%s) not found in this bundle" + % (self._target_seqname,)) elif line.startswith("s"): # s (literal), src (ID), start, size, strand, srcSize, text (sequence) line_split = line.strip().split() @@ -324,6 +409,7 @@ def __maf_indexer(self): line = self._maf_fp.readline() + # TODO: check coordinate correctness for the two bin-related static methods @staticmethod def _region2bin(start, end): """Finds bins that a region may belong to. @@ -344,25 +430,26 @@ def _region2bin(start, end): def _ucscbin(start, end): """Returns the smallest bin a given region will fit into. - Taken from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system + + Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system """ - binOffsets = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0] + bin_offsets = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0] - _binFirstShift = 17 - _binNextShift = 3 + _bin_first_shift = 17 + _bin_next_shift = 3 - startBin = start - endBin = end - 1 + start_bin = start + end_bin = end - 1 - startBin >>= _binFirstShift - endBin >>= _binFirstShift + start_bin >>= _bin_first_shift + end_bin >>= _bin_first_shift - for i in range(0, len(binOffsets)): - if startBin == endBin: - return binOffsets[i] + startBin + for i in range(0, len(bin_offsets)): + if start_bin == end_bin: + return bin_offsets[i] + start_bin - startBin >>= _binNextShift - endBin >>= _binNextShift + start_bin >>= _bin_next_shift + end_bin >>= _bin_next_shift return 0 @@ -374,8 +461,13 @@ def _get_record(self, offset): def search(self, starts, ends): """Searches index database for MAF records overlapping ranges provided. - Returns results in order by start, then end, then internal offset - field. + Returns *MultipleSeqAlignment* results in order by start, then end, then + internal offset field. + + *starts* should be a list of 0-based start coordinates of segments in the reference. + *ends* should be the list of the corresponding segment ends + (in the half-open UCSC convention: + http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). """ # verify the provided exon coordinates if len(starts) != len(ends): @@ -384,23 +476,33 @@ def search(self, starts, ends): for exonstart, exonend in zip(starts, ends): if exonstart >= exonend: raise ValueError("Exon coordinates invalid (%s >= %s)" % (exonstart, exonend)) - con = self._con # search for every exon - for exonstart, exonend in zip(starts, ends): try: possible_bins = ", ".join(map(str, self._region2bin(exonstart, exonend))) except TypeError: raise TypeError("Exon coordinates must be integers " - "(start=%s, end=%s)" % (exonstart, exonend)) + "(start=%d, end=%d)" % (exonstart, exonend)) + + # https://www.sqlite.org/lang_expr.html + # ----- + # The BETWEEN operator + # + # The BETWEEN operator is logically equivalent to a pair of + # comparisons. "x BETWEEN y AND z" is equivalent to "x>=y AND x<=z" + # except that with BETWEEN, the x expression is only evaluated + # once. The precedence of the BETWEEN operator is the same as the + # precedence as operators == and != and LIKE and groups left to + # right. + # ----- result = con.execute("SELECT DISTINCT start, end, offset FROM " - "offset_data WHERE bin IN (%s) AND (end BETWEEN %s AND %s " - "OR %s BETWEEN start AND end) ORDER BY start, end, " - "offset ASC;" - % (possible_bins, exonstart, exonend, exonend)) + "offset_data WHERE bin IN (%s) AND (end BETWEEN %s AND %s " + "OR %s BETWEEN start AND end) ORDER BY start, end, " + "offset ASC;" + % (possible_bins, exonstart, exonend, exonend)) rows = result.fetchall() @@ -412,14 +514,13 @@ def search(self, starts, ends): for record in fetched: if record.id == self._target_seqname: + # start and size come from the maf lines start = record.annotations["start"] - end = record.annotations["start"] + \ - record.annotations["size"] + end = record.annotations["start"] + record.annotations["size"] if not (start == rec_start and end == rec_end): - raise ValueError( - "Expected %s-%s @ offset %s, found %s-%s" % - (rec_start, rec_end, offset, start, end)) + raise ValueError("Expected %s-%s @ offset %s, found %s-%s" % + (rec_start, rec_end, offset, start, end)) yield fetched @@ -427,18 +528,29 @@ def get_spliced(self, starts, ends, strand=1): """Returns a multiple alignment of the exact sequence range provided. Accepts two lists of start and end positions on target_seqname, representing - exons to be spliced in silico. Returns a MultipleSeqAlignment of the + exons to be spliced in silico. Returns a *MultipleSeqAlignment* of the desired sequences spliced together. + + *starts* should be a list of 0-based start coordinates of segments in the reference. + *ends* should be the list of the corresponding segment ends + (in the half-open UCSC convention: + http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). + + To ask for the alignment portion corresponding to the first 100 + nucleotides of the reference sequence, you would use + ``search([0], [100])`` """ # validate strand if strand not in (1, -1): - raise ValueError("Strand must be 1 or -1") + raise ValueError("Strand must be 1 or -1, got %s" % str(strand)) # pull all alignments that span the desired intervals - fetched = [x for x in self.search(starts, ends)] + fetched = [multiseq for multiseq in self.search(starts, ends)] # keep track of the expected letter count - expected_letters = sum([y - x for x, y in zip(starts, ends)]) + # (sum of lengths of [start, end) segments, + # where [start, end) half-open) + expected_letters = sum([end - start for start, end in zip(starts, ends)]) # if there's no alignment, return filler for the assembly of the length given if len(fetched) == 0: @@ -446,10 +558,17 @@ def get_spliced(self, starts, ends, strand=1): id=self._target_seqname)]) # find the union of all IDs in these alignments - all_seqnames = set([x.id for y in fetched for x in y]) + all_seqnames = set( + [sequence.id for multiseq in fetched for sequence in multiseq]) # split every record by base position - split_by_position = dict([(x, {}) for x in all_seqnames]) + # key: sequence name + # value: dictionary + # key: position in the reference sequence + # value: letter(s) (including letters + # aligned to the "-" preceding the letter + # at the position in the reference, if any) + split_by_position = dict([(seq_name, {}) for seq_name in all_seqnames]) # keep track of what the total number of (unspliced) letters should be total_rec_length = 0 @@ -469,12 +588,13 @@ def get_spliced(self, starts, ends, strand=1): if ref_first_strand not in (1, -1): raise ValueError("Strand must be 1 or -1") elif ref_first_strand != seqrec.annotations["strand"]: - raise ValueError("Encountered strand='%s' on target seqname, expected '%s'" % - (seqrec.annotations["strand"], ref_first_strand)) + raise ValueError("Encountered strand='%s' on target seqname, " + "expected '%s'" % + (seqrec.annotations["strand"], ref_first_strand)) except KeyError: raise ValueError("No strand information for target seqname (%s)" % - (self._target_seqname,)) - + self._target_seqname) + # length including gaps (i.e. alignment length) rec_length = len(seqrec) rec_start = seqrec.annotations["start"] rec_end = seqrec.annotations["start"] + seqrec.annotations["size"] @@ -493,12 +613,16 @@ def get_spliced(self, starts, ends, strand=1): # the true, chromosome/contig/etc position in the target seqname real_pos = rec_start + # loop over the alignment to fill split_by_position for gapped_pos in range(0, rec_length): for seqrec in multiseq: # keep track of this position's value for the target seqname if seqrec.id == self._target_seqname: track_val = seqrec.seq[gapped_pos] + # Here, a real_pos that corresponds to just after a series of "-" + # in the reference will "accumulate" the letters found in other sequences + # in front of the "-"s split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos] # increment the real_pos counter only when non-gaps are found in @@ -509,10 +633,14 @@ def get_spliced(self, starts, ends, strand=1): # make sure the number of bp entries equals the sum of the record lengths if len(split_by_position[self._target_seqname]) != total_rec_length: raise ValueError("Target seqname (%s) has %s records, expected %s" % - (self._target_seqname, len(split_by_position[self._target_seqname]), total_rec_length)) + (self._target_seqname, + len(split_by_position[self._target_seqname]), + total_rec_length)) # translates a position in the target_seqname sequence to its gapped length - realpos_to_len = dict([(x, len(y)) for x, y in split_by_position[self._target_seqname].items() if len(y) > 1]) + realpos_to_len = dict([(x, len(y)) + for x, y in split_by_position[self._target_seqname].items() + if len(y) > 1]) # splice together the exons subseq = {} @@ -544,7 +672,8 @@ def get_spliced(self, starts, ends, strand=1): # make sure we're returning the right number of letters if len(subseq[self._target_seqname].replace("-", "")) != expected_letters: raise ValueError("Returning %s letters for target seqname (%s), expected %s" % - (len(subseq[self._target_seqname].replace("-", "")), self._target_seqname, expected_letters)) + (len(subseq[self._target_seqname].replace("-", "")), + self._target_seqname, expected_letters)) # check to make sure all sequences are the same length as the target seqname ref_subseq_len = len(subseq[self._target_seqname]) @@ -552,7 +681,7 @@ def get_spliced(self, starts, ends, strand=1): for seqid, seq in subseq.items(): if len(seq) != ref_subseq_len: raise ValueError("Returning length %s for %s, expected %s" % - (len(seq), seqid, ref_subseq_len)) + (len(seq), seqid, ref_subseq_len)) # finally, build a MultipleSeqAlignment object for our final sequences result_multiseq = [] @@ -571,7 +700,7 @@ def get_spliced(self, starts, ends, strand=1): def __repr__(self): return "MafIO.MafIndex(%r, target_seqname=%r)" % (self._maf_fp.name, - self._target_seqname) + self._target_seqname) def __len__(self): return self._record_count diff --git a/Tests/test_AlignIO.py b/Tests/test_AlignIO.py index b252464b4f4..14a60344d7a 100644 --- a/Tests/test_AlignIO.py +++ b/Tests/test_AlignIO.py @@ -115,7 +115,7 @@ def check_simple_write_read(alignments, indent=" "): records_per_alignment = None # Can we expect this format to work? if not records_per_alignment \ - and format not in test_write_read_alignment_formats: + and format not in test_write_read_alignment_formats: continue print(indent + "Checking can write/read as '%s' format" % format) @@ -226,8 +226,8 @@ def check_phylip_reject_duplicate(): # Expected - check the error assert "Repeated name 'longsequen'" in str(err) -check_phylip_reject_duplicate() +check_phylip_reject_duplicate() # Check parsers can cope with an empty file for t_format in AlignIO._FormatToIterator: @@ -265,8 +265,8 @@ def check_phylip_reject_duplicate(): # Try as an iterator using handle with open(t_filename, "r") as handle: alignments = list(AlignIO.parse(handle, format=t_format)) - assert len(alignments) == t_count, \ - "Found %i alignments but expected %i" % (len(alignments), t_count) + msg = "Found %i alignments but expected %i" % (len(alignments), t_count) + assert len(alignments) == t_count, msg if t_per is not None: for alignment in alignments: assert len(alignment) == t_per, \ @@ -355,7 +355,6 @@ def check_phylip_reject_duplicate(): except ValueError as err: if str(err) != "Error in alphabet: not Nucleotide or Protein, supply expected frequencies": raise err - pass if t_count == 1 and t_format not in ["nexus", "emboss", "fasta-m10"]: # print(" Trying to read a triple concatenation of the input file")