From 3e797d60092c526fcc4774a4b611636ae28a086a Mon Sep 17 00:00:00 2001 From: Yin Hoon Chew Date: Wed, 12 Dec 2018 12:27:18 -0500 Subject: [PATCH] Modify IO to provide options of linking to genome sequence path --- tests/fixtures/eukaryote_seq.fna.fai | 25 +++++++ tests/test_io.py | 97 ++++++++++++++++++++-------- wc_kb/io.py | 63 +++++++++--------- 3 files changed, 127 insertions(+), 58 deletions(-) create mode 100644 tests/fixtures/eukaryote_seq.fna.fai diff --git a/tests/fixtures/eukaryote_seq.fna.fai b/tests/fixtures/eukaryote_seq.fna.fai new file mode 100644 index 0000000..da461e5 --- /dev/null +++ b/tests/fixtures/eukaryote_seq.fna.fai @@ -0,0 +1,25 @@ +chromosome1 840 13 70 71 +chromosome2 630 878 70 71 +chromosome3 560 1530 70 71 +chromosome4 630 2111 70 71 +chromosome5 700 2763 70 71 +chromosome6 910 3486 70 71 +chromosome7 630 4422 70 71 +chromosome8 770 5074 70 71 +chromosome9 910 5868 70 71 +chromosome10 560 6805 70 71 +chromosome11 700 7387 70 71 +chromosome12 630 8111 70 71 +chromosome13 630 8764 70 71 +chromosome14 910 9417 70 71 +chromosome15 1330 10354 70 71 +chromosome16 910 11717 70 71 +chromosome17 770 12654 70 71 +chromosome18 1120 13449 70 71 +chromosome19 700 14599 70 71 +chromosome20 1190 15323 70 71 +chromosome21 1050 16544 70 71 +chromosome22 1120 17623 70 71 +chromosomeX 1120 18772 70 71 +chromosomeY 980 19921 70 71 +chromosomeMT 700 20929 70 71 diff --git a/tests/test_io.py b/tests/test_io.py index 9616c95..3c35cc9 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -10,6 +10,8 @@ from wc_kb import core, prokaryote_schema from wc_kb import io import Bio.Seq +import Bio.SeqRecord +import filecmp import obj_model.io import os import random @@ -22,21 +24,26 @@ class TestIO(unittest.TestCase): def setUp(self): + self.dir = tempfile.mkdtemp() + self.seq_path = os.path.join(self.dir, 'seq.fna') + self.kb = kb = core.KnowledgeBase(id='genus_species', name='Genus species', version='0.0.1') cell = kb.cell = core.Cell(id='genus_species_cell') + dna_seqs = [] for i_chr in range(5): - dna = core.DnaSpeciesType(id='chr_{}'.format(i_chr + 1)) + dna = core.DnaSpeciesType(id='chr_{}'.format(i_chr + 1), sequence_path=self.seq_path) cell.species_types.append(dna) - + seq_len = random.randint(100, 200) bases = 'ACGT' seq = '' for i_nt in range(seq_len): seq += bases[random.randint(0, 3)] - dna.seq = Bio.Seq.Seq(seq) - + dna_seqs.append(Bio.SeqRecord.SeqRecord( + Bio.Seq.Seq(seq), dna.id)) + for i_trn in range(5): trn = prokaryote_schema.TranscriptionUnitLocus(id='tu_{}_{}'.format(i_chr + 1, i_trn + 1)) trn.cell = cell @@ -45,32 +52,35 @@ def setUp(self): trn.end = ((trn.start + random.randint(1, 200) - 1) % seq_len) + 1 trn.strand = core.PolymerStrand.positive - self.dir = tempfile.mkdtemp() + with open(self.seq_path, 'w') as file: + writer = Bio.SeqIO.FastaIO.FastaWriter( + file, wrap=70, record2title=lambda record: record.id) + writer.write_file(dna_seqs) def tearDown(self): shutil.rmtree(self.dir) def test_write_read(self): - core_path = os.path.join(self.dir, 'core.xlsx') - seq_path = os.path.join(self.dir, 'seq.fna') + core_path = os.path.join(self.dir, 'core.xlsx') writer = io.Writer() - writer.run(self.kb, core_path, seq_path, set_repo_metadata_from_path=False) + writer.run(self.kb, core_path, set_repo_metadata_from_path=False) reader = io.Reader() - kb = reader.run(core_path, seq_path) + kb = reader.run(core_path, self.seq_path) core_path = os.path.join(self.dir, 'core2.xlsx') seq_path = os.path.join(self.dir, 'seq2.fna') writer.run(kb, core_path, seq_path, set_repo_metadata_from_path=False) self.assertTrue(self.kb.is_equal(kb)) + self.assertTrue(filecmp.cmp(self.seq_path, seq_path, shallow=False)) def test_read_write_prokaryote(self): fixtures = os.path.join(os.path.dirname(__file__), 'fixtures') core_path = os.path.join(fixtures, 'core.xlsx') seq_path = os.path.join(fixtures, 'seq.fna') - + reader = io.Reader() kb = reader.run(core_path, seq_path) @@ -80,15 +90,16 @@ def test_read_write_prokaryote(self): writer = io.Writer() writer.run(kb, tmp_core_path, tmp_seq_path, set_repo_metadata_from_path=False) - tmp_kb = reader.run(tmp_core_path, tmp_seq_path) + tmp_kb = reader.run(tmp_core_path, seq_path) self.assertTrue(kb.is_equal(tmp_kb)) + self.assertTrue(filecmp.cmp(tmp_seq_path, seq_path, shallow=False)) def test_read_write_eukaryote(self): fixtures = os.path.join(os.path.dirname(__file__), 'fixtures') core_path = os.path.join(fixtures, 'eukaryote_core.xlsx') seq_path = os.path.join(fixtures, 'eukaryote_seq.fna') - + reader = io.Reader() kb = reader.run(core_path, seq_path, schema=False) @@ -98,9 +109,30 @@ def test_read_write_eukaryote(self): writer = io.Writer() writer.run(kb, tmp_core_path, tmp_seq_path, schema=False, set_repo_metadata_from_path=False) - tmp_kb = reader.run(tmp_core_path, tmp_seq_path, schema=False) + tmp_kb = reader.run(tmp_core_path, seq_path, schema=False) self.assertTrue(kb.is_equal(tmp_kb)) + self.assertTrue(filecmp.cmp(tmp_seq_path, seq_path, shallow=False)) + + def test_rewrite_seq_path_in_read_write(self): + path_core_1 = os.path.join(self.dir, 'core_1.xlsx') + path_core_2 = os.path.join(self.dir, 'core_2.xlsx') + path_seq_1 = os.path.join(self.dir, 'seq_1.fna') + path_seq_2 = os.path.join(self.dir, 'seq_2.fna') + + io.Writer().run(self.kb, path_core_1, path_seq_1, set_repo_metadata_from_path=False) + kb1 = io.Reader().run(path_core_1, path_seq_1) + kb2 = io.Reader().run(path_core_1, path_seq_1, rewrite_seq_path=False) + self.assertFalse(kb1.is_equal(self.kb)) + self.assertTrue(kb2.is_equal(self.kb)) + self.assertTrue(filecmp.cmp(path_seq_1, self.seq_path, shallow=False)) + + io.Writer().run(self.kb, path_core_2, path_seq_2, rewrite_seq_path=True, set_repo_metadata_from_path=False) + kb3 = io.Reader().run(path_core_2, self.seq_path) + kb4 = io.Reader().run(path_core_2, self.seq_path, rewrite_seq_path=False) + self.assertFalse(kb3.is_equal(self.kb)) + self.assertTrue(kb4.is_equal(self.kb)) + self.assertTrue(filecmp.cmp(path_seq_2, self.seq_path, shallow=False)) def test_write_with_repo_md(self): _, core_path = tempfile.mkstemp(suffix='.xlsx', dir='.') @@ -122,10 +154,12 @@ def test_write_with_repo_md(self): def test_write_without_cell_relationships(self): core_path = os.path.join(self.dir, 'core.xlsx') - seq_path = os.path.join(self.dir, 'seq.fna') + seq_path = os.path.join(self.dir, 'test_seq.fna') - dna = core.DnaSpeciesType(id='chr_x') - dna.seq = Bio.Seq.Seq('ACGT') + with open(seq_path, 'w') as file: + file.write('>chr_x\nACGT\n') + + dna = core.DnaSpeciesType(id='chr_x', sequence_path=seq_path) self.kb.cell.species_types.append(dna) trn = prokaryote_schema.TranscriptionUnitLocus(id='tu_x_0') @@ -138,8 +172,8 @@ def test_write_without_cell_relationships(self): def test_write_read_sloppy(self): core_path = os.path.join(self.dir, 'core.xlsx') - seq_path = os.path.join(self.dir, 'seq.fna') - + seq_path = os.path.join(self.dir, 'test_seq.fna') + writer = io.Writer() writer.run(self.kb, core_path, seq_path, set_repo_metadata_from_path=False) @@ -150,16 +184,17 @@ def test_write_read_sloppy(self): reader = io.Reader() with self.assertRaisesRegex(ValueError, "The columns of worksheet 'Knowledge base' must be defined in this order"): - kb = reader.run(core_path, seq_path) - kb = reader.run(core_path, seq_path, strict=False) + kb = reader.run(core_path, self.seq_path) + kb = reader.run(core_path, self.seq_path, strict=False) self.assertTrue(kb.is_equal(self.kb)) + self.assertTrue(filecmp.cmp(self.seq_path, seq_path, shallow=False)) def test_reader_no_kb(self): core_path = os.path.join(self.dir, 'core.xlsx') obj_model.io.WorkbookWriter().run(core_path, [], io.PROKARYOTE_MODEL_ORDER, include_all_attributes=False) - seq_path = os.path.join(self.dir, 'seq.fna') + seq_path = os.path.join(self.dir, 'test_seq.fna') with open(seq_path, 'w') as file: pass @@ -177,7 +212,7 @@ def test_reader_error_multiple_kbs(self): core_path = os.path.join(self.dir, 'core.xlsx') obj_model.io.WorkbookWriter().run(core_path, [kb1, kb2], io.PROKARYOTE_MODEL_ORDER, include_all_attributes=False) - seq_path = os.path.join(self.dir, 'seq.fna') + seq_path = os.path.join(self.dir, 'test_seq.fna') with open(seq_path, 'w') as file: pass @@ -191,7 +226,7 @@ def test_reader_error_no_cell(self): core_path = os.path.join(self.dir, 'core.xlsx') obj_model.io.WorkbookWriter().run(core_path, [kb, dna], io.PROKARYOTE_MODEL_ORDER, include_all_attributes=False) - seq_path = os.path.join(self.dir, 'seq.fna') + seq_path = os.path.join(self.dir, 'test_seq.fna') with open(seq_path, 'w') as file: pass @@ -206,7 +241,7 @@ def test_reader_error_multiple_cells(self): core_path = os.path.join(self.dir, 'core.xlsx') obj_model.io.WorkbookWriter().run(core_path, [kb, cell1, cell2], io.PROKARYOTE_MODEL_ORDER, include_all_attributes=False) - seq_path = os.path.join(self.dir, 'seq.fna') + seq_path = os.path.join(self.dir, 'test_seq.fna') with open(seq_path, 'w') as file: pass @@ -222,14 +257,17 @@ def test_convert(self): path_seq_3 = os.path.join(self.dir, 'seq_3.fna') io.Writer().run(self.kb, path_core_1, path_seq_1, set_repo_metadata_from_path=False) + self.assertTrue(filecmp.cmp(path_seq_1, self.seq_path, shallow=False)) io.convert(path_core_1, path_seq_1, path_core_2, path_seq_2) - kb = io.Reader().run(path_core_2, path_seq_2) + kb = io.Reader().run(path_core_2, self.seq_path) self.assertTrue(kb.is_equal(self.kb)) + self.assertTrue(filecmp.cmp(path_seq_1, path_seq_2, shallow=False)) io.convert(path_core_2, path_seq_2, path_core_3, path_seq_3) - kb = io.Reader().run(path_core_3, path_seq_3) + kb = io.Reader().run(path_core_3, self.seq_path) self.assertTrue(kb.is_equal(self.kb)) + self.assertTrue(filecmp.cmp(path_seq_2, path_seq_3, shallow=False)) def test_convert_sloppy(self): path_core_1 = os.path.join(self.dir, 'core_1.xlsx') @@ -240,6 +278,7 @@ def test_convert_sloppy(self): path_seq_3 = os.path.join(self.dir, 'seq_3.fna') io.Writer().run(self.kb, path_core_1, path_seq_1, set_repo_metadata_from_path=False) + self.assertTrue(filecmp.cmp(path_seq_1, self.seq_path, shallow=False)) wb = wc_utils.workbook.io.read(path_core_1) row = wb['Knowledge base'].pop(0) @@ -249,12 +288,14 @@ def test_convert_sloppy(self): with self.assertRaisesRegex(ValueError, "The columns of worksheet 'Knowledge base' must be defined in this order"): io.convert(path_core_1, path_seq_1, path_core_2, path_seq_2) io.convert(path_core_1, path_seq_1, path_core_2, path_seq_2, strict=False) - kb = io.Reader().run(path_core_2, path_seq_2) + kb = io.Reader().run(path_core_2, self.seq_path) self.assertTrue(kb.is_equal(self.kb)) + self.assertTrue(filecmp.cmp(path_seq_1, path_seq_2, shallow=False)) io.convert(path_core_2, path_seq_2, path_core_3, path_seq_3) - kb = io.Reader().run(path_core_3, path_seq_3) + kb = io.Reader().run(path_core_3, self.seq_path) self.assertTrue(kb.is_equal(self.kb)) + self.assertTrue(filecmp.cmp(path_seq_2, path_seq_3, shallow=False)) def test_create_template(self): path_core = os.path.join(self.dir, 'template.xlsx') diff --git a/wc_kb/io.py b/wc_kb/io.py index dbada4a..ac63047 100644 --- a/wc_kb/io.py +++ b/wc_kb/io.py @@ -75,13 +75,15 @@ class Writer(object): """ Write knowledge base to file(s) """ - def run(self, knowledge_base, core_path, seq_path, schema=True, set_repo_metadata_from_path=True): + def run(self, knowledge_base, core_path, seq_path=None, rewrite_seq_path=False, schema=True, set_repo_metadata_from_path=True): """ Write knowledge base to file(s) Args: knowledge_base (:obj:`core.KnowledgeBase`): knowledge base core_path (:obj:`str`): path to save core knowledge base - seq_path (:obj:`str`): path to save genome sequence + seq_path (:obj:`str`, optional): path to save genome sequence + rewrite_seq_path (:obj:`bool`, optional): if :obj:`True`, the path to genome sequence in the saved knowledge base + will be updated to the newly saved seq_path schema (:obj:`bool`, optional): if :obj:`True`, use model order for prokaryote, else use model order for eukaryote set_repo_metadata_from_path (:obj:`bool`, optional): if :obj:`True`, set the Git repository metadata (URL, branch, revision) for the knowledge base from the parent directory of :obj:`core_path` @@ -113,15 +115,22 @@ def run(self, knowledge_base, core_path, seq_path, schema=True, set_repo_metadat if set_repo_metadata_from_path: util.set_git_repo_metadata_from_path(knowledge_base, core_path) - # gather DNA sequences - dna_seqs = [] - if cell: - dna_species_types = cell.species_types.get( - __type=core.DnaSpeciesType) - for species_type in dna_species_types: - dna_seqs.append(Bio.SeqRecord.SeqRecord( - species_type.seq, species_type.id)) - species_type.seq = None + # export sequences if a path is provided + if seq_path: + dna_seqs = [] + if cell: + dna_species_types = cell.species_types.get( + __type=core.DnaSpeciesType) + for species_type in dna_species_types: + dna_seqs.append(Bio.SeqRecord.SeqRecord( + species_type.get_seq(), species_type.id)) + if rewrite_seq_path: + species_type.sequence_path = seq_path + + with open(seq_path, 'w') as file: + writer = Bio.SeqIO.FastaIO.FastaWriter( + file, wrap=70, record2title=lambda record: record.id) + writer.write_file(dna_seqs) # export core _, ext = os.path.splitext(core_path) @@ -138,18 +147,7 @@ def run(self, knowledge_base, core_path, seq_path, schema=True, set_repo_metadat title=knowledge_base.id, description=knowledge_base.name, version=knowledge_base.version, - **kwargs) - - # export sequences - with open(seq_path, 'w') as file: - writer = Bio.SeqIO.FastaIO.FastaWriter( - file, wrap=70, record2title=lambda record: record.id) - writer.write_file(dna_seqs) - - # restore DNA sequences - if cell: - for species_type, seq in zip(dna_species_types, dna_seqs): - species_type.seq = seq.seq + **kwargs) @classmethod def validate_implicit_relationships(cls): @@ -200,12 +198,14 @@ class Reader(object): """ Read knowledge base from file(s) """ #@wc_utils.cache.memoize(filename_args=[1, 2]) - def run(self, core_path, seq_path, schema=True, strict=True): + def run(self, core_path, seq_path, rewrite_seq_path=True, schema=True, strict=True): """ Read knowledge base from file(s) Args: core_path (:obj:`str`): path to core knowledge base seq_path (:obj:`str`): path to genome sequence + rewrite_seq_path (:obj:`bool`, optional): if :obj:`True`, the path to genome sequence in the knowledge base + will be updated to the provided seq_path schema (:obj:`bool`, optional): if :obj:`True`, use model order for prokaryote, else use model order for eukaryote strict (:obj:`bool`, optional): if :obj:`True`, validate that the the model file(s) strictly follow the :obj:`obj_model` serialization format: @@ -289,9 +289,10 @@ def run(self, core_path, seq_path, schema=True, strict=True): for model_obj in model_objects: setattr(model_obj, attr.name, cell) - # read genome sequence and link to the DNA species types - for dna in Bio.SeqIO.parse(seq_path, "fasta"): - kb.cell.species_types.get_one(id=dna.id).seq = dna.seq + # link path to genome sequence to the DNA species types if rewrite_seq_path is True + if rewrite_seq_path: + for dna in Bio.SeqIO.parse(seq_path, "fasta"): + kb.cell.species_types.get_one(id=dna.id).sequence_path = seq_path # validate objs = [] @@ -307,7 +308,7 @@ def run(self, core_path, seq_path, schema=True, strict=True): return kb -def convert(source_core, source_seq, dest_core, dest_seq, strict=True): +def convert(source_core, source_seq, dest_core, dest_seq, rewrite_seq_path=False, strict=True): """ Convert among Excel (.xlsx), comma separated (.csv), and tab separated (.tsv) file formats Read a knowledge base from the `source` files(s) and write it to the `destination` files(s). A path to a @@ -319,6 +320,8 @@ def convert(source_core, source_seq, dest_core, dest_seq, strict=True): source_seq (:obj:`str`): path to the genome sequence of the source knowledge base dest_core (:obj:`str`): path to save the converted core of the knowledge base dest_seq (:obj:`str`): path to save the converted genome sequence of the knowledge base + rewrite_seq_path (:obj:`bool`, optional): if :obj:`True`, the path to genome sequence in the converted + core of the knowledge base will be updated to the path of the converted genome sequence strict (:obj:`bool`, optional): if :obj:`True`, validate that the the model file(s) strictly follow the :obj:`obj_model` serialization format: @@ -330,14 +333,14 @@ def convert(source_core, source_seq, dest_core, dest_seq, strict=True): * There are no extra columns """ kb = Reader().run(source_core, source_seq, strict=strict) - Writer().run(kb, dest_core, dest_seq, set_repo_metadata_from_path=False) + Writer().run(kb, dest_core, dest_seq, rewrite_seq_path=rewrite_seq_path, set_repo_metadata_from_path=False) def create_template(core_path, seq_path, set_repo_metadata_from_path=True): """ Create file with knowledge base template, including row and column headings Args: - core_path (:obj:`str`): path to save temploate of core knowledge base + core_path (:obj:`str`): path to save template of core knowledge base seq_path (:obj:`str`): path to save genome sequence set_repo_metadata_from_path (:obj:`bool`, optional): if :obj:`True`, set the Git repository metadata (URL, branch, revision) for the knowledge base from the parent directory of :obj:`core_path`