Skip to content

Commit

Permalink
Modify IO to provide options of linking to genome sequence path
Browse files Browse the repository at this point in the history
  • Loading branch information
YinHoon committed Dec 12, 2018
1 parent 63470b0 commit 3e797d6
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 58 deletions.
25 changes: 25 additions & 0 deletions tests/fixtures/eukaryote_seq.fna.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
chromosome1 840 13 70 71
chromosome2 630 878 70 71
chromosome3 560 1530 70 71
chromosome4 630 2111 70 71
chromosome5 700 2763 70 71
chromosome6 910 3486 70 71
chromosome7 630 4422 70 71
chromosome8 770 5074 70 71
chromosome9 910 5868 70 71
chromosome10 560 6805 70 71
chromosome11 700 7387 70 71
chromosome12 630 8111 70 71
chromosome13 630 8764 70 71
chromosome14 910 9417 70 71
chromosome15 1330 10354 70 71
chromosome16 910 11717 70 71
chromosome17 770 12654 70 71
chromosome18 1120 13449 70 71
chromosome19 700 14599 70 71
chromosome20 1190 15323 70 71
chromosome21 1050 16544 70 71
chromosome22 1120 17623 70 71
chromosomeX 1120 18772 70 71
chromosomeY 980 19921 70 71
chromosomeMT 700 20929 70 71
97 changes: 69 additions & 28 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from wc_kb import core, prokaryote_schema
from wc_kb import io
import Bio.Seq
import Bio.SeqRecord
import filecmp
import obj_model.io
import os
import random
Expand All @@ -22,21 +24,26 @@
class TestIO(unittest.TestCase):

def setUp(self):
self.dir = tempfile.mkdtemp()
self.seq_path = os.path.join(self.dir, 'seq.fna')

self.kb = kb = core.KnowledgeBase(id='genus_species', name='Genus species', version='0.0.1')

cell = kb.cell = core.Cell(id='genus_species_cell')

dna_seqs = []
for i_chr in range(5):
dna = core.DnaSpeciesType(id='chr_{}'.format(i_chr + 1))
dna = core.DnaSpeciesType(id='chr_{}'.format(i_chr + 1), sequence_path=self.seq_path)
cell.species_types.append(dna)

seq_len = random.randint(100, 200)
bases = 'ACGT'
seq = ''
for i_nt in range(seq_len):
seq += bases[random.randint(0, 3)]
dna.seq = Bio.Seq.Seq(seq)

dna_seqs.append(Bio.SeqRecord.SeqRecord(
Bio.Seq.Seq(seq), dna.id))

for i_trn in range(5):
trn = prokaryote_schema.TranscriptionUnitLocus(id='tu_{}_{}'.format(i_chr + 1, i_trn + 1))
trn.cell = cell
Expand All @@ -45,32 +52,35 @@ def setUp(self):
trn.end = ((trn.start + random.randint(1, 200) - 1) % seq_len) + 1
trn.strand = core.PolymerStrand.positive

self.dir = tempfile.mkdtemp()
with open(self.seq_path, 'w') as file:
writer = Bio.SeqIO.FastaIO.FastaWriter(
file, wrap=70, record2title=lambda record: record.id)
writer.write_file(dna_seqs)

def tearDown(self):
shutil.rmtree(self.dir)

def test_write_read(self):
core_path = os.path.join(self.dir, 'core.xlsx')
seq_path = os.path.join(self.dir, 'seq.fna')
core_path = os.path.join(self.dir, 'core.xlsx')

writer = io.Writer()
writer.run(self.kb, core_path, seq_path, set_repo_metadata_from_path=False)
writer.run(self.kb, core_path, set_repo_metadata_from_path=False)

reader = io.Reader()
kb = reader.run(core_path, seq_path)
kb = reader.run(core_path, self.seq_path)

core_path = os.path.join(self.dir, 'core2.xlsx')
seq_path = os.path.join(self.dir, 'seq2.fna')
writer.run(kb, core_path, seq_path, set_repo_metadata_from_path=False)

self.assertTrue(self.kb.is_equal(kb))
self.assertTrue(filecmp.cmp(self.seq_path, seq_path, shallow=False))

def test_read_write_prokaryote(self):
fixtures = os.path.join(os.path.dirname(__file__), 'fixtures')
core_path = os.path.join(fixtures, 'core.xlsx')
seq_path = os.path.join(fixtures, 'seq.fna')

reader = io.Reader()
kb = reader.run(core_path, seq_path)

Expand All @@ -80,15 +90,16 @@ def test_read_write_prokaryote(self):
writer = io.Writer()
writer.run(kb, tmp_core_path, tmp_seq_path, set_repo_metadata_from_path=False)

tmp_kb = reader.run(tmp_core_path, tmp_seq_path)
tmp_kb = reader.run(tmp_core_path, seq_path)

self.assertTrue(kb.is_equal(tmp_kb))
self.assertTrue(filecmp.cmp(tmp_seq_path, seq_path, shallow=False))

def test_read_write_eukaryote(self):
fixtures = os.path.join(os.path.dirname(__file__), 'fixtures')
core_path = os.path.join(fixtures, 'eukaryote_core.xlsx')
seq_path = os.path.join(fixtures, 'eukaryote_seq.fna')

reader = io.Reader()
kb = reader.run(core_path, seq_path, schema=False)

Expand All @@ -98,9 +109,30 @@ def test_read_write_eukaryote(self):
writer = io.Writer()
writer.run(kb, tmp_core_path, tmp_seq_path, schema=False, set_repo_metadata_from_path=False)

tmp_kb = reader.run(tmp_core_path, tmp_seq_path, schema=False)
tmp_kb = reader.run(tmp_core_path, seq_path, schema=False)

self.assertTrue(kb.is_equal(tmp_kb))
self.assertTrue(filecmp.cmp(tmp_seq_path, seq_path, shallow=False))

def test_rewrite_seq_path_in_read_write(self):
path_core_1 = os.path.join(self.dir, 'core_1.xlsx')
path_core_2 = os.path.join(self.dir, 'core_2.xlsx')
path_seq_1 = os.path.join(self.dir, 'seq_1.fna')
path_seq_2 = os.path.join(self.dir, 'seq_2.fna')

io.Writer().run(self.kb, path_core_1, path_seq_1, set_repo_metadata_from_path=False)
kb1 = io.Reader().run(path_core_1, path_seq_1)
kb2 = io.Reader().run(path_core_1, path_seq_1, rewrite_seq_path=False)
self.assertFalse(kb1.is_equal(self.kb))
self.assertTrue(kb2.is_equal(self.kb))
self.assertTrue(filecmp.cmp(path_seq_1, self.seq_path, shallow=False))

io.Writer().run(self.kb, path_core_2, path_seq_2, rewrite_seq_path=True, set_repo_metadata_from_path=False)
kb3 = io.Reader().run(path_core_2, self.seq_path)
kb4 = io.Reader().run(path_core_2, self.seq_path, rewrite_seq_path=False)
self.assertFalse(kb3.is_equal(self.kb))
self.assertTrue(kb4.is_equal(self.kb))
self.assertTrue(filecmp.cmp(path_seq_2, self.seq_path, shallow=False))

def test_write_with_repo_md(self):
_, core_path = tempfile.mkstemp(suffix='.xlsx', dir='.')
Expand All @@ -122,10 +154,12 @@ def test_write_with_repo_md(self):

def test_write_without_cell_relationships(self):
core_path = os.path.join(self.dir, 'core.xlsx')
seq_path = os.path.join(self.dir, 'seq.fna')
seq_path = os.path.join(self.dir, 'test_seq.fna')

dna = core.DnaSpeciesType(id='chr_x')
dna.seq = Bio.Seq.Seq('ACGT')
with open(seq_path, 'w') as file:
file.write('>chr_x\nACGT\n')

dna = core.DnaSpeciesType(id='chr_x', sequence_path=seq_path)
self.kb.cell.species_types.append(dna)

trn = prokaryote_schema.TranscriptionUnitLocus(id='tu_x_0')
Expand All @@ -138,8 +172,8 @@ def test_write_without_cell_relationships(self):

def test_write_read_sloppy(self):
core_path = os.path.join(self.dir, 'core.xlsx')
seq_path = os.path.join(self.dir, 'seq.fna')

seq_path = os.path.join(self.dir, 'test_seq.fna')
writer = io.Writer()
writer.run(self.kb, core_path, seq_path, set_repo_metadata_from_path=False)

Expand All @@ -150,16 +184,17 @@ def test_write_read_sloppy(self):

reader = io.Reader()
with self.assertRaisesRegex(ValueError, "The columns of worksheet 'Knowledge base' must be defined in this order"):
kb = reader.run(core_path, seq_path)
kb = reader.run(core_path, seq_path, strict=False)
kb = reader.run(core_path, self.seq_path)
kb = reader.run(core_path, self.seq_path, strict=False)

self.assertTrue(kb.is_equal(self.kb))
self.assertTrue(filecmp.cmp(self.seq_path, seq_path, shallow=False))

def test_reader_no_kb(self):
core_path = os.path.join(self.dir, 'core.xlsx')
obj_model.io.WorkbookWriter().run(core_path, [], io.PROKARYOTE_MODEL_ORDER, include_all_attributes=False)

seq_path = os.path.join(self.dir, 'seq.fna')
seq_path = os.path.join(self.dir, 'test_seq.fna')
with open(seq_path, 'w') as file:
pass

Expand All @@ -177,7 +212,7 @@ def test_reader_error_multiple_kbs(self):
core_path = os.path.join(self.dir, 'core.xlsx')
obj_model.io.WorkbookWriter().run(core_path, [kb1, kb2], io.PROKARYOTE_MODEL_ORDER, include_all_attributes=False)

seq_path = os.path.join(self.dir, 'seq.fna')
seq_path = os.path.join(self.dir, 'test_seq.fna')
with open(seq_path, 'w') as file:
pass

Expand All @@ -191,7 +226,7 @@ def test_reader_error_no_cell(self):
core_path = os.path.join(self.dir, 'core.xlsx')
obj_model.io.WorkbookWriter().run(core_path, [kb, dna], io.PROKARYOTE_MODEL_ORDER, include_all_attributes=False)

seq_path = os.path.join(self.dir, 'seq.fna')
seq_path = os.path.join(self.dir, 'test_seq.fna')
with open(seq_path, 'w') as file:
pass

Expand All @@ -206,7 +241,7 @@ def test_reader_error_multiple_cells(self):
core_path = os.path.join(self.dir, 'core.xlsx')
obj_model.io.WorkbookWriter().run(core_path, [kb, cell1, cell2], io.PROKARYOTE_MODEL_ORDER, include_all_attributes=False)

seq_path = os.path.join(self.dir, 'seq.fna')
seq_path = os.path.join(self.dir, 'test_seq.fna')
with open(seq_path, 'w') as file:
pass

Expand All @@ -222,14 +257,17 @@ def test_convert(self):
path_seq_3 = os.path.join(self.dir, 'seq_3.fna')

io.Writer().run(self.kb, path_core_1, path_seq_1, set_repo_metadata_from_path=False)
self.assertTrue(filecmp.cmp(path_seq_1, self.seq_path, shallow=False))

io.convert(path_core_1, path_seq_1, path_core_2, path_seq_2)
kb = io.Reader().run(path_core_2, path_seq_2)
kb = io.Reader().run(path_core_2, self.seq_path)
self.assertTrue(kb.is_equal(self.kb))
self.assertTrue(filecmp.cmp(path_seq_1, path_seq_2, shallow=False))

io.convert(path_core_2, path_seq_2, path_core_3, path_seq_3)
kb = io.Reader().run(path_core_3, path_seq_3)
kb = io.Reader().run(path_core_3, self.seq_path)
self.assertTrue(kb.is_equal(self.kb))
self.assertTrue(filecmp.cmp(path_seq_2, path_seq_3, shallow=False))

def test_convert_sloppy(self):
path_core_1 = os.path.join(self.dir, 'core_1.xlsx')
Expand All @@ -240,6 +278,7 @@ def test_convert_sloppy(self):
path_seq_3 = os.path.join(self.dir, 'seq_3.fna')

io.Writer().run(self.kb, path_core_1, path_seq_1, set_repo_metadata_from_path=False)
self.assertTrue(filecmp.cmp(path_seq_1, self.seq_path, shallow=False))

wb = wc_utils.workbook.io.read(path_core_1)
row = wb['Knowledge base'].pop(0)
Expand All @@ -249,12 +288,14 @@ def test_convert_sloppy(self):
with self.assertRaisesRegex(ValueError, "The columns of worksheet 'Knowledge base' must be defined in this order"):
io.convert(path_core_1, path_seq_1, path_core_2, path_seq_2)
io.convert(path_core_1, path_seq_1, path_core_2, path_seq_2, strict=False)
kb = io.Reader().run(path_core_2, path_seq_2)
kb = io.Reader().run(path_core_2, self.seq_path)
self.assertTrue(kb.is_equal(self.kb))
self.assertTrue(filecmp.cmp(path_seq_1, path_seq_2, shallow=False))

io.convert(path_core_2, path_seq_2, path_core_3, path_seq_3)
kb = io.Reader().run(path_core_3, path_seq_3)
kb = io.Reader().run(path_core_3, self.seq_path)
self.assertTrue(kb.is_equal(self.kb))
self.assertTrue(filecmp.cmp(path_seq_2, path_seq_3, shallow=False))

def test_create_template(self):
path_core = os.path.join(self.dir, 'template.xlsx')
Expand Down
Loading

0 comments on commit 3e797d6

Please sign in to comment.