Skip to content

Commit

Permalink
Include N's in processing DNA sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
YinHoon committed Dec 14, 2018
1 parent 3e797d6 commit 1642567
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 7 deletions.
16 changes: 15 additions & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ def test(self):
self.tmp_dirname = tempfile.mkdtemp()
filepath = os.path.join(self.tmp_dirname, 'test_seq.fasta')
with open(filepath, 'w') as f:
f.write('>dna1\nACGTACGT\n')
f.write('>dna1\nACGTACGT\n'
'>dna2\nACGTACGTNNNN\n')

dna = core.DnaSpeciesType(id='dna1', name='dna1', sequence_path=filepath,
circular=False, double_stranded=False, ploidy=2)
Expand Down Expand Up @@ -270,6 +271,19 @@ def test(self):

self.assertAlmostEqual(dna.get_mol_wt(), exp_mol_wt, places=0)

# If there are N's in the DNA sequence
dna2 = core.DnaSpeciesType(id='dna2', sequence_path=filepath,
circular=False, double_stranded=True)

L = dna2.get_len()
self.assertEqual(dna2.get_empirical_formula(),
chem.EmpiricalFormula('C10H12N5O6P') * 3 * 2
+ chem.EmpiricalFormula('C9H12N3O7P') * 3 * 2
+ chem.EmpiricalFormula('C10H12N5O7P') * 3 * 2
+ chem.EmpiricalFormula('C10H13N2O8P') * 3 * 2
- chem.EmpiricalFormula('OH') * (L - 1) * 2
)

shutil.rmtree(self.tmp_dirname)


Expand Down
21 changes: 15 additions & 6 deletions wc_kb/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1594,16 +1594,25 @@ def get_empirical_formula(self):
:math:`N_A * dAMP + N_C * dCMP + N_G * dGMP + N_T * dTMP - L * OH`
N's in the sequence will be distributed into the four bases by preserving the original ratio
Returns:
:obj:`chem.EmpiricalFormula`: empirical formula
"""
seq = self.get_seq()
n_a = seq.count('A')
n_c = seq.count('C')
n_g = seq.count('G')
n_t = seq.count('T')
l = len(seq)

n_a = seq.upper().count('A')
n_c = seq.upper().count('C')
n_g = seq.upper().count('G')
n_t = seq.upper().count('T')
n_n = seq.upper().count('N')

l = len(seq)
known_bases = n_a + n_c + n_g + n_t
n_a += round(n_a / known_bases * n_n)
n_c += round(n_c / known_bases * n_n)
n_g += round(n_g / known_bases * n_n)
n_t = l - (n_a + n_c + n_g)

if self.double_stranded:
n_a = n_a + n_t
n_t = n_a
Expand Down

0 comments on commit 1642567

Please sign in to comment.