Bugfix for DNA/RNA masses

In Bio.Data.IUPACData: - corrected masses for monophosphate nucleotides in unambiguous_dna_weights and unambiguous_rna_weights (most values where too high by a mass of 16 Da) - added two dictionaries with monoisotopic masses for monophosphate nucleotides (monoisotopic_unambiguous_dna_weights and monoisotopic_unambiguous_rna_weights) - added average and monisotopic masses for selenocysteine and pyrrolysine in protein_weights and monoisotopic_protein_weights In Bio.SeqUtils.__init__: Rewrote method molecular_weight to - correct the calculation (sum masses of sequence elements and substract 18 Da for each formed bond) - allow mass calculation for RNA and protein sequences - allow mass calculation for double stranded nucleic acids
MarkusPiotrowski · Sep 2, 2013 · fd8914f · peterjc · Mar 12, 2014 · MarkusPiotrowski
1 parent 11af0cd
commit fd8914f
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 17 deletions.
diff --git a/Bio/Data/IUPACData.py b/Bio/Data/IUPACData.py
@@ -138,21 +138,38 @@ def _make_ranges(mydict):
         d[key] = (value, value)
     return d
 
-# From bioperl's SeqStats.pm
+# Average masses of monophosphate deoxy nucleotides
 unambiguous_dna_weights = {
-    "A": 347.,
-    "C": 323.,
-    "G": 363.,
-    "T": 322.,
+    "A": 331.22,
+    "C": 307.20,
+    "G": 347.22,
+    "T": 322.21
     }
+
+# Monoisotopic masses of monophospate deoxy nucleotides
+monoisotopic_unambiguous_dna_weights = {
+    "A": 331.07,
+    "C": 307.06,
+    "G": 347.06,
+    "T": 322.06
+    }
+
 unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)
 
 unambiguous_rna_weights = {
-    "A": unambiguous_dna_weights["A"] + 16.,  # 16 for the oxygen
-    "C": unambiguous_dna_weights["C"] + 16.,
-    "G": unambiguous_dna_weights["G"] + 16.,
-    "U": 340.,
+    "A": 347.22,
+    "C": 323.20,
+    "G": 363.22,
+    "U": 324.18
 }
+
+monoisotopic_unambiguous_rna_weights = {
+    "A": 347.06,
+    "C": 323.05,
+    "G": 363.06,
+    "U": 324.04
+}
+
 unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)
 
 
@@ -193,13 +210,13 @@ def _make_ambiguous_ranges(mydict, weight_table):
     "L": 131.18,
     "M": 149.21,
     "N": 132.12,
-    #"O": 0.0, # Needs to be recorded!
+    "O": 255.31, 
     "P": 115.13,
     "Q": 146.15,
     "R": 174.20,
     "S": 105.09,
     "T": 119.12,
-    #"U": 168.05, # To be confirmed
+    "U": 168.05, 
     "V": 117.15,
     "W": 204.23,
     "Y": 181.19
@@ -218,11 +235,13 @@ def _make_ambiguous_ranges(mydict, weight_table):
     "L": 131.09,
     "M": 149.05,
     "N": 132.05,
+    "O": 255.16,
     "P": 115.06,
     "Q": 146.07,
     "R": 174.11,
     "S": 105.04,
     "T": 119.06,
+    "U": 167.96,
     "V": 117.08,
     "W": 204.09,
     "Y": 181.07,
@@ -255,6 +274,7 @@ def _make_ambiguous_ranges(mydict, weight_table):
     "X": "ACDEFGHIKLMNPQRSTVWY",
     #TODO - Include U and O in the possible values of X?
     #This could alter the extended_protein_weight_ranges ...
+    #by MP: Won't do this, because they are so rare.
     "Y": "Y",
     "Z": "QE",
 }

diff --git a/Bio/SeqUtils/__init__.py b/Bio/SeqUtils/__init__.py
@@ -165,12 +165,33 @@ def xGC_skew(seq, window=1000, zoom=100,
     canvas.configure(scrollregion=canvas.bbox(ALL))
 
 
-def molecular_weight(seq):
-    """Calculate the molecular weight of a DNA sequence."""
-    if isinstance(seq, str):
-        seq = Seq(seq, IUPAC.unambiguous_dna)
-    weight_table = IUPACData.unambiguous_dna_weights
-    return sum(weight_table[x] for x in seq)
+def molecular_weight(seq, type='DNA', double_stranded=False):
+    # Rewritten by Markus Piotrowski
+    """Calculate the molecular weight of a DNA, RNA or protein sequence."""
+    seq = ''.join(str(seq).split()).upper() # Do the minimum formatting
+
+    if type == 'DNA':
+        weight_table = IUPACData.unambiguous_dna_weights
+    elif type == 'RNA':
+        weight_table = IUPACData.unambiguous_rna_weights
+    elif type == 'protein':
+        weight_table = IUPACData.protein_weights
+    else:
+        raise ValueError('allowed types are DNA, RNA or protein')
+
+    try:
+        weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * 18.02
+    except KeyError as e:
+        raise ValueError('%s is not a valid unambiguous letter for the ' %e +
+                         'chosen sequence type (DNA, RNA or protein)')
+    except:
+        raise
+
+    if type in ('DNA', 'RNA') and double_stranded:
+        seq = str(Seq(seq).complement())
+        weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * 18.02
+
+    return weight
 
 
 def nt_search(seq, subseq):