In [1]:
#| default_exp constants

# Constants

> Where all constats are stored

This notebook stores all constants.

In [2]:
#| hide
from nbdev.showdoc import *

## Amino Acids
A set of valid amino acids.

In [3]:
#| export
AAs = set('ACDEFGHIKLMNPQRSTUVWY')

In [4]:
print(AAs)

{'U', 'R', 'S', 'M', 'P', 'I', 'E', 'H', 'L', 'N', 'C', 'G', 'D', 'W', 'K', 'F', 'T', 'V', 'A', 'Q', 'Y'}


## Mass dict
A numba compatible mass dict dictionary. This is created from the `modifications.tsv`. Change to allow custom modifications.

In [5]:
#| export

from numba import types
from numba.typed import Dict
import os

#generates the mass dictionary from table
def get_mass_dict(modfile:str, aasfile: str, verbose:bool=True):
    """
    Function to create a mass dict based on tsv files. 
    This is used to create the hardcoded dict in the constants notebook.
    The dict needs to be hardcoded because of importing restrictions when using numba.
    More specifically, a global needs to be typed at runtime.

    Args:
        modfile (str): Filename of modifications file.
        aasfile (str): Filename of AAs file.
        verbose (bool, optional): Flag to print dict.

    Returns:
        Returns a numba compatible dictionary with masses.

    Raises:
        FileNotFoundError: If files are not found.
        
    """
    import pandas as pd

    mods = pd.read_csv(modfile, delimiter="\t")
    aas = pd.read_csv(aasfile, delimiter="\t")
    
    mass_dict = Dict.empty(key_type=types.unicode_type, value_type=types.float64)

    for identifier, mass in aas[["Identifier", "Monoisotopic Mass (Da)"]].values:
        mass_dict[identifier] = float(mass)

    for identifier, aar, mass in mods[
        ["Identifier", "Amino Acid Residue", "Monoisotopic Mass Shift (Da)"]
    ].values:
        #print(identifier, aar, mass)

        if ("<" in identifier) or (">" in identifier):
            for aa_identifier, aa_mass in aas[["Identifier", "Monoisotopic Mass (Da)"]].values:
                if '^' in identifier:
                    new_identifier = identifier[:-2] + aa_identifier
                    mass_dict[new_identifier] = float(mass) + mass_dict[aa_identifier]
                elif aar == aa_identifier:
                    new_identifier = identifier[:-2] + aa_identifier
                    mass_dict[new_identifier] = float(mass) + mass_dict[aa_identifier]
                else:
                    pass
        else:
            mass_dict[identifier] = float(mass) + mass_dict[aar]

    # Manually add other masses
    mass_dict[
        "Electron"
    ] = (
        0.000548579909070
    )  # electron mass, half a millimass error if not taken into account
    mass_dict["Proton"] = 1.00727646687  # proton mass
    mass_dict["Hydrogen"] = 1.00782503223  # hydrogen mass
    mass_dict["C13"] = 13.003354835  # C13 mass
    mass_dict["Oxygen"] = 15.994914619  # oxygen mass
    mass_dict["OH"] = mass_dict["Oxygen"] + mass_dict["Hydrogen"]  # OH mass
    mass_dict["H2O"] = mass_dict["Oxygen"] + 2 * mass_dict["Hydrogen"]  # H2O mass
    
    mass_dict["NH3"] = 17.02654910112
    mass_dict["delta_M"] = 1.00286864
    mass_dict["delta_S"] = 0.0109135
    
    if verbose:
        
        for element in mass_dict:
            print('mass_dict["{}"] = {}'.format(element, mass_dict[element]))

    return mass_dict

try:
    base = os.path.dirname(os.path.abspath(__file__)) #Cant do this in notebook
except NameError:
    base = os.path.join(os.pardir, 'alphapept')

if not os.path.isfile(os.path.join(base, "modifications.tsv")):
    #Calling nbdev build_docs from parent dir
    base = os.path.join(base, 'alphapept')
      
modfile_path = os.path.join(base, "modifications.tsv")  
aafile_path = os.path.join(base, "amino_acids.tsv")  

mass_dict = get_mass_dict(modfile=modfile_path, aasfile=aafile_path, verbose=False)

In [6]:
for _ in mass_dict:
    print(f"{_+':': <12}{mass_dict[_]}")

A:          71.0371138
C:          103.0091845
D:          115.0269431
E:          129.0425931
F:          147.0684139
G:          57.02146373
H:          137.0589119
I:          113.084064
K:          128.094963
L:          113.084064
M:          131.0404846
N:          114.0429275
P:          97.05276386
Q:          128.0585775
R:          156.101111
S:          87.03202843
T:          101.0476785
U:          150.9536333957
V:          99.06841392
W:          186.079313
Y:          163.0633286
cC:         160.03064823
oxM:        147.03539923000002
aA:         113.04767849000001
aC:         145.01974919
aD:         157.03750779
aE:         171.05315779
aF:         189.07897859
aG:         99.03202842
aH:         179.06947659
aI:         155.09462869
aK:         170.10552769
aL:         155.09462869
aM:         173.05104929
aN:         156.05349219000001
aP:         139.06332855
aQ:         170.06914219
aR:         198.11167569
aS:         129.04259312
aT:         143.05824319
aU:    

In [7]:
# Test that there is an entry for each AA
for _ in AAs:
    assert _ in mass_dict.keys()
    
print(mass_dict['A'])
print(mass_dict['K'])

71.0371138
128.094963


In [8]:
#| hide
def test_get_mass_dict():
    m_dict = mass_dict
    assert m_dict["Electron"] == 0.000548579909070

test_get_mass_dict()

In [9]:
#| hide
def test_get_mass_dict():
    m_dict = get_mass_dict(modfile=modfile_path, aasfile=aafile_path, verbose=False)
    assert m_dict == mass_dict
    
test_get_mass_dict()

## Isotopes

We define a `jitclass` that stores the base mass, the number of isotopes, and their abundances. We create the typed dictionary `isotopes` that stores different default isotopes. 

In [10]:
#| export
import numpy as np
from numba import int32, float32, float64, njit, types
from numba.experimental import jitclass
from numba.typed import Dict

spec = [
    ('m0', float32),
    ('dm', int32),
    ('intensities', float32[:]),
]

@jitclass(spec)
class Isotope:
    """
    Jit-compatible class to store isotopes

    Attributes:
        m0 (int): Mass of pattern
        dm0 (int): dm of pattern (number of isotopes)
        int0 (np.float32[:]): Intensities of pattern
    """
    def __init__(self, m0:int, dm:int, intensities:np.ndarray):
        self.m0 = m0
        self.dm = dm
        self.intensities = intensities

isotopes = Dict.empty(key_type=types.unicode_type, value_type=Isotope.class_type.instance_type)

isotopes["C"] = Isotope(12, 3, np.array([0.9893, 0.0107, 0.0], dtype=np.float32))
isotopes["H"] = Isotope(1.007940, 3,  np.array([0.999885, 0.000115, 0.0], dtype=np.float32))
isotopes["O"] = Isotope(15.9949146221, 3,  np.array([0.99757, 0.00038, 0.00205], dtype=np.float32))
isotopes["N"] = Isotope(14.0030740052, 2,  np.array([0.99636, 0.00364], dtype=np.float32))
isotopes["S"] = Isotope(31.97207069, 4,  np.array([0.9499, 0.0075, 0.0425, 0.0001], dtype=np.float32))

isotopes["I"] = Isotope(126.904473, 1,  np.array([1], dtype=np.float32))
isotopes["K"] = Isotope(38.9637069, 3,  np.array([0.932581, 0.000117, 0.067302], dtype=np.float32))

In [11]:
#| hide
def test_isotope():
    assert isotopes["C"].m0 == 12
    assert isotopes["C"].dm == 3
    assert np.allclose(isotopes["C"].intensities[0], 0.9893)
    assert np.allclose(isotopes["C"].intensities[1], 0.0107)
    assert np.allclose(isotopes["C"].intensities[2], 0)
    
test_isotope()

In [12]:
for _ in isotopes:
    print(f'Element {_}: base mass {isotopes[_].m0:<20} intensities {isotopes[_].intensities}')
    

Element C: base mass 12.0                 intensities [0.9893 0.0107 0.    ]
Element H: base mass 1.0079400539398193   intensities [9.99885e-01 1.15000e-04 0.00000e+00]
Element O: base mass 15.994915008544922   intensities [9.9757e-01 3.8000e-04 2.0500e-03]
Element N: base mass 14.003073692321777   intensities [0.99636 0.00364]
Element S: base mass 31.972070693969727   intensities [9.499e-01 7.500e-03 4.250e-02 1.000e-04]
Element I: base mass 126.90447235107422   intensities [1.]
Element K: base mass 38.963706970214844   intensities [9.32581e-01 1.17000e-04 6.73020e-02]


## Averagine Model

In [13]:
#| export
averagine_aa = Dict.empty(key_type=types.unicode_type, value_type=types.float64)

averagine_aa["C"] = 4.9384
averagine_aa["H"] = 7.7583
averagine_aa["N"] = 1.3577
averagine_aa["O"] = 1.4773
averagine_aa["S"] = 0.0417

averagine_avg = 111.1254

In [14]:
#Masses of the averagine model
for _ in averagine_aa:
    print(f"{_} {averagine_aa[_]}")

C 4.9384
H 7.7583
N 1.3577
O 1.4773
S 0.0417


## Protease dict

A numba compatible dictionary that stores different regular expressions needed for digestion. The dictionary was largely taken from the [Pyteomics](https://pyteomics.readthedocs.io/en/latest/_modules/pyteomics/parser.html) website which in turn derived the rules are from [expasy](https://web.expasy.org/peptide_cutter/peptidecutter_enzymes.html). Some entries (Lys-C/ Lys-N) were updated according to [OpenMS](https://github.com/OpenMS/OpenMS/blob/develop/share/OpenMS/CHEMISTRY/Enzymes.xml).
A useful resource for testing Regex can be found [here](https://regex101.com/).

In [15]:
#| export
protease_dict = Dict.empty(key_type=types.unicode_type, value_type=types.unicode_type)

protease_dict["arg-c"] = "R"
protease_dict["asp-n"] = "\w(?=D)"
protease_dict["bnps-skatole"] = "W"
protease_dict["caspase 1"] = "(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])"
protease_dict["caspase 2"] = "(?<=DVA)D(?=[^PEDQKR])"
protease_dict["caspase 3"] = "(?<=DMQ)D(?=[^PEDQKR])"
protease_dict["caspase 4"] = "(?<=LEV)D(?=[^PEDQKR])"
protease_dict["caspase 5"] = "(?<=[LW]EH)D"
protease_dict["caspase 6"] = "(?<=VE[HI])D(?=[^PEDQKR])"
protease_dict["caspase 7"] = "(?<=DEV)D(?=[^PEDQKR])"
protease_dict["caspase 8"] = "(?<=[IL]ET)D(?=[^PEDQKR])"
protease_dict["caspase 9"] = "(?<=LEH)D"
protease_dict["caspase 10"] = "(?<=IEA)D"
protease_dict["chymotrypsin high specificity"] = "([FY](?=[^P]))|(W(?=[^MP]))"
protease_dict["chymotrypsin low specificity"] = "([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))"
protease_dict["clostripain"] = "R"
protease_dict["cnbr"] = "M"
protease_dict["enterokinase"] = "(?<=[DE]{3})K"
protease_dict["factor xa"] = "(?<=[AFGILTVM][DE]G)R"
protease_dict["formic acid"] = "D"
protease_dict["glutamyl endopeptidase"] = "E"
protease_dict["granzyme b"] = "(?<=IEP)D"
protease_dict["hydroxylamine"] = "N(?=G)"
protease_dict["iodosobenzoic acid"] = "W"
protease_dict["lys_c"] = "K(?!P)" #Lys-C cuts after K if not followed by P.
protease_dict["lys_c/p"] = "K" #Lys-C cuts after every K
protease_dict["lys_n"] = ".K"  #Lys-N cuts before K.
protease_dict["ntcb"] = "\w(?=C)"
protease_dict["pepsin ph1.3"] = "((?<=[^HKR][^P])[^R](?=[FL][^P]))|((?<=[^HKR][^P])[FL](?=\w[^P]))"
protease_dict["pepsin ph2.0"] = "((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|((?<=[^HKR][^P])[FLWY](?=\w[^P]))"
protease_dict["proline endopeptidase"] = "(?<=[HKR])P(?=[^P])"
protease_dict["proteinase k"] = "[AEFILTVWY]"
protease_dict["staphylococcal peptidase i"] = "(?<=[^E])E"
protease_dict["thermolysin"] = "[^DE](?=[AFILMV])"
protease_dict["thrombin"] = "((?<=G)R(?=G))|((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))"
protease_dict["trypsin_full"] = "([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))"
protease_dict["trypsin_exception"] = "((?<=[CD])K(?=D))|((?<=C)K(?=[HY]))|((?<=C)R(?=K))|((?<=R)R(?=[HR]))"
protease_dict["non-specific"] = "()"
protease_dict["trypsin"] = "([KR](?=[^P]))"

In [16]:
#Entries in the protease_dict:
for _ in protease_dict:
    print(f"{_:<35} {protease_dict[_]}")

arg-c                               R
asp-n                               \w(?=D)
bnps-skatole                        W
caspase 1                           (?<=[FWYL]\w[HAT])D(?=[^PEDQKR])
caspase 2                           (?<=DVA)D(?=[^PEDQKR])
caspase 3                           (?<=DMQ)D(?=[^PEDQKR])
caspase 4                           (?<=LEV)D(?=[^PEDQKR])
caspase 5                           (?<=[LW]EH)D
caspase 6                           (?<=VE[HI])D(?=[^PEDQKR])
caspase 7                           (?<=DEV)D(?=[^PEDQKR])
caspase 8                           (?<=[IL]ET)D(?=[^PEDQKR])
caspase 9                           (?<=LEH)D
caspase 10                          (?<=IEA)D
chymotrypsin high specificity       ([FY](?=[^P]))|(W(?=[^MP]))
chymotrypsin low specificity        ([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))
clostripain                         R
cnbr                                M
enterokinase                        (?<=[DE]{3})K
factor xa                  

In [17]:
#| hide
def test_get_protease_dict():
    p_dict = protease_dict
    assert p_dict["trypsin"] == "([KR](?=[^P]))"
    
test_get_protease_dict()

## Losses

In [18]:
#| export
from numba.typed import Dict
loss_dict = Dict()
loss_dict[''] = 0.0
loss_dict['-H2O'] = 18.01056468346
loss_dict['-NH3'] = 17.03052

In [19]:
#Entries in the loss_dict:
for _ in loss_dict:
    print(f"{_:<5} {loss_dict[_]}")

      0.0
-H2O  18.01056468346
-NH3  17.03052


## Labels

In [20]:
#| export
from collections import namedtuple
import numpy as np
LABEL = namedtuple('label', ['mod_name', 'channels', 'masses', 'reference_channel','mods_fixed_terminal','mods_variable'])

label_dict = {}

label_dict['TMT10plex'] = LABEL('tmt6',
    ['tmt10-126',
 'tmt10-127N',
 'tmt10-127C',
 'tmt10-128N',
 'tmt10-128C',
 'tmt10-129N',
 'tmt10-129C',
 'tmt10-130N',
 'tmt10-130C',
 'tmt10-131',
 'tmt10-131C'],
np.array([126.127726,
 127.124761,
 127.131081,
 128.128116,
 128.134436,
 129.131471,
 129.13779,
 130.134825,
 130.141145,
 131.13818,
 131.144499]),
'tmt10-126',
['tmt6<^'],
['tmt6Y','tmt6K'],
   )

In [21]:
for label in label_dict:
    print(label_dict[label])

label(mod_name='tmt6', channels=['tmt10-126', 'tmt10-127N', 'tmt10-127C', 'tmt10-128N', 'tmt10-128C', 'tmt10-129N', 'tmt10-129C', 'tmt10-130N', 'tmt10-130C', 'tmt10-131', 'tmt10-131C'], masses=array([126.127726, 127.124761, 127.131081, 128.128116, 128.134436,
       129.131471, 129.13779 , 130.134825, 130.141145, 131.13818 ,
       131.144499]), reference_channel='tmt10-126', mods_fixed_terminal=['tmt6<^'], mods_variable=['tmt6Y', 'tmt6K'])


In [22]:
#|hide
import nbdev; nbdev.nbdev_export()

Converted 00_settings.ipynb.
Converted 01_chem.ipynb.
Converted 02_io.ipynb.
Converted 03_fasta.ipynb.
Converted 04_feature_finding.ipynb.
Converted 05_search.ipynb.
Converted 06_score.ipynb.
Converted 07_recalibration.ipynb.
Converted 08_quantification.ipynb.
Converted 09_matching.ipynb.
Converted 10_constants.ipynb.
Converted 11_interface.ipynb.
Converted 12_performance.ipynb.
Converted 13_export.ipynb.
Converted 14_display.ipynb.
Converted 15_label.ipynb.
Converted additional_code.ipynb.
Converted contributing.ipynb.
Converted file_formats.ipynb.
Converted index.ipynb.
