In [None]:
#| default_exp constants.aa

# Amino acid information

`alphabase.constants.aa`

In [None]:
#| export
 
import os
import pandas as pd
import numpy as np

from typing import Union, Tuple

from alphabase.yaml_utils import load_yaml

from alphabase.constants.element import (
    calc_mass_from_formula, 
    MASS_H2O, parse_formula,
)

from alphabase.constants._const import CONST_FILE_FOLDER

In [None]:
#| export

AA_CHEM:dict = load_yaml(
    os.path.join(CONST_FILE_FOLDER, 'amino_acid.yaml')
)

We use all 128 ASCII code to represent amino acids for flexible extensions in the future.

The amino acid masses are stored in 128-lengh array `AA_ASCII_MASS`. If an ASCII code is not in `AA_CHEM`, the mass will be `1e8` to disable it for MS search.

We also provide a AA table (`AA_DF` dataframe) for users.

In [None]:
#| export

def reset_AA_mass()->np.ndarray:
    """AA mass in np.array with shape (128,)"""
    AA_ASCII_MASS = np.ones(128)*1e8
    for aa, chem in AA_CHEM.items():
        AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(chem)
    return AA_ASCII_MASS

#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
AA_ASCII_MASS:np.ndarray = reset_AA_mass()

def reset_AA_df():
    AA_DF = pd.DataFrame()
    AA_DF['aa'] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
    AA_DF['formula'] = ['']*len(AA_ASCII_MASS)
    aa_idxes = []
    formulas = []
    for aa, formula in AA_CHEM.items():
        aa_idxes.append(ord(aa))
        formulas.append(formula)
    AA_DF.loc[aa_idxes, 'formula'] = formulas
    AA_DF['mass'] = AA_ASCII_MASS
    return AA_DF

#: 128-len AA dataframe
AA_DF:pd.DataFrame = reset_AA_df()

#: AA to formula dict of dict. For example: {'K': {'C': n, 'O': m, ...}}
AA_formula:dict = {}
for aa, formula, mass in AA_DF.values:
    AA_formula[aa] = dict(
        parse_formula(formula)
    )

In [None]:
AA_DF.loc[ord('A'):ord('Z'),:]

Unnamed: 0,aa,formula,mass
65,A,C(3)H(5)N(1)O(1)S(0),71.03711
66,B,C(1000000),12000000.0
67,C,C(3)H(5)N(1)O(1)S(1),103.0092
68,D,C(4)H(5)N(1)O(3)S(0),115.0269
69,E,C(5)H(7)N(1)O(3)S(0),129.0426
70,F,C(9)H(9)N(1)O(1)S(0),147.0684
71,G,C(2)H(3)N(1)O(1)S(0),57.02146
72,H,C(6)H(7)N(3)O(1)S(0),137.0589
73,I,C(6)H(11)N(1)O(1)S(0),113.0841
74,J,C(6)H(11)N(1)O(1)S(0),113.0841


`calc_sequence_mass` can easily get the mass list of each amino acid. 
The key is: `np.array(sequence, 'c').view(np.int8)` converts a string into an ASCII code array

Note that this function is rarely used in alphabase as it is not fast for a set of peptides.

In [None]:
#| export
def calc_sequence_mass(
    sequence: str
)->np.ndarray:
    '''
    Parameters
    ----------
    sequence : str
        Unmodified peptide sequence

    Returns
    -------
    np.ndarray
        Masses of each amino acid.
    '''
    return AA_ASCII_MASS[np.array(sequence,'c').view(np.int8)]

We provide `calc_AA_masses_for_same_len_seqs()` and `calc_sequence_masses_for_same_len_seqs()` functions to fast calculate masses for a given array of AA sequences with same length. They are fast because they both use `slicing` and `reshape` operation based on `AA_ASCII_MASS` array.

In [None]:
#| export
def calc_AA_masses_for_same_len_seqs(
    sequence_array: np.ndarray
)->np.ndarray:
    '''
    Calculate AA masses for the array of same-len AA sequences.

    Parameters
    ----------
    sequence_array : np.ndarray
        unmodified sequences with the same length.

    Returns
    -------
    np.ndarray
        2-D (array_size, sequence_len) array of masses.

    Raises
    -------
    ValueError
        If sequences are not with the same length.
    '''
    return AA_ASCII_MASS[
        # we use np.int32 here because unicode str 
        # uses 4 bytes for a char.
        np.array(sequence_array).view(np.int32) 
    ].reshape(len(sequence_array), -1)

def calc_sequence_masses_for_same_len_seqs(
    sequence_array: np.ndarray
)->np.ndarray:
    '''
    Calculate sequence masses for the array of same-len AA sequences.

    Parameters
    ----------
    sequence_array : np.ndarray
        unmodified sequences with the same length.

    Returns
    -------
    np.ndarray
        1-D (array_size, sequence_len) array of masses.
    
    Raises
    -------
    ValueError
        If sequences are not with the same length.
    '''
    return np.sum(
        calc_AA_masses_for_same_len_seqs(sequence_array),
        axis=1
    )+MASS_H2O


For a single sequence

In [None]:
assert np.allclose(
    calc_sequence_mass('ACDEFGNYK'),
    [ 71.03711379, 103.00918496, 115.02694302, 129.04259309,
       147.06841391,  57.02146372, 114.04292744, 163.06332853,
       128.09496302 ]
)

For sequences with the same length

In [None]:
#| hide
assert np.allclose(
    calc_AA_masses_for_same_len_seqs(
        np.array(['ACDEFGHIK','BCDEFGHIK','CCDEFGHIK'])
    ),
    np.array([
        [ 71.03711379, 103.00918496, 115.02694302, 129.04259309,
            147.06841391,  57.02146372, 137.05891186, 113.08406398,
            128.09496302],
        [ 12000000, 103.00918496, 115.02694302, 129.04259309,
            147.06841391,  57.02146372, 137.05891186, 113.08406398,
            128.09496302 ],
        [ 103.00918496, 103.00918496, 115.02694302, 129.04259309,
            147.06841391,  57.02146372, 137.05891186, 113.08406398,
            128.09496302 ]
    ])
)

In [None]:
#| hide
assert np.allclose(
    calc_sequence_masses_for_same_len_seqs(np.array(['ACDEFGHIK','BCDEFGHIK','CCDEFGHIK'])),
    [  1018.45421603, 12000947.41710224,   1050.4262872 ]
)

It is very easy to generate b/y ions from a sequence or a list of sequences with same length

In [None]:
aa_masses = calc_AA_masses_for_same_len_seqs(['ACDEFGHIK','BCDEFGHIK','CCDEFGHIK'])
b_masses = np.cumsum(aa_masses, axis=1)
b_masses, pepmass = b_masses[:,:-1], b_masses[:,-1:]
pepmass += MASS_H2O
{'pepmass':pepmass, 'b masses':b_masses, 'y masses':pepmass-b_masses}

{'pepmass': array([[1.01845422e+03],
        [1.20009474e+07],
        [1.05042629e+03]]),
 'b masses': array([[7.10371138e+01, 1.74046299e+02, 2.89073242e+02, 4.18115835e+02,
         5.65184249e+02, 6.22205712e+02, 7.59264624e+02, 8.72348688e+02],
        [1.20000000e+07, 1.20001030e+07, 1.20002180e+07, 1.20003471e+07,
         1.20004941e+07, 1.20005512e+07, 1.20006882e+07, 1.20008013e+07],
        [1.03009185e+02, 2.06018370e+02, 3.21045313e+02, 4.50087906e+02,
         5.97156320e+02, 6.54177784e+02, 7.91236696e+02, 9.04320760e+02]]),
 'y masses': array([[947.41710224, 844.40791728, 729.38097426, 600.33838117,
         453.26996726, 396.24850354, 259.18959168, 146.1055277 ],
        [947.41710224, 844.40791728, 729.38097426, 600.33838117,
         453.26996726, 396.24850354, 259.18959168, 146.1055277 ],
        [947.41710224, 844.40791728, 729.38097426, 600.33838117,
         453.26996726, 396.24850354, 259.18959168, 146.1055277 ]])}

`calc_AA_masses_for_var_len_seqs` is rarely used in alphabase.

In [None]:
#| export
def calc_AA_masses_for_var_len_seqs(
    sequence_array: np.ndarray
)->np.ndarray:
    '''
    We recommend to use `calc_AA_masses_for_same_len_seqs` as it is much faster.

    Parameters
    ----------
    sequence_array : np.ndarray
        Sequences with variable lengths.
        
    Returns
    -------
    np.ndarray
        1D array of masses, zero values are padded to fill the max length.
    '''
    return AA_ASCII_MASS[
        np.array(sequence_array).view(np.int32)
    ].reshape(len(sequence_array), -1)

In [None]:
masses = calc_AA_masses_for_var_len_seqs(['EFGHIK','AAAGCDEFGHIK','DDDDCCDEFGHIK'])
masses

array([[1.29042593e+02, 1.47068414e+02, 5.70214637e+01, 1.37058912e+02,
        1.13084064e+02, 1.28094963e+02, 1.00000000e+08, 1.00000000e+08,
        1.00000000e+08, 1.00000000e+08, 1.00000000e+08, 1.00000000e+08,
        1.00000000e+08],
       [7.10371138e+01, 7.10371138e+01, 7.10371138e+01, 5.70214637e+01,
        1.03009185e+02, 1.15026943e+02, 1.29042593e+02, 1.47068414e+02,
        5.70214637e+01, 1.37058912e+02, 1.13084064e+02, 1.28094963e+02,
        1.00000000e+08],
       [1.15026943e+02, 1.15026943e+02, 1.15026943e+02, 1.15026943e+02,
        1.03009185e+02, 1.03009185e+02, 1.15026943e+02, 1.29042593e+02,
        1.47068414e+02, 5.70214637e+01, 1.37058912e+02, 1.13084064e+02,
        1.28094963e+02]])