In [None]:
#default_exp constants.aa

In [None]:
#hide
import alphabase.constants.aa
__file__ = alphabase.constants.aa.__file__

# Amino Acid in the AlphaX Ecosystem

All amino acids are stored as the chemical formulas in `alphabase/constants/amino_acid.yaml`. We load the yaml file into `AA_CHEM` dict.

In [None]:
#export
 
import os
import pandas as pd
import numpy as np
from typing import Union, Tuple
from alphabase.yaml_utils import load_yaml

from alphabase.constants.element import calc_mass_from_formula
from alphabase.constants.element import MASS_H2O

AA_CHEM = load_yaml(
    os.path.join(os.path.dirname(__file__),
    'amino_acid.yaml')
)

We use all 128 ASCII code to represent amino acids for flexible extensions in the future.

The amino acid masses are stored in 128-lengh array `AA_ASCII_MASS`. If an ASCII code is not in `AA_CHEM`, the mass will be `1e8` to disable it for MS search.

We also provide a AA table (`AA_DF` dataframe) for users.

In [None]:
#export

def reset_AA_mass():
    AA_ASCII_MASS = np.ones(128)*1e8
    for aa, chem in AA_CHEM.items():
        AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(chem)
    return AA_ASCII_MASS
    
AA_ASCII_MASS = reset_AA_mass()

def ret_set_AA_df():
    AA_DF = pd.DataFrame()
    AA_DF['aa'] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
    AA_DF['formula'] = ['']*len(AA_ASCII_MASS)
    aa_idxes = []
    formulas = []
    for aa, formula in AA_CHEM.items():
        aa_idxes.append(ord(aa))
        formulas.append(formula)
    AA_DF.loc[aa_idxes, 'formula'] = formulas
    AA_DF['mass'] = AA_ASCII_MASS
    return AA_DF
AA_DF = ret_set_AA_df()


In [None]:
AA_DF.loc[ord('A'):ord('Z'),:]

Unnamed: 0,aa,formula,mass
65,A,C(3)H(5)N(1)O(1)S(0),71.037114
66,B,C(10000),120000.0
67,C,C(3)H(5)N(1)O(1)S(1),103.009185
68,D,C(4)H(5)N(1)O(3)S(0),115.026943
69,E,C(5)H(7)N(1)O(3)S(0),129.042593
70,F,C(9)H(9)N(1)O(1)S(0),147.068414
71,G,C(2)H(3)N(1)O(1)S(0),57.021464
72,H,C(6)H(7)N(3)O(1)S(0),137.058912
73,I,C(6)H(11)N(1)O(1)S(0),113.084064
74,J,C(6)H(11)N(1)O(1)S(0),113.084064


### `calc_sequence_mass` can easily get the mass list of each amino acid. 
The key is: `np.array(sequence, 'c').view(np.int8)` converts a string into an ASCII code array

Note that this function is rarely used in alphabase as it is not fast for a set of peptides.

In [None]:
#export
def calc_sequence_mass(
    sequence: str
)->np.array:
    '''
    Args:
        sequence (str): unmodified peptide sequence
    Returns:
        np.array: masses of each amino acid.
    '''
    return AA_ASCII_MASS[np.array(sequence,'c').view(np.int8)]

We provide `calc_AA_masses_for_same_len_seqs()` and `calc_sequence_masses_for_same_len_seqs()` functions to fast calculate masses for a given array of AA sequences with same length. They are fast because they both use `slicing` and `reshape` operation based on `AA_ASCII_MASS` array.

In [None]:
#export
def calc_AA_masses_for_same_len_seqs(
    sequence_array: np.array
)->np.array:
    '''
    Calculate AA masses for the array of same-len AA sequences.
    Args:
        sequence_array (np.array): unmodified sequences with the same length.
    Returns:
        np.array: 2-D (array_size, sequence_len) array of masses.
    Raise:
        ValueError: if sequences are not with the same length.
    '''
    return AA_ASCII_MASS[
        # we use np.int32 here because unicode str 
        # uses 4 bytes for a char.
        np.array(sequence_array).view(np.int32) 
    ].reshape(len(sequence_array), -1)

def calc_sequence_masses_for_same_len_seqs(
    sequence_array: np.array
)->np.array:
    '''
    Calculate sequence masses for the array of same-len AA sequences.
    Args:
        sequence_array (np.array): unmodified sequences with the same length.
    Returns:
        np.array: 1-D (array_size, sequence_len) array of masses.
    Raise:
        ValueError: if sequences are not with the same length.
    '''
    return np.sum(
        calc_AA_masses_for_same_len_seqs(sequence_array),
        axis=1
    )+MASS_H2O


### For a single sequence

In [None]:
assert np.allclose(
    calc_sequence_mass('ACDEFGNYK'),
    [ 71.03711379, 103.00918496, 115.02694302, 129.04259309,
       147.06841391,  57.02146372, 114.04292744, 163.06332853,
       128.09496302 ]
)

### For sequences with the same length

In [None]:
assert np.allclose(
    calc_AA_masses_for_same_len_seqs(
        np.array(['ACDEFGHIK','BCDEFGHIK','CCDEFGHIK'])
    ),
    np.array([
        [ 71.03711379, 103.00918496, 115.02694302, 129.04259309,
            147.06841391,  57.02146372, 137.05891186, 113.08406398,
            128.09496302],
        [ 120000, 103.00918496, 115.02694302, 129.04259309,
            147.06841391,  57.02146372, 137.05891186, 113.08406398,
            128.09496302 ],
        [ 103.00918496, 103.00918496, 115.02694302, 129.04259309,
            147.06841391,  57.02146372, 137.05891186, 113.08406398,
            128.09496302 ]
    ])
)

In [None]:
assert np.allclose(
    calc_sequence_masses_for_same_len_seqs(np.array(['ACDEFGHIK','BCDEFGHIK','CCDEFGHIK'])),
    [  1018.45421603, 120947.41710224,   1050.4262872 ]
)

### It is very easy to generate b/y ions from a sequence or a list of sequences with same length

In [None]:
aa_masses = calc_AA_masses_for_same_len_seqs(['ACDEFGHIK','BCDEFGHIK','CCDEFGHIK'])
b_masses = np.cumsum(aa_masses, axis=1)
b_masses, pepmass = b_masses[:,:-1], b_masses[:,-1:]
pepmass += MASS_H2O
{'pepmass':pepmass, 'b masses':b_masses, 'y masses':pepmass-b_masses}

{'pepmass': array([[  1018.45421603],
        [120947.41710224],
        [  1050.4262872 ]]),
 'b masses': array([[7.10371138e+01, 1.74046299e+02, 2.89073242e+02, 4.18115835e+02,
         5.65184249e+02, 6.22205712e+02, 7.59264624e+02, 8.72348688e+02],
        [1.20000000e+05, 1.20103009e+05, 1.20218036e+05, 1.20347079e+05,
         1.20494147e+05, 1.20551169e+05, 1.20688228e+05, 1.20801312e+05],
        [1.03009185e+02, 2.06018370e+02, 3.21045313e+02, 4.50087906e+02,
         5.97156320e+02, 6.54177784e+02, 7.91236696e+02, 9.04320760e+02]]),
 'y masses': array([[947.41710224, 844.40791728, 729.38097426, 600.33838117,
         453.26996726, 396.24850354, 259.18959168, 146.1055277 ],
        [947.41710224, 844.40791728, 729.38097426, 600.33838117,
         453.26996726, 396.24850354, 259.18959168, 146.1055277 ],
        [947.41710224, 844.40791728, 729.38097426, 600.33838117,
         453.26996726, 396.24850354, 259.18959168, 146.1055277 ]])}

`calc_AA_masses_for_var_len_seqs` is rarely used in alphabase.

In [None]:
#export
def calc_AA_masses_for_var_len_seqs(
    sequence_array: np.array
)->np.array:
    '''
    We recommend to use `calc_AA_masses_for_same_len_seqs` as it is much faster.
    Args:
        sequence_array (np.array): sequences with variable lengths.
    Returns:
        np.array: 1D array of masses, zero values are padded to fill the max length.
    '''
    return AA_ASCII_MASS[
        np.array(sequence_array).view(np.int32)
    ].reshape(len(sequence_array), -1)

In [None]:
masses = calc_AA_masses_for_var_len_seqs(['EFGHIK','AAAGCDEFGHIK','DDDDCCDEFGHIK'])
masses

array([[1.29042593e+02, 1.47068414e+02, 5.70214637e+01, 1.37058912e+02,
        1.13084064e+02, 1.28094963e+02, 1.00000000e+08, 1.00000000e+08,
        1.00000000e+08, 1.00000000e+08, 1.00000000e+08, 1.00000000e+08,
        1.00000000e+08],
       [7.10371138e+01, 7.10371138e+01, 7.10371138e+01, 5.70214637e+01,
        1.03009185e+02, 1.15026943e+02, 1.29042593e+02, 1.47068414e+02,
        5.70214637e+01, 1.37058912e+02, 1.13084064e+02, 1.28094963e+02,
        1.00000000e+08],
       [1.15026943e+02, 1.15026943e+02, 1.15026943e+02, 1.15026943e+02,
        1.03009185e+02, 1.03009185e+02, 1.15026943e+02, 1.29042593e+02,
        1.47068414e+02, 5.70214637e+01, 1.37058912e+02, 1.13084064e+02,
        1.28094963e+02]])