In [None]:
#default_exp constants.aa

In [None]:
#hide
__file__ = '../../alphabase/constants/aa.py'

In [None]:
#export
 
import os
import pandas as pd
import numpy as np
from typing import Union, Tuple
from alphabase.yaml_utils import load_yaml

from alphabase.constants.element import calc_formula_mass
from alphabase.constants.element import MASS_H2O

AA_CHEM = load_yaml(
    os.path.join(os.path.dirname(__file__),
    'amino_acid.yaml')
)

def reset_AA_mass():
    AA_ASCII_MASS = np.zeros(128)
    for aa, chem in AA_CHEM.items():
        AA_ASCII_MASS[ord(aa)] = calc_formula_mass(chem)
    return AA_ASCII_MASS
    
AA_ASCII_MASS = reset_AA_mass()

def ret_set_AA_df():
    AA_DF = pd.DataFrame()
    AA_DF['aa'] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
    AA_DF['formula'] = ['']*len(AA_ASCII_MASS)
    aa_idxes = []
    formulas = []
    for aa, formula in AA_CHEM.items():
        aa_idxes.append(ord(aa))
        formulas.append(formula)
    AA_DF.loc[aa_idxes, 'formula'] = formulas
    AA_DF['mass'] = AA_ASCII_MASS
    return AA_DF
AA_DF = ret_set_AA_df()


In [None]:
AA_DF.loc[ord('A'):ord('Z'),:]

Unnamed: 0,aa,formula,mass
65,A,C(3)H(5)N(1)O(1)S(0),71.03711
66,B,C(0),0.0
67,C,C(3)H(5)N(1)O(1)S(1),103.00918
68,D,C(4)H(5)N(1)O(3)S(0),115.026938
69,E,C(5)H(7)N(1)O(3)S(0),129.042588
70,F,C(9)H(9)N(1)O(1)S(0),147.068409
71,G,C(2)H(3)N(1)O(1)S(0),57.021461
72,H,C(6)H(7)N(3)O(1)S(0),137.058906
73,I,C(6)H(11)N(1)O(1)S(0),113.084058
74,J,C(4)H(6)N(2)O(2)S(0),114.042922


### `get_sequence_mass` can easily get the mass list of each amino acid. 
The key is: `np.array(sequence, 'c').view(np.int8)` converts a string into an ASCII code array

In [None]:
#export
def get_sequence_mass(
    sequence: str
)->np.array:
    '''
    Args:
        sequence (str): unmodified peptide sequence
    Returns:
        np.array: masses of each amino acid.
    '''
    return AA_ASCII_MASS[np.array(sequence,'c').view(np.int8)]

In [None]:
#export
def get_same_len_sequences_mass(
    sequence_array: np.array
)->np.array:
    '''
    Args:
        sequence_array (np.array): unmodified sequences with the same length.
    Returns:
        np.array: 2-D (array_size, sequence_len) array of masses.
    Raise:
        ValueError: if sequences are not with the same length.
    '''
    return AA_ASCII_MASS[
        np.array(sequence_array).view(np.int32)
    ].reshape(-1, len(sequence_array[0]))

### For a single sequence

In [None]:
get_sequence_mass('ACDEFGNYK')

array([ 71.0371103, 103.0091803, 115.0269385, 129.0425877, 147.0684087,
        57.0214611, 114.0429222, 163.0633228, 128.0949557])

### For sequences with the same length

In [None]:
get_same_len_sequences_mass(np.array(['ACDEFGHIK','BCDEFGHIK','CCDEFGHIK']))

array([[ 71.0371103, 103.0091803, 115.0269385, 129.0425877, 147.0684087,
         57.0214611, 137.0589059, 113.0840579, 128.0949557],
       [  0.       , 103.0091803, 115.0269385, 129.0425877, 147.0684087,
         57.0214611, 137.0589059, 113.0840579, 128.0949557],
       [103.0091803, 103.0091803, 115.0269385, 129.0425877, 147.0684087,
         57.0214611, 137.0589059, 113.0840579, 128.0949557]])

### It is very easy to generate b/y ions from a sequence or a list of sequences with same length

In [None]:
aa_masses = get_same_len_sequences_mass(['ACDEFGHIK','BCDEFGHIK','CCDEFGHIK'])
b_masses = np.cumsum(aa_masses, axis=1)
b_masses, pepmass = b_masses[:,:-1], b_masses[:,-1:]
pepmass += MASS_H2O
{'pepmass':pepmass, 'b masses':b_masses, 'y masses':pepmass-b_masses}

{'pepmass': array([[1018.4541694],
        [ 947.4170591],
        [1050.4262394]]),
 'b masses': array([[ 71.0371103, 174.0462906, 289.0732291, 418.1158168, 565.1842255,
         622.2056866, 759.2645925, 872.3486504],
        [  0.       , 103.0091803, 218.0361188, 347.0787065, 494.1471152,
         551.1685763, 688.2274822, 801.3115401],
        [103.0091803, 206.0183606, 321.0452991, 450.0878868, 597.1562955,
         654.1777566, 791.2366625, 904.3207204]]),
 'y masses': array([[947.4170591, 844.4078788, 729.3809403, 600.3383526, 453.2699439,
         396.2484828, 259.1895769, 146.105519 ],
        [947.4170591, 844.4078788, 729.3809403, 600.3383526, 453.2699439,
         396.2484828, 259.1895769, 146.105519 ],
        [947.4170591, 844.4078788, 729.3809403, 600.3383526, 453.2699439,
         396.2484828, 259.1895769, 146.105519 ]])}

In [None]:
#export
def get_sequence_array_mass(
    sequence_array: np.array
)->Tuple[np.array, np.array]:
    '''
    We recommend to use `get_same_len_sequences_mass` as it is much faster.
    Args:
        sequence_array (np.array): sequences with arbitrary lengths.
    Returns:
        np.array: 1D array of masses.
        np.array: 1D array of offsets. 
          Owning to arbitrary lengths, we need an offsets array for slicing
    '''
    masses = AA_ASCII_MASS[
        np.array(sequence_array).view(np.int32)
    ].reshape(-1)
    masses = masses[masses!=0]
    offsets = np.zeros(len(sequence_array)+1)
    offsets[1:] = [len(seq) for seq in sequence_array]
    return masses, np.cumsum(offsets)

In [None]:
masses, offsets = get_sequence_array_mass(['CCACDEFGHIK','AAAGCDEFGHIK','DDDDCCDEFGHIK'])
masses, offsets

(array([103.0091803, 103.0091803,  71.0371103, 103.0091803, 115.0269385,
        129.0425877, 147.0684087,  57.0214611, 137.0589059, 113.0840579,
        128.0949557,  71.0371103,  71.0371103,  71.0371103,  57.0214611,
        103.0091803, 115.0269385, 129.0425877, 147.0684087,  57.0214611,
        137.0589059, 113.0840579, 128.0949557, 115.0269385, 115.0269385,
        115.0269385, 115.0269385, 103.0091803, 103.0091803, 115.0269385,
        129.0425877, 147.0684087,  57.0214611, 137.0589059, 113.0840579,
        128.0949557]),
 array([ 0., 11., 23., 36.]))