In [None]:
#---#| default_exp peptide.precursor

# Precursor Functionalities

In [None]:
import numpy as np
import pandas as pd

from alphabase.constants.atom import MASS_ISOTOPE
from alphabase.peptide.precursor import calc_precursor_isotope_info, hash_precursor_df, get_mod_seq_hash, \
    get_mod_seq_charge_hash, calc_precursor_isotope_intensity, calc_precursor_isotope_intensity_mp

### Testing

In [None]:
repeat = 2
peptides = ['AGHCEWQMKAADER']*repeat
mods = ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMK']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2

In [None]:
#| hide
calc_precursor_isotope_info(precursor_df)
assert np.allclose(
    precursor_df.precursor_mz.values, 
    [873.86977, 873.86977,545.233862, 545.233862],
    atol=1e-4
),precursor_df.precursor_mz.values
assert abs(
    precursor_df.loc[1,'isotope_m1_mz']-precursor_df.loc[1,'precursor_mz']-MASS_ISOTOPE/2
)<=1e-5
assert abs(
    precursor_df.loc[1,'isotope_apex_mz']-precursor_df.loc[1,'precursor_mz']
    -MASS_ISOTOPE/precursor_df.loc[1,'isotope_apex_offset'] 
    if precursor_df.loc[1,'isotope_apex_offset'] > 0 else 0
)<=1e-5
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,isotope_m1_intensity,isotope_apex_intensity,isotope_apex_offset,isotope_right_most_intensity,isotope_right_most_offset,isotope_m1_mz,isotope_apex_mz,isotope_right_most_mz
0,AGHCEWQMKAADER,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771,0.888952,1.0,0,0.235889,3,874.371421,873.869771,875.374721
1,AGHCEWQMKAADER,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771,0.888952,1.0,0,0.235889,3,874.371421,873.869771,875.374721
2,AGHCEWQMK,,,9,2,545.233862,0.576623,1.0,0,0.277542,2,545.735512,545.233862,546.237162
3,AGHCEWQMK,,,9,2,545.233862,0.576623,1.0,0,0.277542,2,545.735512,545.233862,546.237162


In [None]:
hash_precursor_df(precursor_df)

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,isotope_m1_intensity,isotope_apex_intensity,isotope_apex_offset,isotope_right_most_intensity,isotope_right_most_offset,isotope_m1_mz,isotope_apex_mz,isotope_right_most_mz,mod_seq_hash,mod_seq_charge_hash
0,AGHCEWQMKAADER,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771,0.888952,1.0,0,0.235889,3,874.371421,873.869771,875.374721,13232847304557946767,13232847304557946769
1,AGHCEWQMKAADER,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771,0.888952,1.0,0,0.235889,3,874.371421,873.869771,875.374721,13232847304557946767,13232847304557946769
2,AGHCEWQMK,,,9,2,545.233862,0.576623,1.0,0,0.277542,2,545.735512,545.233862,546.237162,9211182545585790536,9211182545585790538
3,AGHCEWQMK,,,9,2,545.233862,0.576623,1.0,0,0.277542,2,545.735512,545.233862,546.237162,9211182545585790536,9211182545585790538


In [None]:
#| hide
def detect_duplicated_items(
    precursor_df:pd.DataFrame, 
    item_column:str='mod_seq_charge_hash'
)->pd.Index:
    return precursor_df.loc[
        precursor_df[item_column].duplicated()
    ].index

hash_precursor_df(precursor_df)
assert (detect_duplicated_items(precursor_df, 'mod_seq_hash').values==(1,3)).all()
assert (detect_duplicated_items(precursor_df, 'mod_seq_charge_hash').values==(1,3)).all()
assert precursor_df['mod_seq_hash'].dtype == np.uint64
assert precursor_df['mod_seq_charge_hash'].dtype == np.uint64
assert get_mod_seq_hash("AGHCEWQMK",'','') == precursor_df.mod_seq_hash.values[-1]
assert get_mod_seq_hash("AGHCEWQMKAADER",'Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M','0;4;8') == precursor_df.mod_seq_hash.values[0]
assert get_mod_seq_charge_hash("AGHCEWQMK",'','',2) == precursor_df.mod_seq_charge_hash.values[-1]
assert get_mod_seq_charge_hash("AGHCEWQMKAADER",'Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M','0;4;8',2) == precursor_df.mod_seq_charge_hash.values[0]

In [None]:
# sum normalization
sum_norm_intens = np.array([[0.504251,0.290763,0.139951,0.048122,0.013660,0.003253],
    [0.504251,0.290763,0.139951,0.048122,0.013660,0.003253],
    [0.360538,0.320501,0.190923,0.085047,0.030905,0.009528],
    [0.360538,0.320501,0.190923,0.085047,0.030905,0.009528]]
)

# mono normalization
mono_norm_intens = np.array([[1., 0.5766, 0.2775, 0.0954, 0.0270, 0.0064],
    [1., 0.5766, 0.2775, 0.0954, 0.0270, 0.0064],
    [1., 0.8889, 0.5295, 0.2358, 0.0857, 0.0264],
    [1., 0.8889, 0.5295, 0.2358, 0.0857, 0.0264]], 
)

In [None]:
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,isotope_m1_intensity,isotope_apex_intensity,isotope_apex_offset,isotope_right_most_intensity,isotope_right_most_offset,isotope_m1_mz,isotope_apex_mz,isotope_right_most_mz,mod_seq_hash,mod_seq_charge_hash
0,AGHCEWQMKAADER,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771,0.888952,1.0,0,0.235889,3,874.371421,873.869771,875.374721,13232847304557946767,13232847304557946769
1,AGHCEWQMKAADER,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771,0.888952,1.0,0,0.235889,3,874.371421,873.869771,875.374721,13232847304557946767,13232847304557946769
2,AGHCEWQMK,,,9,2,545.233862,0.576623,1.0,0,0.277542,2,545.735512,545.233862,546.237162,9211182545585790536,9211182545585790538
3,AGHCEWQMK,,,9,2,545.233862,0.576623,1.0,0,0.277542,2,545.735512,545.233862,546.237162,9211182545585790536,9211182545585790538


In [None]:
repeat = 2
peptides = ['AGHCEWQMK']*repeat
mods = ['']*repeat
sites = ['']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites += ['0;4;8']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2

precursor_df = calc_precursor_isotope_intensity(precursor_df,normalize="mono")

assert all(col in precursor_df.columns for col in ['i_0','i_1','i_2','i_3','i_4','i_5'])

assert np.allclose(
    precursor_df[['i_0','i_1','i_2','i_3','i_4','i_5']].values,
    mono_norm_intens,
    0.01
)

# test precursor.calc_precursor_isotope_intensity_mp

In [None]:
repeat = 2
peptides = ['AGHCEWQMK']*repeat
mods = ['']*repeat
sites = ['']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites += ['0;4;8']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2

In [None]:
precursor_df = calc_precursor_isotope_intensity_mp(precursor_df,normalize="sum")

assert all(col in precursor_df.columns for col in ['i_0','i_1','i_2','i_3','i_4','i_5'])

assert np.allclose(
    precursor_df[['i_0','i_1','i_2','i_3','i_4','i_5']].values,
    sum_norm_intens,
    0.01
)

100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
