In [None]:
#---#| default_exp peptide.fragment

In [None]:
#| hide
%reload_ext autoreload
%autoreload 2

In [None]:
from alphabase.peptide.fragment import *
from alphabase.peptide.precursor import *

# Fragment Functionalities

### First, it is worth mentioning that, in AlphaBase:
 1. peptide N-term modification site is 0
 2. C-term modification site is -1 
 3. other modifications sites are integers from 1 to nAA

Just in case that we have two modifications, one is on the peptide N-term, and the other is on the N-term AA site chain. Similar for C-term sites.

In [None]:
assert np.all(np.array(get_charged_frag_types(['b','b_modloss'],2))==np.array(['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2']))

In [None]:
assert parse_charged_frag_type('b_z2')==('b',2)
assert parse_charged_frag_type('b_modloss_z2')==('b_modloss', 2)

# Fragment dataframe processing

In AlphaX Ecosystem, library fragments are stored in a dataframe, where the columns are charged_frag_types (`['b_z1','b_z2','y_z1','y_z2','b_modloss_z1','y_H2O_z1'...]`) and the rows are corresponding positions (starting with peptide N-term) of the fragments. Library precursor/peptide dataframe must contain `frag_start_idx` and `frag_stop_idx` columns to tell us where are the fragments of each precursor/peptide.

We provide different ways to initialize fragment dataframes, see below:

For a subset of the precursor dataframe, we need to set or get fragment values for the slicing (by `frag_start_idx` and `frag_stop_idx`in `precursor_df`) of the fragment dataframe. We use `update_sliced_fragment_dataframe` to set the values, and `get_sliced_fragment_dataframe` to get values.

For some search engines, it reports different result files for different raw files. After load them separately, we concatenate `precursor_df_list` and `fragment_df_list` into single dataframes respectively. The main processing here is to cumulate `frag_start_idx` and `frag_stop_idx` for different `precursor_df`s.

# Create fragment mz dataframe
 This is one of the most important functions in alphabase. For a given `precursor_df`, it calculates the fragment ion dataframe, and also set the `frag_start_idx` and `frag_stop_idx` column values to connect the `precursor_df` and `fragment_mz_df`.

 When creating a new fragment mz/intensity dataframes for a precursor, alphabase will check if `frag_start_idx` exists. As the `frag_start_idx` points to an existing fragment dataframe (refers to `reference_frag_df`), so we have to provide the `reference_frag_df` to make sure that `reference_frag_df` and newly created fragment_df are consisitent.


 For the more convenient and faster calculation, we should do as follows:
 - Sort `precursor_df` by 'nAA' (`precursor_df.sort_values('nAA', inplace=True)`) to make sure groupby('nAA') will not change the order of the `precursor_df`.
 - Reset index (`precursor_df.reset_index(drop=True, inplace=True)`) to make sure iloc and loc will index the same dataframe subset.
 - Delete `frag_start_idx` and `frag_stop_idx` columns if they exist, otherwise the creation speed wil be slower.
 - Call `create_fragment_mz_dataframe_by_sort_precursor(precursor_df, charged_frag_types)` or `create_fragment_mz_dataframe(precursor_df, charged_frag_types)`. `create_fragment_mz_dataframe` will also call `create_fragment_mz_dataframe_by_sort_precursor` if there is no `frag_start_idx` column.
 - If we need to predict/calculate `fragment_intensity_df`, we can redo step 3 (delete frag idxes columns) and then call 'intensity prediction' or 'intensity calculation'.

### Examples and unittests:

Test `create_fragment_mz_dataframe_by_sort_precursor`

`create_fragment_mz_dataframe_by_sort_precursor` will sort `nAA` columns in `precursor_df`.

In [None]:
repeat = 2
peptides = ['AGHCEWQMKAADER']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMK']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = [1,1,2,2]
precursor_df = update_precursor_mz(precursor_df)

fragment_mz_df = create_fragment_mz_dataframe_by_sort_precursor(
    precursor_df,
    get_charged_frag_types(['b','y','b_modloss','y_modloss'],2)
)
assert precursor_df.nAA.is_monotonic_increasing
assert np.allclose(
    precursor_df.precursor_mz.values, 
    [545.233862, 545.233862, 1746.732265, 1746.732265],
    atol=1e-4
), precursor_df.precursor_mz.values
ith_pep = 0
frag_start, frag_end = precursor_df[['frag_start_idx','frag_stop_idx']].values[ith_pep]
assert np.allclose(fragment_mz_df.iloc[frag_start:frag_end]['b_z1'].values, 
        [ 72.04439025,  129.06585397,  266.12476583,  369.13395079,
        498.17654388,  684.25585683,  812.31443434,  943.35491942],
        atol=1e-4
    )
ith_pep = 2
frag_start, frag_end = precursor_df[['frag_start_idx','frag_stop_idx']].values[ith_pep]
assert (fragment_mz_df.iloc[frag_start:frag_end]['b_z2'].values==0).all()
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,frag_start_idx,frag_stop_idx
0,AGHCEWQMK,,,9,2,545.233862,0,8
1,AGHCEWQMK,,,9,2,545.233862,8,16
2,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,1,1746.732265,16,29
3,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,1,1746.732265,29,42


Test `get_sliced_fragment_dataframe` and `update_sliced_fragment_dataframe`

In [None]:
sliced_frag_df = get_sliced_fragment_dataframe(
    fragment_mz_df, 
    precursor_df.loc[:0,['frag_start_idx','frag_stop_idx']].values,
    fragment_mz_df.columns.values
)
assert np.allclose(fragment_mz_df.iloc[
        precursor_df['frag_start_idx'].values[0]:precursor_df['frag_stop_idx'].values[0],:
    ].values, sliced_frag_df.values
)
sliced_frag_df = get_sliced_fragment_dataframe(
    fragment_mz_df, 
    precursor_df.loc[:0,['frag_start_idx','frag_stop_idx']].values,
)
assert np.allclose(fragment_mz_df.iloc[
        precursor_df['frag_start_idx'].values[0]:precursor_df['frag_stop_idx'].values[0],:
    ].values, sliced_frag_df.values
)

ith_pep = 1
frag_mz_values = fragment_mz_df.to_numpy(copy=True)
update_sliced_fragment_dataframe(
    fragment_mz_df, 
    frag_mz_values,
    -np.ones((precursor_df.nAA.values[ith_pep]-1,len(fragment_mz_df.columns))),
    [(precursor_df['frag_start_idx'].values[ith_pep],precursor_df['frag_stop_idx'].values[ith_pep])]
)
fragment_mz_df.iloc[:] = frag_mz_values
sliced_frag_df = get_sliced_fragment_dataframe(
    fragment_mz_df, 
    precursor_df.loc[ith_pep:ith_pep,['frag_start_idx','frag_stop_idx']].values,
    fragment_mz_df.columns.values
)
assert np.allclose(
    -np.ones((precursor_df.nAA.values[ith_pep]-1,len(fragment_mz_df.columns))), 
    sliced_frag_df.values
)

ith_pep = 2
frag_mz_values = fragment_mz_df.to_numpy(copy=True)
update_sliced_fragment_dataframe(
    fragment_mz_df, 
    frag_mz_values,
    -2*np.ones((precursor_df.nAA.values[ith_pep]-1,len(fragment_mz_df.columns))),
    [(precursor_df['frag_start_idx'].values[ith_pep],precursor_df['frag_stop_idx'].values[ith_pep])],
    charged_frag_types=fragment_mz_df.columns.values
)
fragment_mz_df.iloc[:] = frag_mz_values
sliced_frag_df = get_sliced_fragment_dataframe(
    fragment_mz_df, 
    precursor_df.loc[ith_pep:ith_pep,['frag_start_idx','frag_stop_idx']].values,
    fragment_mz_df.columns.values
)
assert np.allclose(
    -2*np.ones((precursor_df.nAA.values[ith_pep]-1,len(fragment_mz_df.columns))), 
    sliced_frag_df.values
)

Test `create_fragment_mz_dataframe`

If nAA column is not sorted, `create_fragment_mz_dataframe` also works. But it would be much slower for large peptide sets.

In [None]:
repeat = 2
peptides = ['AGHCEWQMKAADER']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMK']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
assert not precursor_df.nAA.is_monotonic_increasing
fragment_mz_df = create_fragment_mz_dataframe_by_sort_precursor(
    precursor_df,
    get_charged_frag_types(['b','y','b_modloss','y_modloss'],2)
)
precursor_df = precursor_df.sort_values('nAA', ascending=False)
fragment_mz_df1 = create_fragment_mz_dataframe(
    precursor_df,
    get_charged_frag_types(['b','y','b_modloss','y_modloss'],2),
    reference_fragment_df=fragment_mz_df
)
ith_pep = 2
frag_start, frag_end = precursor_df[['frag_start_idx','frag_stop_idx']].values[ith_pep]
assert np.allclose(fragment_mz_df.values, fragment_mz_df1.values)
assert np.allclose(fragment_mz_df.iloc[frag_start:frag_end]['b_z1'].values, 
        [ 72.04439025,  129.06585397,  266.12476583,  369.13395079,
        498.17654388,  684.25585683,  812.31443434,  943.35491942],
        atol=1e-4
    )
ith_pep = 0
frag_start, frag_end = precursor_df[['frag_start_idx','frag_stop_idx']].values[ith_pep]
assert np.allclose(fragment_mz_df.iloc[frag_start:frag_end]['b_z2'].values, 
        [ 57.5311157 ,  86.04184756, 154.57130349, 234.58662783,
            299.10792438, 392.14758085, 456.1768696 , 529.69456946,
            593.74205097, 629.26060786, 664.77916475, 722.29263626,
            786.81393281],
        atol=1e-4
    )
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,frag_start_idx,frag_stop_idx
2,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,16,29
3,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,29,42
0,AGHCEWQMK,,,9,2,0,8
1,AGHCEWQMK,,,9,2,8,16


In [None]:
_reference_frag_df = fragment_mz_df
fragment_mz_df = create_fragment_mz_dataframe(
    precursor_df,
    ['b_z1','y_z1'],
    reference_fragment_df=_reference_frag_df
)
assert np.allclose(fragment_mz_df.values, _reference_frag_df[fragment_mz_df.columns])

## Test other ions

In [None]:
repeat = 2
peptides = ['AGHCEWQMKAADER']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMK']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
assert not precursor_df.nAA.is_monotonic_increasing
fragment_mz_df = create_fragment_mz_dataframe_by_sort_precursor(
    precursor_df,
    get_charged_frag_types(['a','b','c','x','y','z','b_H2O','y_NH3'],1)
)
assert np.allclose(fragment_mz_df.a_z1-fragment_mz_df.b_z1,
    calc_mass_from_formula('C(-1)O(-1)')
)
assert np.allclose(fragment_mz_df.c_z1-fragment_mz_df.b_z1,
    calc_mass_from_formula('N(1)H(3)')
)
assert np.allclose(fragment_mz_df.x_z1-fragment_mz_df.y_z1,
    calc_mass_from_formula('C(1)O(1)H(-2)')
)
assert np.allclose(fragment_mz_df.z_z1-fragment_mz_df.y_z1,
    calc_mass_from_formula('N(-1)H(-2)')
)
assert np.allclose(fragment_mz_df.b_H2O_z1-fragment_mz_df.b_z1,
    calc_mass_from_formula('H(-2)O(-1)')
)
assert np.allclose(fragment_mz_df.y_NH3_z1-fragment_mz_df.y_z1,
    calc_mass_from_formula('N(-1)H(-3)')
)

## Test AA mod diffs

In [None]:
repeat = 1
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMK']*repeat
mods += ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites += ['0;4;8']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
aa_mass_diffs = ['']*len(precursor_df)
aa_mass_diff_sites = ['']*len(precursor_df)
aa_mass_diffs[0],aa_mass_diff_sites[0] = '100;200','0;-1'
precursor_df['aa_mass_diffs'] = aa_mass_diffs
precursor_df['aa_mass_diff_sites'] = aa_mass_diff_sites
update_precursor_mz(precursor_df)
assert np.allclose(precursor_df.precursor_mz.values, [752.747333, 602.747333],
        atol=1e-4)
fragment_mz_df = create_fragment_mz_dataframe(precursor_df, charged_frag_types=fragment_mz_df.columns.values)
assert np.allclose(fragment_mz_df['y_z1'].values[precursor_df.frag_start_idx[0]:precursor_df.frag_stop_idx[0]], 
    [1291.43971168, 1234.41824796, 1097.3593361 ,  937.32868742,
        808.28609433,  622.20678138,  494.14820387,  347.11280417],
        atol=1e-4
),  f'200 Da must be added to all y-ions'
assert np.allclose(fragment_mz_df['b_z1'].values[precursor_df.frag_start_idx[0]:precursor_df.frag_stop_idx[0]], 
    [214.05495494,  271.07641866,  408.13533052,  568.1659792 ,
        697.20857228,  883.28788524, 1011.34646274, 1158.38186245],
        atol=1e-4
),  f'100 Da must be added to all b-ions'
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,aa_mass_diffs,aa_mass_diff_sites,precursor_mz,frag_start_idx,frag_stop_idx
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,100;200,0;-1,752.747333,0,8
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,,,602.747333,8,16


In [None]:
#| hide
repeat = 2
peptides = ['AGHCEWQMKAADER']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMK']*repeat
mods += ['']*repeat
sites += ['']*repeat
peptides += ['PEPSIDE']*repeat
mods += ['Phospho@S']*repeat
sites += ['4']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
np.random.seed(0)
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = np.random.randint(1,4, size=len(mods))
precursor_df = update_precursor_mz(precursor_df)

fragment_mz_df = create_fragment_mz_dataframe_by_sort_precursor(
    precursor_df,
    get_charged_frag_types(['b','y','b_modloss','y_modloss'],2)
)
fragment_intensity_df = fragment_mz_df.copy()
fragment_intensity_df[fragment_intensity_df.columns] = np.random.randint(0,11, size=(fragment_mz_df.shape))/10.0

precursor_new_df, fragment_df = flatten_fragments(
    precursor_df, fragment_mz_df, fragment_intensity_df, 
    min_fragment_intensity=-1,keep_top_k_fragments=1000,
    custom_columns=['type','position']
)
assert(isinstance(precursor_new_df, pd.DataFrame))
assert(isinstance(fragment_df, pd.DataFrame))
assert 'type' in fragment_df.columns
assert 'position' in fragment_df.columns
assert 'number' not in fragment_df.columns
assert 'charge' not in fragment_df.columns
assert 'loss_type' not in fragment_df.columns

fragment_count = np.sum(fragment_mz_df.values>0)

assert(len(fragment_df) == fragment_count)
assert(precursor_new_df['flat_frag_stop_idx'].iloc[-1] == fragment_count)
fragment_df

Unnamed: 0,mz,intensity,type,position
0,98.060040,0.4,98,0
1,49.533658,0.7,98,0
2,769.265157,0.6,121,0
3,385.136217,0.8,121,0
4,671.288262,0.1,121,0
...,...,...,...,...
193,152.584411,0.0,121,11
194,1572.620589,0.6,98,12
195,786.813933,0.9,98,12
196,175.118952,0.3,121,12


In [None]:
#| hide
precursor_new_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,frag_start_idx,frag_stop_idx,flat_frag_start_idx,flat_frag_stop_idx
0,PEPSIDE,Phospho@S,4,7,2,433.662599,0,6,0,36
1,PEPSIDE,Phospho@S,4,7,3,289.444158,6,12,36,72
2,AGHCEWQMK,,,9,1,1089.460447,12,20,72,88
3,AGHCEWQMK,,,9,2,545.233862,20,28,88,120
4,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,1,1746.732265,28,41,120,146
5,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771,41,54,146,198


In [None]:
#| hide
precursor_new_df, fragment_df = flatten_fragments(
    precursor_df, fragment_mz_df, fragment_intensity_df, 
    min_fragment_intensity=-1,keep_top_k_fragments=6,
    custom_columns=['type','position']
)
assert (precursor_new_df.flat_frag_stop_idx.values - precursor_new_df.flat_frag_start_idx.values).max() <= 6
fragment_df

Unnamed: 0,mz,intensity,type,position
0,385.136217,0.8,121,0
1,542.245669,0.8,121,1
2,271.626473,0.9,121,1
3,604.237821,0.9,98,4
4,302.622548,0.9,98,4
5,132.047327,1.0,121,4
6,98.06004,0.8,98,0
7,671.288262,1.0,121,0
8,336.147769,0.8,121,0
9,320.61492,0.9,121,1


In [None]:
#| hide
precursor_new_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,frag_start_idx,frag_stop_idx,flat_frag_start_idx,flat_frag_stop_idx
0,PEPSIDE,Phospho@S,4,7,2,433.662599,0,6,0,6
1,PEPSIDE,Phospho@S,4,7,3,289.444158,6,12,6,12
2,AGHCEWQMK,,,9,1,1089.460447,12,20,12,18
3,AGHCEWQMK,,,9,2,545.233862,20,28,18,24
4,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,1,1746.732265,28,41,24,30
5,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771,41,54,30,36


In [None]:
#| hide

repeat = 2
peptides = ['AGHCEWQMKAADER']*repeat
peptides += ['AGHCEWQMK']*repeat
peptides += ['PEPTIDE']*repeat
mods = ['']*repeat*3
sites = ['']*repeat*3
charge = [2, 3]*3

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites,
    'charge': charge
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df = update_precursor_mz(precursor_df)

fragment_mz_df = create_fragment_mz_dataframe_by_sort_precursor(
    precursor_df,
    get_charged_frag_types(['b','y'],2)
)

fragment_intensity_df = fragment_mz_df.copy()
fragment_intensity_df[fragment_intensity_df.columns] = np.random.random_sample(size=(fragment_mz_df.shape))

small_precursor_df = precursor_df[precursor_df['charge'] == 2].sample(frac=1)
small_precursor_df.reset_index(drop=True, inplace=True)
new_small_precursor_df, (new_fragment_mz_df, new_fragment_intensity_df) = remove_unused_fragments(small_precursor_df, (fragment_mz_df, fragment_intensity_df))

# iterate all precursors and make sure that the precursor order and fragments match
for i in range(len(small_precursor_df)):
    old_frag_idx = small_precursor_df[['frag_start_idx','frag_stop_idx']].values[i]
    new_frag_idx = new_small_precursor_df[['frag_start_idx','frag_stop_idx']].values[i]

    # check fragment intensities
    old_slice = fragment_intensity_df.values[old_frag_idx[0]:old_frag_idx[1]]
    new_slice = new_fragment_intensity_df.values[new_frag_idx[0]:new_frag_idx[1]]
    assert np.allclose(old_slice,new_slice)

    # check fragment mzs
    old_slice = fragment_mz_df.values[old_frag_idx[0]:old_frag_idx[1]]
    new_slice = new_fragment_mz_df.values[new_frag_idx[0]:new_frag_idx[1]]

    assert np.allclose(old_slice,new_slice)


In [None]:
#| hide
def test_join_left():

    left = np.random.randint(0,10,20)
    right = np.arange(0,10)
    joined = join_left(left, right)

    assert all(left==joined)

test_join_left()

In [None]:
precursor_df = pd.DataFrame([
    {'elution_group_idx': 0, 'frag_start_idx': 0, 'frag_stop_idx': 10, 'decoy': 0},
    {'elution_group_idx': 0, 'frag_start_idx': 10, 'frag_stop_idx': 20, 'decoy': 0},
    {'elution_group_idx': 0, 'frag_start_idx': 20, 'frag_stop_idx': 30, 'decoy': 1},
    {'elution_group_idx': 0, 'frag_start_idx': 30, 'frag_stop_idx': 40, 'decoy': 1},
    {'elution_group_idx': 1, 'frag_start_idx': 40, 'frag_stop_idx': 50, 'decoy': 0},
    {'elution_group_idx': 1, 'frag_start_idx': 50, 'frag_stop_idx': 60, 'decoy': 0},
    {'elution_group_idx': 1, 'frag_start_idx': 60, 'frag_stop_idx': 70, 'decoy': 1},
    {'elution_group_idx': 1, 'frag_start_idx': 70, 'frag_stop_idx': 80, 'decoy': 1},
])

fragment_mz = np.arange(0,160).reshape(80,2)

fragment_mz[0::2,:] = 0

fragment_df = pd.DataFrame(
    fragment_mz,
    columns=['y1','y2']
)

cardinality_df = calc_fragment_cardinality(
    precursor_df,
    fragment_df,
    group_column='elution_group_idx',
    split_target_decoy=True
)

assert np.all(cardinality_df.values[0::2,:]==2)
assert np.all(cardinality_df.values[1::2,:]==1)

cardinality_df = calc_fragment_cardinality(
    precursor_df,
    fragment_df,
    group_column='elution_group_idx',
    split_target_decoy=False
)

assert np.all(cardinality_df.values[0::2,:]==4)
assert np.all(cardinality_df.values[1::2,:]==1)


In [None]:
peptides = ['AGHCEWQMKAADER', 'AGHCEWQMKAADER']
mods = ['Dimethyl:2H(4)@Any N-term', 'Dimethyl@Any N-term']
sites = ['0','0']
charge = [2, 2]
elution_group_idx = [0, 0]

precursor_df = pd.DataFrame({
    'elution_group_idx': elution_group_idx,
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites,
    'charge': charge
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df = update_precursor_mz(precursor_df)

fragment_mz_df = create_fragment_mz_dataframe_by_sort_precursor(
    precursor_df,
    get_charged_frag_types(['b','y'],2)
)
fragment_intensity_df = fragment_mz_df.copy()
fragment_intensity_df[fragment_intensity_df.columns] = np.random.randint(0,11, size=(fragment_mz_df.shape))/10.0

cardinality_df = calc_fragment_cardinality(
    precursor_df,
    fragment_mz_df,
    group_column='elution_group_idx',
    split_target_decoy=False
)

# flattening the fragments will create a dataframe with one column for each of the following dataframes: fragment_mz_df, fragment_intensity_df, cardinality_df
# cardinality_df is provided as item in the custom_df dictionary

precursor_new_df, fragment_df = flatten_fragments(
    precursor_df, fragment_mz_df, fragment_intensity_df, 
    min_fragment_intensity=-1,keep_top_k_fragments=6,
    custom_columns=['type','position'],
    custom_df={'cardinality': cardinality_df}
)

In [None]:
print(fragment_df)

             mz  intensity  cardinality  type  position
0    265.620114        1.0            1    98         4
1    358.659770        1.0            1    98         5
2    948.456741        0.9            2   121         5
3    410.702720        0.9            2   121         6
4    680.807368        1.0            1    98        11
5    304.161545        1.0            2   121        11
6    157.097154        1.0            1    98         1
7    712.287157        1.0            1    98         5
8    474.732009        0.9            2   121         5
9    550.244230        0.9            1    98         8
10  1356.582353        0.9            1    98        11
11    88.063114        1.0            2   121        12
