In [3]:
from peptdeep.model.ms2 import pDeepModel, normalize_fragment_intensities
from peptdeep.model.rt import IRT_PEPTIDE_DF
from alphabase.spectral_library.flat import SpecLibFlat
import numpy as np

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
def get_prediction_dataset():
    df=IRT_PEPTIDE_DF.copy()
    df['charge'] = 2
    df['mods'] = ''
    df['mod_sites'] = ''
    # sort by nAA
    df = df.sort_values('nAA')
    idxes = np.zeros(len(df)+1,dtype=np.int64)
    idxes[1:] = np.cumsum(df.nAA.values-1)
    df['frag_start_idx'] = idxes[:-1]
    df['frag_stop_idx'] = idxes[1:]
    df['nce'] = 30
    df['instrument'] = "Lumos"
    # sort by 
    return df
get_prediction_dataset().head()

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,charge,frag_start_idx,frag_stop_idx,nce,instrument
0,LGGNEQVTR,RT-pep a,-24.92,,,9,2,0,8,30,Lumos
3,YILAGVENSK,RT-pep d,19.79,,,10,2,8,17,30,Lumos
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,2,17,28,30,Lumos
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,2,28,39,30,Lumos
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,2,39,50,30,Lumos


#### Legacy weights vs new weights

- Both weights share the same exact underlying weights for the model, the only difference is with the new format we save the charged frag types used during training in the weights file.
- So both models are trained on frag types: 'b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'


In [None]:
legacy_path = "../legacy_pretrained_models/generic/ms2.pth"
new_path = "../new_pretrained_models/generic/ms2.pth"

# Ms2 Prediction 

## User importing a legacy model 

a) Using incorrect *len* of frag types when initialization (Should raise mismatch error)

In [None]:
model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2'])
model.load(legacy_path)

RuntimeError: Error(s) in loading state_dict for ModelMS2Bert:
	size mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).
	size mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).
	size mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).
	size mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).

b) Using the correct *len* of frag types when initialization
- This is the ideal use case for the legacy weights were users request exactly the same frag types used when training. 
- It's important to notice that the old implementation won't raise an error if the user requested different frag types as long as the number of frag types are the same.

In [7]:
# Notice replacing the y_z1 and y_z2 with x_z1 and x_z2 and the model is loaded successfully and we get an incorrect prediction
model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'x_z1', 'x_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])
model.load(legacy_path)

In [8]:
preds = model.predict(get_prediction_dataset())
preds.head()

Unnamed: 0,b_z1,b_z2,x_z1,x_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
0,0.0,0.0,1.0,0.004739,0.0,0.0,0.0,0.0
1,0.162034,0.0,0.360414,0.0,0.0,0.0,0.0,0.0
2,0.04666,0.0,0.10992,0.005516,0.0,0.0,0.0,0.0
3,0.018628,0.0,0.203326,0.0,0.0,0.0,0.0,0.0
4,0.01353,0.0,0.267507,0.0,0.0,0.0,0.0,0.0


In [39]:
# Ideal use case requested frag types == training frag types
model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])
model.load(legacy_path)


In [40]:
legacy_full_preds = model.predict(get_prediction_dataset())
legacy_full_preds.head()

Unnamed: 0,b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
0,0.0,0.0,1.0,0.004739,0.0,0.0,0.0,0.0
1,0.162034,0.0,0.360414,0.0,0.0,0.0,0.0,0.0
2,0.04666,0.0,0.10992,0.005516,0.0,0.0,0.0,0.0
3,0.018628,0.0,0.203326,0.0,0.0,0.0,0.0,0.0
4,0.01353,0.0,0.267507,0.0,0.0,0.0,0.0,0.0


In [None]:
"""
If you don't have the new weighst format uncomment the following line and run the cell 
after loading the legacy model weights with the correct frag types (last 2 cells)
it should the save the new weights in the new path.
"""
# model.save(new_path)

## User importing weighst in the new format  

 Using the correct *len* of frag types when initialization (ideal use case)

In [42]:
model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])
model.load(new_path)

In [43]:
new_full_preds = model.predict(get_prediction_dataset())
# verify the predictions are the same
assert np.allclose(legacy_full_preds.values, new_full_preds.values, atol=1e-5)
new_full_preds.head()

Unnamed: 0,b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
0,0.0,0.0,1.0,0.004739,0.0,0.0,0.0,0.0
1,0.162034,0.0,0.360414,0.0,0.0,0.0,0.0,0.0
2,0.04666,0.0,0.10992,0.005516,0.0,0.0,0.0,0.0
3,0.018628,0.0,0.203326,0.0,0.0,0.0,0.0,0.0
4,0.01353,0.0,0.267507,0.0,0.0,0.0,0.0,0.0


Using incorrect *len* of frag types when initialization but still a subset of what was used during training. 

This is use case where a user request a subset of the frag types used during training for example:

1) Excluding the modloss frags, preivously done by setting mask_modloss = True

In [44]:
# Excluding the modloss fragment types
model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2'])
model.load(new_path)
print(f"Model Interface has charged_frag_types {model.charged_frag_types}")
print(f"Supported charged_frag_types in the loaded weights  {model.model.supported_charged_frag_types}")

Model Interface has charged_frag_types ['b_z1', 'b_z2', 'y_z1', 'y_z2']
Supported charged_frag_types in the loaded weights  ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']


In [45]:
# verify that the predictions are the same with the legacy model for the selected subset of charged_frag_types
new_subset_preds = model.predict(get_prediction_dataset())
assert np.allclose(legacy_full_preds[new_subset_preds.columns], new_subset_preds)
new_subset_preds.head()

Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,0.0,0.0,1.0,0.004739
1,0.162034,0.0,0.360414,0.0
2,0.04666,0.0,0.10992,0.005516
3,0.018628,0.0,0.203326,0.0
4,0.01353,0.0,0.267507,0.0


2) Excluding frag types that are not modloss (New feature)

In [46]:
# Excluding the the y fragments while keeping the modloss fragments
model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2'])
model.load(new_path)

In [47]:
# verify that the predictions are the same with the legacy model for the selected subset of charged_frag_types
new_subset_preds = model.predict(get_prediction_dataset())
assert np.allclose(legacy_full_preds[new_subset_preds.columns], new_subset_preds)
new_subset_preds.head()

Unnamed: 0,b_z1,b_z2,b_modloss_z1,b_modloss_z2
0,0.0,0.0,0.0,0.0
1,0.162034,0.0,0.0,0.0
2,0.04666,0.0,0.0,0.0
3,0.018628,0.0,0.0,0.0
4,0.01353,0.0,0.0,0.0


Using the new format we have more semantics on what charged frag types are supported, so when a user request frag types that are not supported we can detect and raise an interpretable *error* (New feature)

In [48]:
model = pDeepModel(charged_frag_types=['x_z1', 'x_z2'])
model.load(new_path)

new_subset_preds = model.predict(get_prediction_dataset())



ValueError: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['x_z1', 'x_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.

Even if the user requested correct *len* of frag types when initialization but the requested frag types are not a subset of what was used during training we should raise an *error*.

In [49]:
model =  pDeepModel(charged_frag_types=['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'])
model.load(new_path)

new_subset_preds = model.predict(get_prediction_dataset())


ValueError: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.

User has a weights file and want to predict all fragment types used for training without knowing what exactly was used during training (New feature)

Notice how the requested frag types are overridden in the model interface

In [50]:
model =  pDeepModel(
    charged_frag_types=['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'], # Will be overridden by the model weights
    override_from_weights=True
    )
model.load(new_path)

print(f"Model Interface has requested charged_frag_types {model.charged_frag_types}")
print(f"Supported charged_frag_types in the loaded weights  {model.model.supported_charged_frag_types}")

Model Interface has requested charged_frag_types ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']
Supported charged_frag_types in the loaded weights  ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']


In [51]:
new_full_preds = model.predict(get_prediction_dataset())
# verify the predictions are the same
assert np.allclose(legacy_full_preds.values, new_full_preds.values, atol=1e-5)
new_full_preds.head()

Unnamed: 0,b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
0,0.0,0.0,1.0,0.004739,0.0,0.0,0.0,0.0
1,0.162034,0.0,0.360414,0.0,0.0,0.0,0.0,0.0
2,0.04666,0.0,0.10992,0.005516,0.0,0.0,0.0,0.0
3,0.018628,0.0,0.203326,0.0,0.0,0.0,0.0,0.0
4,0.01353,0.0,0.267507,0.0,0.0,0.0,0.0,0.0


# Ms2 model training

In [52]:
trainin_data_path = "C:/Users/USER/Desktop/Germany/work/MPIB/alphadia/2oh_evidence_txt_0_batch_0.hdf"
speclib = SpecLibFlat()
speclib.load_hdf(trainin_data_path)
speclib.fragment_intensity_df["b_modloss_z1"] = 0
speclib.fragment_intensity_df["b_modloss_z2"] = 0
speclib.fragment_intensity_df["y_modloss_z1"] = 0
speclib.fragment_intensity_df["y_modloss_z2"] = 0
frgament_types_in_data = speclib.fragment_intensity_df.columns

speclib.precursor_df['nce'] = 30
speclib.precursor_df['instrument'] = "Lumos"
# sample only 100 samples
speclib.precursor_df = speclib.precursor_df.sample(100)

# normalize intensity 
normalize_fragment_intensities(speclib.precursor_df, speclib.fragment_intensity_df)
print(f"Fragment types in the training data: {frgament_types_in_data}")

Fragment types in the training data: Index(['a_z1', 'a_z2', 'b_z1', 'b_z2', 'c_z1', 'c_z2', 'x_z1', 'x_z2', 'y_z1',
       'y_z2', 'z_z1', 'z_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2',
       'c_lossH_z1', 'c_lossH_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1',
       'y_NH3_z2', 'z_addH_z1', 'z_addH_z2', 'b_modloss_z1', 'b_modloss_z2',
       'y_modloss_z1', 'y_modloss_z2'],
      dtype='object')


## User importing a legacy model 

Using correct *len* of frag types when initialization 


In [53]:
target_frag_types = ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']
model = pDeepModel(charged_frag_types=target_frag_types)
model.load(legacy_path)
model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)

2025-02-12 19:42:53> Training with fixed sequence length: 0
[Training] Epoch=1, Mean Loss=0.0174776264175307


Using incorrect *len* of frag types when initialization (Should raise a mismatch error)


In [54]:
target_frag_types = ['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2']
model = pDeepModel(charged_frag_types=target_frag_types)
model.load(legacy_path)
model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)

RuntimeError: Error(s) in loading state_dict for ModelMS2Bert:
	size mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).
	size mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).
	size mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).
	size mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).

## User importing a new model 

Trining on new fragment types that were not part of the training of the original weights (New feature). 
- This is not training from scratch but rather loading the pre-trained backbone and only the prediction heads are initialized from scratch which results in a much faster convergence and reduce the risk of overfiting.
- Notice how when the requested frag types are not a subset of the supported the model is not safe to use for prediction, but after training the model is now safe to predict. 


In [55]:
target_frag_types = ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] 
model = pDeepModel(charged_frag_types=target_frag_types)
model.load(new_path)
print("Trying to predict when the requested fragment types are not supported by the pretrained model")
try: 
    # try to predict with the new model
    model.predict(get_prediction_dataset())
except Exception as e:
    print(f"Error: {e}")

print("Training the model with the requested fragment types")
model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)

print("Trying to predict after training with the requested fragment types")
try: 
    # try to predict with the new model
    preds = model.predict(get_prediction_dataset())
except Exception as e:
    print(f"Error: {e}")

preds.head()

Trying to predict when the requested fragment types are not supported by the pretrained model
Error: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.
Training the model with the requested fragment types
2025-02-12 19:43:01> Training with fixed sequence length: 0
[Training] Epoch=1, Mean Loss=0.08573874365538359
Trying to predict after training with the requested fragment types


Unnamed: 0,a_z1,a_z2,b_H2O_z1,b_H2O_z2
0,0.0,0.0,1.0,0.0
1,0.156433,0.0,0.352,0.0
2,0.042533,0.0,0.101634,0.001894
3,0.015229,0.0,0.188399,0.0
4,0.009935,0.0,0.25053,0.0


After training the the underlying supported fragment types is aligned with teh requested frag types which can then be saved.

In [56]:
print(f"Model Interface has requested charged_frag_types {model.charged_frag_types}")
print(f"Supported charged_frag_types in the loaded weights  {model.model.supported_charged_frag_types}")

Model Interface has requested charged_frag_types ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2']
Supported charged_frag_types in the loaded weights  ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2']
