In [2]:
from peptdeep.model.ms2 import pDeepModel, normalize_fragment_intensities
from peptdeep.model.rt import IRT_PEPTIDE_DF
from alphabase.spectral_library.flat import SpecLibFlat
import numpy as np

### MS2 Transfer Learning 

In this notebook we will walk you through multiple use cases for fine tuning the ms2 models. There various reasons why fine-tuning a pre-trained model can be a good option for example:
- Improving performance on a new dataset while befitting from the pre-trained models provided by peptDeep. 
- Modifying the model prediction head to be able to predict additional fragment types. 

Here are the use cases we support in peptDeep: 
- Finetuning the same model architecture (with the same target charged fragment types) (1)
- Finetuning a MS2 model for a new sharged fragment types. (2)

(1)For the first use case we can benefit from the whole pretarined model, since no architecture changes is required we can directly finetune the model as long as we have a target value for all the fragment types the model is supporting. 

(2)For the second use case, the architecture has to change, since instead of predicting the default 8 fragment types we need to predict 12 fragament types for example. But instead of training the model from scratch, peptdeep allows changing the requested fragment types and still benefiting from a pre-trained model by only re-intializing a new prediction head. 




In [None]:
# loading test dataset for training the model
trainin_data_path = "../data/2oh_evidence_txt_0_batch_0.hdf"
speclib = SpecLibFlat()
speclib.load_hdf(trainin_data_path)
speclib.fragment_intensity_df["b_modloss_z1"] = 0
speclib.fragment_intensity_df["b_modloss_z2"] = 0
speclib.fragment_intensity_df["y_modloss_z1"] = 0
speclib.fragment_intensity_df["y_modloss_z2"] = 0
fragment_types_in_data = speclib.fragment_intensity_df.columns

speclib.precursor_df['nce'] = 30
speclib.precursor_df['instrument'] = "Lumos"
# sample only 100 samples
speclib.precursor_df = speclib.precursor_df.sample(100)

# normalize intensity 
normalize_fragment_intensities(speclib.precursor_df, speclib.fragment_intensity_df)
print(f"Fragment types in the training data: {fragment_types_in_data}")

Fragment types in the training data: Index(['a_z1', 'a_z2', 'b_z1', 'b_z2', 'c_z1', 'c_z2', 'x_z1', 'x_z2', 'y_z1',
       'y_z2', 'z_z1', 'z_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2',
       'c_lossH_z1', 'c_lossH_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1',
       'y_NH3_z2', 'z_addH_z1', 'z_addH_z2', 'b_modloss_z1', 'b_modloss_z2',
       'y_modloss_z1', 'y_modloss_z2'],
      dtype='object')


In [None]:
model_path = "../new_pretrained_models/generic/ms2.pth"

- Training a model to enhance performance without changing the supported fragment types for the model. 

In [12]:
model_interface = pDeepModel(override_from_weights=True)
model_interface.load(model_path)

supported_fragment_types = model_interface.model.supported_charged_frag_types
print(f"Fragment types supported by the model: {supported_fragment_types}")

Fragment types supported by the model: ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']


In [13]:
# training the model
model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,supported_fragment_types], epoch=10, verbose=1)


2025-03-10 04:31:20> Training with fixed sequence length: 0
[Training] Epoch=1, Mean Loss=0.018098749686032534
[Training] Epoch=2, Mean Loss=0.016522509627975523
[Training] Epoch=3, Mean Loss=0.015607067674864084
[Training] Epoch=4, Mean Loss=0.014744784915819764
[Training] Epoch=5, Mean Loss=0.014450938149821013
[Training] Epoch=6, Mean Loss=0.014458817313425243
[Training] Epoch=7, Mean Loss=0.013100229378324002
[Training] Epoch=8, Mean Loss=0.013234045763965696
[Training] Epoch=9, Mean Loss=0.012910482473671436
[Training] Epoch=10, Mean Loss=0.013018820900470018


Now for the interesting use case, where we can benefit from a pre-trained model with different charged fragment types compared to what we actually want to train on (Full 24 Fragment types)

- Before seeing why this might be a good option. Lets try training a model from scratch instead of fine tuning. 

In [18]:
requested_fragment_types = fragment_types_in_data
print(f"Requested fragment types: {requested_fragment_types}")

model_interface = pDeepModel(charged_frag_types=requested_fragment_types) # creating a model from scratch
model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df, epoch=10, verbose=1)

Requested fragment types: Index(['a_z1', 'a_z2', 'b_z1', 'b_z2', 'c_z1', 'c_z2', 'x_z1', 'x_z2', 'y_z1',
       'y_z2', 'z_z1', 'z_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2',
       'c_lossH_z1', 'c_lossH_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1',
       'y_NH3_z2', 'z_addH_z1', 'z_addH_z2', 'b_modloss_z1', 'b_modloss_z2',
       'y_modloss_z1', 'y_modloss_z2'],
      dtype='object')
2025-03-10 04:35:29> Training with fixed sequence length: 0
[Training] Epoch=1, Mean Loss=0.2627508668228984
[Training] Epoch=2, Mean Loss=0.2113290736451745
[Training] Epoch=3, Mean Loss=0.17837902437895536
[Training] Epoch=4, Mean Loss=0.15775226894766092
[Training] Epoch=5, Mean Loss=0.13848155084997416
[Training] Epoch=6, Mean Loss=0.12690765224397182
[Training] Epoch=7, Mean Loss=0.11536069307476282
[Training] Epoch=8, Mean Loss=0.10829603718593717
[Training] Epoch=9, Mean Loss=0.10202393215149641
[Training] Epoch=10, Mean Loss=0.09558012848719954


Now lets see for the same requested fragment types, but this time we will use a pre-trained model as base for the fine-tuning .

In [19]:
print(f"Requested fragment types: {requested_fragment_types}")
model_interface = pDeepModel(charged_frag_types=requested_fragment_types) # creating a model from scratch
model_interface.load(model_path)
model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df, epoch=10, verbose=1)

Requested fragment types: Index(['a_z1', 'a_z2', 'b_z1', 'b_z2', 'c_z1', 'c_z2', 'x_z1', 'x_z2', 'y_z1',
       'y_z2', 'z_z1', 'z_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2',
       'c_lossH_z1', 'c_lossH_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1',
       'y_NH3_z2', 'z_addH_z1', 'z_addH_z2', 'b_modloss_z1', 'b_modloss_z2',
       'y_modloss_z1', 'y_modloss_z2'],
      dtype='object')
2025-03-10 04:35:45> Training with fixed sequence length: 0
[Training] Epoch=1, Mean Loss=0.11859677266329527
[Training] Epoch=2, Mean Loss=0.11320584965869784
[Training] Epoch=3, Mean Loss=0.10764327785000205
[Training] Epoch=4, Mean Loss=0.1021554390899837
[Training] Epoch=5, Mean Loss=0.09915261343121529
[Training] Epoch=6, Mean Loss=0.09576836554333568
[Training] Epoch=7, Mean Loss=0.09286501817405224
[Training] Epoch=8, Mean Loss=0.08949193079024553
[Training] Epoch=9, Mean Loss=0.08687052177265286
[Training] Epoch=10, Mean Loss=0.08428588975220919


Testing out the prediction for the new model

In [20]:
def get_prediction_dataset():
    df=IRT_PEPTIDE_DF.copy()
    df['charge'] = 2
    df['mods'] = ''
    df['mod_sites'] = ''
    # sort by nAA
    df = df.sort_values('nAA')
    idxes = np.zeros(len(df)+1,dtype=np.int64)
    idxes[1:] = np.cumsum(df.nAA.values-1)
    df['frag_start_idx'] = idxes[:-1]
    df['frag_stop_idx'] = idxes[1:]
    df['nce'] = 30
    df['instrument'] = "Lumos"
    # sort by 
    return df
get_prediction_dataset().head()

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,charge,frag_start_idx,frag_stop_idx,nce,instrument
0,LGGNEQVTR,RT-pep a,-24.92,,,9,2,0,8,30,Lumos
3,YILAGVENSK,RT-pep d,19.79,,,10,2,8,17,30,Lumos
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,2,17,28,30,Lumos
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,2,28,39,30,Lumos
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,2,39,50,30,Lumos


In [21]:
prediction = model_interface.predict(get_prediction_dataset())
prediction.head()

Unnamed: 0,a_z1,a_z2,b_z1,b_z2,c_z1,c_z2,x_z1,x_z2,y_z1,y_z2,...,c_lossH_z1,c_lossH_z2,y_H2O_z1,y_H2O_z2,y_NH3_z1,y_NH3_z2,y_modloss_z1,y_modloss_z2,z_addH_z1,z_addH_z2
0,0.0,0.0,0.0,0.0,0.0,0.16629,0.710155,0.715457,0.839183,0.0,...,0.571916,0.596842,0.381391,1.0,0.160475,0.0,0.0,0.0,0.0,0.0
1,0.0,0.172437,0.0,0.212494,0.0,0.526594,0.155132,0.428784,0.357634,0.0,...,0.135188,0.0,0.0,0.946745,0.0,0.0,0.0,0.0,0.0,0.0
2,0.169509,0.0,0.0,0.0,0.0,0.016571,0.0,0.120425,0.0,0.0,...,0.160255,0.0,0.0,0.471156,0.0,0.0,0.0,0.0,0.0,0.0
3,0.006227,0.0,0.0,0.018475,0.0,0.119066,0.012204,0.17037,0.016484,0.0,...,0.080222,0.0,0.0,0.631714,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.065317,0.0,0.242804,0.138258,0.234232,0.075323,0.0,...,0.051258,0.0,0.0,0.770283,0.0,0.0,0.0,0.0,0.0,0.0
