In [1]:
from peptdeep.model.ms2 import pDeepModel, normalize_fragment_intensities
from peptdeep.model.rt import IRT_PEPTIDE_DF
from alphabase.spectral_library.flat import SpecLibFlat
from alphabase.peptide.precursor import refine_precursor_df
import numpy as np

### MS2 Transfer Learning 

In this notebook we will walk you through multiple use cases for fine tuning the ms2 models. There various reasons why fine-tuning a pre-trained model can be a good option for example:
- Improving performance on a new dataset while befitting from the pre-trained models provided by peptDeep. 
- Modifying the model prediction head to be able to predict additional fragment types. 

Here are the use cases we support in peptDeep: 
- Finetuning the same model architecture (with the same target charged fragment types) (1)
- Finetuning a MS2 model for a new sharged fragment types. (2)

(1)For the first use case we can benefit from the whole pretrained model, since no architecture changes is required we can directly finetune the model as long as we have a target value for all the fragment types the model is supporting. 

(2)For the second use case, the architecture has to change, since instead of predicting the default 8 fragment types we need to predict 12 fragament types for example. But instead of training the model from scratch, peptdeep allows changing the requested fragment types and still benefiting from a pre-trained model by only re-intializing a new prediction head. 




In [2]:
import pandas as pd
def calculate_similarity(precursor_df_a, precursor_df_b, intensity_df_a, intensity_df_b):

    _a_df = precursor_df_a[['precursor_idx', 'frag_start_idx', 'frag_stop_idx']].copy()
    _b_df = precursor_df_b[['precursor_idx', 'frag_start_idx', 'frag_stop_idx']].copy()

    _merged_df = pd.merge(_a_df, _b_df, on='precursor_idx', suffixes=('_a', '_b'))
    # keep only first precursor
    _merged_df = _merged_df.drop_duplicates(subset='precursor_idx', keep='first')
    similarity_list = []

    for i, (start_a, stop_a, start_b, stop_b) in enumerate(zip(_merged_df['frag_start_idx_a'], _merged_df['frag_stop_idx_a'], _merged_df['frag_start_idx_b'], _merged_df['frag_stop_idx_b'])):
        observed_intensity = intensity_df_a.loc[start_a:stop_a, :].values.flatten()
        predicted_intensity = intensity_df_b.loc[start_b:stop_b, :].values.flatten()

        similarity = np.dot(observed_intensity, predicted_intensity) / ((np.linalg.norm(observed_intensity) * np.linalg.norm(predicted_intensity)) + 1e-6)
        similarity_list.append({'similarity': similarity, 'index': i, 'precursor_idx': _merged_df.iloc[i]['precursor_idx']})

    return pd.DataFrame(similarity_list)

In [None]:
# loading test dataset for training the model
trainin_data_path = "../data/sample_speclib.hdf"
speclib = SpecLibFlat()
speclib.load_hdf(trainin_data_path)
speclib.fragment_intensity_df["b_modloss_z1"] = 0
speclib.fragment_intensity_df["b_modloss_z2"] = 0
speclib.fragment_intensity_df["y_modloss_z1"] = 0
speclib.fragment_intensity_df["y_modloss_z2"] = 0
fragment_types_in_data = speclib.fragment_intensity_df.columns

speclib.precursor_df['nce'] = 30.0
speclib.precursor_df['instrument'] = "Lumos"
speclib.precursor_df['precursor_idx'] = speclib.precursor_df.index  

# sample only 100 samples
# speclib.precursor_df = speclib.precursor_df.sample(100)
refine_precursor_df(speclib.precursor_df)
# normalize intensity 
normalize_fragment_intensities(speclib.precursor_df, speclib.fragment_intensity_df)
print(f"Fragment types in the training data: {fragment_types_in_data}")

Fragment types in the training data: Index(['a_z1', 'a_z2', 'b_z1', 'b_z2', 'c_z1', 'c_z2', 'x_z1', 'x_z2', 'y_z1',
       'y_z2', 'z_z1', 'z_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2',
       'c_lossH_z1', 'c_lossH_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1',
       'y_NH3_z2', 'z_addH_z1', 'z_addH_z2', 'b_modloss_z1', 'b_modloss_z2',
       'y_modloss_z1', 'y_modloss_z2'],
      dtype='object')


In [None]:
model_path = "../pretrained_models/generic/ms2.pth"

- Training a model to enhance performance without changing the supported fragment types for the model. 

In [5]:
model_interface = pDeepModel(override_from_weights=True)
model_interface.load(model_path)

supported_fragment_types = model_interface.model.supported_charged_frag_types
print(f"Fragment types supported by the model: {supported_fragment_types}")

Fragment types supported by the model: ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']


In [6]:
# training the model
model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,supported_fragment_types], epoch=10, verbose=1)


2025-03-24 06:27:55> Training with fixed sequence length: 0


[Training] Epoch=1, Mean Loss=0.01714216558262706
[Training] Epoch=2, Mean Loss=0.014999589333310723
[Training] Epoch=3, Mean Loss=0.01401507736183703
[Training] Epoch=4, Mean Loss=0.013427118873223662
[Training] Epoch=5, Mean Loss=0.01310566863976419
[Training] Epoch=6, Mean Loss=0.012583116488531231
[Training] Epoch=7, Mean Loss=0.01228506090119481
[Training] Epoch=8, Mean Loss=0.011949550099670888
[Training] Epoch=9, Mean Loss=0.011700581423938275
[Training] Epoch=10, Mean Loss=0.011566510824486613


In [7]:
print(f"Prediction after training on fragment types: {supported_fragment_types}") 
prediction_precursor_df = speclib.precursor_df.copy()
prediction_precursor_df.drop(columns=["frag_start_idx", "frag_stop_idx"], inplace=True)
predictions = model_interface.predict(prediction_precursor_df) # predict wil set the frag_start_idx and frag_stop_idx inplace
similarity_df = calculate_similarity(speclib.precursor_df, prediction_precursor_df, speclib.fragment_intensity_df.loc[:, supported_fragment_types], predictions)
print(f"Median similarity: {similarity_df['similarity'].median()}")

Prediction after training on fragment types: ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']
Median similarity: 0.9788682196185345


Now for the interesting use case, where we can benefit from a pre-trained model with different charged fragment types compared to what we actually want to train on (12 Fragment types)

- Before seeing why this might be a good option. Lets try training a model from scratch instead of fine tuning. 

In [23]:
requested_fragment_types = ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_H2O_z1','b_H2O_z2','b_NH3_z1', 'b_NH3_z2', 'y_H2O_z1', 'y_H2O_z2','y_NH3_z1', 'y_NH3_z2']
print(f"Requested fragment types: {requested_fragment_types}")

model_interface = pDeepModel(charged_frag_types=requested_fragment_types) # creating a model from scratch
model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,requested_fragment_types], epoch=20, verbose=True)

Requested fragment types: ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1', 'y_NH3_z2']
2025-03-24 06:36:08> Training with fixed sequence length: 0
[Training] Epoch=1, Mean Loss=0.18290741086006165
[Training] Epoch=2, Mean Loss=0.12341278985142708
[Training] Epoch=3, Mean Loss=0.09590978175401688
[Training] Epoch=4, Mean Loss=0.07777980178594589
[Training] Epoch=5, Mean Loss=0.06758781857788562
[Training] Epoch=6, Mean Loss=0.06245427638292313
[Training] Epoch=7, Mean Loss=0.05913622558116913
[Training] Epoch=8, Mean Loss=0.05696883402764797
[Training] Epoch=9, Mean Loss=0.0552051093429327
[Training] Epoch=10, Mean Loss=0.05408928550779819
[Training] Epoch=11, Mean Loss=0.05306584112346172
[Training] Epoch=12, Mean Loss=0.05200395107269287
[Training] Epoch=13, Mean Loss=0.051182358488440514
[Training] Epoch=14, Mean Loss=0.04973145142197609
[Training] Epoch=15, Mean Loss=0.04895635634660721
[Training] Epoch=16, Mean Los

In [24]:
print(f"Prediction after training on fragment types: {requested_fragment_types}") 
prediction_precursor_df = speclib.precursor_df.copy()
prediction_precursor_df.drop(columns=["frag_start_idx", "frag_stop_idx"], inplace=True)
predictions = model_interface.predict(prediction_precursor_df) # predict wil set the frag_start_idx and frag_stop_idx inplace
similarity_df = calculate_similarity(speclib.precursor_df, prediction_precursor_df, speclib.fragment_intensity_df.loc[:,requested_fragment_types], predictions)
print(f"Median similarity: {similarity_df['similarity'].median()}"  )

Prediction after training on fragment types: ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1', 'y_NH3_z2']
Median similarity: 0.6148259002143182


Now lets see for the same requested fragment types, but this time we will use a pre-trained model as base for the fine-tuning .

In [25]:
print(f"Requested fragment types: {requested_fragment_types}")
model_interface = pDeepModel(charged_frag_types=requested_fragment_types)
model_interface.load(model_path)
model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,requested_fragment_types], epoch=20, verbose=True)

Requested fragment types: ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1', 'y_NH3_z2']
2025-03-24 06:37:51> Training with fixed sequence length: 0
[Training] Epoch=1, Mean Loss=0.13062387451529503
[Training] Epoch=2, Mean Loss=0.11805687174201011
[Training] Epoch=3, Mean Loss=0.1085664016008377
[Training] Epoch=4, Mean Loss=0.0950324584543705
[Training] Epoch=5, Mean Loss=0.0680735632032156
[Training] Epoch=6, Mean Loss=0.05339862667024135
[Training] Epoch=7, Mean Loss=0.044806683547794816
[Training] Epoch=8, Mean Loss=0.04099671371281147
[Training] Epoch=9, Mean Loss=0.038800872154533865
[Training] Epoch=10, Mean Loss=0.03747128788381815
[Training] Epoch=11, Mean Loss=0.03655099641531706
[Training] Epoch=12, Mean Loss=0.03583796564489603
[Training] Epoch=13, Mean Loss=0.035318268463015556
[Training] Epoch=14, Mean Loss=0.034866768941283224
[Training] Epoch=15, Mean Loss=0.03474729720503092
[Training] Epoch=16, Mean Lo

In [26]:
print(f"Prediction after training on fragment types: {requested_fragment_types}") 
prediction_precursor_df = speclib.precursor_df.copy()
prediction_precursor_df.drop(columns=["frag_start_idx", "frag_stop_idx"], inplace=True)
predictions = model_interface.predict(prediction_precursor_df) # predict wil set the frag_start_idx and frag_stop_idx inplace
similarity_df = calculate_similarity(speclib.precursor_df, prediction_precursor_df, speclib.fragment_intensity_df.loc[:,requested_fragment_types], predictions)
print(f"Median similarity: {similarity_df['similarity'].median()}"  )

Prediction after training on fragment types: ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1', 'y_NH3_z2']
Median similarity: 0.7050544920322075
