In [7]:
%reload_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
from alphabase.spectral_library.base import SpecLibBase
from alphadia.transferlearning.train import *
from alphabase.tools.data_downloader import DataShareDownloader
import tempfile
import seaborn as sns
sns.set()

logger = logging.getLogger()


In [3]:
import torch
torch.set_num_threads(10)

In [8]:
tempdir = tempfile.gettempdir()
DataShareDownloader("https://datashare.biochem.mpg.de/s/1GiKQSwlPf6YlMm/download?path=%2Ftransfer_pass&files=speclib.transfer.hdf", tempdir).download()

transfer_lib = SpecLibBase()
transfer_lib.load_hdf(f'{tempdir}/speclib.transfer.hdf', load_mod_seq=True)


/var/folders/6r/8h59xkv90qs2dyy8mdn_ch840000gn/T/speclib.transfer.hdf does not yet exist
/var/folders/6r/8h59xkv90qs2dyy8mdn_ch840000gn/T/speclib.transfer.hdf successfully downloaded (73.11343574523926 MB)


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'tempdir/speclib.transfer.hdf', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [9]:
transfer_lib.precursor_df = transfer_lib.precursor_df[~transfer_lib.precursor_df['mods'].str.contains('Dimethyl@C')]

KeyError: 'mods'

In [10]:
tune_mgr = FinetuneManager(
    device="gpu",
    test_interval=3)


## CCS Fine-tuning


In [None]:
transfer_lib.precursor_df["mobility"] = transfer_lib.precursor_df["mobility_observed"]
transfer_lib.precursor_df = tune_mgr.predict_mobility(transfer_lib.precursor_df)
plt.scatter(transfer_lib.precursor_df['mobility'], transfer_lib.precursor_df['mobility_pred'], s=1, alpha=0.1)
plt.xlabel('mobility observed')
plt.ylabel('mobility predicted')

In [None]:
ccs_stats = tune_mgr.finetune_ccs(transfer_lib.precursor_df)

transfer_lib.precursor_df = tune_mgr.ccs_model.predict(transfer_lib.precursor_df)
plt.scatter(transfer_lib.precursor_df['ccs'], transfer_lib.precursor_df['ccs_pred'], s=1, alpha=0.1)
plt.xlabel('ccs observed')
plt.ylabel('ccs predicted')

In [None]:
g = sns.relplot(data=ccs_stats, x='epoch', y='value', hue='data_split', marker= 'o',dashes=False, col='metric_name', kind='line', col_wrap=2, facet_kws={'sharex': False, 'sharey': False, 'legend_out': False})
g.set_titles("{col_name}")
g.legend.set_title('Data split')

## RT Fine-tuning


In [None]:

transfer_lib.precursor_df = tune_mgr.predict_rt(transfer_lib.precursor_df)
plt.scatter(transfer_lib.precursor_df['rt_norm'], transfer_lib.precursor_df['rt_norm_pred'], s=1, alpha=0.1)
plt.xlabel('RT observed')
plt.ylabel('RT predicted')

In [None]:
rt_stats = tune_mgr.finetune_rt(transfer_lib.precursor_df)

transfer_lib.precursor_df = tune_mgr.predict_rt(transfer_lib.precursor_df)

plt.scatter(transfer_lib.precursor_df['rt_norm'], transfer_lib.precursor_df['rt_norm_pred'], s=0.1, alpha=0.1)
plt.xlabel('RT observed')
plt.ylabel('RT predicted')


In [None]:
g = sns.relplot(data=rt_stats, x='epoch', y='value', hue='data_split', marker= 'o',dashes=False, col='metric_name', kind='line', col_wrap=2, facet_kws={'sharex': False, 'sharey': False, 'legend_out': False})
g.set_titles("{col_name}")
g.legend.set_title('Data split')

## Charge Fine-tuning

In [None]:

# Testing the charge finetuning on the transfer library
charge_stats = tune_mgr.finetune_charge(psm_df=transfer_lib.precursor_df)

In [None]:
g = sns.relplot(data=charge_stats, x='epoch', y='value', hue='data_split', marker= 'o',dashes=False, col='metric_name', kind='line', col_wrap=2, facet_kws={'sharex': False, 'sharey': False, 'legend_out': False})
g.set_titles("{col_name}")
g.legend.set_title('Data split')


## MS2 Fine-tuning

In [None]:
# Uncomment the following line to only finetune the ms2 on high quality spectra
# transfer_lib.precursor_df = transfer_lib.precursor_df[transfer_lib.precursor_df['use_for_ms2']]


In [None]:
def calculate_similarity(precursor_df_a, precursor_df_b, intensity_df_a, intensity_df_b):

    _a_df = precursor_df_a[['precursor_idx', 'frag_start_idx', 'frag_stop_idx']].copy()
    _b_df = precursor_df_b[['precursor_idx', 'frag_start_idx', 'frag_stop_idx']].copy()

    _merged_df = pd.merge(_a_df, _b_df, on='precursor_idx', suffixes=('_a', '_b'))
    # keep only first precursor
    _merged_df = _merged_df.drop_duplicates(subset='precursor_idx', keep='first')
    similarity_list = []

    for i, (start_a, stop_a, start_b, stop_b) in enumerate(zip(_merged_df['frag_start_idx_a'], _merged_df['frag_stop_idx_a'], _merged_df['frag_start_idx_b'], _merged_df['frag_stop_idx_b'])):
        observed_intensity = intensity_df_a.iloc[start_a:stop_a, :4].values.flatten()
        predicted_intensity = intensity_df_b.iloc[start_b:stop_b, :4].values.flatten()

        similarity = np.dot(observed_intensity, predicted_intensity) / (np.linalg.norm(observed_intensity) * np.linalg.norm(predicted_intensity))
        similarity_list.append({'similarity': similarity, 'index': i, 'precursor_idx': _merged_df.iloc[i]['precursor_idx']})

    return pd.DataFrame(similarity_list)

In [None]:
res = tune_mgr.predict_all(transfer_lib.precursor_df.copy(), predict_items=['ms2'])

precursor_after_df = res['precursor_df']
fragment_mz_after_df = res['fragment_mz_df']
fragment_intensity_after_df = res['fragment_intensity_df']
similarity_after_df = calculate_similarity(precursor_after_df, transfer_lib.precursor_df, fragment_intensity_after_df, transfer_lib.fragment_intensity_df)
print(similarity_after_df['similarity'].median())
plt.scatter(similarity_after_df['index'], similarity_after_df['similarity'], s=0.1)
plt.xlabel('Index')
plt.ylabel('Similarity')
plt.title('Similarity between observed and predicted MS2 spectra before fine-tuning')

In [None]:

# Testing the ms2 finetuning on the transfer library
ms2_stats = tune_mgr.finetune_ms2(psm_df=transfer_lib.precursor_df.copy(), matched_intensity_df=transfer_lib.fragment_intensity_df.copy())

In [None]:
res = tune_mgr.predict_all(transfer_lib.precursor_df.copy(), predict_items=["ms2"])

precursor_after_df = res["precursor_df"]
fragment_mz_after_df = res["fragment_mz_df"]
fragment_intensity_after_df = res["fragment_intensity_df"]
similarity_after_df = calculate_similarity(
    precursor_after_df,
    transfer_lib.precursor_df,
    fragment_intensity_after_df,
    transfer_lib.fragment_intensity_df,
)
print(similarity_after_df["similarity"].median())
plt.scatter(similarity_after_df["index"], similarity_after_df["similarity"], s=0.1)
plt.xlabel("Index")
plt.ylabel("Similarity")
plt.title("Similarity between observed and predicted MS2 spectra after fine-tuning")

In [None]:
g = sns.relplot(data=ms2_stats, x='epoch', y='value', hue='data_split', marker= 'o',dashes=False, col='metric_name', kind='line', col_wrap=2, facet_kws={'sharex': False, 'sharey': False, 'legend_out': False})
g.set_titles("{col_name}")
g.legend.set_title('Data split')
