In [6]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
from prosit_t.eval import prosit_transformer_eval
import wandb
import tensorflow as tf
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import os
from prosit_t.wandb_agent.train_utils import get_proteometools_data
from prosit_t.models import PrositTransformerV2
import pandas as pd
from dlomix.models import PrositIntensityPredictor
from dlomix.losses import masked_pearson_correlation_distance
from prosit_t.models import PrositTransformerV2
from dlomix.losses import masked_spectral_distance
from prosit_t.models.variable_seq_length_models import TestModelDrop
from prosit_t.data.parquet_to_tfdataset import get_tfdatasets

pd.set_option('mode.chained_assignment', None)

In [8]:
tf.config.run_functions_eagerly(True)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [15]:
train_data, val_data = get_tfdatasets(2)



In [4]:
data_config = {
    "dataset": "proteometools",
    "data_source": {
        "train": "/cmnfs/proj/prosit/Transformer/first_pool_train.parquet",
        "val": "/cmnfs/proj/prosit/Transformer/first_pool_test.parquet",
    },
    "fragmentation": "HCD",
    "batch_size": 1,
    "seq_length": 30,
}

In [5]:
train_df = pd.read_parquet(data_config["data_source"]["train"])
train_df.head()

Unnamed: 0,raw_file,scan_number,modified_sequence,precursor_charge,fragmentation,mass_analyzer,andromeda_score,peptide_length,orig_collision_energy,aligned_collision_energy,intensities_raw,masses_raw,collision_energy_aligned_normed,method_nbr,precursor_charge_onehot
0,01650b_BF4-TUM_first_pool_72_01_01-2xIT_2xHCD-...,17405,AEEAAGPGAAALQR,2,HCD,ITMS,243.37,14,28.0,29.663639,"[0.05, 0.0, -1.0, 0.0, 0.0, -1.0, 0.05, 0.0, -...","[175.32356, 0.0, -1.0, 0.0, 0.0, -1.0, 303.386...",0.296636,1,"[0, 1, 0, 0, 0, 0]"
1,01625b_GA1-TUM_first_pool_1_01_01-2xIT_2xHCD-1...,28098,VSWQDLK,2,HCD,FTMS,183.04,7,23.0,25.222923,"[0.06, 0.0, -1.0, 0.0, 0.0, -1.0, 0.24, 0.0, -...","[147.11295, 0.0, -1.0, 0.0, 0.0, -1.0, 260.196...",0.252229,1,"[0, 1, 0, 0, 0, 0]"
2,01625b_GA1-TUM_first_pool_1_01_01-3xHCD-1h-R1,11793,DIHHIDYYK,2,HCD,FTMS,117.7,9,35.0,37.352073,"[0.43, 0.0, -1.0, 0.0, 0.0, -1.0, 0.33, 0.0, -...","[147.11281, 0.0, -1.0, 0.0, 0.0, -1.0, 310.175...",0.373521,1,"[0, 1, 0, 0, 0, 0]"
3,01650b_BB4-TUM_first_pool_68_01_01-3xHCD-1h-R2,42651,LVSDEM[UNIMOD:35]VVELIEK,2,HCD,FTMS,143.21,13,25.0,26.651139,"[0.09, 0.0, -1.0, 0.0, 0.0, -1.0, 0.22, 0.0, -...","[147.11278, 0.0, -1.0, 0.0, 0.0, -1.0, 276.155...",0.266511,1,"[0, 1, 0, 0, 0, 0]"
4,01709a_GD2-TUM_first_pool_110_01_01-DDA-1h-R1,27132,IRDLSGNLWERSSGDGEELER,4,HCD,FTMS,134.87,21,28.0,31.333845,"[0.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.15, 0.0, 0.0...","[175.11949, 0.0, 0.0, 0.0, 0.0, 0.0, 304.16235...",0.313338,1,"[0, 0, 0, 1, 0, 0]"


In [6]:
df = pd.DataFrame(columns=["sequence", "precursor_charge", "collision_energy"])

In [7]:
df["sequence"] = train_df["modified_sequence"].str.replace('[UNIMOD:35]', '', regex=False).str.replace('[UNIMOD:4]', '', regex=False)

In [8]:
def concatenate_columns(row):
    return row.tolist()

df["precursor_charge"] = pd.get_dummies(train_df['precursor_charge'], dtype=float).apply(lambda row: concatenate_columns(row), axis=1)

In [9]:
df["collision_energy"] = train_df["collision_energy_aligned_normed"]

In [10]:
df["intensities_raw"] = train_df["intensities_raw"]

In [11]:
def truncate_target(row):
    sequence = row["sequence"]
    target = row["intensities_raw"][: (len(sequence) - 1) * 6]
    return target

In [12]:
df["target"] = df.apply(truncate_target, axis=1)

In [13]:
df = df.drop("intensities_raw", axis=1)

In [14]:
dff = df.iloc[:40]

In [15]:
dff.loc[:, "sequence"] = dff["sequence"].apply(lambda x: np.array(list(x)))
dff.loc[:, "precursor_charge"] = dff["precursor_charge"].apply(np.array)
dff.loc[:, "collision_energy"] = dff["collision_energy"].apply(lambda x: [x])

In [17]:
sequence_col = dff['sequence'].tolist()
collision_energy_col = dff['collision_energy'].tolist()
precursor_charge_col = dff['precursor_charge'].tolist()
target_col = dff['target'].tolist()

sequence_col_ragged = tf.ragged.constant(sequence_col, dtype=tf.string)
collision_energy_col_ragged = tf.ragged.constant(collision_energy_col, dtype=tf.float32)
precursor_charge_col_ragged = tf.ragged.constant(precursor_charge_col, dtype=tf.float32)
target_col_ragged = tf.ragged.constant(target_col, dtype=tf.float64)

2023-10-06 13:21:07.446284: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43640 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:21:00.0, compute capability: 8.6


In [18]:
def ragged_to_dense(x,y):
    return x, y

def merge_tuples(item1, item2):
    return ({
        "sequence": item1[0],
        "precursor_charge": item2[0],
        "collision_energy": item2[1]
    }, item1[-1])

In [19]:
batch_size = 2

In [20]:
dataset_seq_target = tf.data.Dataset.from_tensor_slices(
    (
        sequence_col_ragged,
        target_col_ragged,
    )
).map(ragged_to_dense).padded_batch(batch_size).unbatch()



In [21]:
dataset_meta = tf.data.Dataset.from_tensor_slices(
(
    precursor_charge_col_ragged,
    collision_energy_col_ragged
)).map(ragged_to_dense)

In [22]:
dataset = tf.data.Dataset.zip(dataset_seq_target, dataset_meta).map(merge_tuples).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [16]:
model = TestModelDrop(
    seq_length=30,
    embedding_output_dim=64,
    num_heads=16,
    num_transformers=6, 
    dense_dim_factor=4
)

In [17]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(
    optimizer=optimizer,
    loss=masked_spectral_distance,
)

In [18]:
model.fit(
    train_data,
    validation_data=val_data,
    epochs=10
)

Epoch 1/10


2023-10-06 17:56:42.201946: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-10-06 17:56:43.620747: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8801


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f01aa31ceb0>