In [1]:
from ms_data_funcs import *
from transformer_funcs import *

import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
if torch.cuda.is_available():
    print("CUDA is available.")
    print("PyTorch version:", torch.__version__)
    print("CUDA version:", torch.version.cuda)
    print("Number of available GPUs:", torch.cuda.device_count())
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

CUDA is available.
PyTorch version: 2.0.1+cu118
CUDA version: 11.8
Number of available GPUs: 1
GPU name: NVIDIA GeForce RTX 3080


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
df = pd.read_csv('data/MoNA/in-silico-SMILES.csv') 
print(f"Original shape: {df.shape}")

  df = pd.read_csv('data/MoNA/in-silico-SMILES.csv')


Original shape: (1837926, 18)


In [5]:
method='direct'

In [6]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['molecular_formula'])
max_mz = calculate_max_mz(df)

X_train, X_test = df, df
y_train, y_test = y, y

In [7]:
df['molecular_formula'].unique().shape

(19867,)

In [None]:
smiles_vocabs = get_or_create_smiles_vocabs(df)

In [None]:
results = {}
print(f"\nSpectra tokenized with {method} tokenization:")
print(f"\nSMILES tokenized with {'character'} tokenization")
smiles_vocab = smiles_vocabs['character']

train_loader, test_loader = load_tokenized_data_with_smiles(X_train, y_train, 
                                                            X_test, y_test, 
                                                            method, 
                                                            smiles_vocab, 
                                                            max_mz=max_mz)

#num_classes = len(label_encoder.classes_)
smiles_vocab_size = len(smiles_vocab)

# sample batch used for input dimensions
sample_batch, _ = next(iter(train_loader))
print("Sample shape:", sample_batch.shape)
embed_depth = sample_batch.shape[3]

In [None]:
model = MS_VIT_Seq2Seq(
    smiles_vocab_size=len(smiles_vocab),
    embed_depth=embed_depth,
    d_model=64,           # Reduced from 256
    nhead=4,              # Reduced from 8
    num_layers=2,         # Reduced from 6
    dim_feedforward=256,  # Reduced from 2048
    dropout=0.1,
    num_classes=None
)
print(model)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion_cls = nn.CrossEntropyLoss()
criterion_seq = nn.CrossEntropyLoss(ignore_index=smiles_vocab['<pad>'])

model, history = train_model_seq2seq(model, train_loader, test_loader, 
                                     optimizer, criterion_seq, 
                                     num_epochs=50, evaluate=True, verbose=1,
                                     checkpoint_path="./model_checkpoints/",
                                     meta_tag=(method+"_character"))

In [None]:
cls_accuracy, seq_accuracy = evaluate_model_seq2seq(model, test_loader, smiles_vocab)

In [None]:
# Visualize training history
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history['seq_loss'].values(), label='Sequence Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history['seq_accuracy'].values(), label='Sequence Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()