In [None]:
from ms_data_funcs import *
from transformer_funcs import *

import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
if torch.cuda.is_available():
    print("CUDA is available.")
    print("PyTorch version:", torch.__version__)
    print("CUDA version:", torch.version.cuda)
    print("Number of available GPUs:", torch.cuda.device_count())
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
df = pd.read_csv('data/MoNA/in-silico-SMILES.csv') 
print(f"Train shape: {df.shape}")

In [None]:
df_test = pd.read_csv('data/MoNA/experimental-SMILES.csv')
print(f"Test shape: {df_test.shape}")

In [None]:
# Minimal test
n = 50000
df = df.sample(n)
df_test = df_test.sample(n//2)

In [None]:
method='direct'

In [None]:
max_mz = calculate_max_mz(df)

In [None]:
df['molecular_formula'].unique().shape

In [None]:
smiles_vocabs = get_or_create_smiles_vocabs(df)

In [None]:
results = {}
print(f"\nSpectra tokenized with {method} tokenization:")
print(f"\nSMILES tokenized with {'character'} tokenization")
smiles_vocab = smiles_vocabs['character']

train_loader, test_loader = load_tokenized_data_with_smiles(df, df_test, 
                                                            method, 
                                                            smiles_vocab, 
                                                            max_mz=max_mz)

#num_classes = len(label_encoder.classes_)
smiles_vocab_size = len(smiles_vocab)

# sample batch used for input dimensions
sample_batch, target_batch = next(iter(train_loader))
print("Spectra shape:", sample_batch.shape)
print("SMILES shape:", target_batch.shape)
embed_depth = sample_batch.shape[3]

In [None]:
model = MS_VIT_Seq2Seq(
    smiles_vocab_size=len(smiles_vocab),
    embed_depth=embed_depth,
    d_model=64,           # Reduced from 256
    nhead=4,              # Reduced from 8
    num_layers=2,         # Reduced from 6
    dim_feedforward=256,  # Reduced from 2048
    dropout=0.1,
    num_classes=None
)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion_seq = nn.CrossEntropyLoss()#ignore_index=smiles_vocab['<pad>'])

model, history = train_model_seq2seq(model, train_loader, test_loader, 
                                     optimizer, criterion_seq, 
                                     num_epochs=100, evaluate=True, verbose=1,
                                     checkpoint_path="./model_checkpoints/",
                                     meta_tag=(method+"_character"),
                                     use_tensorboard=True)

In [None]:
results = evaluate_model_seq2seq(model, test_loader, smiles_vocab, test=True)

In [None]:
plot_training_history(history)