In [1]:
from ms_data_funcs import *
from transformer_funcs import *

import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

In [2]:
if torch.cuda.is_available():
    print("CUDA is available.")
    print("PyTorch version:", torch.__version__)
    print("CUDA version:", torch.version.cuda)
    print("Number of available GPUs:", torch.cuda.device_count())
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

CUDA is available.
PyTorch version: 2.0.1+cu118
CUDA version: 11.8
Number of available GPUs: 1
GPU name: NVIDIA GeForce RTX 3080


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
df = pd.read_csv('data/MoNA/in-silico.csv') 
print(df.shape)

  df = pd.read_csv('data/MoNA/in-silico.csv')


(1844352, 17)


In [5]:
methods = ['direct', 'wavelet2']

In [6]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['molecular_formula'])
max_mz = calculate_max_mz(df)

X_train, X_test = df, df
y_train, y_test = y, y

In [7]:
df['molecular_formula'].unique().shape

(19868,)

In [8]:
results = {}
for method in methods:
    print(f"\nTraining with {method} tokenization:")

    train_loader, test_loader = load_tokenized_data(X_train, y_train, 
                                                    X_test, y_test, 
                                                    method)

    # sample batch used for input dimensions
    sample_batch, _ = next(iter(train_loader))
    print("Sample shape:", sample_batch.shape)
    
    num_classes = len(label_encoder.classes_)
    embed_depth = sample_batch.shape[3]

    model = MS_VIT(num_classes, embed_depth).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss()

    model, history = train_model(model, train_loader, test_loader, 
                                 optimizer, criterion, 
                                 evaluate=False, verbose=1,
                                 checkpoint_path="./model_checkpoints/")
    
    # Final eval
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            x_batch = x_batch.squeeze(1)
            outputs = model(x_batch)
            _, predicted = torch.max(outputs.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
            
    accuracy = correct / total
    results[method] = accuracy
    print(f"Final accuracy with {method} tokenization: {accuracy:.4f}")


Training with direct tokenization:
Sample shape: torch.Size([32, 1, 221, 16])


Epoch 1/50 [Train]:   5%|▍         | 2644/57636 [02:50<1:02:06, 14.76it/s, loss=8.1939]