In [1]:
from massspecgym.data.datasets import MSnDataset
from massspecgym.featurize import SpectrumFeaturizer
from massspecgym.data import RetrievalDataset, MassSpecDataModule
import os
from notebooks.MSn.transformer_model.mine_tokenizers import ByteBPETokenizerWithSpecialTokens
import selfies as sf

In [2]:
spectra_mgf = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20241211_msn_library_pos_all_lib_MSn.mgf"
split_file = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20241211_split.tsv"

In [3]:
config = {
    'features': ['binned_peaks'],
    'feature_attributes': {
        'binned_peaks': {
            'max_mz': 1000,
            'bin_width': 0.25,
            'to_rel_intensities': True,
        },
    },
}

In [4]:
featurizer = SpectrumFeaturizer(config, mode='torch')
batch_size = 12

In [5]:
msn_dataset = MSnDataset(
    pth=spectra_mgf,
    featurizer=featurizer,
    mol_transform=None,
    max_allowed_deviation=0.005
)

In [6]:
data_module_msn = MassSpecDataModule(
    dataset=msn_dataset,
    batch_size=batch_size,
    split_pth=split_file,
    num_workers=0,
)

In [7]:
# Retrieve SMILES from the dataset
smiles_list = msn_dataset.smiles 

# Train on MSn SMILES

In [8]:
SMILES_TOKENIZER_SAVE_PATH = "/Users/macbook/CODE/Majer:MassSpecGym/data/tokenizers/smiles_tokenizer.json"
SELFIES_TOKENIZER_SAVE_PATH = "/Users/macbook/CODE/Majer:MassSpecGym/data/tokenizers/selfies_tokenizer.json"

os.makedirs(os.path.dirname(SMILES_TOKENIZER_SAVE_PATH), exist_ok=True)
os.makedirs(os.path.dirname(SELFIES_TOKENIZER_SAVE_PATH), exist_ok=True)

In [9]:
# Retrieve SMILES from the dataset
smiles_list = msn_dataset.smiles 
selfies_list = [sf.encoder(smi, strict=False) for smi in smiles_list]

In [10]:
print("\nInitializing and Training SMILES Tokenizer...")
smiles_tokenizer = ByteBPETokenizerWithSpecialTokens(max_len=200)

smiles_tokenizer.train(
    texts=smiles_list,
    vocab_size=1000,
    min_frequency=2,
    save_path=SMILES_TOKENIZER_SAVE_PATH,
    show_progress=True
)


Initializing and Training SMILES Tokenizer...
Initialized a new Byte-Level BPE Tokenizer.
Starting training on 16476 texts...



Training complete.
Tokenizer saved to /Users/macbook/CODE/Majer:MassSpecGym/data/tokenizers/smiles_tokenizer.json.


In [11]:
# Step 5: Initialize and Train the SELFIES Tokenizer
print("\nInitializing and Training SELFIES Tokenizer...")
selfies_tokenizer = ByteBPETokenizerWithSpecialTokens(max_len=200)

selfies_tokenizer.train(
    texts=selfies_list,
    vocab_size=1000,       # SELFIES typically have a smaller, fixed set of symbols
    min_frequency=1,       # Each SELFIES symbol is meaningful; set min_frequency to 1
    save_path=SELFIES_TOKENIZER_SAVE_PATH,
    show_progress=True
)


Initializing and Training SELFIES Tokenizer...
Initialized a new Byte-Level BPE Tokenizer.
Starting training on 16476 texts...



Training complete.
Tokenizer saved to /Users/macbook/CODE/Majer:MassSpecGym/data/tokenizers/selfies_tokenizer.json.


In [12]:
# Step 6: Verify SMILES Tokenizer
print("\nVerifying SMILES Tokenizer...")
sample_smiles = "COC1=C(C=C(C=C1)CCC(=O)C2=C#(C=C(C=C2OC)OC)O)OC"  # Ethanol
smiles_tokenizer = ByteBPETokenizerWithSpecialTokens(tokenizer_path=SMILES_TOKENIZER_SAVE_PATH)
encoded_smiles = smiles_tokenizer.encode(sample_smiles)
print(f"Encoded SMILES: {encoded_smiles}")

decoded_smiles = smiles_tokenizer.decode(encoded_smiles)
print(f"Decoded SMILES: {decoded_smiles}")

# Step 7: Verify SELFIES Tokenizer
selfies_tokenizer = ByteBPETokenizerWithSpecialTokens(tokenizer_path=SELFIES_TOKENIZER_SAVE_PATH)
print("\nVerifying SELFIES Tokenizer...")
encoded_selfies = selfies_tokenizer.encode(sample_smiles)  # Encode SMILES via SELFIES tokenizer
print(f"Encoded SELFIES: {encoded_selfies}")
decoded_selfies = selfies_tokenizer.decode(encoded_selfies)
print(f"Decoded SMILES: {decoded_selfies}")


Verifying SMILES Tokenizer...
Loaded tokenizer from /Users/macbook/CODE/Majer:MassSpecGym/data/tokenizers/smiles_tokenizer.json.
Encoded SMILES: [1, 283, 20, 32, 38, 11, 38, 32, 38, 11, 38, 32, 38, 20, 12, 269, 261, 50, 12, 38, 21, 32, 38, 6, 11, 38, 32, 38, 11, 38, 32, 38, 21, 265, 12, 265, 12, 50, 12, 265, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded SMILES:  COC1=C(C=C(C=C1)CCC(=O)C2=C#(C=C(C=C2OC)OC)O)OC
Loaded tokenizer from /Users/macbook/CODE/Majer:MassSpecGym/data/tokenizers/selfies_tokenizer.json.

Verifying SELFIES Tokenizer...
Encoded SELF

In [14]:
import tokenizers
print(tokenizers.__version__)

0.19.1


In [None]:
!pip freeze | grep tokenizers