In [1]:
import json
from matchms.importing import load_from_mgf
from rdkit import Chem
from massspecgym.tools.analyzers import analyze_canonical_smiles
import os

In [2]:
file_mgf = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20241211_msn_library_pos_all_lib_MSn.mgf"
file_json = "/Users/macbook/CODE/Majer:MassSpecGym/data/Retrieval/MassSpecGym_retrieval_candidates_mass.json"
split_file = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20240929_split.tsv"

In [None]:
print("Loading spectra from MGF file...")
spectra = list(load_from_mgf(file_mgf))
print(f"Total number of spectra loaded: {len(spectra)}")

Loading spectra from MGF file...


In [4]:
print("Filtering spectra with SPECTYPE=ALL_ENERGIES and MS_LEVEL=2...")
filtered_spectra = [
    s for s in spectra
    if s.metadata.get("spectype") == "ALL_ENERGIES" and int(s.metadata["ms_level"]) == 2
]
print(f"Number of spectra after filtering: {len(filtered_spectra)}")

Filtering spectra with SPECTYPE=ALL_ENERGIES and MS_LEVEL=2...
Number of spectra after filtering: 16476


In [5]:
_ = analyze_canonical_smiles(filtered_spectra)

==== RDKit Canonical SMILES ====
=== SMILES Processing Statistics ===
Mode: SPECTRA
Total SMILES extracted: 13984
Unique original SMILES: 13984
Unique canonical SMILES: 13984
Number of invalid SMILES: 0
Number of SMILES unchanged after RDKit canonicalization: 67



({'CC1=C(C=CC(=C1)OC)NC(=O)NC2=CN=C(N=C2N3CCCC3)OC',
  'C[C@H]1/C=C/C=CC=C(C(=O)NC2=CC(=O)C3=C(C2=O)C=C(C(=C3C(=O)/C(=C/[C@@H]([C@H]([C@H](/C=C/[C@H](C/C=C(/C(=O)C[C@@H]1O)\\C)O)C)O)C)/C)O)C)C',
  'CC(=CCCC1(C=CC2=C(O1)C3=C(C=C2)NC(=O)C(C3(C4=CC=C(C=C4)OC)O)OC)C)C',
  'CCCC(=O)O[C@H]1[C@H](C[C@]2([C@H]1[C@H]([C@@]3(C(=O)[C@@H]4[C@H]([C@@]3(C2=O)C)CC([C@@H]4OC(=O)C)(C)C)C)OC(=O)C)OC(=O)C)C',
  'CCNC(=O)/C=C/C1=CC(=CC=C1)Br',
  'CCCCCOC1=CC=C(C=C1)CSC2=NNC(=N2)CC',
  'CC1=C(C=CC(=C1)OCC(=O)N2CC3CN(CC3C2)C4=NC=CC=N4)Cl',
  'COC1=C(C=C(C=C1)CS(=O)(=O)/C=C/C2=C(C=C(C=C2OC)OC)OC)O',
  'CC1=C(C=C(C=C1)N2CC(CC2=O)C(=O)NCC3=CC=CO3)Cl',
  'CC(CCC(C(C)(C)O)O)C1CCC2(C1(CCC34C2CCC5C3(C4)C(CC(C5(C)C(=O)OC6C(C(C(C(O6)CO)O)O)O)O)O)C)C',
  'CC1=CN=C(N1CC2=CC=CC=C2OCCC[C@@H](C)CC(=O)O)C3=CC=C(C=C3)OC(F)(F)F',
  'C[C@]12CC[C@@H]([C@@](C1CC[C@@]3(C2CC=C4[C@]3(CC[C@@]5([C@H]4CC(CC5)(C)C)C(=O)O)C)C)(C)CO)O[C@H]6[C@@H]([C@H]([C@@H]([C@H](O6)CO)O)O)O[C@H]7[C@@H]([C@H]([C@@H]([C@H](O7)CO)O)O)O',
  'C[C@@H]1/C=

In [6]:
print("Extracting and canonicalizing SMILES from filtered spectra...")
smiles_set = set()

for spectrum in filtered_spectra:
    smiles = spectrum.get("smiles")
    smiles_set.add(smiles)

Extracting and canonicalizing SMILES from filtered spectra...


In [7]:
# if len(invalid_smiles_mgf) > 0:
#     print(f"Number of invalid SMILES skipped from MGF: {len(invalid_smiles_mgf)}")
# else:
#     print("No valid SMILES skipped from MGF")

In [8]:
with open(file_json, 'r') as f:
    smiles_dict = json.load(f)

In [9]:
_ = analyze_canonical_smiles(smiles_dict, mode='json')

==== RDKit Canonical SMILES ====
=== SMILES Processing Statistics ===
Mode: JSON
Total SMILES extracted: 32010
Unique original SMILES: 32010
Unique canonical SMILES: 32010
Number of invalid SMILES: 0
Number of SMILES unchanged after RDKit canonicalization: 1447



In [10]:

json_keys_set = set()
invalid_smiles_json = set()

for key in smiles_dict.keys():
    # canonical_key = canonicalize_smiles(key)
    json_keys_set.add(key)
    # if canonical_key:
    #     json_keys_set.add(canonical_key)
    # else:
    #     invalid_smiles_json.add(key)
if len(invalid_smiles_json) > 0:
    print(f"Number of invalid SMILES skipped from JSON: {len(invalid_smiles_json)}")
else:
    print("No valid SMILES skipped from JSON")


No valid SMILES skipped from JSON


In [11]:
print("Comparing SMILES from MGF with JSON keys...")
smiles_in_json = smiles_set.intersection(json_keys_set)
smiles_not_in_json = smiles_set.difference(json_keys_set)

all_present = len(smiles_not_in_json) == 0

print("\n--- Comparison Results ---")
if all_present:
    print("All SMILES from the filtered MGF file are present in the JSON file.")
else:
    print(f"Not all SMILES from the filtered MGF file are present in the JSON file.")
    print(f"Number of SMILES present in JSON: {len(smiles_in_json)}")
    print(f"Number of SMILES NOT present in JSON: {len(smiles_not_in_json)}")

print("\n--- Detailed Summary ---")
print(f"Total SMILES extracted from MGF: {len(smiles_set)}")
print(f"Total SMILES in JSON: {len(json_keys_set)}")
print(f"SMILES present in JSON and MGF: {len(smiles_in_json)}")
print(f"SMILES from MGF not present in JSON: {len(smiles_not_in_json)}")

Comparing SMILES from MGF with JSON keys...

--- Comparison Results ---
Not all SMILES from the filtered MGF file are present in the JSON file.
Number of SMILES present in JSON: 13274
Number of SMILES NOT present in JSON: 710

--- Detailed Summary ---
Total SMILES extracted from MGF: 13984
Total SMILES in JSON: 32010
SMILES present in JSON and MGF: 13274
SMILES from MGF not present in JSON: 710


# MSnRetrieval

In [3]:
from massspecgym.data.transforms import MolFingerprinter, MolToInChIKey, MolToFormulaVector, SpecTokenizer
from massspecgym.data.datasets import MSnDataset, MSnRetrievalDataset
from massspecgym.featurize import SpectrumFeaturizer
from massspecgym.data.data_module import MassSpecDataModule

In [4]:
config = {
    'features': ['collision_energy', 'ionmode', 'adduct', 'spectrum_stats', 'atom_counts', 'value', "retention_time", 'ion_source', 'binned_peaks'],
    'feature_attributes': {
        'atom_counts': {
            'top_n_atoms': 12,
            'include_other': True,
        },
    },
}

In [5]:
featurizer = SpectrumFeaturizer(config, mode='torch')

In [6]:
# Instantiate the dataset
mol_transform = MolFingerprinter(fp_size=2048)

msn_retrieval_dataset = MSnRetrievalDataset(
    pth=file_mgf,
    mol_transform=mol_transform,
    featurizer=featurizer,
    candidates_pth=file_json,
    max_allowed_deviation=0.005
)


Total valid indices: 15674
Dataset length: 15674


In [7]:
# Initialize the data module
data_module = MassSpecDataModule(
    dataset=msn_retrieval_dataset,
    batch_size=12,
    num_workers=0,
    split_pth=split_file
)

In [8]:
data_module.prepare_data()
data_module.setup()

train_loader = data_module.train_dataloader()

Train dataset size: 11892
Val dataset size: 1890


AttributeError: 'MassSpecDataModule' object has no attribute 'test_dataset'

In [None]:
# Test the data loader
for batch in train_loader:
    print(batch['spec'])  # PyG Batch object
    # print(f"batch['mol'] shape: {batch['mol'].shape}")  # Should be [batch_size, fp_size]
    # print(f"batch['candidates'] shape: {batch['candidates'].shape}")  # [total_candidates, fp_size]
    # print(f"batch['labels'] shape: {batch['labels'].shape}")  # [total_candidates]
    # print(f"batch['batch_ptr']: {batch.ptr}")  # [batch_size]
    break

# Original Retrieval

In [4]:
from massspecgym.data import RetrievalDataset, MassSpecDataModule

In [5]:
pth_massspecgym_original = "/Users/macbook/CODE/Majer:MassSpecGym/data/MassSpecGym/MassSpecGym.tsv"

In [6]:
# Init hyperparameters
n_peaks = 60
fp_size = 4096
batch_size = 12

# Load dataset
dataset_original = RetrievalDataset(
    pth=pth_massspecgym_original,
    candidates_pth=file_json,
    spec_transform=SpecTokenizer(n_peaks=n_peaks),
    mol_transform=MolFingerprinter(fp_size=fp_size),
)


In [7]:
# Init data module
data_module_original = MassSpecDataModule(
    dataset=dataset_original,
    batch_size=batch_size,
    num_workers=0,
)

In [8]:
data_module_original.prepare_data()
data_module_original.setup()

train_loader_original = data_module_original.train_dataloader()

Train dataset size: 194119
Val dataset size: 19429


In [9]:
tmp = []
for batch in train_loader_original:
    print(batch['spec'])
    tmp = batch
    break

tensor([[[4.5319e+02, 1.1000e+00],
         [5.5050e+01, 1.0119e-04],
         [5.7071e+01, 1.2012e-02],
         ...,
         [3.8278e+02, 1.0691e-04],
         [4.0595e+02, 1.1269e-04],
         [4.4735e+02, 1.5205e-04]],

        [[3.9615e+02, 1.1000e+00],
         [5.8065e+01, 2.2362e-01],
         [8.4081e+01, 9.4040e-02],
         ...,
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00]],

        [[1.8107e+02, 1.1000e+00],
         [5.4034e+01, 2.4247e-03],
         [6.5039e+01, 2.4672e-02],
         ...,
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00]],

        ...,

        [[3.9716e+02, 1.1000e+00],
         [1.1645e+02, 1.4494e-02],
         [1.3236e+02, 1.8559e-02],
         ...,
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00]],

        [[3.7613e+02, 1.1000e+00],
         [5.5018e+01, 7.2072e-02],
         [5.

In [10]:
tmp.keys()

dict_keys(['spec', 'mol', 'precursor_mz', 'adduct', 'mol_freq', 'identifier', 'smiles', 'candidates', 'candidates_smiles', 'labels'])

In [11]:
tmp['spec'].shape, tmp['mol'].shape, 

(torch.Size([12, 61, 2]), torch.Size([12, 4096]))

In [12]:
tmp['precursor_mz']

tensor([453.1884, 396.1530, 181.0720, 600.4350, 237.1234, 521.2150, 445.1800,
        152.0710, 162.0760, 397.1618, 376.1310, 369.2000])

In [13]:
tmp['adduct'], 

(['[M+Na]+',
  '[M+H]+',
  '[M+H]+',
  '[M+Na]+',
  '[M+H]+',
  '[M+Na]+',
  '[M+Na]+',
  '[M+H]+',
  '[M+H]+',
  '[M+Na]+',
  '[M+H]+',
  '[M+Na]+'],)

In [14]:
tmp['mol_freq']

tensor([  5.,   3., 123.,   3.,   9.,  18.,  38.,  83.,  31.,   1.,  18.,  65.])

In [15]:
tmp['identifier']

['MassSpecGymID0154061',
 'MassSpecGymID0229345',
 'MassSpecGymID0007090',
 'MassSpecGymID0253067',
 'MassSpecGymID0062695',
 'MassSpecGymID0362443',
 'MassSpecGymID0285628',
 'MassSpecGymID0018665',
 'MassSpecGymID0397170',
 'MassSpecGymID0159697',
 'MassSpecGymID0078614',
 'MassSpecGymID0133486']

In [16]:
tmp['smiles']

['CC(C)CC(=O)OC1C(OC2=C1C3=C(C=C2)C=CC(=O)O3)C(C)(C)OC(=O)CC(C)C',
 'C[C@@H]1CN(C[C@@H](N1)C)C2=C(C3=C(C(=C2F)F)C(=O)C(=CN3C4CC4)C(=O)O)F',
 'CN1C2=C(C(=O)N(C1=O)C)NC=N2',
 'C[C@H](CCC(=O)NCCCNCCCCNC(=O)C)[C@H]1CC[C@@H]2[C@@]1([C@H](C[C@H]3[C@H]2[C@@H](C[C@H]4[C@@]3(CC[C@H](C4)O)C)O)O)C',
 'C[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)C(=O)O)N',
 'CC=C(C)C(=O)OC1C(C(CC2=CC3=C(C(=C2C4=C(C(=C(C=C14)OC)OC)OC)OC)OCO3)C)C',
 'CC(C)CC(=O)OC1C=C2C(C13CO3)C(OC=C2COC(=O)C)OC(=O)CC(C)C',
 'CC(=O)NC1=CC=C(C=C1)O',
 'CNC(CCC(=O)O)C(=O)O',
 'CC(=O)OC(CCCCC1=CC(=C(C=C1)O)O)CCC2=CC(=C(C=C2)O)O',
 'CC(C)(C(=O)OC)OC1=CC=C(C=C1)CCNC(=O)C2=CC=C(C=C2)Cl',
 'CN1CCC23C4NC5=CC=CC=C5C2(C1NC6=CC=CC=C36)CCN4C']

In [17]:
tmp['labels']

[tensor([True, True, True, True, True, True, True, True, True, True, True, True]),
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False]),
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False]),
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False]),
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False]),
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False]),
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False]),
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False]),
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False]),
 tensor([False, False, False, False, False, False, False, False, False, False,
      

In [18]:
len(tmp['candidates'])

173

In [19]:
tmp['candidates'][5].shape

torch.Size([12, 4096])

In [20]:
len(tmp['candidates_smiles'])

173

In [21]:
with open(file_json, "r") as file:
    json_dict = json.load(file)

In [48]:
a = list(json_dict.keys())

In [49]:
a[0]

'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC'

In [51]:
len(json_dict[a[0]])

256