In [1]:
import json
from matchms.importing import load_from_mgf
from rdkit import Chem
from massspecgym.tools.analyzers import analyze_canonical_smiles
import os

In [2]:
file_mgf = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20241211_msn_library_pos_all_lib_MSn.mgf"
file_json = "/Users/macbook/CODE/Majer:MassSpecGym/data/Retrieval/MassSpecGym_retrieval_candidates_mass.json"
split_file = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20240929_split.tsv"

In [4]:
print("Loading spectra from MGF file...")
spectra = list(load_from_mgf(file_mgf))
print(f"Total number of spectra loaded: {len(spectra)}")

Loading spectra from MGF file...
Total number of spectra loaded: 803405


In [5]:
print("Filtering spectra with SPECTYPE=ALL_ENERGIES and MS_LEVEL=2...")
filtered_spectra = [
    s for s in spectra
    if s.metadata.get("spectype") == "ALL_ENERGIES" and int(s.metadata["ms_level"]) == 2
]
print(f"Number of spectra after filtering: {len(filtered_spectra)}")

Filtering spectra with SPECTYPE=ALL_ENERGIES and MS_LEVEL=2...
Number of spectra after filtering: 16476


In [7]:
analyze_canonical_smiles(filtered_spectra)

=== SMILES Processing Statistics ===
Mode: SPECTRA
Total SMILES extracted: 13984
Unique original SMILES: 13984
Unique canonical SMILES: 13984
Number of invalid SMILES: 0
Number of SMILES unchanged after RDKit canonicalization: 67



In [21]:
print("Extracting and canonicalizing SMILES from filtered spectra...")
smiles_set = set()

for spectrum in filtered_spectra:
    smiles = spectrum.get("smiles")
    smiles_set.add(smiles)

Extracting and canonicalizing SMILES from filtered spectra...


In [9]:
# if len(invalid_smiles_mgf) > 0:
#     print(f"Number of invalid SMILES skipped from MGF: {len(invalid_smiles_mgf)}")
# else:
#     print("No valid SMILES skipped from MGF")

No valid SMILES skipped from MGF


In [22]:
with open(file_json, 'r') as f:
    smiles_dict = json.load(f)

In [24]:
_ = analyze_canonical_smiles(smiles_dict, mode='json')

=== SMILES Processing Statistics ===
Mode: JSON
Total SMILES extracted: 32010
Unique original SMILES: 32010
Unique canonical SMILES: 32010
Number of invalid SMILES: 0
Number of SMILES unchanged after RDKit canonicalization: 1447



In [25]:

json_keys_set = set()
invalid_smiles_json = set()

for key in smiles_dict.keys():
    # canonical_key = canonicalize_smiles(key)
    json_keys_set.add(key)
    # if canonical_key:
    #     json_keys_set.add(canonical_key)
    # else:
    #     invalid_smiles_json.add(key)
if len(invalid_smiles_json) > 0:
    print(f"Number of invalid SMILES skipped from JSON: {len(invalid_smiles_json)}")
else:
    print("No valid SMILES skipped from JSON")


No valid SMILES skipped from JSON


In [27]:
print("Comparing SMILES from MGF with JSON keys...")
smiles_in_json = smiles_set.intersection(json_keys_set)
smiles_not_in_json = smiles_set.difference(json_keys_set)

all_present = len(smiles_not_in_json) == 0

print("\n--- Comparison Results ---")
if all_present:
    print("All SMILES from the filtered MGF file are present in the JSON file.")
else:
    print(f"Not all SMILES from the filtered MGF file are present in the JSON file.")
    print(f"Number of SMILES present in JSON: {len(smiles_in_json)}")
    print(f"Number of SMILES NOT present in JSON: {len(smiles_not_in_json)}")

print("\n--- Detailed Summary ---")
print(f"Total SMILES extracted from MGF: {len(smiles_set)}")
print(f"Total SMILES in JSON: {len(json_keys_set)}")
print(f"SMILES present in JSON and MGF: {len(smiles_in_json)}")
print(f"SMILES from MGF not present in JSON: {len(smiles_not_in_json)}")

Comparing SMILES from MGF with JSON keys...

--- Comparison Results ---
Not all SMILES from the filtered MGF file are present in the JSON file.
Number of SMILES present in JSON: 13274
Number of SMILES NOT present in JSON: 710

--- Detailed Summary ---
Total SMILES extracted from MGF: 13984
Total SMILES in JSON: 32010
SMILES present in JSON and MGF: 13274
SMILES from MGF not present in JSON: 710


# MSnRetrieval

In [14]:
from massspecgym.data.transforms import MolFingerprinter, MolToInChIKey, MolToFormulaVector
from massspecgym.data.datasets import MSnDataset, MSnRetrievalDataset
from massspecgym.featurize import SpectrumFeaturizer
from massspecgym.data.data_module import MassSpecDataModule

In [15]:
config = {
    'features': ['collision_energy', 'ionmode', 'adduct', 'spectrum_stats', 'atom_counts', 'value', "retention_time", 'ion_source', 'binned_peaks'],
    'feature_attributes': {
        'atom_counts': {
            'top_n_atoms': 12,
            'include_other': True,
        },
    },
}

In [16]:
featurizer = SpectrumFeaturizer(config, mode='torch')

In [17]:
# Instantiate the dataset
mol_transform = MolFingerprinter(fp_size=2048)
msn_retrieval_dataset = MSnRetrievalDataset(
    pth=file_mgf,
    mol_transform=mol_transform,
    featurizer=featurizer,
    candidates_pth=file_json,
    max_allowed_deviation=0.005
)




In [18]:
# Initialize the data module
data_module = MassSpecDataModule(
    dataset=msn_retrieval_dataset,
    batch_size=12,
    num_workers=0,
    split_pth=split_file
)

In [19]:
data_module.prepare_data()
data_module.setup()

train_loader = data_module.train_dataloader()

In [20]:
# Test the data loader
for batch in train_loader:
    print(batch['spec'])  # PyG Batch object
    print(f"batch['mol'] shape: {batch['mol'].shape}")  # Should be [batch_size, fp_size]
    print(f"batch['candidates'] shape: {batch['candidates'].shape}")  # [total_candidates, fp_size]
    print(f"batch['labels'] shape: {batch['labels'].shape}")  # [total_candidates]
    print(f"batch['batch_ptr']: {batch['batch_ptr']}")  # [batch_size]
    break

TypeError: DataLoader found invalid type: '<class 'numpy.ndarray'>'