In [1]:
import json
from matchms.importing import load_from_mgf
from rdkit import Chem
import os

In [2]:
file_mgf = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/20240929_msn_library_pos_all_lib_MSn.mgf"
file_json = "/Users/macbook/CODE/Majer:MassSpecGym/data/Retrieval/MassSpecGym_retrieval_candidates_mass.json"

In [3]:
def canonicalize_smiles(smiles):
    """
    Canonicalize a SMILES string using RDKit.
    Returns the canonical SMILES or None if invalid.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.MolToSmiles(mol, canonical=True)
    else:
        return None

In [4]:
print("Loading spectra from MGF file...")
spectra = list(load_from_mgf(file_mgf))
print(f"Total number of spectra loaded: {len(spectra)}")

Loading spectra from MGF file...
Total number of spectra loaded: 803405


In [25]:
print("Filtering spectra with SPECTYPE=ALL_ENERGIES and MS_LEVEL=2...")
filtered_spectra = [
    s for s in spectra
    if s.metadata.get("spectype") == "ALL_ENERGIES" and int(s.metadata["ms_level"]) == 2
]
print(f"Number of spectra after filtering: {len(filtered_spectra)}")

Filtering spectra with SPECTYPE=ALL_ENERGIES and MS_LEVEL=2...
Number of spectra after filtering: 16476


In [26]:
print("Extracting and canonicalizing SMILES from filtered spectra...")
smiles_set = set()
invalid_smiles_mgf = set()

for spectrum in filtered_spectra:
    smiles = spectrum.get("smiles")
    if smiles:
        canonical_smiles = canonicalize_smiles(smiles)
        if canonical_smiles:
            smiles_set.add(canonical_smiles)
        else:
            invalid_smiles_mgf.add(smiles)

Extracting and canonicalizing SMILES from filtered spectra...


In [35]:
if len(invalid_smiles_mgf) > 0:
    print(f"Number of invalid SMILES skipped from MGF: {len(invalid_smiles_mgf)}")
else:
    print("No valid SMILES skipped from MGF")

In [36]:
with open(file_json, 'r') as f:
    smiles_dict = json.load(f)

json_keys_set = set()
invalid_smiles_json = set()

for key in smiles_dict.keys():
    canonical_key = canonicalize_smiles(key)
    if canonical_key:
        json_keys_set.add(canonical_key)
    else:
        invalid_smiles_json.add(key)
if len(invalid_smiles_json) > 0:
    print(f"Number of invalid SMILES skipped from JSON: {len(invalid_smiles_json)}")
else:
    print("No valid SMILES skipped from JSON")


No valid SMILES skipped from JSON


In [34]:
print("Comparing SMILES from MGF with JSON keys...")
smiles_in_json = smiles_set.intersection(json_keys_set)
smiles_not_in_json = smiles_set.difference(json_keys_set)

all_present = len(smiles_not_in_json) == 0

print("\n--- Comparison Results ---")
if all_present:
    print("All SMILES from the filtered MGF file are present in the JSON file.")
else:
    print(f"Not all SMILES from the filtered MGF file are present in the JSON file.")
    print(f"Number of SMILES present in JSON: {len(smiles_in_json)}")
    print(f"Number of SMILES NOT present in JSON: {len(smiles_not_in_json)}")

print("\n--- Detailed Summary ---")
print(f"Total SMILES extracted from MGF: {len(smiles_set)}")
print(f"Total SMILES in JSON: {len(json_keys_set)}")
print(f"SMILES present in JSON: {len(smiles_in_json)}")
print(f"SMILES not present in JSON: {len(smiles_not_in_json)}")

Comparing SMILES from MGF with JSON keys...

--- Comparison Results ---
Not all SMILES from the filtered MGF file are present in the JSON file.
Number of SMILES present in JSON: 12786
Number of SMILES NOT present in JSON: 1198

--- Detailed Summary ---
Total SMILES extracted from MGF: 13984
Total SMILES in JSON: 32010
SMILES present in JSON: 12786
SMILES not present in JSON: 1198
