In [1]:
import os
import numpy as np
from datetime import datetime
from matchms import Spectrum
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
from collections import Counter
from massspecgym.utils import standardize_smiles
import os
import json
import time
import typing as T
from pathlib import Path
from matchms import Spectrum
from rdkit import Chem

In [2]:
FILE_1 = "20231031_nihnp_library_pos_all_lib_MSn.mgf"
FILE_2 = "20231130_mcescaf_library_pos_all_lib_MSn.mgf"
FILE_3 = "20231130_otavapep_library_pos_all_lib_MSn.mgf"
FILE_4 = "20240411_mcebio_library_pos_all_lib_MSn.mgf"

In [3]:
BASE_PATH = "/Users/macbook/CODE/MS/data/MSn/ALL"
SAVE_PATH = "../../../data/MSn"

In [4]:
file_paths = [os.path.join(BASE_PATH, FILE_1),
              os.path.join(BASE_PATH, FILE_2),
              os.path.join(BASE_PATH, FILE_3),
              os.path.join(BASE_PATH, FILE_4)]

In [5]:
all_spectra = []
spectra_counts = {}

# Initialize counters
root_id_counter = 0  # Counter for root MS2 spectra
fragment_id_counter = 0  # Counter for fragments under each root


for file_path in file_paths:
    spectra = list(load_from_mgf(file_path))
    spectra_counts[file_path] = len(spectra)
    idx_list = []  # To store IDENTIFIERs for validation

    for spectrum in spectra:
        ms_level = int(spectrum.get('ms_level'))
        if ms_level == 2:
            root_id_counter += 1
            fragment_id_counter = 0  # Reset fragment counter for new root
            # First part of IDENTIFIER: root ID
            root_id = f"{root_id_counter:07d}"
            # Second part of IDENTIFIER: fragment ID (for root, it's '0000000')
            fragment_id = f"{fragment_id_counter:07d}"
            idx = f"{root_id}_{fragment_id}"
            spectrum.set('IDENTIFIER', idx)
        else:
            fragment_id_counter += 1
            # Use the current root_id and increment fragment_id_counter
            fragment_id = f"{fragment_id_counter:07d}"
            idx = f"{root_id}_{fragment_id}"
            spectrum.set('IDENTIFIER', idx)
        idx_list.append(idx)

    all_spectra.extend(spectra)

for file, count in spectra_counts.items():
    print(f"{file}: {count} spectra")

combined_count = len(all_spectra)
print(f"Total combined spectra: {combined_count}")

/Users/macbook/CODE/MS/data/MSn/ALL/20231031_nihnp_library_pos_all_lib_MSn.mgf: 122837 spectra
/Users/macbook/CODE/MS/data/MSn/ALL/20231130_mcescaf_library_pos_all_lib_MSn.mgf: 243033 spectra
/Users/macbook/CODE/MS/data/MSn/ALL/20231130_otavapep_library_pos_all_lib_MSn.mgf: 63123 spectra
/Users/macbook/CODE/MS/data/MSn/ALL/20240411_mcebio_library_pos_all_lib_MSn.mgf: 374412 spectra
Total combined spectra: 803405


In [7]:
# all_spectra[0].metadata.get("smiles")

'COc1cc(O)c(C(=O)CCc2cc(OC)c(OC)cc2)c(OC)c1'

In [9]:
# all_spectra[0].set("smiles", "fijip")

Spectrum(precursor m/z=347.15, 135 fragments between 41.0 and 347.1)

In [10]:
# all_spectra[0].metadata.get("smiles")

'fijip'

# Retrieve canonical version

In [6]:
def canonicalize_smiles_in_spectra(
    spectra_list: T.List[Spectrum],
    standardize_smiles_func: T.Callable[[T.Union[str, T.List[str]]], T.Union[str, T.List[str]]],
    save_path: str,
    batch_size: int = 500,
    max_retries: int = 10,
    delay_between_retries: int = 60  # in seconds
) -> T.Dict[str, str]:
    """
    Canonicalize SMILES in a list of spectrum objects.

    Args:
        spectra_list: List of Spectrum objects.
        standardize_smiles_func: Function to standardize SMILES.
        save_path: Path to save the progress dictionary.
        batch_size: Number of SMILES to process before saving progress.
        max_retries: Maximum number of retries if the database is not reachable.
        delay_between_retries: Delay between retries in seconds.

    Returns:
        Dictionary mapping original SMILES to canonical SMILES.
    """
    # Extract all unique SMILES from spectra
    all_smiles = set()
    for spectrum in spectra_list:
        smiles = spectrum.metadata.get('smiles')
        if smiles:
            all_smiles.add(smiles)

    print(f"Total unique SMILES to process: {len(all_smiles)}")

    # Check if there is a saved progress file
    if os.path.exists(save_path):
        print(f"Resuming from saved progress at {save_path}")
        with open(save_path, 'r') as f:
            smiles_dict = json.load(f)
    else:
        smiles_dict = {}

    processed_smiles = set(smiles_dict.keys())
    remaining_smiles = all_smiles - processed_smiles
    print(f"SMILES remaining to process: {len(remaining_smiles)}")

    smiles_list = list(remaining_smiles)
    total_smiles = len(smiles_list)

    # For saving progress
    batch_counter = 0
    for idx, original_smiles in enumerate(smiles_list, 1):
        retries = 0
        success = False
        while retries < max_retries and not success:
            try:
                # Canonicalize the SMILES
                canonical_smiles = standardize_smiles_func(original_smiles)
                if not canonical_smiles:
                    print(f"Warning: Invalid SMILES '{original_smiles}'. Skipping.")
                    canonical_smiles = None
                else:
                    # If the function returns a list, extract the first element
                    if isinstance(canonical_smiles, list):
                        canonical_smiles = canonical_smiles[0]
                smiles_dict[original_smiles] = canonical_smiles
                success = True
            except Exception as e:
                retries += 1
                print(f"Error processing SMILES '{original_smiles}': {e}")
                if retries < max_retries:
                    print(f"Retrying in {delay_between_retries} seconds... (Attempt {retries}/{max_retries})")
                    time.sleep(delay_between_retries)
                else:
                    print(f"Max retries reached for SMILES '{original_smiles}'. Skipping.")
                    smiles_dict[original_smiles] = None  # Mark as failed

        batch_counter += 1
        # Save progress every batch_size SMILES or at the end
        if batch_counter >= batch_size or idx == total_smiles:
            print(f"Processing progress: {idx}/{total_smiles} SMILES")
            with open(save_path, 'w') as f:
                json.dump(smiles_dict, f)
            print(f"Progress saved to {save_path}")
            batch_counter = 0 

    print("Canonicalization completed.")
    return smiles_dict

In [7]:
save_progress_path = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/smiles_canonicalization_all.json"

smiles_canonical_dict = canonicalize_smiles_in_spectra(
    spectra_list=all_spectra,
    standardize_smiles_func=standardize_smiles,
    save_path=save_progress_path,
    batch_size=500,
    max_retries=30,
    delay_between_retries=20
)

Total unique SMILES to process: 14008
Resuming from saved progress at /Users/macbook/CODE/Majer:MassSpecGym/data/MSn/smiles_canonicalization_all.json
SMILES remaining to process: 0
Canonicalization completed.


In [None]:
# # Update spectra with canonical SMILES
# for spectrum in all_spectra:
#     original_smiles = spectrum.metadata.get('smiles')
#     if original_smiles:
#         canonical_smiles = smiles_canonical_dict.get(original_smiles)
#         if canonical_smiles:
#             spectrum.set('smiles', canonical_smiles)
#         else:
#             print(f"Warning: Could not canonicalize SMILES '{original_smiles}'.")

In [8]:
problematic_smiles = []

for spectrum in all_spectra:
    original_smiles = spectrum.metadata.get('smiles')
    if original_smiles:
        canonical_smiles = smiles_canonical_dict.get(original_smiles)
        if canonical_smiles:
            spectrum.set('smiles', canonical_smiles)
        else:
            # Keep the original SMILES
            # spectrum.set('smiles', original_smiles)

            problematic_smiles.append(original_smiles)
            print(f"Warning: Could not canonicalize SMILES '{original_smiles}'. Keeping the original SMILES.")

In [9]:
# Initialize lists to categorize SMILES
valid_smiles = []
invalid_smiles = []

for smi in problematic_smiles:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        valid_smiles.append(smi)
    else:
        invalid_smiles.append(smi)

In [10]:
# Report the results
print("\nValidation Results:")
print(f"Total problematic SMILES: {len(problematic_smiles)}")
print(f"Total unqiue problematic SMILES: {len(set(problematic_smiles))}")
print(f"Valid SMILES that couldn't be canonicalized: {len(valid_smiles)}")
print(f"Invalid SMILES: {len(invalid_smiles)}")


Validation Results:
Total problematic SMILES: 0
Total unqiue problematic SMILES: 0
Valid SMILES that couldn't be canonicalized: 0
Invalid SMILES: 0


In [12]:
problematic_smiles_canonized = []
for smi in set(problematic_smiles):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        canonical_smi = Chem.MolToSmiles(mol, canonical=True)
        # Optionally, update the spectrum with the RDKit canonical SMILES
        # spectrum.set('smiles', canonical_smi)
        print(f"Canonicalized '{smi}' to '{canonical_smi}'")
        problematic_smiles_canonized.append(canonical_smi)
    else:
        print(f"Cannot canonicalize invalid SMILES '{smi}'.")

# Corrected

Canonicalized 'CC(C)CN(CC(=O)N1CCN(CC(OCc2cc3ccccc3cc2)c2c(Cl)cc(Cl)cc2)CC1)CC(C)C' to 'CC(C)CN(CC(=O)N1CCN(CC(OCc2ccc3ccccc3c2)c2ccc(Cl)cc2Cl)CC1)CC(C)C'
Canonicalized 'CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O' to 'CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O'
Canonicalized 'CCN(CC)CCOC(=O)C1(c2cc(OC)c(OC)cc2)CCCC1' to 'CCN(CC)CCOC(=O)C1(c2ccc(OC)c(OC)c2)CCCC1'

In [15]:
problematic_pubchcem_canonical = standardize_smiles(problematic_smiles_canonized)

In [18]:
problematic_pubchcem_canonical

['CC(C)CN(CC(C)C)CC(=O)N1CCN(CC1)CC(C2=C(C=C(C=C2)Cl)Cl)OCC3=CC4=CC=CC=C4C=C3',
 'CC1CCC2(O1)C(CC(CC2(C)O)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO[C@H]4[C@@H]([C@](CO4)(CO)O)O)O)O)O)(C)C',
 'CCN(CC)CCOC(=O)C1(CCCC1)C2=CC(=C(C=C2)OC)OC']

In [19]:
problematic_smiles_canonized

['CC(C)CN(CC(=O)N1CCN(CC(OCc2ccc3ccccc3c2)c2ccc(Cl)cc2Cl)CC1)CC(C)C',
 'CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O',
 'CCN(CC)CCOC(=O)C1(c2ccc(OC)c(OC)c2)CCCC1']

In [20]:
def handle_problematic_smiles(problematic_smiles):
    """
    Takes a list of problematic SMILES strings, validates and canonicalizes them using RDKit,
    then sends the RDKit-canonicalized SMILES to PubChem for standardization.
    Returns a dictionary mapping the original problematic SMILES to the PubChem-canonicalized SMILES.
    Args:
        problematic_smiles: List of SMILES strings that could not be canonicalized initially.

    Returns:
        Dictionary mapping original problematic SMILES to PubChem-canonicalized SMILES.
    """

    unique_problematic_smiles = set(problematic_smiles)
    mapping_dict = {}

    for smi in unique_problematic_smiles:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            rdkit_canonical_smi = Chem.MolToSmiles(mol, canonical=True)
            try:
                pubchem_canonical_smi = standardize_smiles(rdkit_canonical_smi)
                if isinstance(pubchem_canonical_smi, list):
                    pubchem_canonical_smi = pubchem_canonical_smi[0]
                mapping_dict[smi] = pubchem_canonical_smi
                print(f"Canonicalized '{smi}' to '{pubchem_canonical_smi}'")
            except Exception as e:
                print(f"Error standardizing SMILES '{smi}' after RDKit canonicalization: {e}")
                mapping_dict[smi] = None
        else:
            print(f"Cannot canonicalize invalid SMILES '{smi}'.")
            mapping_dict[smi] = None

    return mapping_dict

In [21]:
problematic_mapping_dict = handle_problematic_smiles(problematic_smiles)

Canonicalized 'CC(C)CN(CC(=O)N1CCN(CC(OCc2cc3ccccc3cc2)c2c(Cl)cc(Cl)cc2)CC1)CC(C)C' to 'CC(C)CN(CC(C)C)CC(=O)N1CCN(CC1)CC(C2=C(C=C(C=C2)Cl)Cl)OCC3=CC4=CC=CC=C4C=C3'
Canonicalized 'CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O' to 'CC1CCC2(O1)C(CC(CC2(C)O)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO[C@H]4[C@@H]([C@](CO4)(CO)O)O)O)O)O)(C)C'
Canonicalized 'CCN(CC)CCOC(=O)C1(c2cc(OC)c(OC)cc2)CCCC1' to 'CCN(CC)CCOC(=O)C1(CCCC1)C2=CC(=C(C=C2)OC)OC'


In [22]:
# Create a copy of the larger dictionary to avoid modifying the original
merged_canonical = smiles_canonical_dict.copy()

In [23]:
merged_canonical.update(problematic_mapping_dict)

In [24]:
output_merged_path = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/smiles_canonicalization_all.json"
# with open(output_merged_path, 'w') as outfile:
#     json.dump(merged_canonical, outfile, indent=4)

In [25]:
len(merged_canonical)

14008

In [13]:
# Define the output file name with the current date
current_date = datetime.now().strftime("%Y%m%d")
output_file_name = f"{current_date}_msn_library_pos_all_lib_MSn.mgf"
output_file_path = os.path.join(SAVE_PATH, output_file_name)

# save_as_mgf(all_spectra, output_file_path)

dict_keys(['spectra'])


In [7]:
# Load the saved file to verify the number of spectra
loaded_spectra = list(load_from_mgf(output_file_path))
loaded_count = len(loaded_spectra)

# Compare counts
print(f"Loaded spectra from saved file: {loaded_count}")

if combined_count == loaded_count:
    print("Validation successful: Combined and loaded spectra counts match.")
else:
    print("Validation failed: Counts do not match!")
    print(f"Combined count: {combined_count}, Loaded count: {loaded_count}")

Loaded spectra from saved file: 803405
Validation successful: Combined and loaded spectra counts match.


In [8]:
loaded_spectra[0].metadata_dict()

{'charge': 1,
 'description': 'NIH NPAC ACONN collection of NP',
 'formula': 'C19H22O6',
 'inchi': 'InChI=1S/C19H22O6/c1-22-13-10-15(21)19(18(11-13)25-4)14(20)7-5-12-6-8-16(23-2)17(9-12)24-3/h6,8-11,21H,5,7H2,1-4H3',
 'smiles': 'COc1cc(O)c(C(=O)CCc2cc(OC)c(OC)cc2)c(OC)c1',
 'feature_id': '-1',
 'adduct': '[M+H]+',
 'spectype': 'ALL_MSN_TO_PSEUDO_MS2',
 'fragmentation_method': 'HCD',
 'isolation_window': '1.2000000476840569',
 'acquisition': 'Crude',
 'instrument_type': 'Orbitrap',
 'ims_type': 'none',
 'ion_source': 'ESI',
 'ionmode': 'positive',
 'dataset_id': 'MSVPLACEHOLDERID',
 'usi': 'mzspec:MSVPLACEHOLDERID:20230404_pluskal_nih_01P_A3_id_positive.mzML:-1',
 'scans': '-1',
 'precursor_purity': '1.0',
 'quality_chimeric': 'PASSED',
 'quality_explained_intensity': '0.97712445',
 'quality_explained_signals': '0.8',
 'collision_energy': '60.0',
 'num_peaks': '135',
 'compound_name': 'MEGxp0_001769',
 'parent_mass': '346.141638',
 'inchi_aux': 'MQEOTHTYNCMSAN-UHFFFAOYSA-N',
 'ms_level'