In [1]:
import os
import numpy as np
from datetime import datetime
from matchms import Spectrum
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
from collections import Counter
from massspecgym.utils import standardize_smiles
from massspecgym.tools.data import canonicalize_smiles_in_spectra, handle_problematic_smiles
import os
import json
import time
import typing as T
from pathlib import Path
from matchms import Spectrum
from rdkit import Chem

In [2]:
FILE_1 = "20231031_nihnp_library_pos_all_lib_MSn.mgf"
FILE_2 = "20231130_mcescaf_library_pos_all_lib_MSn.mgf"
FILE_3 = "20231130_otavapep_library_pos_all_lib_MSn.mgf"
FILE_4 = "20240411_mcebio_library_pos_all_lib_MSn.mgf"

In [3]:
BASE_PATH = "/Users/macbook/CODE/MS/data/MSn/ALL"
SAVE_PATH = "../../../data/MSn"

In [4]:
file_paths = [os.path.join(BASE_PATH, FILE_1),
              os.path.join(BASE_PATH, FILE_2),
              os.path.join(BASE_PATH, FILE_3),
              os.path.join(BASE_PATH, FILE_4)]

In [None]:
all_spectra = []
spectra_counts = {}

# Initialize counters
root_id_counter = 0  # Counter for root MS2 spectra
fragment_id_counter = 0  # Counter for fragments under each root


for file_path in file_paths:
    spectra = list(load_from_mgf(file_path))
    spectra_counts[file_path] = len(spectra)
    idx_list = []  # To store IDENTIFIERs for validation

    for spectrum in spectra:
        ms_level = int(spectrum.get('ms_level'))
        if ms_level == 2:
            root_id_counter += 1
            fragment_id_counter = 0  # Reset fragment counter for new root
            # First part of IDENTIFIER: root ID
            root_id = f"{root_id_counter:07d}"
            # Second part of IDENTIFIER: fragment ID (for root, it's '0000000')
            fragment_id = f"{fragment_id_counter:07d}"
            idx = f"{root_id}_{fragment_id}"
            spectrum.set('IDENTIFIER', idx)
        else:
            fragment_id_counter += 1
            # Use the current root_id and increment fragment_id_counter
            fragment_id = f"{fragment_id_counter:07d}"
            idx = f"{root_id}_{fragment_id}"
            spectrum.set('IDENTIFIER', idx)
        idx_list.append(idx)

    all_spectra.extend(spectra)

for file, count in spectra_counts.items():
    print(f"{file}: {count} spectra")

combined_count = len(all_spectra)
print(f"Total combined spectra: {combined_count}")

# Retrieve canonical version

In [6]:
save_progress_path = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/smiles_canonicalization_all.json"

smiles_canonical_dict = canonicalize_smiles_in_spectra(
    spectra_list=all_spectra,
    standardize_smiles_func=standardize_smiles,
    save_path=save_progress_path,
    batch_size=500,
    max_retries=30,
    delay_between_retries=20
)

Total unique SMILES to process: 14008
Resuming from saved progress at /Users/macbook/CODE/Majer:MassSpecGym/data/MSn/smiles_canonicalization_all.json
SMILES remaining to process: 0
Canonicalization completed.


In [7]:
problematic_smiles = []

for spectrum in all_spectra:
    original_smiles = spectrum.metadata.get('smiles')
    if original_smiles:
        canonical_smiles = smiles_canonical_dict.get(original_smiles)
        if canonical_smiles:
            spectrum.set('smiles', canonical_smiles)
        else:
            # Keep the original SMILES
            # spectrum.set('smiles', original_smiles)

            problematic_smiles.append(original_smiles)
            print(f"Warning: Could not canonicalize SMILES '{original_smiles}'. Keeping the original SMILES.")

In [8]:
# Initialize lists to categorize SMILES
valid_smiles = []
invalid_smiles = []

for smi in problematic_smiles:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        valid_smiles.append(smi)
    else:
        invalid_smiles.append(smi)

In [9]:
# Report the results
print("\nValidation Results:")
print(f"Total problematic SMILES: {len(problematic_smiles)}")
print(f"Total unqiue problematic SMILES: {len(set(problematic_smiles))}")
print(f"Valid SMILES that couldn't be canonicalized: {len(valid_smiles)}")
print(f"Invalid SMILES: {len(invalid_smiles)}")


Validation Results:
Total problematic SMILES: 0
Total unqiue problematic SMILES: 0
Valid SMILES that couldn't be canonicalized: 0
Invalid SMILES: 0


In [10]:
# PubChem non canonized go to RDKit, undergo canonization and returned to PubChem to undergo canonization again with PubChem
problematic_smiles_canonized = []
for smi in set(problematic_smiles):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        canonical_smi = Chem.MolToSmiles(mol, canonical=True)
        print(f"Canonicalized '{smi}' to '{canonical_smi}'")
        problematic_smiles_canonized.append(canonical_smi)
    else:
        print(f"Cannot canonicalize invalid SMILES '{smi}'.")

# Corrected
Problematic for PubChem, original to RDKit canonized:

Canonicalized ```CC(C)CN(CC(=O)N1CCN(CC(OCc2cc3ccccc3cc2)c2c(Cl)cc(Cl)cc2)CC1)CC(C)C to CC(C)CN(CC(=O)N1CCN(CC(OCc2ccc3ccccc3c2)c2ccc(Cl)cc2Cl)CC1)CC(C)C```

Canonicalized ```CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O to CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O```

Canonicalized ```CCN(CC)CCOC(=O)C1(c2cc(OC)c(OC)cc2)CCCC1 to CCN(CC)CCOC(=O)C1(c2ccc(OC)c(OC)c2)CCCC1```

In [15]:
# Undergo PubChem canonization
problematic_pubchcem_canonical = standardize_smiles(problematic_smiles_canonized)

In [18]:
# PubChem canonized
# problematic_pubchcem_canonical

['CC(C)CN(CC(C)C)CC(=O)N1CCN(CC1)CC(C2=C(C=C(C=C2)Cl)Cl)OCC3=CC4=CC=CC=C4C=C3',
 'CC1CCC2(O1)C(CC(CC2(C)O)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO[C@H]4[C@@H]([C@](CO4)(CO)O)O)O)O)O)(C)C',
 'CCN(CC)CCOC(=O)C1(CCCC1)C2=CC(=C(C=C2)OC)OC']

```CC(C)CN(CC(C)C)CC(=O)N1CCN(CC1)CC(C2=C(C=C(C=C2)Cl)Cl)OCC3=CC4=CC=CC=C4C=C3```

 ```CC1CCC2(O1)C(CC(CC2(C)O)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO[C@H]4[C@@H]([C@](CO4)(CO)O)O)O)O)O)(C)C```
 
 ```CCN(CC)CCOC(=O)C1(CCCC1)C2=CC(=C(C=C2)OC)OC```

In [19]:
# RDKit canonized
# problematic_smiles_canonized

['CC(C)CN(CC(=O)N1CCN(CC(OCc2ccc3ccccc3c2)c2ccc(Cl)cc2Cl)CC1)CC(C)C',
 'CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O',
 'CCN(CC)CCOC(=O)C1(c2ccc(OC)c(OC)c2)CCCC1']

``` CC(C)CN(CC(=O)N1CCN(CC(OCc2ccc3ccccc3c2)c2ccc(Cl)cc2Cl)CC1)CC(C)C```

 ```CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O```
 
 ```CCN(CC)CCOC(=O)C1(c2ccc(OC)c(OC)c2)CCCC1```

In [13]:
problematic_mapping_dict = handle_problematic_smiles(problematic_smiles, standardize_smiles)

#### Result:

Canonicalized ```CC(C)CN(CC(=O)N1CCN(CC(OCc2cc3ccccc3cc2)c2c(Cl)cc(Cl)cc2)CC1)CC(C)C``` to ```CC(C)CN(CC(C)C)CC(=O)N1CCN(CC1)CC(C2=C(C=C(C=C2)Cl)Cl)OCC3=CC4=CC=CC=C4C=C3```

Canonicalized ```CC1CCC2(O1)C(C)(C)CC(O[C@@H]1O[C@H](CO[C@@H]3OC[C@](O)(CO)[C@H]3O)[C@@H](O)[C@H](O)[C@H]1O)CC2(C)O``` to ```CC1CCC2(O1)C(CC(CC2(C)O)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO[C@H]4[C@@H]([C@](CO4)(CO)O)O)O)O)O)(C)C```

Canonicalized ```CCN(CC)CCOC(=O)C1(c2cc(OC)c(OC)cc2)CCCC1``` to ```CCN(CC)CCOC(=O)C1(CCCC1)C2=CC(=C(C=C2)OC)OC```


In [14]:
# Create a copy of the larger dictionary to avoid modifying the original
merged_canonical = smiles_canonical_dict.copy()

In [15]:
merged_canonical.update(problematic_mapping_dict)

In [16]:
output_merged_path = "/Users/macbook/CODE/Majer:MassSpecGym/data/MSn/smiles_canonicalization_all.json"
# with open(output_merged_path, 'w') as outfile:
#     json.dump(merged_canonical, outfile, indent=4)

In [17]:
len(merged_canonical)

14008

In [18]:
# Update again spectra object with canonized SMILES
problematic_smiles = []

for spectrum in all_spectra:
    original_smiles = spectrum.metadata.get('smiles')
    if original_smiles:
        canonical_smiles = problematic_mapping_dict.get(original_smiles)
        if canonical_smiles:
            spectrum.set('smiles', canonical_smiles)
            print(f"Canonicalize SMILES '{original_smiles}' to PubChem-canonicalized SMILES {canonical_smiles}")


In [19]:
# Define the output file name with the current date
current_date = datetime.now().strftime("%Y%m%d")
output_file_name = f"{current_date}_msn_library_pos_all_lib_MSn.mgf"
output_file_path = os.path.join(SAVE_PATH, output_file_name)

# save_as_mgf(all_spectra, output_file_path)

dict_keys(['spectra'])


In [7]:
# Load the saved file to verify the number of spectra
loaded_spectra = list(load_from_mgf(output_file_path))
loaded_count = len(loaded_spectra)

# Compare counts
print(f"Loaded spectra from saved file: {loaded_count}")

if combined_count == loaded_count:
    print("Validation successful: Combined and loaded spectra counts match.")
else:
    print("Validation failed: Counts do not match!")
    print(f"Combined count: {combined_count}, Loaded count: {loaded_count}")

Loaded spectra from saved file: 803405
Validation successful: Combined and loaded spectra counts match.


In [8]:
loaded_spectra[0].metadata_dict()

{'charge': 1,
 'description': 'NIH NPAC ACONN collection of NP',
 'formula': 'C19H22O6',
 'inchi': 'InChI=1S/C19H22O6/c1-22-13-10-15(21)19(18(11-13)25-4)14(20)7-5-12-6-8-16(23-2)17(9-12)24-3/h6,8-11,21H,5,7H2,1-4H3',
 'smiles': 'COc1cc(O)c(C(=O)CCc2cc(OC)c(OC)cc2)c(OC)c1',
 'feature_id': '-1',
 'adduct': '[M+H]+',
 'spectype': 'ALL_MSN_TO_PSEUDO_MS2',
 'fragmentation_method': 'HCD',
 'isolation_window': '1.2000000476840569',
 'acquisition': 'Crude',
 'instrument_type': 'Orbitrap',
 'ims_type': 'none',
 'ion_source': 'ESI',
 'ionmode': 'positive',
 'dataset_id': 'MSVPLACEHOLDERID',
 'usi': 'mzspec:MSVPLACEHOLDERID:20230404_pluskal_nih_01P_A3_id_positive.mzML:-1',
 'scans': '-1',
 'precursor_purity': '1.0',
 'quality_chimeric': 'PASSED',
 'quality_explained_intensity': '0.97712445',
 'quality_explained_signals': '0.8',
 'collision_energy': '60.0',
 'num_peaks': '135',
 'compound_name': 'MEGxp0_001769',
 'parent_mass': '346.141638',
 'inchi_aux': 'MQEOTHTYNCMSAN-UHFFFAOYSA-N',
 'ms_level'