In [1]:
!pip install rdkit-pypi


import itertools
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
from IPython.display import display
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import minimize
import random
import json
from rdkit.Chem import inchi

Collecting rdkit-pypi
  Using cached rdkit_pypi-2022.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


# GNPS-JSON reader

In [2]:
# Load the JSON data
file_path = "GNPS-LIBRARY.json"
with open(file_path, 'r') as f:
    data = json.load(f)

compound_data = {}

def is_valid_smiles(smiles):
    """Check if a SMILES string is valid."""
    if not smiles or smiles.strip() in {"", "N/A"}:  # Catch empty, space-only, or "N/A"
        return False
    return Chem.MolFromSmiles(smiles) is not None

def inchi_to_smiles(inchi_str):
    """Convert InChI to SMILES if possible."""
    if not inchi_str or inchi_str.strip() in {"", "N/A"}:  # Check for empty values
        return None
    try:
        mol = inchi.MolFromInchi(inchi_str)
        return Chem.MolToSmiles(mol) if mol else None
    except:
        return None  # Avoid crashes if conversion fails

# Process each compound
for compound in data:
    compound_name = compound.get("Compound_Name", "Unknown")
    smiles = compound.get("Smiles") or compound.get("SMILES")
    inchi_str = compound.get("INCHI")

    # If SMILES is invalid, try generating it from InChI
    if not is_valid_smiles(smiles):
        smiles = inchi_to_smiles(inchi_str)

    if not is_valid_smiles(smiles):  # If still invalid, skip
        with open("missing_smiles.log", "a") as log_file:
            log_file.write(f"Skipping {compound_name}: Missing or invalid SMILES/InChI\n")
        continue

    # Extract metadata
    metadata = {key: compound.get(key) for key in [
        "spectrum_id", "source_file", "task", "scan", "ms_level",
        "library_membership", "Precursor_MZ", "ExactMass", "Charge",
        "Compound_Source", "Instrument", "Ion_Source", "Ion_Mode",
    ]}
    
    # Convert SMILES to InChI/InChIKey 
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        try:
            metadata.update({
                "smiles": smiles,
                "inchi": inchi.MolToInchi(mol),
                "inchikey": inchi.MolToInchiKey(mol)
            })
        except:
            metadata.update({"inchi": None, "inchikey": None})
    else:
        metadata.update({"inchi": None, "inchikey": None})

    # Parse peaks JSON 
    peaks_json = compound.get("peaks_json")
    try:
        spectra_df = pd.DataFrame(json.loads(peaks_json), columns=["m/z", "intensity"]) if isinstance(peaks_json, str) and peaks_json.strip().lower() != "n/a" else pd.DataFrame(columns=["m/z", "intensity"])
    except:
        spectra_df = pd.DataFrame(columns=["m/z", "intensity"])

    # Store the compound data
    compound_data[compound_name] = {"metadata": metadata, "spectra": spectra_df}
















































































[02:02:03] ERROR: 





































[02:02:03] Explicit valence for atom # 49 Na, 2, is greater than permitted













































[02:02:04] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[02:02:04] SMILES Parse Error: Failed parsing SMILES ' InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)' for input: ' InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)'
[02:02:04] ERROR: 



































[02:02:06] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
[02:02:06] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16






























[02:02:07] SMILES Parse Error: unclosed ring for input: 'OC1=CC(C(OC)=O)=C(OC2=CC(C)=CC(O)=C2C(O)=O)C(OC)=C2'
[02:02:07] SMILES Parse Error: unclosed ring for input: 'O=C1C2=C(C=C(C)C=C2O)OC3=CC(O)=CC(C(OC)=O)=C32'
[02:02:07] SMILES Parse Error: unclosed ring for input: 'O=C([C@H](CC)C)O[C@H]1CCC=C2C1[C@@H](CC[C@@H](O)C[C@@H](O)CC(OC)=O)[C@@H](C)C=C3'
[02:02:07] SMILES Parse Error: unclosed ring for input: 'O=C(N[C@@H](CCCCCC(CC)=O)C(N[C@@H](CC1=CN(OC)C2=C1C=CC=C2)C3=O)=O)[C@@H]4N(C([C@H]([C@H](CC)C)N3)=O)CCCC5'

[02:02:07] SMILES Parse Error: unclosed ring for input: 'O=C(N(C(C=CC=C1)=C1C(N(C)[C@@]2([H])CC3=CC=CC=C3)=O)C2=N4)C5=C4C=CC=C6'




[02:02:07] SMILES Parse Error: unclosed ring for input: 'OC1=CC=C(CC(C(NC(C(CC)C)C(OC(C(CCCCCCCCCC)C)CC(NC(C(NC(C(NC(C(NC2CCC(N)=O)=O)C)=O)C)=O)C(O)C)=O)=O)=O)NC2=O)C=C2'




















[02:02:10] ERROR: 

[02:02:10] ERROR: 

[02:02:10] ERROR: 

[02:02:10] ERROR: 

[02:02:10] ERROR: 

















[02:02:11] Explicit valence for atom # 22 O, 3, is greater than permitted




[02:02:11] Explicit valence for atom # 31 O, 3, is greater than permitted

[02:02:11] Explicit valence for atom # 6 O, 3, is greater than permitted
[02:02:11] Explicit valence for atom # 2 O, 3, is greater than permitted



[02:02:11] Explicit valence for atom # 4 O, 3, is greater than permitted
[02:02:11] Explicit valence for atom # 7 O, 3, is greater than permitted

[02:02:11] Explicit valence for atom # 35 O, 3, is greater than permitted

[02:02:11] Explicit valence for atom # 35 O, 3, is greater than permitted


[02:02:11] Explicit valence for atom # 35 O, 3, is greater than permitted
[02:02:11] Explicit valence for atom # 35 O, 3, is greater than permitted










[02:02:11] ERROR: 




































































































































































































































































[02:02:13] SMILES Parse Error: syntax error while parsing: NA
[02:02:13] SMILES Parse Error: Failed parsing SMILES 'NA' for input: 'NA'
[02:02:13] ERROR: 









































































[02:02:13] ERROR: 

























[02:02:14] SMILES Parse Error: syntax error while parsing: (CC(=O)O3)O)C
[02:02:14] SMILES Parse Error: Failed parsing SMILES '(CC(=O)O3)O)C' for input: '(CC(=O)O3)O)C'


































































[02:02:14] Explicit valence for atom # 17 N, 4, is greater than permitted
[02:02:14] Explicit valence for atom # 19 N, 4, is greater than permitted



[02:02:14] SMILES Parse Error: syntax error while parsing: CC1(C)[C@H](OC(C)=O)[C@H](O)C[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;
[02:02:14] SMILES Parse Error: Failed parsing SMILES 'CC1(C)[C@H](OC(C)=O)[C@H](O)C[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;' for input: 'CC1(C)[C@H](OC(C)=O)[C@H](O)C[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;'






















































































































































































































































































[02:02:19] SMILES Parse Error: syntax error while parsing: CC1(C)[C@H](O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(C=O)C(C=O)=C4O3&gt;&gt;
[02:02:19] SMILES Parse Error: Failed parsing SMILES 'CC1(C)[C@H](O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(C=O)C(C=O)=C4O3&gt;&gt;' for input: 'CC1(C)[C@H](O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(C=O)C(C=O)=C4O3&gt;&gt;'
[02:02:19] SMILES Parse Error: syntax error while parsing: CC1(C)[C@H](OC(C)=O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;
[02:02:19] SMILES Parse Error: Failed parsing SMILES 'CC1(C)[C@H](OC(C)=O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;' for input: 'CC1(C)[C@H](OC(C)=O)CC[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;'
[02:02:19] SMILES Parse Error: syntax error while parsing: CC1(C)[C@H](OC(C)=O)[C@H](O)C[C@]([C@@]1([H])CC[C@H]2C)(C)[C@]32CC4=C(O)C=C(CO)C(C=O)=C4O3&gt;&gt;
[02:02:1








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































[02:02:28] SMILES Parse Error: extra open parentheses for input: 'COC(=O)C1=CCCC2C3(C)CC(OC(=O)C3CC(OC3OC(CO)C(O)C(O)C3O'
[02:02:28] ERROR: 










[02:02:28] Explicit valence for atom # 19 N, 4, is greater than permitted





[02:02:32] SMILES Parse Error: syntax error while parsing: 1S/C17H22O2/c1-3-5-6-7-8-9-10-14-17(19)15-12-11-13-16(18)4-2/h3-4,10,14,16-19H,1-2,5-9H2
[02:02:32] SMILES Parse Error: Failed parsing SMILES '1S/C17H22O2/c1-3-5-6-7-8-9-10-14-17(19)15-12-11-13-16(18)4-2/h3-4,10,14,16-19H,1-2,5-9H2' for input: '1S/C17H22O2/c1-3-5-6-7-8-9-10-14-17(19)15-12-11-13-16(18)4-2/h3-4,10,14,16-19H,1-2,5-9H2'
[02:02:32] ERROR: 




[02:02:40] ERROR: 












[02:02:40] SMILES Parse Error: syntax error while parsing: -O=C1O[C@@H](C2=CC=C(C(OC)=C2)O)[C@@H](C(O)=O)/C1=C\C3=CC=C(C(OC)=C3)O
[02:02:40] SMILES Parse Error: Failed parsing SMILES '-O=C1O[C@@H](C2=CC=C(C(OC)=C2)O)[C@@H](C(O)=O)/C1=C\C3=CC=C(C(OC)=C3)O' for input: '-O=C1O














































[02:02:41] SMILES Parse Error: syntax error while parsing: O=C1C=2C=CC=CC2OC3=C(O)C(OC)=C(OC)C=C13;
[02:02:41] SMILES Parse Error: Failed parsing SMILES 'O=C1C=2C=CC=CC2OC3=C(O)C(OC)=C(OC)C=C13;' for input: 'O=C1C=2C=CC=CC2OC3=C(O)C(OC)=C(OC)C=C13;'













[02:02:41] Explicit valence for atom # 38 O, 3, is greater than permitted





[02:02:41] SMILES Parse Error: syntax error while parsing: ;O=C1C=2C(O)=CC(O)=CC2OC=3C(O)=CC=C(C13)CC=C(C)C
[02:02:41] SMILES Parse Error: Failed parsing SMILES ';O=C1C=2C(O)=CC(O)=CC2OC=3C(O)=CC=C(C13)CC=C(C)C' for input: ';O=C1C=2C(O)=CC(O)=CC2OC=3C(O)=CC=C(C13)CC=C(C)C'































































































































































[02:02:42] SMILES Parse Error: syntax error while parsing: p(MeOx6)H+
[02:02:42] SMILES Parse Error: Failed parsing SMILES 'p(MeOx6)H+' for input: 'p(MeOx6)H+'




























































































































































































































































































[02:02:43] Explicit valence for atom # 10 Na, 2, is greater than permitted











[02:02:43] Explicit valence for atom # 10 Na, 2, is greater than permitted









[02:02:43] Explicit valence for atom # 9 Na, 2, is greater than permitted











[02:02:43] Explicit valence for atom # 50 Na, 2, is greater than permitted


[02:02:43] Explicit valence for atom # 29 Na, 2, is greater than permitted



















[02:02:43] Explicit valence for atom # 35 Na, 2, is greater than permitted






























[02:02:43] Explicit valence for atom # 53 Na, 2, is greater than permitted






















































































































































[02:02:44] SMILES Parse Error: extra open parentheses for input: '[H][C@]1([C@H](CC2=CNC3=CC=CC=C32)'
[02:02:44] SMILES Parse Error: extra open parentheses for input: '[H][C@]1([C@H](CC2=CNC3=CC=CC=C32)'
[02:02:44] SMILES Parse Error: extra open parentheses for input: '[H][C@]1([C@H](CC2=CNC3=CC=CC=C32)N'
[02:02:44] SMILES Parse Error: syntax error while parsing: CC([C@H]1C)=C[C@@](/C=C/C[C@H](C)/C=
[02:02:44] SMILES Parse Error: Failed parsing SMILES 'CC([C@H]1C)=C[C@@](/C=C/C[C@H](C)/C=' for input: 'CC([C@H]1C)=C[C@@](/C=C/C[C@H](C)/C='






[02:02:45] SMILES Parse Error: syntax error while parsing: lydicamycin
[02:02:45] SMILES Parse Error: Failed parsing SMILES 'lydicamycin' for input: 'lydicamycin'
[02:02:45] ERROR: 

[02:02:45] SMILES Parse Error: syntax error while parsing: lydicamycin
[02:02:45] SMILES Parse Error: Failed parsing SMILES 'lydicamycin' for input: 'lydicamycin'
[02:02:45] ERROR: 

































































[02:02:46] ERROR: 












































































In [4]:
# # Normalize intensity values
# if not spectra_df.empty:
#     max_intensity = spectra_df["intensity"].max()
#     spectra_df["normalized_intensity"] = (spectra_df["intensity"] / max_intensity) * 100

# # Print metadata
# print("Compound Name:", first_compound_name)
# print("Metadata:", first_compound["metadata"])
# print("Stored spectra:", first_compound["spectra"], "\n")

# fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# # --- Plot Molecular Structure ---
# if smiles:
#     mol = Chem.MolFromSmiles(smiles)
#     img = Draw.MolToImage(mol, size=(300, 300))
#     axes[0].imshow(img)
#     axes[0].axis("off")
#     axes[0].set_title("Molecular Structure")

# # --- Plot Normalized Mass Spectrum ---
# if not spectra_df.empty:
#     axes[1].bar(spectra_df["m/z"], spectra_df["normalized_intensity"], width=1.0, color="black")
#     axes[1].set_xlabel("m/z")
#     axes[1].set_ylabel("Relative Intensity (%)")
#     axes[1].set_title(f"Normalized Mass Spectrum of {first_compound_name}")
# else:
#     axes[1].text(0.5, 0.5, "No spectral data available", ha="center", va="center", fontsize=12)
#     axes[1].axis("off")

# plt.tight_layout()
# plt.show()


In [None]:
molecule_smiles = next(iter(compound_data.values()))["metadata"]["smiles"]
molecule = Chem.MolFromSmiles(molecule_smiles)
initial_mass = CalcExactMolWt(molecule)

fragmentations = { 
    # CMF Reactions
    ## Positive ion mode 
    'simple_inductive_cleavage': '[O+,N+,S+:2]-[C:1]>>[C+:1].[O,N,S:2]',  # Simple inductive cleavage with charge migration
    'inductive_cleavage_heteroatom': '[O,N,S:1]-[C:2]-[*+:3]>>[O+,N+,S+:1]=[C+:2].[*:3]',  # Heteroatom-assisted cleavage
    'displacement_reaction_positive_a': '[O,N,S:1]-[C:2]-[C:3]-[*+:4]>>[C:3]1-[C:2]-[O+,N+,S+:1]-1.[*:4]',  # Displacement reaction in positive ion mode
    'displacement_reaction_positive_b': '[O,N,S,C:1]=[C:2]-[C:3]-[*+:4]>>[O+,N+,S+,C+:1]-[C:2]=[C:3].[*:4]',  # Displacement reaction in positive ion mod
    'beta_hydrogen_removal_positive': '[O,N,S:1]-[C:2]-[C;H:3]-[C:4]-[*+:5]>>[O+,N+,S+;H1:1]-[C:2]-[C:3]=[C:4].[*:5]',  # β-hydrogen removal with charge migration
    'grob_wharton_fragmentation': '[O,N,S:1]-[C:2]-[C:3]-[C:4]-[*+:5]>>[O+,N+,S+:1]=[C:2].[C:3]=[C:4].[*:5]',  # Grob-Wharton fragmentation

    ## Negative ion mode
    'alpha_elimination': '[C:1]-[C:2](=[O,S,N:3])-[O-,N-,S-:4]>>[C-:1].[C:2](=[O,S,N:3])-[O,N,S:4]',  # α-Elimination
    'gamma_elimination': '[O-,N-,S-:1]-[C:2]=[C:3]-[C:4]-[*:5]>>[O,N,S:1]=[C:2]-[C:3]=[C:4].[*-:5]',  # γ-Elimination
    'epsilon_elimination': '[O-,N-,S-:1]-[C:2]=[C:3]-[C:4]=[C:5]-[C:6][*:7]>>[O,N,S:1]=[C:2]-[C:3]=[C:4]-[C:5]=[C:6].[*-:7]',  # ε-Elimination
    'displacement_reaction_negative': '[O-,N-,S-:1]-[C:2]-[C:3]-[*:4]>>[O,N,S:1]1-[C:2]-[C:3]-1.[*-:4]',  # Displacement reaction (negative mode)
    'beta_hydrogen_removal_negative': '[O-,N-,S-:1]-[C:2]-[C;H:3]-[C:4]-[*:5]>>[O,N,S;H1:1]-[C:2]-[C:3]=[C:4].[*-:5]',  # β-Hydrogen removal (negative mode)
    
    #CRF Reactions 
    'remote_hydrogen_rearrangement_a': '[O,N,S:1]-[C:2]-[C:3]-[H:4]>>[C:2]=[C:3].[H:4]-[O,N,S:1]',  # Remote H rearrangement
    'remote_hydrogen_rearrangement_b': '[C:1]-[C:2]-[O:3]-[H:4]>>[C:1]-[H:4].[C:2]=[O:3]',  # Alternative remote H rearrangement
    'retro_diels_alder': '[C:1]1=[C:2]-[C:3]-[C:4]-[C:5]-[C:6]-1>>[C:6]=[C:1]-[C:2]=[C:3].[C:4]=[C:5]',  # Retro-Diels-Alder (RDA) reaction
    'retro_ene': '[C:2]=[C:1]-[C:3]-[C:4]-[C:5]-[H:6]>>[H:6]-[C:2]-[C:3]=[C:1].[C:4]=[C:5]',  # Retro-ene reaction
    'retro_heteroene': '[O,N,S:1]=[C:2]-[C:3]-[C:4]-[C:5]-[H:6]>>[H:6]-[O,N,S:1]-[C:2]=[C:3].[C:4]=[C:5]',  # Retro-heteroene reaction
    'charge_remote_fragmentation': '[H:1]-[C:2]-[C:3]-[C:4]-[C:5]-[H:6]>>[C:2]=[C:3].[C:4]=[C:5].[H:1]-[H:6]',  # Charge remote fragmentation
    'aromatic_elimination': '[C:1]-[C:2]-[C:3]-[C:4]-[C:5]-[C:6]-[C:7]-[C:8]-[O,N,S:9]>>[C:1]=[C:8]-[O,N,S:9].[c:2]1[c:3][c:4][c:5][c:6][c:7]1',  
    'pericyclic_shift': '[C:1]-[C:2]-[C:3]-[C:4]-[C:5]-[C:6]>>[C:1]=[C:2].[C:5]=[C:6].[C:3]=[C:4]', 
    'pericyclic_1_3_shift': '[H:1]-[C:2]-[C:3]=[C:4]-[C:5]=[C:6]>>[C:2]=[C:3]-[C:4](-[H:1])=[C:5]-[C:6]', 
    'pericyclic_1_5_shift': '[H:1]-[C:2]-[C:3]=[C:4]-[C:5]=[C:6]>>[C:2]=[C:3]-[C:4]=[C:5]-[C:6](-[H:1])', 
    'carbon_monoxide_elimination_a': '[C:1]1-[C:2]-[C:3]-[C:4]-[C:5]-[C:6](=O)-1>>[C:1]1-[C:2]-[C:3]-[C:4]-[C:5]-1.[C:6](#O)',  # CO elimination from cyclic carbonyls
    'carbon_monoxide_elimination_b': '[C:6](#[O+1])-[C:1]-[C:2]-[C:3]-[C:4]-[C-:5]>>[C:1]1-[C:2]-[C:3]-[C:4]-[C:5]-1.[C:6](#O)',  # Alternative CO elimination
#     'radical_fragmentation': '[C:1]-[O,N,S:2]>>[C^1:1].[O^1,N^1,S^1:2]',  # Radical fragmentation
}

reactions = {name: AllChem.ReactionFromSmarts(smarts) for name, smarts in fragmentations.items()}
ionisation_rxn = AllChem.ReactionFromSmarts('[O,N,S:2][C:1]>>[H][O+,N+,S+:2][C:1]')
product_sets = ionisation_rxn.RunReactants((molecule,))

if not product_sets:
    raise ValueError("Ionisation failed, no products generated.")
ionised_product = product_sets[0][0]
Chem.SanitizeMol(ionised_product)
ionised_mass = CalcExactMolWt(ionised_product)
peaks_df = pd.DataFrame({'mz': [ionised_mass], 'intensity': ["1"]})
print(f"length of products_sets (ionsied products formed) : ", len(product_sets))









edges = []
fragmentation_tree = []
processed_fragments = set()
######
fragment_images = []
######

for ionised_product_tuple in product_sets:  
    for ionised_product in ionised_product_tuple:  
        Chem.SanitizeMol(ionised_product)  
        ionised_mass = CalcExactMolWt(ionised_product)  

        # Add ionized product mass to peaks_df if not already present
        if not any(abs(ionised_mass - mz) < 0.5 for mz in peaks_df['mz']):  
            peaks_df = pd.concat([peaks_df, pd.DataFrame({'mz': [ionised_mass], 'intensity': ["1"]})], ignore_index=True)  

        # Initialize fragmentation process  
        new_fragments = [(ionised_product, ionised_mass)]  

        while new_fragments:  
            current_fragments = new_fragments  
            new_fragments = []  

            for parent, parent_mass in current_fragments:  
                parent_key = Chem.MolToSmiles(parent)  # Use SMILES instead of InChIKey to allow more variations  
                if parent_key in processed_fragments:  
                    continue  
                processed_fragments.add(parent_key)  

                for loss_name, reaction in reactions.items():  
                    try:  
                        product_sets = reaction.RunReactants((parent,))  


                        if product_sets:  
                            for product_tuple in product_sets:  # Process ALL products, not just the first  
                                for fragment in product_tuple:  
                                    try:  
                                        Chem.SanitizeMol(fragment)  
                                    except Exception as e:  
                                        continue  

                                    fragment_mass = CalcExactMolWt(fragment)  

                                    # Allow multiple fragments and prevent duplicates  
                                    if not any(abs(fragment_mass - mz) < 0.5 for mz in peaks_df['mz']):  
                                        peaks_df = pd.concat(  
                                            [peaks_df, pd.DataFrame({'mz': [fragment_mass], 'intensity': ["1"]})],  
                                            ignore_index=True,  
                                        )  
                                        new_fragments.append((fragment, fragment_mass))  
                                        edges.append((parent_mass, fragment_mass))  
                                        fragmentation_tree.append((parent_mass, fragment_mass, loss_name))  

                                        ############### Generate and store images for visualization  
                                        parent_img = Draw.MolToImage(parent)  
                                        fragment_img = Draw.MolToImage(fragment)  
                                        fragment_images.append((parent_img, fragment_img, parent_mass, fragment_mass, loss_name))  
                                        ###############
                    except Exception as e:  
                        print(f"Reaction {loss_name} failed on {Chem.MolToSmiles(parent)}: {e}")  
                        continue  

print(edges)  
print(peaks_df)  

###########
# Display the parent and corresponding fragment pairs side by side
fig, axes = plt.subplots(len(fragment_images), 2, figsize=(6, 3 * len(fragment_images)))

for i, (parent_img, fragment_img, parent_mass, fragment_mass, loss_name) in enumerate(fragment_images):
    # Parent molecule
    axes[i, 0].imshow(parent_img)
    axes[i, 0].set_title(f"Parent (m/z={parent_mass:.2f})")
    axes[i, 0].axis("off")

    # Fragment molecule
    axes[i, 1].imshow(fragment_img)
    axes[i, 1].set_title(f"Fragment (m/z={fragment_mass:.2f})\n{loss_name}")
    axes[i, 1].axis("off")

plt.tight_layout()
plt.show()

length of products_sets (ionsied products formed) :  21


[02:03:03] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:04] Explicit valence for atom # 1 C greater than permitted
[02:03:05]

[(981.54039419609, 966.5294951640901), (981.54039419609, 913.47779394009), (981.54039419609, 955.5611296400901), (981.54039419609, 912.5189304800901), (981.54039419609, 870.47198028809), (981.54039419609, 968.5451452280902), (981.54039419609, 885.48287932009), (966.5294951640901, 71.08552677208999), (913.47779394009, 896.4512448440901), (913.47779394009, 18.03382554809), (955.5611296400901, 938.5345805440901), (955.5611296400901, 940.5502306080901), (912.5189304800901, 895.4923813840901), (912.5189304800901, 897.5080314480902), (870.47198028809, 853.44543119209), (968.5451452280902, 950.5345805440902), (968.5451452280902, 19.01784113609), (885.48287932009, 868.45633022409), (896.4512448440901, 867.4479566121801), (896.4512448440901, 29.00219107209), (940.5502306080901, 922.5396659240902), (897.5080314480902, 879.4974667640902), (853.44543119209, 824.44214296018), (868.45633022409, 839.45304199218), (867.4479566121801, 825.41413106409), (867.4479566121801, 43.041102000180004), (824.4421

In [None]:
import matplotlib.pyplot as plt

# Convert intensity to numeric and normalize
peaks_df["intensity"] = pd.to_numeric(peaks_df["intensity"])
max_intensity = peaks_df["intensity"].max()
peaks_df["normalized_intensity"] = (peaks_df["intensity"] / max_intensity) * 100  # Scale to 100%

# Plot the mass spectrum
plt.figure(figsize=(10, 5))
plt.bar(peaks_df["mz"], peaks_df["normalized_intensity"], width=1.0, color="black")

# Label the axes
plt.xlabel("m/z")
plt.ylabel("Relative Intensity (%)")
plt.title("Simulated Mass Spectrum (NO INTENSITY)")

# Format the x-axis for better readability
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)

plt.show()

# Add experimental data below

In [None]:
# Example experimental intensities (Needs to be provided from real data)
experimental_intensities = {180.0655: 18.39, 162.0549: 1.26, 135.0684: 6.20, 133.0516: 2.55, 105.0334: 100}

# Add observed experimental intensities to peaks_df
peaks_df["observed_intensity"] = peaks_df["mz"].map(experimental_intensities)

# Normalize observed intensities to match scale
peaks_df["observed_intensity"] = peaks_df["observed_intensity"] / peaks_df["observed_intensity"].max()


# Genetic algorithm

In [None]:
population_size = 50  # Number of individuals in each generation
generations = 1000  # Number of generations to evolve
mutation_rate = 0.1  # Probability of mutating a transition probability
tolerance = 0.05  # Error tolerance for intensity difference

def fitness_function(transition_probs, edges, peaks_df, tolerance=0.05):
    """
    Compare predicted intensities to observed intensities and optimize transition probabilities.
    """

    # Ensure mz values are rounded consistently
    peaks_df['mz'] = peaks_df['mz'].round(6)
    edges = [(round(i, 6), round(j, 6)) for i, j in edges]

    # Create a directed graph of the fragmentation tree
    G = nx.DiGraph()
    G.add_edges_from(edges)

    # Perform topological sorting (process precursors before fragments)
    topo_order = list(nx.topological_sort(G))

    # Initialize calculated intensities using actual mz values as keys
    calculated_intensities = {mz: 0 for mz in peaks_df['mz'].values}

    # Ensure precursor node is in `calculated_intensities`
    precursor_node = topo_order[0]
    if precursor_node not in calculated_intensities:
        print(f"Error: Precursor node {precursor_node} not in peaks_df['mz']")
        return -float('inf')

    calculated_intensities[precursor_node] = 1  # Set precursor intensity to 1

    # Normalize transition probabilities
    for node in topo_order:
        outgoing_edges = [(i, j) for (i, j) in edges if i == node]
        total_prob = sum(transition_probs.get((node, j), 0) for _, j in outgoing_edges)

        if total_prob > 1:
            for _, j in outgoing_edges:
                transition_probs[(node, j)] /= total_prob

    # Propagate intensities through the fragmentation tree
    for node in topo_order:
        outgoing_edges = [(i, j) for (i, j) in edges if i == node]
        total_prob = sum(transition_probs.get((node, j), 0) for _, j in outgoing_edges)

        totalTransition_intensityChange = calculated_intensities.get(node, 0) * total_prob

        for (i, j) in edges:
            if i == node:
                if j not in calculated_intensities:
                    calculated_intensities[j] = 0  # Initialize missing nodes
                transition_intensity = calculated_intensities[i] * transition_probs.get((i, j), 0)
                calculated_intensities[j] += transition_intensity

        calculated_intensities[node] -= totalTransition_intensityChange

    # Compare calculated intensities with observed experimental intensities
    score = 0
    for mz in peaks_df['mz']:
        observed_intensity = peaks_df.loc[peaks_df['mz'] == mz, 'observed_intensity'].values[0]  # Use observed values
        calculated_intensity = calculated_intensities.get(mz, 0)  # Get calculated value from propagation

        if observed_intensity > 0:
            error = abs(calculated_intensity - observed_intensity) / observed_intensity
        else:
            error = 0 if calculated_intensity == 0 else 1  # Penalize non-zero calculated values when observed is zero

        if error <= tolerance:
            score += 1 - error  # Reward small errors
        else:
            score -= error  # Penalize larger errors

    return score

def select_parents(population, fitness_scores):
    """
    Select parents using a roulette-wheel selection based on fitness scores.
    """
    min_fitness = min(fitness_scores)
    if min_fitness < 0:
        fitness_scores = [score - min_fitness + 1 for score in fitness_scores]

    total_fitness = sum(fitness_scores)
    selection_probs = [score / total_fitness if total_fitness != 0 else 1/len(fitness_scores) for score in fitness_scores]
    
    return np.random.choice(population, size=2, p=selection_probs, replace=False)

def initialize_population(edges):
    """
    Initialize a population of random transition probabilities for each edge.
    """
    population = []
    for _ in range(population_size):
        individual = {edge: random.uniform(0, 1) for edge in edges}
        population.append(individual)
    return population

def crossover(parent1, parent2):
    """
    Perform crossover between two parents to create an offspring.
    """
    return {key: parent1[key] if random.random() < 0.5 else parent2[key] for key in parent1}

def mutate(individual):
    """
    Perform mutation on an individual by randomly adjusting transition probabilities.
    """
    for key in individual:
        if random.random() < mutation_rate:
            individual[key] += random.uniform(-0.1, 0.1)
            individual[key] = max(0, min(1, individual[key]))  # Ensure probabilities remain between 0 and 1
    return individual

def genetic_algorithm(edges, peaks_df, patience=10, epsilon=1e-6):
    """
    Run the Genetic Algorithm to optimize transition probabilities for the fragmentation tree.
    """
    # Initialize population
    population = initialize_population(edges)
    
    best_fitness = float('-inf')
    no_improvement_count = 0

    for generation in range(generations):
        fitness_scores = [fitness_function(individual, edges, peaks_df) for individual in population]

        current_best_fitness = max(fitness_scores)
        
        if current_best_fitness > best_fitness + epsilon:
            best_fitness = current_best_fitness
            no_improvement_count = 0
        else:
            no_improvement_count += 1
        
        if no_improvement_count >= patience:
            print(f"Early stopping after {generation} generations with best fitness: {best_fitness}")
            break

        new_population = []
        for _ in range(population_size // 2):
            parent1, parent2 = select_parents(population, fitness_scores)
            offspring1, offspring2 = mutate(crossover(parent1, parent2)), mutate(crossover(parent1, parent2))
            new_population.extend([offspring1, offspring2])
        
        population = new_population
    
    best_individual = max(population, key=lambda ind: fitness_function(ind, edges, peaks_df))
    return best_individual

# Call the GA to calculate transition probabilities
transition_probabilities = genetic_algorithm(edges, peaks_df)

# Print the final optimized transition probabilities
print(f"Optimized transition probabilities: {transition_probabilities}")

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict

def plot_fragmentation_tree(peaks_df, fragmentation_tree, transition_probabilities):
    """
    Plots a fragmentation tree with Δm/z values, reaction labels, and hierarchical layout.
    """

    # Create a directed graph
    G = nx.DiGraph()

    # Add nodes and edges based on fragmentation_tree
    node_labels = {}
    node_mz_map = {}  # Mapping node index to its m/z value

    for parent_mass, child_mass, reaction in fragmentation_tree:
        # Find indices of parent and child nodes based on their m/z values
        parent_node = peaks_df[peaks_df['mz'].apply(lambda x: abs(x - parent_mass) < 0.01)].index[0]
        child_node = peaks_df[peaks_df['mz'].apply(lambda x: abs(x - child_mass) < 0.01)].index[0]

        # Add nodes and labels
        if parent_node not in G:
            G.add_node(parent_node)
            node_labels[parent_node] = f"m/z {parent_mass:.4f}"
            node_mz_map[parent_node] = parent_mass

        if child_node not in G:
            G.add_node(child_node)
            node_labels[child_node] = f"m/z {child_mass:.4f}"
            node_mz_map[child_node] = child_mass

        # Add directed edge with reaction label
        G.add_edge(parent_node, child_node, reaction=reaction)

    # Identify the precursor node (largest m/z value)
    precursor_node = peaks_df['mz'].idxmax()

    # Compute node depths (distance from precursor ion)
    if precursor_node in G:
        node_depths = nx.single_source_shortest_path_length(G, precursor_node)
    else:
        print(f"⚠ Warning: Precursor node {precursor_node} is not in the graph.")
        node_depths = {node: 0 for node in G.nodes()}  # Default all nodes to level 0

    # Ensure all nodes are in node_depths
    for node in G.nodes():
        if node not in node_depths:
            node_depths[node] = max(node_depths.values(), default=0) + 1  # Place at lowest level

    # Group nodes by depth (tree levels)
    levels = defaultdict(list)
    for node, depth in node_depths.items():
        levels[depth].append(node)

    # Custom layout for the tree structure
    pos = {}
    horizontal_spacing = 2.0  # Spacing between nodes at the same level
    vertical_spacing = 1.5    # Spacing between levels

    for depth, nodes in levels.items():
        num_nodes = len(nodes)
        for i, node in enumerate(nodes):
            pos[node] = (i * horizontal_spacing - num_nodes * horizontal_spacing / 2, -depth * vertical_spacing)

    # Ensure all nodes have positions
    for node in G.nodes():
        if node not in pos:
            pos[node] = (0, 0)  # Assign a default position

    # Add m/z differences and transition probabilities as edge labels
    edge_labels = {}
    for parent, child in G.edges():
        parent_mass = node_mz_map[parent]
        child_mass = node_mz_map[child]
        mz_diff = abs(parent_mass - child_mass)

        # Retrieve transition probability using correct indexing
        transition_prob = transition_probabilities.get((parent_mass, child_mass), 0)
        
        print(f"Transition prob for ({parent_mass}, {child_mass}): {transition_prob}")

        # Format the label with Δm/z and transition probability
        edge_labels[(parent, child)] = f"Δm/z: {mz_diff:.2f}, p={transition_prob:.2f}"

    # Draw the updated graph
    plt.figure(figsize=(12, 10))

    # Draw the nodes
    nx.draw_networkx_nodes(G, pos, node_size=700, node_color='lightblue', edgecolors="black")

    # Draw the edges
    nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=15, edge_color='gray', alpha=0.5)

    # Draw the node labels (m/z values)
    nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=10, font_color='black')

    # Draw the edge labels (reaction name, Δm/z, and transition probability)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=9)

    # Set plot title
    plt.title("Fragmentation Tree with Δm/z, Transition Probabilities", fontsize=14)

    # Show the plot
    plt.show()

plot_fragmentation_tree(peaks_df, fragmentation_tree, transition_probabilities)


In [None]:
# Example of accessing data:
compound_name = "3-Des-Microcystein_LR"
metadata = compound_data[compound_name]["metadata"]
spectra_df = compound_data[compound_name]["spectra"]

print("Stored metadata:", compound_data[compound_name]["metadata"],"\n")
print("Stored spectra:", compound_data[compound_name]["spectra"], "\n")
print(metadata["smiles"],"\n")  
print(metadata["inchi"])