In [1]:
#!pip install rdkit-pypi


import itertools
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
from IPython.display import display
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import minimize
import random
import json
from rdkit.Chem import inchi

In [2]:
   ## Negative ion mode
#    'alpha_elimination': '[#6:1]-[#6:2](=[O,S,N:3])-[O-,N-,S-:4]>>[#6-:1].[#6:2](=[O,S,N:3])-[O,N,S:4]',  # α-Elimination
#    'gamma_elimination': '[O-,N-,S-:1]-[#6:2]=[#6:3]-[#6:4]-[*:5]>>[O,N,S:1]=[#6:2]-[#6:3]=[#6:4].[*-:5]',  # γ-Elimination
#    'epsilon_elimination': '[O-,N-,S-:1]-[#6:2]=[#6:3]-[#6:4]=[#6:5]-[#6:6][*:7]>>[O,N,S:1]=[#6:2]-[#6:3]=[#6:4]-[#6:5]=[#6:6].[*-:7]',  # ε-Elimination
#    'displacement_reaction_negative': '[O-,N-,S-:1]-[#6:2]-[#6:3]-[*:4]>>[O,N,S:1]1-[#6:2]-[#6:3]-1.[*-:4]',  # Displacement reaction (negative mode)
#    'beta_hydrogen_removal_negative': '[O-,N-,S-:1]-[#6:2]-[#6;H:3]-[#6:4]-[*:5]>>[O,N,S;H1:1]-[#6:2]-[#6:3]=[#6:4].[*-:5]',  # β-Hydrogen removal (negative mode)
    
 
fragmentations = { 
    # CMF Reactions
    ## Positive ion mode 
    'simple_inductive_cleavage': '[O+,N+,S+:2]-[#6:1]>>[#6+:1].[O,N,S;+0:2]',  # Simple inductive cleavage with charge migration
    'inductive_cleavage_heteroatom': '[O,N,S:1]-[#6:2]-[*+:3]>>[O+,N+,S+:1]=[#6:2].[*;+0:3]',  # Heteroatom-assisted cleavage
    'displacement_reaction_positive_a': '[O,N,S:1]-[#6:2]-[#6:3]-[*+:4]>>[#6:3]1-[#6:2]-[O+,N+,S+:1]-1.[*;+0:4]',  # Displacement reaction in positive ion mode
    'displacement_reaction_positive_b': '[O,N,S,#6:1]=[#6:2]-[#6:3]-[*+:4]>>[O+,N+,S+,C+:1]-[#6:2]=[#6:3].[*;+0:4]',  # Displacement reaction in positive ion mod
    'beta_hydrogen_removal_positive': '[O,N,S:1]-[#6:2]-[#6;H:3]-[#6:4]-[*+:5]>>[O+,N+,S+;H1:1]-[#6:2]-[#6:3]=[#6:4].[*;+0:5]',  # β-hydrogen removal with charge migration
    'grob_wharton_fragmentation': '[O,N,S:1]-[#6:2]-[#6:3]-[#6:4]-[*+:5]>>[O+,N+,S+:1]=[#6:2].[#6:3]=[#6:4].[*;+0:5]',  # Grob-Wharton fragmentation

    #CRF Reactions 
    'remote_hydrogen_rearrangement_a': '[O,N,S:1]-[#6:2]-[#6:3]-[H:4]>>[#6:2]=[#6:3].[H:4]-[O,N,S:1]',  # Remote H rearrangement
    'remote_hydrogen_rearrangement_b': '[#6:1]-[#6:2]-[O:3]-[H:4]>>[#6:1]-[H:4].[#6:2]=[O:3]',  # Alternative remote H rearrangement
    'retro_diels_alder': '[#6:1]1=[#6:2]-[#6:3]-[#6:4]-[#6:5]-[#6:6]-1>>[#6:6]=[#6:1]-[#6:2]=[#6:3].[#6:4]=[#6:5]',  # Retro-Diels-Alder (RDA) reaction
    'retro_ene': '[#6:2]=[#6:1]-[#6:3]-[#6:4]-[#6:5]-[H:6]>>[H:6]-[#6:2]-[#6:3]=[#6:1].[#6:4]=[#6:5]',  # Retro-ene reaction
    'retro_heteroene': '[O,N,S:1]=[#6:2]-[#6:3]-[#6:4]-[#6:5]-[H:6]>>[H:6]-[O,N,S:1]-[#6:2]=[#6:3].[#6:4]=[#6:5]',  # Retro-heteroene reaction
    'charge_remote_fragmentation': '[H:1]-[#6:2]-[#6:3]-[#6:4]-[#6:5]-[H:6]>>[#6:2]=[#6:3].[#6:4]=[#6:5].[H:1]-[H:6]',  # Charge remote fragmentation
    'aromatic_elimination': '[#6:1]-[#6:2]-[#6:3]-[#6:4]-[#6:5]-[#6:6]-[#6:7]-[#6:8]-[O,N,S:9]>>[#6:1]=[#6:8]-[O,N,S:9].[#6:2]1[#6:3][#6:4][#6:5][#6:6][#6:7]1',  
    'pericyclic_shift': '[#6:1]-[#6:2]-[#6:3]-[#6:4]-[#6:5]-[#6:6]>>[#6:1]=[#6:2].[#6:5]=[#6:6].[#6:3]=[#6:4]', 
    'pericyclic_1_3_shift': '[H:1]-[#6:2]-[#6:3]=[#6:4]-[#6:5]=[#6:6]>>[#6:2]=[#6:3]-[#6:4](-[H:1])=[#6:5]-[#6:6]', 
    'pericyclic_1_5_shift': '[H:1]-[#6:2]-[#6:3]=[#6:4]-[#6:5]=[#6:6]>>[#6:2]=[#6:3]-[#6:4]=[#6:5]-[#6:6](-[H:1])', 
    'carbon_monoxide_elimination_a': '[#6:1]1-[#6:2]-[#6:3]-[#6:4]-[#6:5]-[#6:6](=O)-1>>[#6:1]1-[#6:2]-[#6:3]-[#6:4]-[#6:5]-1.[#6:6](#O)',  # CO elimination from cyclic carbonyls
    'carbon_monoxide_elimination_b': '[#6:6](#[O+1])-[#6:1]-[#6:2]-[#6:3]-[#6:4]-[#6-:5]>>[#6:1]1-[#6:2]-[#6:3]-[#6:4]-[#6:5]-1.[#6:6](#O;+0)',  # Alternative CO elimination
    'radical_fragmentation': '[#6:1]-[O,N,S:2]>>[#6^1:1].[O^1,N^1,S^1:2]',  # Radical fragmentation
}


# GNPS-JSON reader

In [3]:
# Load the JSON data from a file
file_path = "GNPS-LIBRARY.json"
with open(file_path, 'r') as f:
    data = json.load(f)
    
compound_data = {}

# Iterate over each compound in the JSON data
for compound in data[:5000]:
    compound_name = compound.get("Compound_Name", "Unknown")

    metadata = {
        "spectrum_id": compound.get("spectrum_id"),
        "source_file": compound.get("source_file"),
        "task": compound.get("task"),
        "scan": compound.get("scan"),
        "ms_level": compound.get("ms_level"),
        "library_membership": compound.get("library_membership"),
        "precursor_mz": compound.get("Precursor_MZ"),
        "exact_mass": compound.get("ExactMass"),
        "charge": compound.get("Charge"),
        "compound_source": compound.get("Compound_Source"),
        "instrument": compound.get("Instrument"),
        "ion_source": compound.get("Ion_Source"),
        "ion_mode": compound.get("Ion_Mode")
    }
    
    def is_valid_smiles(smiles):
        """Check if a SMILES string is valid using RDKit."""
        if not smiles:
            return False
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None

    def generate_inchi_inchikey(smiles):
        """Convert a SMILES string to InChI and InChIKey using RDKit."""
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            inchi_str = inchi.MolToInchi(mol)
            inchikey_str = inchi.MolToInchiKey(mol)
            return inchi_str, inchikey_str
        return None, None

    def generate_smiles_from_inchi(inchi_str):
        """Convert an InChI string to a SMILES string using RDKit."""
        mol = inchi.MolFromInchi(inchi_str)
        return Chem.MolToSmiles(mol) if mol else None

    # Retrieve SMILES and InChI
    smiles = compound.get("Smiles") or compound.get("SMILES")
    inchi_str = compound.get("INCHI")
    if smiles == "": continue
    if inchi == "N/A": continue
        
    # Validate and process SMILES
    inchikey_str = None
    if is_valid_smiles(smiles):
        inchi_str, inchikey_str = generate_inchi_inchikey(smiles)
    elif inchi_str:
        smiles = generate_smiles_from_inchi(inchi_str)

    # Store processed identifiers
    metadata["smiles"] = smiles or ""
    metadata["inchi"] = inchi_str
    metadata["inchikey"] = inchikey_str

    if metadata["smiles"] == "" and metadata["inchi"] == "N/A": continue
    if metadata['charge'] == "0": continue
        
    try: 
        metadata['precursor_mz'] = float(metadata['precursor_mz'])
    except: continue

    if metadata['precursor_mz'] > 300: continue
    if metadata["ion_mode"] == 'Negative': continue
        
    
    # Validate and parse peaks_json
    peaks_json = compound.get("peaks_json")
    if peaks_json and peaks_json != "N/A":
        try:
            peaks = json.loads(peaks_json)
            spectra_df = pd.DataFrame(peaks, columns=["mz", "intensity"])
        except json.JSONDecodeError:
            spectra_df = pd.DataFrame(columns=["mz", "intensity"])
    else:
        spectra_df = pd.DataFrame(columns=["mz", "intensity"])

    # Normalize intensity (max = 1)
    spectra_df["intensity"] = spectra_df["intensity"] / spectra_df["intensity"].max()
    
    # Filter out peaks with normalized intensity < 0.05
    df_filtered = spectra_df[spectra_df["intensity"] >= 0.05]
    
    # Store the metadata and spectra dataframe in the dictionary
    compound_data[compound_name] = {
        "metadata": metadata,
        "spectra": df_filtered
    }


[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 






[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax er

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error while parsing: N/A
[15:57:46] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:46] ERROR: 

[15:57:46] SMILES Parse Error: syntax error wh

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error wh





[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax erro









[15:57:47] SMILES Parse Error: syntax error while parsing: N/A
[15:57:47] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: 
[15:57:47] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:47] ERROR: 











[15:57:47] SMILES Parse Error: syntax error while parsing: 
[15:57:47] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: 
[15:57:47] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: 
[15:57:47] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: 
[15:57:47] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:47] ERROR: 

[15:57:47] SMILES Parse Error: syntax error while parsing: 
[15


























[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15





[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse 

[15:57:48] ERROR: 

[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: unclosed ring for input: 'O=C([C@H](CC)C)O[C@H]1CCC=C2C1[C@@H](CC[C@@H](O)C[C@@H](O)CC(OC)=O)[C@@H](C)C=C3'
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: unclosed ring for input: 'O=C(N[C@@H](CCCCCC(CC)=O)C(N[C@@H](CC1=CN(OC)C2=C1C=CC=C2)C3=O)=O)[C@@H]4N(C([C@H]([C@H](CC)C)N3)=O)CCCC5'
[15:57:48] ERROR: 


[15:57:48] SMILES Parse Error: unclosed ring for input: 'O=C(N(C(C=CC=C1)=C1C(N(C)[C@@]2([H])CC3=CC=CC=C3)=O)C2=N4)C5=C4C=CC=C6'
[15:57:48] ERROR: 





[15:57:48] SMILES Parse Error: unclosed ring for input: 'OC1=CC=C(CC(C(NC(C(CC)C)C(OC(C(CCCCCCCCCC)C)CC(NC(C(NC(C(NC(C(NC2CCC(N)=O)=O)C)=O)C)=O)C(O)C)=O)=O)=O)NC2=O)C=C2'
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:48] ERROR: 

[15:57:48] SMILES Parse Error: syntax error while parsing: 
[15:57:48] SMILES Parse Error: Failed parsing SMILES ' ' for

[15:57:49] SMILES Parse Error: syntax error while parsing: 
[15:57:49] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:49] ERROR: 

[15:57:49] SMILES Parse Error: syntax error while parsing: 
[15:57:49] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:49] ERROR: 

[15:57:49] SMILES Parse Error: syntax error while parsing: 
[15:57:49] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:49] ERROR: 

[15:57:49] SMILES Parse Error: syntax error while parsing: 
[15:57:49] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:49] ERROR: 

[15:57:49] SMILES Parse Error: syntax error while parsing: 
[15:57:49] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:49] ERROR: 

[15:57:49] SMILES Parse Error: syntax error while parsing: 
[15:57:49] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:49] ERROR: 

[15:57:49] SMILES Parse Error: syntax error while parsing: 
[15:57:49] SMILES Parse Erro

[15:57:50] SMILES Parse Error: syntax error while parsing: 
[15:57:50] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:50] ERROR: 

[15:57:50] SMILES Parse Error: syntax error while parsing: 
[15:57:50] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:50] ERROR: 

[15:57:50] SMILES Parse Error: syntax error while parsing: 
[15:57:50] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:50] ERROR: 

[15:57:50] SMILES Parse Error: syntax error while parsing: 
[15:57:50] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:50] ERROR: 

[15:57:50] SMILES Parse Error: syntax error while parsing: 
[15:57:50] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:50] ERROR: 

[15:57:50] SMILES Parse Error: syntax error while parsing: 
[15:57:50] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:50] ERROR: 

[15:57:50] SMILES Parse Error: syntax error while parsing: 
[15:57:50] SMILES Parse Erro

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 


[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 


[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Er

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 



[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Er







[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Pars


[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 

[15:57:51] SMILES Parse Error: syntax error while parsing: 
[15:57:51] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:51] ERROR: 











































































































[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 


[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '






[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse 













[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILE



[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 



[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse Error: Failed parsing SMILES ' ' for input: ' '
[15:57:52] ERROR: 

[15:57:52] SMILES Parse Error: syntax error while parsing: 
[15:57:52] SMILES Parse 

In [4]:
keys = list(compound_data.keys())

In [5]:
keys

['1-hydroxyphenazine',
 '1-methoxyphenazine',
 'phenazine-1-carboxylic acid',
 'phenazine-1-carboxamide',
 'phenazine',
 '2-heptyl-3-hydroxy 4-quinolone',
 'pyocyanin',
 'Chrysophanic acid',
 'Citrinin',
 'cyclo(L-Phe-L-Pro)',
 'cyclo(L-Leu-L-Pro)',
 'cyclo(Phe-Leu)',
 'cyclo(Trp-Ala)',
 'pateamine',
 'cyclo(L-Trp-L-Pro)',
 'cyclo(L-Val-L-Pro)',
 'S-4-benzyl-3-isobutyryloxazolidin-2-one',
 'R-4-benzyl-3-isobutyryloxazolidin-2-one',
 '"S-4-benzyl-3-isobutyryl-5,5-dimethyloxazolidin-2-one"',
 '"R-4-benzyl-3-isobutyryl-5,5-dimethyloxazolidin-2-one"',
 'tBuO-val-hexanoate (tert-butyl hexanoylvalinate)',
 'Hexose',
 'HHQ aka 2-heptylquinolin-4(1H)-one',
 'NHQ aka 2-nonylquinolin-4(1H)-one',
 'Phenazine-1-carboxylic acid',
 'Phosphocholine',
 'PQS-C9 and NQNO mixture, 2-nonyl-3-hydroxy 4(1H)-quinolone ',
 'UHQ C11:1 aka 2-undecenyl-quinoloin-4(1H)-one position of double bond unknown',
 '5-methyl phenazine-1-carboxylic acid',
 'Octadecenoic acid',
 'Hexadecenoic acid',
 'cis-9,10-methylene-he

In [6]:
molecule_smiles = compound_data[keys[5]]['metadata']['smiles']
molecule = Chem.MolFromSmiles(molecule_smiles)
initial_mass = CalcExactMolWt(molecule)


reactions = {name: AllChem.ReactionFromSmarts(smarts) for name, smarts in fragmentations.items()}
ionisation_rxn = AllChem.ReactionFromSmarts('[O,N,S:2]>>[H][O+,N+,S+:2]')

product_sets = ionisation_rxn.RunReactants((molecule,))
if not product_sets:
    raise ValueError("Ionisation failed, no products generated.")
ionised_product = product_sets[0][0]
Chem.SanitizeMol(ionised_product)
ionised_mass = CalcExactMolWt(ionised_product)
random_intensity = random.uniform(0.1, 100)  # Generate a random intensity
peaks_df = pd.DataFrame({'mz': [ionised_mass], 'intensity': [random_intensity]})

print(peaks_df)

[15:57:53] SMARTS Parse Error: syntax error while parsing: [#6:6](#O;+0)
[15:57:53] SMARTS Parse Error: Failed parsing SMARTS '[#6:6](#O;+0)' for input: '[#6:6](#O;+0)'


ValueError: ChemicalReactionParserException: Problems constructing product from SMARTS: [#6:6](#O;+0)

In [None]:
molecule_smiles = 'OC(=O)CNC(=O)C1=CC=C(C=C1)' #compound_data[keys[0]]['metadata']['smiles']
molecule = Chem.MolFromSmiles(molecule_smiles)

In [None]:
molecule

In [None]:
ionisation_rxn

In [None]:
initial_mass = CalcExactMolWt(molecule)
ionised_sets = ionisation_rxn.RunReactants((molecule,))
# Flatten the list of products
ionised_mols = [product[0] for product in ionised_sets]
print(ionised_mols)
print([Chem.MolToSmiles(product[0]) for product in ionised_sets] )
# Draw the products in a grid
Draw.MolsToGridImage(ionised_mols, molsPerRow=2, subImgSize=(200, 200))

In [None]:
# Example of accessing data:
compound_name = "Caramboxin"
spectra_df = compound_data[compound_name]["spectra"]

print("Stored metadata:", compound_data[compound_name]["metadata"],"\n")
#print("Stored spectra:\n", compound_data[compound_name]["spectra"], "\n")
#print(metadata["smiles"],"\n")  
#print(metadata["inchi"])

def generateIonisedMolecules(compound_name, compound_data = compound_data):
    metadata = compound_data[compound_name]["metadata"]
    spectra_df = compound_data[compound_name]["spectra"]
    try:
        molecule = Chem.MolFromSmiles(metadata["smiles"])
    except:
        try:
            molecule = Chem.MolFromInchi(metadata['inchi'])
        except:
            print('structure generation fail')
    
    initial_mass = CalcExactMolWt(molecule)
    print(initial_mass)
    ionised_sets = ionisation_rxn.RunReactants((molecule,))
    # Flatten the list of products
    ionised_mols = [product[0] for product in ionised_sets]
    print([Chem.MolToSmiles(product[0]) for product in ionised_sets] )
    print(len(ionised_mols))
    
    if len(ionised_mols)>0: return ionised_mols
    else: return []

#for compound_name in keys:
ionised_mols = generateIonisedMolecules(compound_name)
try:
    img = Draw.MolsToGridImage(ionised_mols, molsPerRow=2, subImgSize=(200, 200))
except: 
    print('no ionisedMols')
    img = None
img

In [None]:
def recursive_reaction_processing(reactions, starting_mols):
    """
    Recursively applies reactions and tracks parent-child relationships.

    Args:
        reactions (dict): Dictionary {reaction_name: RDKit Reaction}.
        starting_mols (list): List of RDKit molecules.

    Returns:
        tuple: (all_products, parent_map)
    """
    all_products = {}
    parent_map = {}  # Maps child InChIKey -> list of (Parent InChIKey, Reaction Name, Level)
    queue = deque([(mol, "Starting Material", 1) for mol in starting_mols])

    while queue:
        current_mols = [mol for mol, _, level in queue]
        current_level = queue[0][2]

        # Apply reactions and track new products
        new_products = apply_reactions(reactions, current_mols, current_level + 1, all_products)

        # Track parent-child relationships
        for inchikey, (parent_inchikey, prod, rxn_name, lvl) in new_products.items():
            prod_inchikey = Chem.MolToInchiKey(prod)

            # Ensure multiple parents are stored
            if prod_inchikey not in parent_map:
                parent_map[prod_inchikey] = []
            parent_map[prod_inchikey].append((parent_inchikey, rxn_name, lvl))

            # Avoid duplicate processing
            if prod_inchikey not in all_products:
                queue.append((prod, rxn_name, lvl))

        # Remove processed molecules
        queue.popleft()

    return all_products, parent_map


In [None]:
def build_reaction_graph(all_products, parent_map, starting_mols):
    """
    Builds a directed graph from reaction products, supporting multiple parents per node.
    """
    G = nx.DiGraph()
    edges = []  # Store edges for debugging
    fragmentation_tree = []  # Store fragmentation info

    # Add starting materials
    for mol in starting_mols:
        inchikey = Chem.MolToInchiKey(mol)
        smiles = Chem.MolToSmiles(mol)
        mass = Descriptors.ExactMolWt(mol)
        G.add_node(inchikey, smiles=smiles, mass=mass, label=f"{smiles}\n{mass:.2f}", level=0)

    # Add reaction products
    for inchikey, (parent_inchikey, mol, reaction_name, level) in all_products.items():
        smiles = Chem.MolToSmiles(mol)
        mass = Descriptors.ExactMolWt(mol)
        G.add_node(inchikey, smiles=smiles, mass=mass, label=f"{smiles}\n{mass:.2f}", level=level)

    # Add edges and track debugging info
    for child, parent_list in parent_map.items():  # Iterate over children
        for parent, reaction_name, lvl in parent_list:  # Iterate over multiple parents
            if parent in G.nodes and child in G.nodes:
                parent_mass = G.nodes[parent]["mass"]
                child_mass = G.nodes[child]["mass"]
                mass_loss = parent_mass - child_mass  # Calculate mass loss

                G.add_edge(parent, child, reaction=reaction_name)

                # Store debugging output
                edges.append((parent_mass, child_mass))
                fragmentation_tree.append((parent_mass, child_mass, reaction_name))
    # Debugging print statements
    #print("\nEdges:")
    #for edge in edges:
    #    print(f"  {edge}")

    #print("\nFragmentation Tree:")
    #for frag in fragmentation_tree:
    #    print(f"  {frag}")

    return G, edges, fragmentation_tree



def tree_layout(G):
    """Generates a tree-like layout based on reaction level."""
    levels = {}  # Track nodes by their level
    for node in G.nodes:
        level = G.nodes[node]["level"]
        if level not in levels:
            levels[level] = []
        levels[level].append(node)

    pos = {}
    max_width = max(len(nodes) for nodes in levels.values())  # Widest level
    for y, (level, nodes) in enumerate(sorted(levels.items())):
        x_positions = np.linspace(-max_width / 2, max_width / 2, num=len(nodes))
        for x, node in zip(x_positions, nodes):
            pos[node] = (x, -y)  # Y-axis goes downward

    return pos

def draw_reaction_graph(G):
    """Visualises the reaction graph with tree structure."""
    plt.figure(figsize=(12, 8))

    pos = tree_layout(G)  # Tree layout based on levels
    labels = {node: G.nodes[node]["label"] for node in G.nodes}
    edge_labels = {(u, v): G.edges[u, v]["reaction"] for u, v in G.edges}

    nx.draw(G, pos, with_labels=True, labels=labels, node_color="lightblue", edge_color="black", 
            node_size=2000, font_size=8, arrows=True)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=7)

    plt.title("Possible Reaction Network")
    plt.show()

# Example usage:
all_results, parent_relationships = recursive_reaction_processing(reactions, ionised_mols)
reaction_graph, edges, fragmentation_tree = build_reaction_graph(all_results, parent_relationships, ionised_mols)
draw_reaction_graph(reaction_graph)


In [None]:
# Extract unique m/z values from parent_mass and child_mass
mz_values = set()
for parent_mass, child_mass, _ in fragmentation_tree:
    mz_values.add(parent_mass)
    mz_values.add(child_mass)

# Create a DataFrame with intensity initialized to 100
peaks_df = pd.DataFrame({"mz": list(mz_values), "intensity": 1}).sort_values(by="mz", ascending=False)

# Display the DataFrame
print(peaks_df)

# Genetic algorithm

In [None]:
population_size = 50  # Number of individuals in each generation
generations = 1000  # Number of generations to evolve
mutation_rate = 0.1  # Probability of mutating a transition probability
tolerance = 0.05  # Error tolerance for intensity difference

import numpy as np
import pandas as pd
import networkx as nx
from scipy.spatial import cKDTree

import numpy as np
import pandas as pd
import networkx as nx
from scipy.spatial import cKDTree

def fitness_function(transition_probs, edges, peaks_df, tolerance=0.05):
    """
    Compare predicted intensities to observed intensities and optimize transition probabilities.
    Uses m/z tolerance for matching peaks during scoring.
    """
    # Create a directed graph of the fragmentation tree
    G = nx.DiGraph()
    G.add_edges_from(edges)

    # Perform topological sorting (process precursors before fragments)
    try:
        topo_order = list(nx.topological_sort(G))
    except nx.NetworkXUnfeasible:
        print("Graph contains a cycle, cannot perform topological sort.")
        return -float('inf')

    # Initialize calculated intensities using edges and peaks_df mz values
    all_mz = set(peaks_df['mz']).union({i for edge in edges for i in edge})
    calculated_intensities = {mz: 0.0 for mz in all_mz}

    precursor_node = topo_order[0]
    if precursor_node not in calculated_intensities:
        print(f"Precursor node {precursor_node} not in peaks_df or edges")
        return -float('inf')

    calculated_intensities[precursor_node] = 1.0  # Initial intensity

    # Normalize transition probabilities to ensure sum <= 1
    for node in topo_order:
        successors = list(G.successors(node))
        total_prob = sum(transition_probs.get((node, succ), 0) for succ in successors)
        
        if total_prob > 1 + 1e-6:  # Check if probabilities need normalization
            for succ in successors:
                transition_probs[(node, succ)] /= total_prob

    # Propagate intensities through the fragmentation tree
    for node in topo_order:
        current_intensity = calculated_intensities[node]
        successors = list(G.successors(node))
        
        total_prob = sum(transition_probs.get((node, succ), 0) for succ in successors)
        intensity_to_distribute = current_intensity * total_prob
        
        # Distribute intensity to successors
        for succ in successors:
            prob = transition_probs.get((node, succ), 0)
            calculated_intensities[succ] += current_intensity * prob
        
        # Remove distributed intensity from current node
        calculated_intensities[node] -= intensity_to_distribute

    # Prepare data for matching
    observed_mz = peaks_df['mz'].values
    observed_int = peaks_df['intensity'].values
    calculated_mz = np.array(list(calculated_intensities.keys()))
    calculated_int = np.array([calculated_intensities[mz] for mz in calculated_mz])

    # Match observed peaks to calculated peaks within tolerance
    calc_tree = cKDTree(calculated_mz.reshape(-1, 1))
    distances, indices = calc_tree.query(observed_mz.reshape(-1, 1), k=1, distance_upper_bound=tolerance)
    
    # Calculate score components
    score = 0
    matched_calc = set()
    
    # Process matched pairs
    for obs_idx, (dist, calc_idx) in enumerate(zip(distances, indices)):
        if dist <= tolerance:
            matched_calc.add(calc_idx)
            obs_i = observed_int[obs_idx]
            calc_i = calculated_int[calc_idx]
            
            if obs_i > 0:
                error = abs(calc_i - obs_i) / obs_i
                if error <= tolerance:
                    score += 1 - error
                else:
                    score -= error
            else:
                if calc_i > 0:
                    score -= 1  # Penalize false positive

    # Penalize unmatched observed peaks (false negatives)
    for obs_idx in range(len(observed_mz)):
        if distances[obs_idx] > tolerance:
            obs_i = observed_int[obs_idx]
            if obs_i > 0:
                score -= 1  # Expected intensity but nothing detected

    # Penalize unmatched calculated peaks (false positives)
    for calc_idx in range(len(calculated_mz)):
        if calc_idx not in matched_calc:
            calc_i = calculated_int[calc_idx]
            if calc_i > 1e-6:  # Only penalize significant intensities
                score -= calc_i  # Penalize by intensity magnitude

    return score

def select_parents(population, fitness_scores):
    """
    Select parents using a roulette-wheel selection based on fitness scores.
    """
    min_fitness = min(fitness_scores)
    if min_fitness < 0:
        fitness_scores = [score - min_fitness + 1 for score in fitness_scores]

    total_fitness = sum(fitness_scores)
    selection_probs = [score / total_fitness if total_fitness != 0 else 1/len(fitness_scores) for score in fitness_scores]
    
    return np.random.choice(population, size=2, p=selection_probs, replace=False)

def initialize_population(edges):
    """
    Initialize a population of random transition probabilities for each edge.
    """
    population = []
    for _ in range(population_size):
        individual = {edge: random.uniform(0, 1) for edge in edges}
        population.append(individual)
    return population

def crossover(parent1, parent2):
    """
    Perform crossover and ensure transition probabilities are valid.
    """
    offspring = {key: parent1[key] if random.random() < 0.5 else parent2[key] for key in parent1}
    return normalize_transition_probs(offspring, edges)


def normalize_transition_probs(individual, edges):
    """
    Ensure that transition probabilities from each node sum to at most 1.
    """
    outgoing_edges = {}
    
    # Collect all outgoing edges for each node
    for (i, j) in edges:
        if i not in outgoing_edges:
            outgoing_edges[i] = []
        outgoing_edges[i].append((i, j))
    
    # Normalize probabilities per node
    for node, edges_out in outgoing_edges.items():
        total_prob = sum(individual.get(edge, 0) for edge in edges_out)
        if total_prob > 1:
            for edge in edges_out:
                individual[edge] /= total_prob  # Scale down
    
    return individual

def mutate(individual):
    """
    Perform mutation on an individual while keeping probabilities valid.
    """
    for key in individual:
        if random.random() < mutation_rate:
            individual[key] += random.uniform(-0.1, 0.1)
            individual[key] = max(0, min(1, individual[key]))  # Keep probabilities in [0,1]

    return normalize_transition_probs(individual, edges)  # Ensure normalization after mutation


def genetic_algorithm(edges, peaks_df, patience=10, epsilon=1e-6):
    """
    Run the Genetic Algorithm to optimize transition probabilities for the fragmentation tree.
    """
    # Initialize population
    population = initialize_population(edges)
    
    best_fitness = float('-inf')
    no_improvement_count = 0

    for generation in range(generations):
        fitness_scores = [fitness_function(individual, edges, peaks_df) for individual in population]

        current_best_fitness = max(fitness_scores)
        
        if current_best_fitness > best_fitness + epsilon:
            best_fitness = current_best_fitness
            no_improvement_count = 0
        else:
            no_improvement_count += 1
        
        if no_improvement_count >= patience:
            print(f"Early stopping after {generation} generations with best fitness: {best_fitness}")
            break

        new_population = []
        for _ in range(population_size // 2):
            parent1, parent2 = select_parents(population, fitness_scores)
            offspring1, offspring2 = mutate(crossover(parent1, parent2)), mutate(crossover(parent1, parent2))
            new_population.extend([offspring1, offspring2])
        
        population = new_population
    
    best_individual = max(population, key=lambda ind: fitness_function(ind, edges, peaks_df))
    return best_individual



def calculateSpectra(transition_probs, edges, peaks_df, tolerance=0.05):
    """
    Compare predicted intensities to observed intensities and optimize transition probabilities.
    Uses m/z tolerance for matching peaks during scoring.
    """
    # Create a directed graph of the fragmentation tree
    G = nx.DiGraph()
    G.add_edges_from(edges)

    # Perform topological sorting (process precursors before fragments)
    try:
        topo_order = list(nx.topological_sort(G))
    except nx.NetworkXUnfeasible:
        print("Graph contains a cycle, cannot perform topological sort.")
        return -float('inf')

    # Initialize calculated intensities using edges and peaks_df mz values
    all_mz = set(peaks_df['mz']).union({i for edge in edges for i in edge})
    calculated_intensities = {mz: 0.0 for mz in all_mz}

    precursor_node = topo_order[0]
    if precursor_node not in calculated_intensities:
        print(f"Precursor node {precursor_node} not in peaks_df or edges")
        return -float('inf')

    calculated_intensities[precursor_node] = 1.0  # Initial intensity

    # Normalize transition probabilities to ensure sum <= 1
    for node in topo_order:
        successors = list(G.successors(node))
        total_prob = sum(transition_probs.get((node, succ), 0) for succ in successors)
        
        if total_prob > 1 + 1e-6:  # Check if probabilities need normalization
            for succ in successors:
                transition_probs[(node, succ)] /= total_prob

    # Propagate intensities through the fragmentation tree
    for node in topo_order:
        current_intensity = calculated_intensities[node]
        successors = list(G.successors(node))
        
        total_prob = sum(transition_probs.get((node, succ), 0) for succ in successors)
        intensity_to_distribute = current_intensity * total_prob
        
        # Distribute intensity to successors
        for succ in successors:
            prob = transition_probs.get((node, succ), 0)
            calculated_intensities[succ] += current_intensity * prob
        
        # Remove distributed intensity from current node
        calculated_intensities[node] -= intensity_to_distribute

    # Prepare data for matching
    observed_mz = peaks_df['mz'].values
    observed_int = peaks_df['intensity'].values
    calculated_mz = np.array(list(calculated_intensities.keys()))
    calculated_int = np.array([calculated_intensities[mz] for mz in calculated_mz])

    calculatedSpectra = pd.DataFrame({'mz':calculated_mz, "intensity": calculated_int})
    calculatedSpectra['intensity'] = calculatedSpectra['intensity'] / calculatedSpectra['intensity'].max()
    return calculatedSpectra

# Call the GA to calculate transition probabilities
transition_probabilities = genetic_algorithm(edges, spectra_df)
calculatedSpectra = calculateSpectra(transition_probabilities, edges, spectra_df, tolerance=0.05)

# Print the final optimized transition probabilities
print(f"Optimized transition probabilities: {transition_probabilities}")

In [None]:
calculatedSpectra

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict

def plot_fragmentation_tree(peaks_df, fragmentation_tree, transition_probabilities):
    """
    Plots a fragmentation tree with Δm/z values and transition probabilities,
    excluding nodes where all incoming edges have a transition probability of 0.
    """
    # Create a directed graph
    G = nx.DiGraph()
    node_labels = {}
    node_mz_map = {}

    for parent_mass, child_mass, reaction in fragmentation_tree:
        parent_node = peaks_df[peaks_df['mz'].apply(lambda x: abs(x - parent_mass) < 0.01)].index[0]
        child_node = peaks_df[peaks_df['mz'].apply(lambda x: abs(x - child_mass) < 0.01)].index[0]

        transition_prob = transition_probabilities.get((parent_mass, child_mass), 0)
        if transition_prob > 0:
            if parent_node not in G:
                G.add_node(parent_node)
                node_labels[parent_node] = f"m/z {parent_mass:.1f}"
                node_mz_map[parent_node] = parent_mass

            if child_node not in G:
                G.add_node(child_node)
                node_labels[child_node] = f"m/z {child_mass:.1f}"
                node_mz_map[child_node] = child_mass

            G.add_edge(parent_node, child_node, reaction=reaction, weight=transition_prob)

    if not G.nodes:
        print("No valid fragmentation pathways with nonzero transition probabilities.")
        return

    precursor_node = peaks_df['mz'].idxmax()
    node_depths = nx.single_source_shortest_path_length(G, precursor_node) if precursor_node in G else {}
    
    levels = defaultdict(list)
    for node, depth in node_depths.items():
        levels[depth].append(node)

    pos = {}
    horizontal_spacing = 2.0
    vertical_spacing = 1.5
    for depth, nodes in levels.items():
        for i, node in enumerate(nodes):
            pos[node] = (i * horizontal_spacing - len(nodes) * horizontal_spacing / 2, -depth * vertical_spacing)

    edge_labels = {}
    for parent, child in G.edges():
        parent_mass = node_mz_map[parent]
        child_mass = node_mz_map[child]
        mz_diff = abs(parent_mass - child_mass)
        transition_prob = transition_probabilities.get((parent_mass, child_mass), 0)
        edge_labels[(parent, child)] = f"Δm/z: {mz_diff:.2f}, p={transition_prob:.2f}"

    plt.figure(figsize=(12, 10))
    nx.draw_networkx_nodes(G, pos, node_size=700, node_color='lightblue', edgecolors="black")
    nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=15, edge_color='gray', alpha=0.5)
    nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=10, font_color='black')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=9)
    plt.title("Filtered Fragmentation Tree", fontsize=14)
    plt.show()


plot_fragmentation_tree(peaks_df, fragmentation_tree, transition_probabilities)


In [None]:
spectra_df

In [None]:
import matplotlib.pyplot as plt

def plot_mass_spectrum(peaks_df, title):
    """
    Plots a mass spectrum using the optimised intensities from peaks_df.
    """

    # Extract m/z values and their corresponding optimized intensities
    mz_values = peaks_df["mz"].values
    optimized_intensities = peaks_df["intensity"].values  # GA-optimized intensities

    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.vlines(mz_values, ymin=0, ymax=optimized_intensities, color='blue', linewidth=1.5, label="Optimised Intensities")

    # Add labels and title
    plt.xlabel("m/z", fontsize=14)
    plt.ylabel("Intensity", fontsize=14)
    plt.title(title, fontsize=16)

    # Customize the x-axis to show only relevant peaks
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Show grid for better readability
    plt.grid(True, linestyle='--', alpha=0.6)

    # Show the plot
    plt.show()

# Call the function to plot the spectrum
plot_mass_spectrum(spectra_df, title = "Experimental Data")

In [None]:
import matplotlib.pyplot as plt

def plot_comparison_spectra(peaks_df, spectra_df):
    """
    Plots the actual (experimental) mass spectrum from spectra_df and 
    the optimised mass spectrum from peaks_df side by side, with the same x-axis range.
    """

    # Extract Experimental (Actual) Data
    exp_mz_values = spectra_df["mz"].values
    exp_intensities = spectra_df["intensity"].values  # Experimental intensities

    # Extract Optimized (GA) Data
    opt_mz_values = peaks_df["mz"].values
    opt_intensities = peaks_df["intensity"].values  # Optimized intensities from GA

    # Determine the global x-axis range (m/z)
    global_min_mz = min(min(exp_mz_values), min(opt_mz_values))  # Minimum m/z across both spectra
    global_max_mz = max(max(exp_mz_values), max(opt_mz_values))  # Maximum m/z across both spectra

    # Create a figure with 2 subplots
    fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

    # --- Plot Experimental Spectrum ---
    axes[0].vlines(exp_mz_values, ymin=0, ymax=exp_intensities, color='red', linewidth=1.5)
    axes[0].set_xlabel("m/z", fontsize=12)
    axes[0].set_ylabel("Intensity", fontsize=12)
    axes[0].set_title("Experimental Mass Spectrum", fontsize=14)
    axes[0].grid(True, linestyle='--', alpha=0.5)
    axes[0].set_xlim(global_min_mz, global_max_mz)  # Set x-axis range

    # --- Plot Optimized Spectrum ---
    axes[1].vlines(opt_mz_values, ymin=0, ymax=opt_intensities, color='blue', linewidth=1.5)
    axes[1].set_xlabel("m/z", fontsize=12)
    axes[1].set_title("Simulated Mass Spectrum", fontsize=14)
    axes[1].grid(True, linestyle='--', alpha=0.5)
    axes[1].set_xlim(global_min_mz, global_max_mz)  # Set x-axis range

    # Show the plot
    plt.tight_layout()
    plt.show()

# Call the function to plot both spectra
plot_comparison_spectra(calculatedSpectra, spectra_df)