In [81]:
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_pred.magma.fragmentation as fe
from ms_pred.common.plot_utils import *
from ms_pred.dag_pred import joint_model
set_style()
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [82]:
test_smiles = "COC1=C(C2=C[N+]3=C(C=C2C=C1)C4=CC5=C(C=C4CC3)OCO5)OC"
test_ionization = "[M+H]+"

In [83]:
inten_ckpt = f"../quickstart/iceberg/models/nist_iceberg_score.ckpt"
gen_ckpt = f"../quickstart/iceberg/models/nist_iceberg_generate.ckpt"

# Load joint model
model = joint_model.JointModel.from_checkpoints(
    inten_checkpoint=inten_ckpt, gen_checkpoint=gen_ckpt
)

In [84]:
outputs = model.predict_mol(
    smi=test_smiles,
    adduct=test_ionization,
    device="cpu",
    max_nodes=100,
    binned_out=False,
    threshold=0,
)

In [85]:
root_inchi = outputs["root_inchi"]
frags = outputs["frags"]

# Generate a fragmentation engine
engine = fe.FragmentEngine(mol_str=root_inchi, mol_str_type="inchi")

In [86]:
# Convert from frags dict into a list of mz, inten
mass_to_obj = defaultdict(lambda: {})
for k, val in frags.items():
    masses, intens, form  = val["mz_charge"], val["intens"], val["form"]
    for m, i in zip(masses, intens):
        if i <= 0:
            continue
        cur_obj = mass_to_obj[m]
        if cur_obj.get("inten", 0) > 0:
            # update
            if cur_obj.get("inten") < i:
                cur_obj["frag_hash"] = k
                cur_obj['form'] = form
            cur_obj["inten"] += i
        else:
            cur_obj["inten"] = i
            cur_obj["frag_hash"] = k
            cur_obj['form'] = form

max_inten = max(*[i["inten"] for i in mass_to_obj.values()], 1e-9)
mass_to_obj = {
    k: dict(inten=v["inten"] / max_inten, frag_hash=v["frag_hash"], 
            form=v['form'])
            
    for k, v in mass_to_obj.items()
}

In [87]:
# Ordenar los fragmentos por intensidad en orden descendente y seleccionar los 10 primeros
top_10_fragments = sorted(mass_to_obj.items(), key=lambda x: x[1]["inten"], reverse=True)[:10]

# Imprimir los 10 fragmentos más intensos
for mz, fragment_info in top_10_fragments:
    inten = fragment_info['inten']
    frag_hash = fragment_info['frag_hash']
    form = fragment_info['form']
    print(f"MZ: {mz}, Intensidad: {inten}, Fragmento Hash: {frag_hash}, Fórmula: {form}")

MZ: 337.1308595079999, Intensidad: 1.0, Fragmento Hash: bcb0f21edf852ed98ce35caa94305f1445777e9cfdfcf1887d8f5f8a464f4c8a, Fórmula: C20NO4H18
MZ: 321.09955937999985, Intensidad: 0.9475625367366349, Fragmento Hash: 63dffda56fa1d50df5cb0a55854d7241b310eedfdd940a2974d3f051a5b471e5, Fórmula: C19NO4H15
MZ: 178.086255044, Intensidad: 0.7670157928701044, Fragmento Hash: e0850b8bf2b2928c2fa62257e1a96d712346e152b99391fbb7a87cd82205107a, Fórmula: C10NO2H8
MZ: 244.07569036, Intensidad: 0.7132755426204191, Fragmento Hash: 95c29adebee64621268ff5e1b2b5d819d023344146ce2e329c1dccaad320bcd8, Fórmula: C17NOH12
MZ: 307.0839093159999, Intensidad: 0.6638795968083683, Fragmento Hash: a592d5549897d650946afe3981a84246e7572e755f58ac1a1e23d9cd62e2057a, Fórmula: C18NO4H12
MZ: 291.08899469599993, Intensidad: 0.6581041122428617, Fragmento Hash: c3b48e57fcf4bb059960b84afb460b45e7cf9ab6b7072568a33c1683e885dc93, Fórmula: C18NO3H12
MZ: 261.07843001199996, Intensidad: 0.6495318794822018, Fragmento Hash: ffc2d7ddaf0afc42

In [88]:
# Obtener los MZ e intensidades de los fragmentos en listas separadas
fragment_mz = [item[0] for item in sorted(list(mass_to_obj.items()), key=lambda x: x[1]["inten"], reverse=True)[:10]]
fragment_intensidades = [item[1]['inten'] for item in sorted(list(mass_to_obj.items()), key=lambda x: x[1]["inten"], reverse=True)[:10]]

In [89]:
# from pathlib import Path

# test_path= Path("/home/javier_rodriguez/quimica_py/ms-pred/results/dag_nist20/split_1_rnd1/inten_thresh_sweep/10/form_preds/")

In [90]:
# Ruta del archivo CSV que guardaste en R
# ruta_csv = "/home/javier_rodriguez/ms2net/quimicaR/datos_spectros/mona_agilent_normalizada.csv"
# ruta_csv = "/home/javier_rodriguez/quimica_py/ms-pred/notebooks/mona_2.csv"
# ruta_csv = "/home/javier_rodriguez/ms2net/quimicaR/datos_spectros/mona_agilent_qtof.csv"
# ruta_csv = "/data/home/javier_rodriguez/quimica_py/ms-pred/notebooks/rendimiento_mona_smile2.csv"

ruta_csv = "/data/home/javier_rodriguez/quimica_py/ms-pred/notebooks/rendimiento_nist20_normalizadoo.csv"


# ruta_csv = "/home/javier_rodriguez/ms2net/quimicaR/datos_spectros/mona_smile_real.csv"

# Cargar el archivo CSV en un DataFrame de pandas
df = pd.read_csv(ruta_csv)
# df = df.head(10)
# Filtrar solo las filas con SMILES válidos
# df = df[df['smiles'].apply(lambda x: isinstance(x, str) and Chem.MolFromSmiles(x) is not None)]
# df = df[~df['smiles'].str.contains(",")]


In [91]:
# df.columns
df.smiles[1505]

'CC(=O)NCCCC[C@H](N)C(=O)O'

In [92]:
# for ruta in test_path.glob("*.json"):
#     print(ruta)
#     print(type(ruta))
#     break



In [93]:
column_names = df.columns
print(column_names)
# inten_ckpt = f"../quickstart/iceberg/models/nist_iceberg_score.ckpt"
# gen_ckpt = f"../quickstart/iceberg/models/nist_iceberg_generate.ckpt"
inten_ckpt = f"../quickstart/iceberg/models/canopus_iceberg_score.ckpt"
gen_ckpt = f"../quickstart/iceberg/models/canopus_iceberg_generate.ckpt"

# Load joint model
model = joint_model.JointModel.from_checkpoints(
    inten_checkpoint=inten_ckpt, gen_checkpoint=gen_ckpt
)


Index(['Name', 'NOTES', 'PRECURSOR_TYPE', 'SPECTRUM_TYPE', 'PRECURSOR_M.Z',
       'INSTRUMENT_TYPE', 'instrument', 'SAMPLE_INLET', 'IONIZATION',
       'COLLISION_GAS', 'COLLISION_ENERGY', 'ION_MODE', 'inchikey', 'SYNONYMS',
       'formula', 'MW', 'EXACT_MASS', 'CASNO', 'NISTNO', 'SPEC_ID', 'ID',
       'CHARGE', 'COMMENT', 'NUM_PEAKS', 'smiles', 'Num.Peaks', 'mz',
       'intensity', 'RELATED_CASNO', 'dataOrigin', 'msLevel', 'MeanIntensity',
       'FragmentMZ', 'FragmentIntensidades', 'Original_SMILES'],
      dtype='object')


In [94]:
def iceberg(test_smiles, test_ionization):
    # inten_ckpt = f"/home/javier_rodriguez/ms2net/mspred/ms-pred/quickstart/iceberg/models/nist_iceberg_score.ckpt"
    # gen_ckpt = f"/home/javier_rodriguez/ms2net/mspred/ms-pred/quickstart/iceberg/models/nist_iceberg_generate.ckpt"
    inten_ckpt = f"../quickstart/iceberg/models/canopus_iceberg_score.ckpt"
    gen_ckpt = f"../quickstart/iceberg/models/canopus_iceberg_generate.ckpt"
    model = joint_model.JointModel.from_checkpoints(
        inten_checkpoint=inten_ckpt, gen_checkpoint=gen_ckpt)
    outputs = model.predict_mol(
    smi=test_smiles,
    adduct=test_ionization,
    device="cpu",
    max_nodes=100,
    binned_out=False,
    threshold=0,
    )
    root_inchi = outputs["root_inchi"]
    frags = outputs["frags"]
    # Generate a fragmentation engine
    engine = fe.FragmentEngine(mol_str=root_inchi, mol_str_type="inchi")

    # Convert from frags dict into a list of mz, inten
    mass_to_obj = defaultdict(lambda: {})
    for k, val in frags.items():
        masses, intens, form  = val["mz_charge"], val["intens"], val["form"]
        for m, i in zip(masses, intens):
            if i <= 0:
                continue
            cur_obj = mass_to_obj[m]
            if cur_obj.get("inten", 0) > 0:
                # update
                if cur_obj.get("inten") < i:
                    cur_obj["frag_hash"] = k
                    cur_obj['form'] = form
                cur_obj["inten"] += i
            else:
                cur_obj["inten"] = i
                cur_obj["frag_hash"] = k
                cur_obj['form'] = form

    max_inten = max(*[i["inten"] for i in mass_to_obj.values()], 1e-9)
    mass_to_obj = {
        k: dict(inten=v["inten"] / max_inten, frag_hash=v["frag_hash"], 
                form=v['form'])
                
        for k, v in mass_to_obj.items()
    }

    # Ordenar los fragmentos por intensidad en orden descendente y seleccionar los 10 primeros
    top_10_fragments = sorted(mass_to_obj.items(), key=lambda x: x[1]["inten"], reverse=True)[:10]

    # Imprimir los 10 fragmentos más intensos
    # for mz, fragment_info in top_10_fragments:
    #     inten = fragment_info['inten']
    #     frag_hash = fragment_info['frag_hash']
    #     form = fragment_info['form']
        # print(f"MZ: {mz}, Intensidad: {inten}, Fragmento Hash: {frag_hash}, Fórmula: {form}")

    
    # Obtener los MZ e intensidades de los fragmentos en listas separadas
    fragment_mz = [item[0] for item in sorted(list(mass_to_obj.items()), key=lambda x: x[1]["inten"], reverse=True)[:]]
    fragment_intensidades = [item[1]['inten'] for item in sorted(list(mass_to_obj.items()), key=lambda x: x[1]["inten"], reverse=True)[:]]
    return fragment_mz, fragment_intensidades


In [None]:
# df['smiles'] = df['smiles'].str.replace(': ', '=')


resultados = []

for index, row in df.iterrows():
    test_smiles = row['smiles']  # Asegúrate de tener una columna "SMILES" en tu DataFrame
    # print(test_smiles)
    # test_ionization = row['Precursor_type']
    test_ionization = row['PRECURSOR_TYPE']

    # Inicializa variables para los resultados
    fragment_mz = None
    fragment_intensidades = None

    # Verificar si test_smiles es una cadena antes de llamar a iceberg
    if isinstance(test_smiles, str):
        try:
            fragment_mz, fragment_intensidades = iceberg(test_smiles, test_ionization)
        except KeyError as e:
            # Manejar la excepción (puedes imprimir un mensaje, omitir la fila, etc.)
            print(f"Error: {e}")

    # Agregar los resultados al DataFrame resultados_df
    resultados.append({
        'SMILES': test_smiles,
        'FragmentMZ': fragment_mz,
        'FragmentIntensidades': fragment_intensidades
    })

# Convierte la lista de resultados en un nuevo DataFrame
resultados_df = pd.DataFrame(resultados)

# Añadir la columna de resultados al DataFrame original 'df'
df['FragmentMZ'] = resultados_df['FragmentMZ']
df['FragmentIntensidades'] = resultados_df['FragmentIntensidades']

# Añadir la columna de SMILES originales al DataFrame original 'df'
df['Original_SMILES'] = resultados_df['SMILES']

KeyError: 'molecular_weight'

anged

 Omitted undefined stereo
ndefined stereo
tereo
NG: Charges were rearranged
 undefined stereo
ged
: Omitted undefined stereo
ING: Omitted undefined stereo
ed undefined stereo
ed undefined stereo
 rearranged
defined stereo
 stereo
 stereo
ereo
ted undefined stereo
efined stereo
o
itted undefined stereo
fined stereo
tted undefined stereo
ined stereo
defined stereo
 stereo
undefined stereo
stereo
ndefined stereo
reo
fined stereo
d
ed stereo
ARNING: Charges were rearranged
NG: Charges were rearranged
efined stereo
ted undefined stereo
ned stereo
efined stereo
reo
mitted undefined stereo
o

RNING: Omitted undefined stereo
tted undefined stereo
NING: Omitted undefined stereo
ted undefined stereo
ING: Omitted undefined stereo
ed undefined stereo
NG: Omitted undefined stereo
d undefined stereo
tted undefined stereo
ined stereo
d undefined stereo
d stereo
tereo

ed stereo
tereo
NG: Omitted undefined stereo
d undefined stereo
mitted undefined stereo
efined stereo
d undefined stereo
d ster

In [96]:
# 
# !pip install openpyxl
import openpyxl
# df.to_excel('rendimiento_mona_canopus.xlsx', index=False)


In [97]:
import pandas as pd
print(pd.__version__)


2.0.3


In [98]:
# df.to_csv("rendimiento_nist20_normalizadoo.csv", index=False)

df.to_csv("rendimiento_nist_canopus.csv", index=False)