In [19]:
from dreams.api import dreams_embeddings
from matchms.importing import load_from_mgf
from matchms import Spectrum
from matchms.exporting import save_as_mgf

from dreams.definitions import DREAMS_EMBEDDING
from dreams.utils.data import MSData

In [2]:
massspectgym_path ="/workspace/mol2DreaMS/data/data/MassSpecGym.mgf"

In [6]:
dreams_embs = dreams_embeddings(massspectgym_path, prec_mz_col='PRECURSOR_MZ')

Computing DreaMS embedding: 100%|██████████| 231104/231104 [05:43<00:00, 673.74it/s]


In [9]:
dreams_embs.shape

(231104, 1024)

In [8]:
dreams_embs

array([[ 0.22195248, -0.5375411 , -0.5991391 , ..., -0.45315918,
         0.80612165,  1.337087  ],
       [-0.28255856, -0.4625993 , -0.76268137, ..., -0.2596016 ,
         0.61134255,  0.64017355],
       [ 0.11121915, -0.35258758, -0.4906476 , ..., -0.5522816 ,
         0.2577802 ,  0.9908015 ],
       ...,
       [-0.39851522, -0.84326875, -0.6090119 , ..., -1.122334  ,
         0.26077583, -0.7364357 ],
       [-1.1556643 ,  0.2024601 , -0.24088667, ..., -0.58375216,
        -1.0787734 , -0.89217705],
       [ 0.20935775,  0.70727056, -0.2958385 , ..., -1.0314586 ,
        -0.35540977, -0.85699624]], dtype=float32)

In [12]:
spectra_from_path = list(load_from_mgf(massspectgym_path))

In [14]:
if len(spectra_from_path) != dreams_embs.shape[0]:
    raise ValueError("The number of embeddings does not match the number of spectra.")

for idx, (spectrum, embedding) in enumerate(zip(spectra_from_path, dreams_embs)):
    spectrum.set("dreams_embedding", embedding.tolist())  

print("All spectra have been updated with 'dreams_embedding' attributes.")

All spectra have been updated with 'dreams_embedding' attributes.


In [15]:
output_mgf_path = "/workspace/mol2DreaMS/data/data/Enveda/DreaMS_MassSpecGym.mgf"

# save_as_mgf(spectra_from_path, output_mgf_path)

dict_keys(['spectra'])


In [25]:
loaded_spectra_from_path = list(load_from_mgf(output_mgf_path))

In [29]:
# spectra_from_path[0].metadata

In [20]:
hdf5_path = "../../data/data/MassSpecGym_DreaMS.hdf5"
msdata = MSData.from_hdf5(hdf5_path, prec_mz_col='precursor_mz')
embs = msdata[DREAMS_EMBEDDING]

In [22]:
msdata.columns()

['COLLISION_ENERGY',
 'DreaMS_embedding',
 'FOLD',
 'FORMULA',
 'IDENTIFIER',
 'INCHIKEY',
 'INSTRUMENT_TYPE',
 'PARENT_MASS',
 'PRECURSOR_FORMULA',
 'SIMULATION_CHALLENGE',
 'adduct',
 'precursor_mz',
 'smiles',
 'spectrum']

In [27]:
identifiers_hdf5 = msdata['IDENTIFIER']
embeddings_hdf5 = msdata['DreaMS_embedding']

In [28]:
print(f"Number of spectra in HDF5: {len(identifiers_hdf5)}")

Number of spectra in HDF5: 213548


In [33]:
# Create a mapping from IDENTIFIER to DreaMS_embedding in HDF5
identifier_to_embedding_hdf5 = {
    identifier: embedding for identifier, embedding in zip(identifiers_hdf5, embeddings_hdf5)
}

# Create a mapping from IDENTIFIER to dreams_embedding in MGF
identifier_to_embedding_mgf = {
    spectrum.get("identifier"): spectrum.get("dreams_embedding") for spectrum in loaded_spectra_from_path
}

# Verify the mappings
print(f"Number of identifiers in HDF5 mapping: {len(identifier_to_embedding_hdf5)}")
print(f"Number of identifiers in MGF mapping: {len(identifier_to_embedding_mgf)}")

Number of identifiers in HDF5 mapping: 213548
Number of identifiers in MGF mapping: 231104


In [38]:
import json

In [39]:
# Function to parse the dreams_embedding string to a list
def parse_embedding(embedding_str):
    try:
        return json.loads(embedding_str)
    except json.JSONDecodeError:
        # Handle cases where the embedding is not a valid JSON string
        # For example, if it's stored as a plain string with space-separated values
        # Uncomment the following lines if needed:
        # return list(map(float, embedding_str.strip('[]').split(',')))
        raise ValueError("Embedding string is not in valid JSON format.")

# Create a mapping from IDENTIFIER to dreams_embedding
identifier_to_embedding_mgf = {}
for spectrum in loaded_spectra_from_path:
    identifier = spectrum.get("identifier")
    embedding_str = spectrum.get("dreams_embedding")
    if identifier is None or embedding_str is None:
        continue  # Skip spectra without IDENTIFIER or dreams_embedding
    embedding_list = parse_embedding(embedding_str)
    identifier_to_embedding_mgf[identifier] = embedding_list

print(f"Number of identifiers in MGF mapping: {len(identifier_to_embedding_mgf)}")

Number of identifiers in MGF mapping: 231104


In [40]:
# Create a mapping from IDENTIFIER to DreaMS_embedding in HDF5
identifier_to_embedding_hdf5 = {
    identifier: embedding for identifier, embedding in zip(identifiers_hdf5, embeddings_hdf5)
}

print(f"Number of identifiers in HDF5 mapping: {len(identifier_to_embedding_hdf5)}")

Number of identifiers in HDF5 mapping: 213548


In [41]:
# Get sets of identifiers
identifiers_mgf_set = set(identifier_to_embedding_mgf.keys())
identifiers_hdf5_set = set(identifier_to_embedding_hdf5.keys())

# Find overlapping identifiers
overlapping_identifiers = identifiers_mgf_set.intersection(identifiers_hdf5_set)

print(f"Number of overlapping identifiers: {len(overlapping_identifiers)}")

Number of overlapping identifiers: 213548


In [43]:
total_overlapping = len(overlapping_identifiers)
matched_embeddings = 0
mismatched_embeddings = 0

# List to store mismatched identifiers
mismatched_ids = []

# Iterate over overlapping identifiers and compare embeddings
for identifier in overlapping_identifiers:
    embedding_mgf = np.array(identifier_to_embedding_mgf[identifier])
    embedding_hdf5 = np.array(identifier_to_embedding_hdf5[identifier])
    
    # Check if embeddings are identical within a tolerance
    if np.allclose(embedding_mgf, embedding_hdf5, atol=1e-5):
        matched_embeddings += 1
    else:
        mismatched_embeddings += 1
        mismatched_ids.append(identifier)

# Summary of comparison
print("\n--- Embedding Comparison Summary ---")
print(f"Total overlapping identifiers: {total_overlapping}")
print(f"Matched embeddings: {matched_embeddings}")
print(f"Mismatched embeddings: {mismatched_embeddings}")


--- Embedding Comparison Summary ---
Total overlapping identifiers: 213548
Matched embeddings: 213548
Mismatched embeddings: 0


# enveda_dadta

In [44]:
import pandas as pd

In [62]:
dataframe_path = "/workspace/mol2DreaMS/data/data/Enveda/enveda_spec_df.csv"

In [63]:
df = pd.read_csv(dataframe_path)

In [64]:
df.columns

Index(['Unnamed: 0', 'spec_id', 'mol_id', 'group_id', 'inst_type', 'prec_type',
       'ace', 'prec_mz', 'peaks', 'smiles'],
      dtype='object')

In [65]:
df

Unnamed: 0.1,Unnamed: 0,spec_id,mol_id,group_id,inst_type,prec_type,ace,prec_mz,peaks,smiles
0,0,0,0,0,Orbitrap,[M+H]+,60.000000,417.230011,"[(227.07, 0.03), (240.078, 0.007), (241.086, 0...",COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...
1,1,1,0,0,Orbitrap,[M+H]+,20.000000,417.230011,"[(316.131, 0.04), (347.149, 0.095), (373.155, ...",COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...
2,2,2,0,0,Orbitrap,[M+H]+,10.000000,417.230011,"[(399.141, 0.01), (402.204, 0.006), (417.182, ...",COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...
3,3,3,0,0,Orbitrap,[M+H]+,80.000000,417.230011,"[(169.065, 0.017), (181.065, 0.051), (197.06, ...",COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...
4,4,4,0,0,Orbitrap,[M+H]+,80.000000,417.230011,"[(181.065, 0.014), (197.06, 0.245), (198.067, ...",COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...
...,...,...,...,...,...,...,...,...,...,...
154469,154469,154861,9117,17415,QTOF,[M-H]-,10.000000,276.064172,"[(68.995758, 0.65665666), (69.039554, 0.011011...",O=C(Nc1ccc2c(c1)Cc1ccccc1-2)C(F)(F)F
154470,154470,154862,9118,17416,QTOF,[M-H]-,45.000000,899.565430,"[(223.002, 0.03), (240.995, 0.016), (241.015, ...",CCC=CCC=CCC=CCC=CCCCCCCC(=O)OC(COC(=O)CCCCCCCC...
154471,154471,154863,9118,17416,QTOF,[M-H]-,45.000000,899.565400,"[(223.002, 0.02902903), (240.995, 0.01501502),...",CCC=CCC=CCC=CCC=CCCCCCCC(=O)OC(COC(=O)CCCCCCCC...
154472,154472,154864,9119,17417,Orbitrap,[M+H]+,14.403912,240.065200,"[(56.0498, 1.0), (84.0556, 0.23923924), (90.03...",O=NNC1=NCCN1Cc1ccc(Cl)nc1


In [60]:
df['prec_type'].unique()

array(['[M+H]+', '[M-H]-'], dtype=object)

In [53]:
import numpy as np
from matchms import Spectrum
from dreams.utils.data import MSData

# Function to convert peaks to m/z and intensity lists
def convert_peaks(peaks_str):
    # If peaks are stored as strings, parse them
    if isinstance(peaks_str, str):
        try:
            peaks = json.loads(peaks_str.replace("'", "\""))
        except json.JSONDecodeError:
            # Handle other formats if necessary
            peaks = eval(peaks_str)
    else:
        peaks = peaks_str  # Assuming it's already a list of tuples

    mz, intensity = zip(*peaks) if peaks else ([], [])
    return list(mz), list(intensity)

# Apply the conversion to the 'peaks' column
df[['mz', 'intensity']] = df['peaks'].apply(lambda x: pd.Series(convert_peaks(x)))

# Create an 'IDENTIFIER' column based on 'spec_id'
df['IDENTIFIER'] = 'EnvedaID' + df['spec_id'].astype(str).str.zfill(7)

# Select necessary columns for MSData
msdata_df = df[['IDENTIFIER', 'prec_mz', 'mz', 'intensity', 'smiles']].copy()
msdata_df = msdata_df.rename(columns={
    'prec_mz': 'PRECURSOR_MZ',
    'mz': 'spectrum_mz',
    'intensity': 'spectrum_intensity'
})

# Create the 'spectrum' column as a list of two lists: [mz, intensity]
msdata_df['spectrum'] = msdata_df.apply(lambda row: [row['spectrum_mz'], row['spectrum_intensity']], axis=1)

# Drop intermediate columns
msdata_df = msdata_df.drop(columns=['spectrum_mz', 'spectrum_intensity'])

# Display the transformed DataFrame
print("\nTransformed DataFrame for MSData:")
print(msdata_df.head())


Transformed DataFrame for MSData:
        IDENTIFIER  PRECURSOR_MZ  \
0  EnvedaID0000000    417.230011   
1  EnvedaID0000001    417.230011   
2  EnvedaID0000002    417.230011   
3  EnvedaID0000003    417.230011   
4  EnvedaID0000004    417.230011   

                                              smiles  \
0  COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...   
1  COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...   
2  COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...   
3  COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...   
4  COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...   

                                            spectrum  
0  [[227.07, 240.078, 241.086, 242.094, 243.065, ...  
1  [[316.131, 347.149, 373.155, 386.209, 402.204,...  
2  [[399.141, 402.204, 417.182, 417.227, 423.109]...  
3  [[169.065, 181.065, 197.06, 198.068, 199.075, ...  
4  [[181.065, 197.06, 198.067, 199.075, 211.075, ...  


In [55]:
msdata_df['spectrum'].iloc[0]

[[227.07,
  240.078,
  241.086,
  242.094,
  243.065,
  243.102,
  255.065,
  255.102,
  257.081,
  258.089,
  269.081,
  270.089,
  271.06,
  271.096,
  272.104,
  273.112,
  282.089,
  283.096,
  285.076,
  285.112,
  286.084,
  287.091,
  289.107,
  298.12,
  299.091,
  300.099,
  301.107,
  315.123,
  316.13,
  317.102,
  341.138],
 [0.03,
  0.007,
  0.117,
  0.365,
  0.019,
  0.007,
  0.008,
  0.054,
  0.087,
  0.172,
  0.019,
  0.156,
  0.022,
  0.092,
  0.022,
  0.124,
  0.016,
  0.032,
  0.039,
  0.659,
  0.137,
  0.008,
  0.041,
  0.011,
  0.04,
  0.069,
  1.0,
  0.079,
  0.177,
  0.102,
  0.01]]

### convert csv to mgf

In [66]:
import pandas as pd
import ast
import os

In [67]:
dataframe_path = "/workspace/mol2DreaMS/data/data/Enveda/enveda_spec_df.csv"

df = pd.read_csv(dataframe_path)

print(df.head())

   Unnamed: 0  spec_id  mol_id  group_id inst_type prec_type   ace  \
0           0        0       0         0  Orbitrap    [M+H]+  60.0   
1           1        1       0         0  Orbitrap    [M+H]+  20.0   
2           2        2       0         0  Orbitrap    [M+H]+  10.0   
3           3        3       0         0  Orbitrap    [M+H]+  80.0   
4           4        4       0         0  Orbitrap    [M+H]+  80.0   

      prec_mz                                              peaks  \
0  417.230011  [(227.07, 0.03), (240.078, 0.007), (241.086, 0...   
1  417.230011  [(316.131, 0.04), (347.149, 0.095), (373.155, ...   
2  417.230011  [(399.141, 0.01), (402.204, 0.006), (417.182, ...   
3  417.230011  [(169.065, 0.017), (181.065, 0.051), (197.06, ...   
4  417.230011  [(181.065, 0.014), (197.06, 0.245), (198.067, ...   

                                              smiles  
0  COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...  
1  COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C...  
2  CO

In [68]:
adduct_to_charge = {
    '[M+H]+': 1,
    '[M-H]-': -1
}

df['CHARGE'] = df['prec_type'].map(adduct_to_charge)

if df['CHARGE'].isnull().any():
    unmapped = df[df['CHARGE'].isnull()]['prec_type'].unique()
    raise ValueError(f"Unmapped adduct types found: {unmapped}")

In [69]:
df['IDENTIFIER'] = df['spec_id'].apply(lambda x: f"EnvedaID{int(x):07d}")

In [70]:
# Function to safely parse the 'peaks' string into a list of tuples
def parse_peaks(peaks_str):
    try:
        peaks = ast.literal_eval(peaks_str)
        if isinstance(peaks, list) and all(isinstance(t, tuple) for t in peaks):
            return peaks
        else:
            raise ValueError
    except (ValueError, SyntaxError):
        print(f"Malformed peaks entry: {peaks_str}")
        return []

In [71]:
df['parsed_peaks'] = df['peaks'].apply(parse_peaks)

In [73]:
additional_metadata = ['mol_id', 'group_id', 'inst_type', 'ace']

In [74]:
# Path to save the MGF file
mgf_output_path = "/workspace/mol2DreaMS/data/data/Enveda/enveda_spec_df.mgf"

In [75]:
# Open the MGF file for writing
with open(mgf_output_path, 'w') as mgf_file:
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Begin IONS block
        mgf_file.write("BEGIN IONS\n")
        
        # Write mandatory and additional metadata
        mgf_file.write(f"IDENTIFIER={row['IDENTIFIER']}\n")
        mgf_file.write(f"SMILES={row['smiles']}\n")
        mgf_file.write(f"ADDUCT={row['prec_type']}\n")
        mgf_file.write(f"CHARGE={row['CHARGE']}\n")
        mgf_file.write(f"PRECURSOR_MZ={row['prec_mz']}\n")
        
        # Include additional metadata
        for meta in additional_metadata:
            mgf_file.write(f"{meta.upper()}={row[meta]}\n")
        
        # Write the peaks
        for mz, intensity in row['parsed_peaks']:
            mgf_file.write(f"{mz} {intensity}\n")
        
        # End IONS block
        mgf_file.write("END IONS\n\n")

In [76]:
spectra_from_path = list(load_from_mgf(mgf_output_path))

In [77]:
len(spectra_from_path)

154474

In [78]:
dreams_embs = dreams_embeddings(mgf_output_path, prec_mz_col='PRECURSOR_MZ')

Computing DreaMS embedding: 100%|██████████| 154474/154474 [03:50<00:00, 671.10it/s]


In [79]:
if len(spectra_from_path) != dreams_embs.shape[0]:
    raise ValueError("The number of embeddings does not match the number of spectra.")

for idx, (spectrum, embedding) in enumerate(zip(spectra_from_path, dreams_embs)):
    spectrum.set("dreams_embedding", embedding.tolist())  

print("All spectra have been updated with 'dreams_embedding' attributes.")

All spectra have been updated with 'dreams_embedding' attributes.


In [80]:
output_mgf_path = "/workspace/mol2DreaMS/data/data/Enveda/DreaMS_Enveda_spec.mgf"

# save_as_mgf(spectra_from_path, output_mgf_path)

dict_keys(['spectra'])


In [81]:
loaded_spectra_from_path = list(load_from_mgf("/workspace/mol2DreaMS/data/data/Enveda/DreaMS_Enveda_spec.mgf"))

In [83]:
loaded_spectra_from_path[0].metadata

{'charge': 1,
 'identifier': 'EnvedaID0000000',
 'smiles': 'COc1cc2c(c(OC)c1OC)-c1c(cc(OC)c(OC)c1OC)CC(C)C(C)C2',
 'adduct': '[M+H]+',
 'precursor_mz': 417.230010986328,
 'mol_id': '0',
 'group_id': '0',
 'inst_type': 'Orbitrap',
 'ace': '60.0',
 'dreams_embedding': '[0.8425571322441101, 0.0918535366654396, -1.1419888734817505, 0.6671642661094666, -0.0880361944437027, 0.6634528040885925, -0.6003973484039307, -0.2649351954460144, 0.36253616213798523, 0.2710038423538208, -0.8484073877334595, -1.0968399047851562, 0.027199842035770416, 0.9952453374862671, 0.693976879119873, -0.11301381140947342, 0.2924557328224182, 0.39215439558029175, -1.6976791620254517, 0.5363748669624329, 1.437273621559143, -0.21050402522087097, 0.3839256763458252, -0.8289666771888733, -0.9512644410133362, -0.529186487197876, -0.47029027342796326, -0.1632167398929596, -0.4269216060638428, -0.6551788449287415, 1.039275884628296, 0.6268438696861267, 1.215273141860962, 0.0648469477891922, 1.4068809747695923, 0.35655584931