In [1]:
import random
import umap
import numpy as np
import h5py
import torch
import pandas as pd
import matplotlib.pyplot as plt
from dreams.utils.data import MSData
from dreams.api import dreams_embeddings
from dreams.utils.plots import init_plotting
from dreams.utils.mols import formula_type
from dreams.definitions import DREAMS_EMBEDDING
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
from mol2dreams.featurizer.featurize import MoleculeFeaturizer
from mol2dreams.featurizer.atom_features import AtomFeaturizer
from mol2dreams.featurizer.bond_features import BondFeaturizer

  from .autonotebook import tqdm as notebook_tqdm
Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [7]:
msdata = MSData.load('../../data/data/MassSpecGym_DreaMS.hdf5', in_mem=True)
embs = msdata[DREAMS_EMBEDDING]
embs.shape

Loading dataset MassSpecGym_DreaMS into memory (213548 spectra)...


(213548, 1024)

## Connect

In [9]:
bond_config = {
    'features': {
        'bond_type': True,
        'conjugated': True,
        'in_ring': True,
        'stereochemistry': False,
    }
}

atom_config = {
    'features': {
        'atom_symbol': True,
        'total_valence': True,
        'aromatic': True,
        'hybridization': True,
        'formal_charge': True,
        'default_valence': True,
        'ring_size': True,        
        'hydrogen_count': True,
    },
    'feature_attributes': {
        'atom_symbol': {
            'top_n_atoms': 42,     
            'include_other': True,    
        },

    }
}

In [7]:
spectrum_embedding_size = 1024 
featurizer = MoleculeFeaturizer(atom_config, bond_config, spectrum_embedding_size)

In [8]:
spectra_path = "../../data/data/MassSpecGym.mgf"
spectra = list(load_from_mgf(spectra_path))
spectra_small = spectra[:50] 

In [7]:
msdata = MSData.load('../../data/data/MassSpecGym_DreaMS.hdf5', in_mem=True)
embs = msdata[DREAMS_EMBEDDING]  

Loading dataset MassSpecGym_DreaMS into memory (213548 spectra)...


In [15]:
msdata.columns()

['COLLISION_ENERGY',
 'DreaMS_embedding',
 'FOLD',
 'FORMULA',
 'IDENTIFIER',
 'INCHIKEY',
 'INSTRUMENT_TYPE',
 'PARENT_MASS',
 'PRECURSOR_FORMULA',
 'SIMULATION_CHALLENGE',
 'adduct',
 'precursor_mz',
 'smiles',
 'spectrum']

In [14]:
msdata.to_pandas()

Unnamed: 0,COLLISION_ENERGY,FOLD,FORMULA,IDENTIFIER,INCHIKEY,INSTRUMENT_TYPE,PARENT_MASS,PRECURSOR_FORMULA,SIMULATION_CHALLENGE,adduct,precursor_mz,smiles,spectrum
0,30.0,train,C16H17NO4,MassSpecGymID0000001,VFMQMACUYWGDOJ,Orbitrap,287.115224,C16H18NO4,True,[M+H]+,288.1225,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[91.0542, 125.0233, 154.0499, 155.0577, 185.0..."
1,20.0,train,C16H17NO4,MassSpecGymID0000002,VFMQMACUYWGDOJ,Orbitrap,287.115224,C16H18NO4,True,[M+H]+,288.1225,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[91.0542, 125.0233, 155.0577, 185.0961, 229.0..."
2,40.0,train,C16H17NO4,MassSpecGymID0000003,VFMQMACUYWGDOJ,Orbitrap,287.115224,C16H18NO4,True,[M+H]+,288.1225,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[69.0343, 91.0542, 125.0233, 127.039, 153.069..."
3,55.0,train,C16H17NO4,MassSpecGymID0000004,VFMQMACUYWGDOJ,Orbitrap,287.115224,C16H18NO4,True,[M+H]+,288.1225,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[69.0343, 91.0542, 110.06, 111.0441, 112.0393..."
4,10.0,train,C16H17NO4,MassSpecGymID0000005,VFMQMACUYWGDOJ,Orbitrap,287.115224,C16H18NO4,True,[M+H]+,288.1225,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,"[[91.0542, 125.0233, 185.0961, 229.0859, 246.1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213543,,train,C27H30O16,MassSpecGymID0414163,OTUCXMIQUNROBJ,QTOF,610.153724,C27H31O16,False,[M+H]+,611.1610,CC1[C@@H]([C@@H](C([C@@H](O1)OC2=CC(=C3C(=C2)O...,"[[303.049713, 449.108185, 611.161194], [1.0, 0..."
213544,,val,C46H77NO17,MassSpecGymID0414168,WBPYTXDJUQJLPQ,QTOF,915.521724,C46H78NO17,False,[M+H]+,916.5290,CC[C@@H]1[C@H](/C=C(/C=C\C(=O)[C@@H](C[C@@H]([...,"[[55.054474, 58.029369, 58.065601, 59.049339, ..."
213545,,val,C21H43N5O7,MassSpecGymID0414172,CEAZRRDELHUEMR,QTOF,477.317724,C21H44N5O7,False,[M+H]+,478.3250,C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,"[[72.080132, 102.233917, 113.082344, 113.12149..."
213546,,val,C21H43N5O7,MassSpecGymID0414173,CEAZRRDELHUEMR,QTOF,477.317724,C21H44N5O7,False,[M+H]+,478.3250,C[C@H]([C@@H]1CC[C@H]([C@H](O1)O[C@@H]2[C@H](C...,"[[55.053627, 56.455425, 58.065819, 67.053543, ..."


In [13]:
msdata['smiles']

['CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC',
 'C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=O)O1',
 'C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=O)O1',
 'C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=O)O1',
 'C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=O)O1',
 'C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=O)O1',
 'C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=O)O1',
 'C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=O)O1',
 'C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=O)O1',
 'C[C@H]1CCCC(=O)C

In [17]:
hdf5_path = "../../data/data/MassSpecGym_DreaMS.hdf5"

In [18]:
with h5py.File(hdf5_path, 'r') as h5f:
    data_dict = {key: h5f[key][:] for key in h5f.keys()}

df = pd.DataFrame(data_dict)

# Display unique folds
print("Unique folds:", df['FOLD'].unique())

ValueError: Per-column arrays must each be 1-dimensional

In [None]:
msdata = MSData.load('../../data/data/MassSpecGym_DreaMS.hdf5', in_mem=True)
embs = msdata[DREAMS_EMBEDDING]  

In [None]:
'IDENTIFIER'

In [11]:
if isinstance(embs, torch.Tensor):
    embs = embs.numpy()
elif not isinstance(embs, np.ndarray):
    embs = np.array(embs)

In [16]:
list_smiles = []
for spectrum in spectra:
    entry = {}
    entry['smiles'] = spectrum.metadata.get('smiles', None)
    list_smiles.append(entry)

# Verify that the number of embeddings matches the number of spectra
assert len(list_smiles) == embs.shape[0], "Mismatch between number of spectra and embeddings."

# Featurize the dataset
data_list = featurizer.featurize_dataset(list_smiles, embs)

print(f"Number of successfully featurized molecules: {len(data_list)}")

AssertionError: Mismatch between number of spectra and embeddings.

In [17]:
len(list_smiles), embs.shape[0]

(231104, 213548)

In [10]:
type(embs)

numpy.ndarray

## Computing embeddings

In [2]:
def extract_first_n_spectra(original_mgf_path, new_mgf_path, n=50):

    spectra = load_from_mgf(original_mgf_path)

    first_n_spectra = []
    for i, spectrum in enumerate(spectra):
        if i >= n:
            break
        first_n_spectra.append(spectrum)

    save_as_mgf(first_n_spectra, new_mgf_path)
    print(f"Extracted {len(first_n_spectra)} spectra and saved to {new_mgf_path}")

# # Example usage
# original_mgf = "../../data/data/MassSpecGym.mgf"
# new_mgf = "../../data/data/MassSpecGym_first50.mgf"
# extract_first_n_spectra(original_mgf, new_mgf, n=50)

Extracted 50 spectra and saved to ../../data/data/MassSpecGym_first50.mgf


In [2]:
new_mgf = "../../data/data/MassSpecGym_first50.mgf"
dreams_embs_50 = dreams_embeddings(new_mgf)

Computing DreaMS embedding: 100%|██████████| 50/50 [00:01<00:00, 42.73it/s]


In [3]:
# hdf5_output = "../../data/data/MassSpecGym_first50_embeddings.hdf5"

In [4]:
# with h5py.File(hdf5_output, 'w') as h5f:
#     h5f.create_dataset("DREAMS_EMBEDDING", data=dreams_embs_50)

In [6]:
# msdata = MSData.load(hdf5_output, in_mem=True)
# embs = msdata[DREAMS_EMBEDDING]  

ValueError: Column "spectrum" is not present in the dataset ../../data/data/MassSpecGym_first50_embeddings.hdf5.

## Preparing dataset

In [2]:
def prepare_datasets(msdata, embs, splits=['train', 'valid', 'test'], 
                    smiles_col='smiles', embedding_col='DreaMS_embedding', fold_col='FOLD'):
    """
    Prepares train, validation, and test datasets from MSData and embeddings.

    Args:
        msdata (MSData): The MSData object loaded from HDF5.
        embs (np.ndarray or torch.Tensor): Embeddings matrix with shape [num_samples, embedding_size].
        splits (list of str): List of fold names to extract. Default is ['train', 'valid', 'test'].
        smiles_col (str): Column name for SMILES strings in the DataFrame. Default is 'smiles'.
        embedding_col (str): Column name for embeddings in the DataFrame. Default is 'DreaMS_embedding'.
        fold_col (str): Column name for fold attribute in the DataFrame. Default is 'FOLD'.

    Returns:
        dict: A dictionary where keys are split names and values are lists of dictionaries 
              with 'smiles' and 'embedding'.
    """
    # Convert msdata to pandas DataFrame
    df = msdata.to_pandas()
    
    # Check alignment
    num_rows = df.shape[0]
    if embs.shape[0] != num_rows:
        raise ValueError(f"Number of embeddings ({embs.shape[0]}) does not match number of data samples ({num_rows}).")
    
    # Assign embeddings to the DataFrame
    # Ensure embeddings are numpy arrays
    if isinstance(embs, torch.Tensor):
        embs = embs.numpy()
    elif not isinstance(embs, np.ndarray):
        embs = np.array(embs)
    
    # Add embedding as a new column
    df[embedding_col] = list(embs)
    
    # Initialize dictionary to hold datasets
    datasets = {split: [] for split in splits}
    
    # Iterate over each split
    for split in splits:
        split_df = df[df[fold_col] == split].reset_index(drop=True)
        print(f"Processing split '{split}' with {len(split_df)} samples.")
        
        # Iterate over rows and collect 'smiles' and 'embedding'
        for idx, row in split_df.iterrows():
            smiles = row.get(smiles_col, None)
            embedding = row.get(embedding_col, None)
            
            # Skip if SMILES is missing or embedding is missing
            if pd.isna(smiles):
                print(f"Skipping row {idx} due to missing SMILES.")
                continue
            if embedding is None or len(embedding) != embs.shape[1]:
                print(f"Skipping row {idx} due to invalid embedding.")
                continue
            
            datasets[split].append({
                'smiles': smiles,
                'embedding': embedding
            })
    
    # Optionally, handle additional splits present in the DataFrame
    unique_folds = df[fold_col].unique()
    additional_folds = set(unique_folds) - set(splits)
    for split in additional_folds:
        split_df = df[df[fold_col] == split].reset_index(drop=True)
        print(f"Processing additional split '{split}' with {len(split_df)} samples.")
        datasets[split] = []
        for idx, row in split_df.iterrows():
            smiles = row.get(smiles_col, None)
            embedding = row.get(embedding_col, None)
            if pd.isna(smiles):
                print(f"Skipping row {idx} in split '{split}' due to missing SMILES.")
                continue
            if embedding is None or len(embedding) != embs.shape[1]:
                print(f"Skipping row {idx} in split '{split}' due to invalid embedding.")
                continue
            datasets[split].append({
                'smiles': smiles,
                'embedding': embedding
            })
    
    return datasets

In [3]:
hdf5_path = "../../data/data/MassSpecGym_DreaMS.hdf5"
msdata = MSData.load(hdf5_path, in_mem=True)

Loading dataset MassSpecGym_DreaMS into memory (213548 spectra)...


In [7]:
embs = msdata[DREAMS_EMBEDDING]  

# Prepare datasets
datasets = prepare_datasets(
    msdata=msdata, 
    embs=embs, 
    splits=['train', 'val'],  # Include 'test' if present
    smiles_col='smiles', 
    embedding_col='DreaMS_embedding', 
    fold_col='FOLD'
)


Processing split 'train' with 194119 samples.
Processing split 'val' with 19429 samples.


In [8]:
datasets.keys()

dict_keys(['train', 'val'])

In [None]:
spectrum_embedding_size = 1024 
featurizer = MoleculeFeaturizer(atom_config, bond_config, spectrum_embedding_size)

data_list_train = featurizer.featurize_dataset(datasets['train'], 
                                              [entry['embedding'] for entry in datasets['train']])
data_list_valid = featurizer.featurize_dataset(datasets['valid'], 
                                              [entry['embedding'] for entry in datasets['valid']])

print(f"Number of successfully featurized training molecules: {len(data_list_train)}")
print(f"Number of successfully featurized validation molecules: {len(data_list_valid)}")

# Try torch dataloader 

In [None]:

from torch_geometric.loader import DataLoader

batch_size_train = 32
batch_size_valid = 32

loader_train = DataLoader(
    data_list_train, 
    batch_size=batch_size_train, 
    shuffle=True, 
    num_workers=1  
)

loader_valid = DataLoader(
    data_list_valid, 
    batch_size=batch_size_valid, 
    shuffle=False, 
    num_workers=1  
)


for batch in loader_train:
    print(batch)
    print(f"Batch size: {batch.num_graphs}")  
    print(f"Node feature shape: {batch.x.shape}")          
    print(f"Edge index shape: {batch.edge_index.shape}")  
    print(f"Edge feature shape: {batch.edge_attr.shape}") 
    print(f"Spectrum embedding shape: {batch.y.shape}")  
    break  