In [1]:
import random
from cgitb import small

import tqdm
import numpy as np
import h5py
import torch
import pandas as pd
import matplotlib.pyplot as plt
from dreams.utils.data import MSData
from dreams.api import dreams_embeddings
from dreams.utils.plots import init_plotting
from dreams.utils.mols import formula_type
from dreams.definitions import DREAMS_EMBEDDING
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
from mol2dreams.featurizer.featurize import MoleculeFeaturizer
from mol2dreams.utils.data import prepare_datasets
from mol2dreams.featurizer.atom_features import AtomFeaturizer
from mol2dreams.featurizer.bond_features import BondFeaturizer

  from cgitb import small
  from .autonotebook import tqdm as notebook_tqdm
Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [2]:
hdf5_path = "../../data/data/MassSpecGym_DreaMS.hdf5"
msdata = MSData.from_hdf5(hdf5_path, prec_mz_col='precursor_mz')
embs = msdata[DREAMS_EMBEDDING]
embs.shape

(213548, 1024)

## Connect

In [13]:
bond_config = {
    'features': {
        'bond_type': True,
        'conjugated': True,
        'in_ring': True,
        'stereochemistry': False,
    }
}

atom_config = {
    'features': {
        'atom_symbol': True,
        'total_valence': True,
        'aromatic': True,
        'hybridization': True,
        'formal_charge': True,
        'default_valence': True,
        'ring_size': True,        
        'hydrogen_count': True,
    },
    'feature_attributes': {
        'atom_symbol': {
            'top_n_atoms': 42,     
            'include_other': True,    
        },

    }
}

## Computing embeddings

In [4]:
def extract_first_n_spectra(original_mgf_path, new_mgf_path, n=50):

    spectra = load_from_mgf(original_mgf_path)

    first_n_spectra = []
    for i, spectrum in enumerate(spectra):
        if i >= n:
            break
        first_n_spectra.append(spectrum)

    save_as_mgf(first_n_spectra, new_mgf_path)
    print(f"Extracted {len(first_n_spectra)} spectra and saved to {new_mgf_path}")

# # Example usage
# original_mgf = "../../data/data/MassSpecGym.mgf"
# new_mgf = "../../data/data/MassSpecGym_first50.mgf"
# extract_first_n_spectra(original_mgf, new_mgf, n=50)

In [17]:
new_mgf = "../../data/data/MassSpecGym_first50.mgf"
dreams_embs_50 = dreams_embeddings(new_mgf, prec_mz_col='PRECURSOR_MZ')

Computing DreaMS embedding: 100%|██████████| 50/50 [00:01<00:00, 43.97it/s]


## Preparing dataset

In [4]:
msdata.columns()

['COLLISION_ENERGY',
 'DreaMS_embedding',
 'FOLD',
 'FORMULA',
 'IDENTIFIER',
 'INCHIKEY',
 'INSTRUMENT_TYPE',
 'PARENT_MASS',
 'PRECURSOR_FORMULA',
 'SIMULATION_CHALLENGE',
 'adduct',
 'precursor_mz',
 'smiles',
 'spectrum']

In [5]:
embs = msdata[DREAMS_EMBEDDING]  
extra_features = ['COLLISION_ENERGY', 'adduct', 'precursor_mz']
# Prepare datasets
datasets = prepare_datasets(
    msdata=msdata, 
    embs=embs, 
    splits=['train', 'val'],  # Include 'test' if present
    smiles_col='smiles', 
    embedding_col='DreaMS_embedding', 
    fold_col='FOLD'
)


Processing split 'train' with 194119 samples.


Featurizing train: 100%|██████████| 194119/194119 [00:03<00:00, 52709.04it/s]


Processing split 'val' with 19429 samples.


Featurizing val: 100%|██████████| 19429/19429 [00:00<00:00, 51145.91it/s]


In [10]:
# datasets

In [11]:
# datasets['val']

In [8]:
small_dataset = {}
small_dataset['valid'] = datasets['train'][:100]
small_dataset['valid'] = datasets['val'][:100]

In [9]:
spectrum_embedding_size = 1024 
featurizer = MoleculeFeaturizer(atom_config, bond_config, spectrum_embedding_size)

data_list_train = featurizer.featurize_dataset(
    small_dataset['valid'], include_extra_attr=True)

data_list_valid = featurizer.featurize_dataset(
    small_dataset['valid'], include_extra_attr=True)

print(f"Number of successfully featurized training molecules: {len(data_list_train)}")
print(f"Number of successfully featurized validation molecules: {len(data_list_valid)}")

Featurizing dataset: 100%|██████████| 100/100 [00:00<00:00, 132.47it/s]
Featurizing dataset: 100%|██████████| 100/100 [00:00<00:00, 132.78it/s]

Number of successfully featurized training molecules: 100
Number of successfully featurized validation molecules: 100





In [14]:
data_list_train

[Data(x=[64, 84], edge_index=[2, 128], edge_attr=[128, 7], y=[1, 1024], IDENTIFIER=[1], COLLISION_ENERGY=[1, 1], adduct=[1], precursor_mz=[1, 1]),
 Data(x=[64, 84], edge_index=[2, 128], edge_attr=[128, 7], y=[1, 1024], IDENTIFIER=[1], COLLISION_ENERGY=[1, 1], adduct=[1], precursor_mz=[1, 1]),
 Data(x=[64, 84], edge_index=[2, 128], edge_attr=[128, 7], y=[1, 1024], IDENTIFIER=[1], COLLISION_ENERGY=[1, 1], adduct=[1], precursor_mz=[1, 1]),
 Data(x=[64, 84], edge_index=[2, 128], edge_attr=[128, 7], y=[1, 1024], IDENTIFIER=[1], COLLISION_ENERGY=[1, 1], adduct=[1], precursor_mz=[1, 1]),
 Data(x=[64, 84], edge_index=[2, 128], edge_attr=[128, 7], y=[1, 1024], IDENTIFIER=[1], COLLISION_ENERGY=[1, 1], adduct=[1], precursor_mz=[1, 1]),
 Data(x=[64, 84], edge_index=[2, 128], edge_attr=[128, 7], y=[1, 1024], IDENTIFIER=[1], COLLISION_ENERGY=[1, 1], adduct=[1], precursor_mz=[1, 1]),
 Data(x=[42, 84], edge_index=[2, 96], edge_attr=[96, 7], y=[1, 1024], IDENTIFIER=[1], COLLISION_ENERGY=[1, 1], adduct=

# Try torch dataloader 

In [10]:
from torch_geometric.loader import DataLoader

batch_size_train = 32
batch_size_valid = 32

loader_train = DataLoader(
    data_list_train, 
    batch_size=batch_size_train, 
    shuffle=True, 
    num_workers=1  
)

loader_valid = DataLoader(
    data_list_valid, 
    batch_size=batch_size_valid, 
    shuffle=False, 
    num_workers=1  
)


for batch in loader_train:
    print(batch)
    print(f"Batch size: {batch.num_graphs}")  
    print(f"Node feature shape: {batch.x.shape}")          
    print(f"Edge index shape: {batch.edge_index.shape}")  
    print(f"Edge feature shape: {batch.edge_attr.shape}") 
    print(f"Spectrum embedding shape: {batch.y.shape}")  
    break

DataBatch(x=[1406, 84], edge_index=[2, 2998], edge_attr=[2998, 7], y=[32, 1024], IDENTIFIER=[32], COLLISION_ENERGY=[32, 1], adduct=[32], precursor_mz=[32, 1], batch=[1406], ptr=[33])
Batch size: 32
Node feature shape: torch.Size([1406, 84])
Edge index shape: torch.Size([2, 2998])
Edge feature shape: torch.Size([2998, 7])
Spectrum embedding shape: torch.Size([32, 1024])


### Save for later use

In [11]:
torch.manual_seed(42)

num_batches_to_save = 3
saved_batches = []

for i, batch in enumerate(loader_train):
    if i >= num_batches_to_save:
        break
    saved_batches.append(batch)

print(f"Collected {len(saved_batches)} batches.")

Collected 3 batches.


In [12]:
save_path = "../../data/data/precomputed_batches_small.pt"

# torch.save(saved_batches, save_path)

print(f"Saved {len(saved_batches)} batches to {save_path}.")

Saved 3 batches to ../../data/data/precomputed_batches_small.pt.
