In [3]:
import pandas as pd
import numpy as np
from matchms.importing import load_from_mgf

In [4]:
def load_mgf_with_folds(mgf_path):

    spectra = list(load_from_mgf(mgf_path))
    records = []
    for spec in spectra:
        record = spec.to_dict()
        records.append(record)
    df = pd.DataFrame(records)

    if 'fold' not in df.columns:
        raise ValueError("fold column is missing. Ensure the dataset has been split into train/val/test.")
    
    df.collision_energy = df.collision_energy.astype(float)
    df.parent_mass = df.parent_mass.astype(float)
    df.precursor_mz = df.precursor_mz.astype(float)
    
    return df


In [5]:
spectra_path = "../../data/data/MassSpecGym.mgf"
df = load_mgf_with_folds(spectra_path)
print(df.head())

             identifier                                         smiles  \
0  MassSpecGymID0000001  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
1  MassSpecGymID0000002  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
2  MassSpecGymID0000003  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
3  MassSpecGymID0000004  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
4  MassSpecGymID0000005  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   

         inchikey    formula precursor_formula  parent_mass  precursor_mz  \
0  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4   287.115224      288.1225   
1  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4   287.115224      288.1225   
2  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4   287.115224      288.1225   
3  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4   287.115224      288.1225   
4  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4   287.115224      288.1225   

   adduct instrument_type  collision_energy   fold simulation_challenge  \
0  [M+H]+        

In [6]:
def construct_triplets(df, examples_number=5):
    # Filter the DataFrame
    df_filtered = df[
        (df['fold'] == 'train') &
        (df['adduct'] == '[M+H]+') &
        (df['collision_energy'] == 60.0)
    ].reset_index(drop=True)

    print(f"Filtered dataset size: {len(df_filtered)} spectra")

    if len(df_filtered) == 0:
        print("No spectra found after filtering. Exiting.")
        return None

    # Compute the 14-character prefix of the InChI key
    df_filtered['inchikey_prefix'] = df_filtered['inchikey'].str[:14]

    # Build mappings
    # Map from InChI key prefixes to lists of identifiers
    inchikey_to_identifiers = df_filtered.groupby('inchikey_prefix')['identifier'].apply(list).to_dict()
    
    # Map from identifiers to InChI key prefixes and parent masses
    identifier_to_inchikey = df_filtered.set_index('identifier')['inchikey_prefix'].to_dict()
    identifier_to_parent_mass = df_filtered.set_index('identifier')['parent_mass'].to_dict()

    # Build a DataFrame of parent masses and identifiers
    mass_df = df_filtered[['identifier', 'parent_mass', 'inchikey_prefix']].copy()
    mass_df = mass_df.sort_values('parent_mass').reset_index(drop=True)

    # For each anchor, find positive and negative examples
    data = []
    for idx, row in df_filtered.iterrows():
        anchor_id = row['identifier']
        anchor_inchikey = row['inchikey_prefix']
        anchor_parent_mass = row['parent_mass']
        anchor_smiles = row['smiles']

        # Find positive examples (same InChI key prefix, different identifier)
        positive_ids = inchikey_to_identifiers[anchor_inchikey].copy()
        positive_ids = [pid for pid in positive_ids if pid != anchor_id]

        # Randomly select up to 5 positive examples
        if len(positive_ids) > examples_number:
            positive_ids = np.random.choice(positive_ids, examples_number, replace=False).tolist()

        # Find negative examples (different InChI key prefix, parent mass within ±5 Da)
        mass_lower = anchor_parent_mass - 5.0
        mass_upper = anchor_parent_mass + 5.0

        # Candidates within the mass range
        mass_candidates = mass_df[
            (mass_df['parent_mass'] >= mass_lower) &
            (mass_df['parent_mass'] <= mass_upper)
        ]

        # Exclude entries with the same InChI key prefix and the anchor itself
        negative_candidates = mass_candidates[
            (mass_candidates['inchikey_prefix'] != anchor_inchikey) &
            (mass_candidates['identifier'] != anchor_id)
        ]

        negative_ids = negative_candidates['identifier'].tolist()

        # Randomly select up to 5 negative examples
        if len(negative_ids) > examples_number:
            negative_ids = np.random.choice(negative_ids, examples_number, replace=False).tolist()

        # Append the results
        data.append({
            'anchor_smiles': anchor_smiles,
            'anchor_id': anchor_id,
            'positive_ids': positive_ids,
            'negative_ids': negative_ids
        })

    # Convert the list to a DataFrame
    triplets_df = pd.DataFrame(data)

    print(f"Constructed triplets for {len(triplets_df)} anchors.")

    return triplets_df


In [7]:
triplets_df = construct_triplets(df)
print(triplets_df.head())

Filtered dataset size: 11186 spectra
Constructed triplets for 11186 anchors.
                                       anchor_smiles             anchor_id  \
0  CC(C)[C@H]1C(=O)O[C@@H](C(=O)N([C@H](C(=O)O[C@...  MassSpecGymID0000261   
1  C[C@@H]1CC2=C(C=C(C(=C2C(=O)O1)O)C(=O)N[C@@H](...  MassSpecGymID0000740   
2  CC[C@H](C)C(=O)O[C@H]1CCC=C2[C@H]1[C@H]([C@H](...  MassSpecGymID0000882   
3       CC1=CC2=C(C(=C1)O)C(=O)C3=C(C2=O)C=C(C=C3O)O  MassSpecGymID0001133   
4  COC1=C2C3=C(C(=O)CC3)C(=O)OC2=C4[C@@H]5C=CO[C@...  MassSpecGymID0001358   

  positive_ids                                       negative_ids  
0           []  [MassSpecGymID0195900, MassSpecGymID0229559, M...  
1           []  [MassSpecGymID0221313, MassSpecGymID0153479, M...  
2           []  [MassSpecGymID0224162, MassSpecGymID0157766, M...  
3           []  [MassSpecGymID0054786, MassSpecGymID0157013, M...  
4           []  [MassSpecGymID0209277, MassSpecGymID0055962, M...  


In [8]:
triplets_df['num_positive'] = triplets_df['positive_ids'].apply(len)
triplets_df['num_negative'] = triplets_df['negative_ids'].apply(len)

# Compute value counts for positive_ids
positive_counts = triplets_df['num_positive'].value_counts().sort_index()
print("Positive IDs counts:")
print(positive_counts)

# Compute value counts for negative_ids
negative_counts = triplets_df['num_negative'].value_counts().sort_index()
print("\nNegative IDs counts:")
print(negative_counts)

# Compute unique compounds 
unique_compounds = triplets_df['anchor_smiles'].unique()
print("\nUnique compounds:")
print(len(unique_compounds))

Positive IDs counts:
num_positive
0    7467
1    1756
2    1137
3     440
4     190
5     196
Name: count, dtype: int64

Negative IDs counts:
num_negative
0        2
1       13
2       11
3        8
4       12
5    11140
Name: count, dtype: int64

Unique compounds:
9154


## Join with embedding

In [9]:
from mol2dreams.utils.data import prepare_datasets
from dreams.utils.data import MSData
from dreams.definitions import DREAMS_EMBEDDING

  from .autonotebook import tqdm as notebook_tqdm
Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [10]:
hdf5_path = "../../data/data/MassSpecGym_DreaMS.hdf5"
msdata = MSData.from_hdf5(hdf5_path, prec_mz_col='precursor_mz')
embs = msdata[DREAMS_EMBEDDING]
embs.shape

(213548, 1024)

In [11]:
extra_features = ['COLLISION_ENERGY', 'adduct', 'precursor_mz']
# Prepare datasets
datasets = prepare_datasets(
    msdata=msdata, 
    embs=embs, 
    splits=['train', 'val'],  # Include 'test' if present
    smiles_col='smiles', 
    embedding_col='DreaMS_embedding', 
    fold_col='FOLD'
)

Processing split 'train' with 194119 samples.


Featurizing train: 100%|██████████| 194119/194119 [00:03<00:00, 52873.66it/s]


Processing split 'val' with 19429 samples.


Featurizing val: 100%|██████████| 19429/19429 [00:00<00:00, 49226.48it/s]


In [12]:
train_data = datasets['train']

In [13]:
identifier_to_data = {entry['IDENTIFIER']: entry for entry in train_data}

In [14]:
# Anchors
anchor_ids = set(triplets_df['anchor_id'])

# Positives
positive_ids_set = set([pid for sublist in triplets_df['positive_ids'] for pid in sublist])

# Negatives
negative_ids_set = set([nid for sublist in triplets_df['negative_ids'] for nid in sublist])

# Union of all identifiers
all_triplet_ids = anchor_ids.union(positive_ids_set).union(negative_ids_set)

In [15]:
dataset_ids = set(identifier_to_data.keys())
identifiers_not_in_datasets = all_triplet_ids - dataset_ids

if identifiers_not_in_datasets:
    print(f"Identifiers in triplets not in datasets: {len(identifiers_not_in_datasets)}")
    print(identifiers_not_in_datasets)
else:
    print("All identifiers in triplets are present in datasets.")

All identifiers in triplets are present in datasets.


In [18]:
identifier_to_embedding = {entry['IDENTIFIER']: entry['embedding'] for entry in train_data}
identifier_to_smiles ={entry['IDENTIFIER']: entry['smiles'] for entry in train_data}

In [19]:
len(identifier_to_embedding), len(identifier_to_smiles)

(194119, 194119)