In [64]:
import pandas as pd
from matchms.importing import load_from_mgf
from rdkit import Chem
from tqdm import tqdm
from collections import defaultdict

import torch
from torch.utils.data import Dataset, DataLoader
import random

# Assuming 'murcko_hist' and 'are_sub_hists' are available from the 'dreams' package
from dreams.algorithms.murcko_hist.murcko_hist import murcko_hist

In [2]:
def load_mgf_with_folds(mgf_path):
    """
    Load the MGF file and assign folds based on the precomputed 'FOLD' column.

    Parameters:
    - mgf_path (str): Path to the MGF file.

    Returns:
    - df (pd.DataFrame): DataFrame containing spectra and associated metadata.
    """
    spectra = list(load_from_mgf(mgf_path))
    records = []
    for spec in spectra:
        record = spec.to_dict()
        records.append(record)
    df = pd.DataFrame(records)
    
    # Assuming 'FOLD' has been assigned previously and is part of the DataFrame
    if 'fold' not in df.columns:
        raise ValueError("FOLD column is missing. Ensure the dataset has been split into train/val/test.")
    
    return df

             identifier                                         smiles  \
0  MassSpecGymID0000001  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
1  MassSpecGymID0000002  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
2  MassSpecGymID0000003  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
3  MassSpecGymID0000004  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
4  MassSpecGymID0000005  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   

         inchikey    formula precursor_formula parent_mass  precursor_mz  \
0  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
1  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
2  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
3  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
4  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   

   adduct instrument_type collision_energy   fold simulation_challenge  \
0  [M+H]+        Orbitra

In [52]:
spectra_path = "../../data/data/MassSpecGym.mgf"
df = load_mgf_with_folds(spectra_path)
print(df.head())

             identifier                                         smiles  \
0  MassSpecGymID0000001  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
1  MassSpecGymID0000002  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
2  MassSpecGymID0000003  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
3  MassSpecGymID0000004  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
4  MassSpecGymID0000005  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   

         inchikey    formula precursor_formula parent_mass  precursor_mz  \
0  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
1  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
2  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
3  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
4  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   

   adduct instrument_type collision_energy   fold simulation_challenge  \
0  [M+H]+        Orbitra

In [56]:
def extract_training_set(df):
    """
    Extract the training set from the DataFrame based on the 'FOLD' column.

    Parameters:
    - df (pd.DataFrame): Complete DataFrame with all folds.

    Returns:
    - df_train (pd.DataFrame): Training set DataFrame.
    """
    df_train = df[df['fold'] == 'train'].reset_index(drop=True)
    print(f"Training set size: {len(df_train)} spectra")
    return df_train

In [57]:
df_train = extract_training_set(df)

Training set size: 194119 spectra


In [58]:
def get_unique_smiles(df_train, smiles_col='smiles'):
    """
    Extract unique SMILES from the training set.

    Parameters:
    - df_train (pd.DataFrame): Training set DataFrame.
    - smiles_col (str): Column name for SMILES strings.

    Returns:
    - df_us (pd.DataFrame): DataFrame with unique SMILES.
    """
    df_us = df_train.drop_duplicates(subset=[smiles_col]).copy()
    print(f"Number of unique SMILES in training set: {df_us[smiles_col].nunique()}")
    return df_us

In [59]:
df_us_train = get_unique_smiles(df_train)

Number of unique SMILES in training set: 25046


In [60]:
def compute_murcko_histograms(df_us, smiles_col='smiles'):
    """
    Compute Murcko histograms for each unique SMILES in the DataFrame.

    Parameters:
    - df_us (pd.DataFrame): DataFrame with unique SMILES.
    - smiles_col (str): Column name for SMILES strings.

    Returns:
    - df_us (pd.DataFrame): Updated DataFrame with Murcko histograms.
    """
    print("Computing Murcko histograms...")
    tqdm.pandas()
    df_us['MurckoHist'] = df_us[smiles_col].progress_apply(
        lambda x: murcko_hist(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else {}
    )
    
    # Convert dictionaries to strings for easier handling
    df_us['MurckoHistStr'] = df_us['MurckoHist'].astype(str)
    
    print('Number of unique SMILES:', df_us[smiles_col].nunique(), 
          'Number of unique Murcko histograms:', df_us['MurckoHistStr'].nunique())
    
    print('Top 20 most common Murcko histograms:')
    print(df_us['MurckoHistStr'].value_counts().head(20))
    
    return df_us

In [61]:
df_us_train = compute_murcko_histograms(df_us_train)

Computing Murcko histograms...


100%|██████████| 25046/25046 [02:35<00:00, 160.73it/s] 

Number of unique SMILES: 25046 Number of unique Murcko histograms: 338
Top 20 most common Murcko histograms:
MurckoHistStr
{'0_1': 1, '1_0': 1, '1_1': 1}              3351
{}                                          2953
{'0_1': 2}                                  2948
{'0_0': 1}                                  2875
{'1_0': 2}                                  1626
{'0_1': 2, '0_2': 1}                        1436
{'1_0': 2, '2_0': 2}                        1134
{'1_0': 2, '2_0': 1}                        1096
{'0_1': 1, '0_2': 1, '1_0': 1, '1_1': 1}     751
{'0_1': 2, '1_1': 2}                         545
{'0_1': 1, '1_0': 1, '1_1': 1, '2_0': 1}     495
{'0_1': 2, '1_0': 1, '1_2': 1}               481
{'1_0': 2, '1_1': 2}                         456
{'0_1': 3}                                   341
{'1_0': 2, '2_0': 3}                         288
{'0_1': 2, '0_2': 2}                         281
{'0_1': 1, '1_0': 1, '1_1': 1, '2_0': 2}     246
{'0_1': 2, '0_2': 1, '1_1': 2}              




In [62]:
def group_by_murcko_histograms(df_us, smiles_col='smiles'):
    """
    Group molecules by their Murcko histogram strings.

    Parameters:
    - df_us (pd.DataFrame): DataFrame with MurckoHistStr column.
    - smiles_col (str): Column name for SMILES strings.

    Returns:
    - df_gb (pd.DataFrame): Grouped DataFrame by MurckoHistStr.
    """
    df_gb = df_us.groupby('MurckoHistStr').agg(
        count=(smiles_col, 'count'),
        smiles_list=(smiles_col, list)
    ).reset_index()
    
    # Convert MurckoHistStr to MurckoHist
    df_gb['MurckoHist'] = df_gb['MurckoHistStr'].apply(eval)
    
    # Sort by 'count' in descending order and reset index
    df_gb = df_gb.sort_values('count', ascending=False).reset_index(drop=True)
    
    print(f"Grouped into {len(df_gb)} Murcko histogram groups.")
    print(df_gb.head())
    
    return df_gb


In [68]:
df_gb_train = group_by_murcko_histograms(df_us_train)

Grouped into 338 Murcko histogram groups.
                    MurckoHistStr  count  \
0  {'0_1': 1, '1_0': 1, '1_1': 1}   3351   
1                              {}   2953   
2                      {'0_1': 2}   2948   
3                      {'0_0': 1}   2875   
4                      {'1_0': 2}   1626   

                                         smiles_list  \
0  [C1=CC=C(C=C1)C2=C(C(=O)NC3=CC=CC=C32)O, CN1C(...   
1  [CC(=C)C(=O)/C(=C/C(=O)O)/OC, CC[C@@H](C)[C@H]...   
2  [CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC...   
3  [C[C@@H]1C[C@H]2[C@H](O2)/C=C\C(=O)CC(=O)O1, C...   
4  [C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=...   

                       MurckoHist  
0  {'0_1': 1, '1_0': 1, '1_1': 1}  
1                              {}  
2                      {'0_1': 2}  
3                      {'0_0': 1}  
4                      {'1_0': 2}  


In [69]:
df_gb_train

Unnamed: 0,MurckoHistStr,count,smiles_list,MurckoHist
0,"{'0_1': 1, '1_0': 1, '1_1': 1}",3351,"[C1=CC=C(C=C1)C2=C(C(=O)NC3=CC=CC=C32)O, CN1C(...","{'0_1': 1, '1_0': 1, '1_1': 1}"
1,{},2953,"[CC(=C)C(=O)/C(=C/C(=O)O)/OC, CC[C@@H](C)[C@H]...",{}
2,{'0_1': 2},2948,[CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC...,{'0_1': 2}
3,{'0_0': 1},2875,"[C[C@@H]1C[C@H]2[C@H](O2)/C=C\C(=O)CC(=O)O1, C...",{'0_0': 1}
4,{'1_0': 2},1626,[C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=...,{'1_0': 2}
...,...,...,...,...
333,"{'0_1': 1, '2_0': 2, '3_0': 2, '12_1': 2, '18_...",1,[C1C2C3C(C(C(O2)OC(=O)C4=CC(=C(C(=C4)O)O)O)OC(...,"{'0_1': 1, '2_0': 2, '3_0': 2, '12_1': 2, '18_..."
334,"{'1_0': 1, '1_1': 1, '7_0': 1, '7_1': 1, '8_1'...",1,[C1[C@@H]2CC(C[C@H]3N2CC(=O)C1C3)OC(=O)C4=CNC5...,"{'1_0': 1, '1_1': 1, '7_0': 1, '7_1': 1, '8_1'..."
335,"{'1_0': 1, '2_0': 1, '3_0': 1, '4_0': 1, '14_0...",1,[CC(=O)N1CCC2=CC(=C3C=C2[C@H]1CC4=CC=C(C=C4)OC...,"{'1_0': 1, '2_0': 1, '3_0': 1, '4_0': 1, '14_0..."
336,"{'1_0': 1, '2_0': 1, '3_0': 3, '4_0': 1}",1,[C[C@@H]1[C@@H]2C[C@@]3([C@H]4[C@H](O[C@H]([C@...,"{'1_0': 1, '2_0': 1, '3_0': 3, '4_0': 1}"


In [72]:
def create_mappings(df_gb, df_us_train, df_train, smiles_col='smiles', identifier_col='identifier'):
    """
    Create mappings from MurckoHistStr to identifiers and vice versa.

    Parameters:
    - df_gb (pd.DataFrame): Grouped DataFrame by MurckoHistStr.
    - df_us_train (pd.DataFrame): DataFrame with unique SMILES and their MurckoHistStr.
    - df_train (pd.DataFrame): Training set DataFrame.
    - smiles_col (str): Column name for SMILES strings.
    - identifier_col (str): Column name for unique identifiers.

    Returns:
    - hist_to_ids (dict): Mapping from MurckoHistStr to list of identifiers.
    - id_to_hist (dict): Mapping from identifier to MurckoHistStr.
    """
    hist_to_ids = defaultdict(list)
    # Iterate through each group and map identifiers
    for _, row in df_gb.iterrows():
        murcko_hist_str = row['MurckoHistStr']
        smiles_list = row['smiles_list']
        # Get identifiers corresponding to these SMILES
        identifiers = df_train[df_train[smiles_col].isin(smiles_list)][identifier_col].tolist()
        hist_to_ids[murcko_hist_str].extend(identifiers)
    
    # Create mapping from identifier to MurckoHistStr
    id_to_hist = df_train.set_index(identifier_col)['MurckoHistStr'].to_dict()
    
    print(f"Total Murcko histogram groups mapped to identifiers: {len(hist_to_ids)}")
    return hist_to_ids, id_to_hist


In [74]:
df_train

Unnamed: 0,identifier,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge,peaks_json
0,MassSpecGymID0000001,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,30.0,train,True,"[[91.0542, 0.24524524524524524], [125.0233, 1...."
1,MassSpecGymID0000002,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,20.0,train,True,"[[91.0542, 0.0990990990990991], [125.0233, 0.2..."
2,MassSpecGymID0000003,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,40.0,train,True,"[[69.0343, 0.03403403403403404], [91.0542, 0.3..."
3,MassSpecGymID0000004,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,55.0,train,True,"[[69.0343, 0.17917917917917917], [91.0542, 0.4..."
4,MassSpecGymID0000005,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,10.0,train,True,"[[91.0542, 0.07807807807807808], [125.0233, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194114,MassSpecGymID0414159,CC(=O)OC[C@@H]1[C@H](C(C([C@@H](O1)OC2=C(OC3=C...,IGLUNMMNDNWZOA,C23H22O13,C23H23O13,506.106724,507.1140,[M+H]+,QTOF,,train,False,"[[81.034798, 0.07607607607607608], [109.0299, ..."
194115,MassSpecGymID0414160,C[C@@H]1[C@@H]([C@@H]([C@H]([C@@H](O1)OC2=CC(=...,JYXSWDCPHRTYGU,C27H30O15,C27H31O15,594.158724,595.1660,[M+H]+,QTOF,,train,False,"[[287.05481, 1.0], [433.112701, 0.271271271271..."
194116,MassSpecGymID0414161,CC1[C@@H](C([C@@H]([C@@H](O1)OC2[C@@H](C(O[C@H...,YFPYXTNSQOUHPS,C27H30O15,C27H31O15,594.158724,595.1660,[M+H]+,QTOF,,train,False,"[[71.050797, 0.043043043043043044], [85.029099..."
194117,MassSpecGymID0414162,CC1[C@@H]([C@@H](C([C@@H](O1)OC2=CC(=C3C(=C2)O...,DDELFAUOHDSZJL,C33H40O19,C33H41O19,740.216724,741.2240,[M+H]+,QTOF,,train,False,"[[85.029198, 0.07707707707707707], [287.055115..."


In [67]:
df_g

Unnamed: 0,MurckoHistStr,count,smiles_list,MurckoHist
0,"{'0_1': 1, '1_0': 1, '1_1': 1}",3351,"[C1=CC=C(C=C1)C2=C(C(=O)NC3=CC=CC=C32)O, CN1C(...","{'0_1': 1, '1_0': 1, '1_1': 1}"
1,{},2953,"[CC(=C)C(=O)/C(=C/C(=O)O)/OC, CC[C@@H](C)[C@H]...",{}
2,{'0_1': 2},2948,[CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC...,{'0_1': 2}
3,{'0_0': 1},2875,"[C[C@@H]1C[C@H]2[C@H](O2)/C=C\C(=O)CC(=O)O1, C...",{'0_0': 1}
4,{'1_0': 2},1626,[C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=...,{'1_0': 2}
...,...,...,...,...
333,"{'0_1': 1, '2_0': 2, '3_0': 2, '12_1': 2, '18_...",1,[C1C2C3C(C(C(O2)OC(=O)C4=CC(=C(C(=C4)O)O)O)OC(...,"{'0_1': 1, '2_0': 2, '3_0': 2, '12_1': 2, '18_..."
334,"{'1_0': 1, '1_1': 1, '7_0': 1, '7_1': 1, '8_1'...",1,[C1[C@@H]2CC(C[C@H]3N2CC(=O)C1C3)OC(=O)C4=CNC5...,"{'1_0': 1, '1_1': 1, '7_0': 1, '7_1': 1, '8_1'..."
335,"{'1_0': 1, '2_0': 1, '3_0': 1, '4_0': 1, '14_0...",1,[CC(=O)N1CCC2=CC(=C3C=C2[C@H]1CC4=CC=C(C=C4)OC...,"{'1_0': 1, '2_0': 1, '3_0': 1, '4_0': 1, '14_0..."
336,"{'1_0': 1, '2_0': 1, '3_0': 3, '4_0': 1}",1,[C[C@@H]1[C@@H]2C[C@@]3([C@H]4[C@H](O[C@H]([C@...,"{'1_0': 1, '2_0': 1, '3_0': 3, '4_0': 1}"


In [73]:
hist_to_ids, id_to_hist = create_mappings(df_gb_train, df_us_train, df_train)

KeyError: 'MurckoHistStr'

In [None]:
def assign_murcko_hist_to_train(df_train, id_to_hist, identifier_col='identifier'):
    """
    Assign MurckoHistStr to each identifier in the training set.

    Parameters:
    - df_train (pd.DataFrame): Training set DataFrame.
    - id_to_hist (dict): Mapping from identifier to MurckoHistStr.
    - identifier_col (str): Column name for unique identifiers.

    Returns:
    - df_train (pd.DataFrame): Updated DataFrame with MurckoHistStr.
    """
    df_train['MurckoHistStr'] = df_train[identifier_col].map(id_to_hist)
    
    # Handle any missing mappings (assign an empty dictionary string)
    df_train['MurckoHistStr'] = df_train['MurckoHistStr'].fillna('{}')
    
    print(f"Assigned MurckoHistStr to {df_train['MurckoHistStr'].notna().sum()} identifiers.")
    return df_train

In [None]:
df_train = assign_murcko_hist_to_train(df_train, id_to_hist)

# CLOSE

In [6]:
def get_train_set(df):
    """
    Extract the training set from the DataFrame.

    Parameters:
    - df (pd.DataFrame): Complete DataFrame with all folds.

    Returns:
    - df_train (pd.DataFrame): Training set DataFrame.
    """
    df_train = df[df['fold'] == 'train'].reset_index(drop=True)
    return df_train

In [13]:
df_train = get_train_set(df)
print(f"Training set size: {len(df_train)} spectra")

Training set size: 194119 spectra


In [14]:
df_us_train = df_train.drop_duplicates(subset=['smiles']).copy()
print(f"Number of unique SMILES in training set: {df_us_train['smiles'].nunique()}")

Number of unique SMILES in training set: 25046


In [24]:
def compute_murcko_histograms(df_us, smiles_col='smiles'):
    """
    Compute Murcko histograms for each unique SMILES in the DataFrame.

    Parameters:
    - df_us (pd.DataFrame): DataFrame with unique SMILES.
    - smiles_col (str): Column name for SMILES strings.

    Returns:
    - df_us (pd.DataFrame): Updated DataFrame with Murcko histograms.
    """
    print("Computing Murcko histograms...")
    tqdm.pandas()
    df_us['MurckoHist'] = df_us[smiles_col].progress_apply(
        lambda x: murcko_hist(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else {}
    )
    
    # Convert dictionaries to strings for easier handling
    df_us['MurckoHistStr'] = df_us['MurckoHist'].astype(str)
    
    print('Number of unique SMILES:', df_us[smiles_col].nunique(), 
          'Number of unique Murcko histograms:', df_us['MurckoHistStr'].nunique())
    
    print('Top 20 most common Murcko histograms:')
    print(df_us['MurckoHistStr'].value_counts().head(20))
    
    return df_us

In [25]:
df_us_train = compute_murcko_histograms(df_us_train)

Computing Murcko histograms...


100%|██████████| 25046/25046 [02:35<00:00, 161.44it/s] 

Number of unique SMILES: 25046 Number of unique Murcko histograms: 338
Top 20 most common Murcko histograms:
MurckoHistStr
{'0_1': 1, '1_0': 1, '1_1': 1}              3351
{}                                          2953
{'0_1': 2}                                  2948
{'0_0': 1}                                  2875
{'1_0': 2}                                  1626
{'0_1': 2, '0_2': 1}                        1436
{'1_0': 2, '2_0': 2}                        1134
{'1_0': 2, '2_0': 1}                        1096
{'0_1': 1, '0_2': 1, '1_0': 1, '1_1': 1}     751
{'0_1': 2, '1_1': 2}                         545
{'0_1': 1, '1_0': 1, '1_1': 1, '2_0': 1}     495
{'0_1': 2, '1_0': 1, '1_2': 1}               481
{'1_0': 2, '1_1': 2}                         456
{'0_1': 3}                                   341
{'1_0': 2, '2_0': 3}                         288
{'0_1': 2, '0_2': 2}                         281
{'0_1': 1, '1_0': 1, '1_1': 1, '2_0': 2}     246
{'0_1': 2, '0_2': 1, '1_1': 2}              




In [27]:
def group_by_murcko_histograms(df_us, smiles_col='smiles'):
    """
    Group molecules by their Murcko histogram strings.

    Parameters:
    - df_us (pd.DataFrame): DataFrame with MurckoHistStr column.
    - smiles_col (str): Column name for SMILES strings.

    Returns:
    - df_gb (pd.DataFrame): Grouped DataFrame by MurckoHistStr.
    """
    df_gb = df_us.groupby('MurckoHistStr').agg(
        count=(smiles_col, 'count'),
        smiles_list=(smiles_col, list)
    ).reset_index()
    
    # Convert MurckoHistStr to MurckoHist
    df_gb['MurckoHist'] = df_gb['MurckoHistStr'].apply(eval)
    
    # Sort by 'count' in descending order and reset index
    df_gb = df_gb.sort_values('count', ascending=False).reset_index(drop=True)
    
    print(f"Grouped into {len(df_gb)} Murcko histogram groups.")
    print(df_gb.head())
    
    return df_gb

In [31]:
df_gb_train = group_by_murcko_histograms(df_us_train)

Grouped into 338 Murcko histogram groups.
                    MurckoHistStr  count  \
0  {'0_1': 1, '1_0': 1, '1_1': 1}   3351   
1                              {}   2953   
2                      {'0_1': 2}   2948   
3                      {'0_0': 1}   2875   
4                      {'1_0': 2}   1626   

                                         smiles_list  \
0  [C1=CC=C(C=C1)C2=C(C(=O)NC3=CC=CC=C32)O, CN1C(...   
1  [CC(=C)C(=O)/C(=C/C(=O)O)/OC, CC[C@@H](C)[C@H]...   
2  [CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC...   
3  [C[C@@H]1C[C@H]2[C@H](O2)/C=C\C(=O)CC(=O)O1, C...   
4  [C[C@H]1CCCC(=O)CCC/C=C/C2=C(C(=CC(=C2)O)O)C(=...   

                       MurckoHist  
0  {'0_1': 1, '1_0': 1, '1_1': 1}  
1                              {}  
2                      {'0_1': 2}  
3                      {'0_0': 1}  
4                      {'1_0': 2}  


In [34]:
df_gb_train.keys()

Index(['MurckoHistStr', 'count', 'smiles_list', 'MurckoHist'], dtype='object')

In [37]:
# Explode the 'smiles_list' to have one SMILES per row
df_gb_exploded = df_gb_train.explode('smiles_list')

# Rename 'smiles_list' to 'SMILES' for clarity
df_gb_exploded = df_gb_exploded.rename(columns={'smiles_list': 'SMILES'})

# Reset index for cleanliness
df_gb_exploded = df_gb_exploded.reset_index(drop=True)

# Display the first few rows to verify
print(df_gb_exploded[['MurckoHistStr', 'count', 'SMILES', 'MurckoHist']].head())

                    MurckoHistStr  count  \
0  {'0_1': 1, '1_0': 1, '1_1': 1}   3351   
1  {'0_1': 1, '1_0': 1, '1_1': 1}   3351   
2  {'0_1': 1, '1_0': 1, '1_1': 1}   3351   
3  {'0_1': 1, '1_0': 1, '1_1': 1}   3351   
4  {'0_1': 1, '1_0': 1, '1_1': 1}   3351   

                                              SMILES  \
0             C1=CC=C(C=C1)C2=C(C(=O)NC3=CC=CC=C32)O   
1   CN1C(=O)C2=CC=CC=C2NC(=O)C13[C@H](O3)C4=CC=CC=C4   
2  CCOC(=O)C(CC1=CC=CC=C1)NC(=O)C2=C(C3=C(CC(OC3=...   
3  C[C@@H]1CC2=C(C=C(C(=C2C(=O)O1)O)C(=O)N[C@@H](...   
4  CC1CC2=C(C=C(C(=C2C(=O)O1)O)C(=O)NC(CC3=CC=CC=...   

                       MurckoHist  
0  {'0_1': 1, '1_0': 1, '1_1': 1}  
1  {'0_1': 1, '1_0': 1, '1_1': 1}  
2  {'0_1': 1, '1_0': 1, '1_1': 1}  
3  {'0_1': 1, '1_0': 1, '1_1': 1}  
4  {'0_1': 1, '1_0': 1, '1_1': 1}  


In [38]:
# Create a mapping from SMILES to MurckoHist
smiles_to_murcko_hist = pd.Series(
    df_gb_exploded['MurckoHist'].values,
    index=df_gb_exploded['SMILES']
).to_dict()

# Display a sample of the mapping
print("Sample of SMILES to MurckoHist mapping:")
for i, (smiles, hist) in enumerate(smiles_to_murcko_hist.items()):
    if i >= 5:
        break
    print(f"SMILES: {smiles} => MurckoHist: {hist}")

Sample of SMILES to MurckoHist mapping:
SMILES: C1=CC=C(C=C1)C2=C(C(=O)NC3=CC=CC=C32)O => MurckoHist: {'0_1': 1, '1_0': 1, '1_1': 1}
SMILES: CN1C(=O)C2=CC=CC=C2NC(=O)C13[C@H](O3)C4=CC=CC=C4 => MurckoHist: {'0_1': 1, '1_0': 1, '1_1': 1}
SMILES: CCOC(=O)C(CC1=CC=CC=C1)NC(=O)C2=C(C3=C(CC(OC3=O)C)C=C2)O => MurckoHist: {'0_1': 1, '1_0': 1, '1_1': 1}
SMILES: C[C@@H]1CC2=C(C=C(C(=C2C(=O)O1)O)C(=O)N[C@@H](CC3=CC=CC=C3)C(=O)O)Cl => MurckoHist: {'0_1': 1, '1_0': 1, '1_1': 1}
SMILES: CC1CC2=C(C=C(C(=C2C(=O)O1)O)C(=O)NC(CC3=CC=CC=C3)C(=O)O)Cl => MurckoHist: {'0_1': 1, '1_0': 1, '1_1': 1}


In [40]:
# Map the MurckoHist to df_train based on SMILES
df_train['MurckoHist'] = df_train['smiles'].map(smiles_to_murcko_hist)

# Verify the assignment
print(df_train[['identifier', 'smiles', 'MurckoHist']].head())

             identifier                                         smiles  \
0  MassSpecGymID0000001  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
1  MassSpecGymID0000002  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
2  MassSpecGymID0000003  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
3  MassSpecGymID0000004  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
4  MassSpecGymID0000005  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   

   MurckoHist  
0  {'0_1': 2}  
1  {'0_1': 2}  
2  {'0_1': 2}  
3  {'0_1': 2}  
4  {'0_1': 2}  


In [41]:
# Check for any missing MurckoHist assignments
missing_murcko = df_train['MurckoHist'].isna().sum()
print(f"Number of spectra with missing MurckoHist: {missing_murcko}")

# Optionally, handle missing values
# For example, assign an empty dictionary
df_train['MurckoHist'] = df_train['MurckoHist'].fillna({})

Number of spectra with missing MurckoHist: 0


In [43]:
df_train.head()

Unnamed: 0,identifier,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge,peaks_json,MurckoHist
0,MassSpecGymID0000001,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,30.0,train,True,"[[91.0542, 0.24524524524524524], [125.0233, 1....",{'0_1': 2}
1,MassSpecGymID0000002,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,20.0,train,True,"[[91.0542, 0.0990990990990991], [125.0233, 0.2...",{'0_1': 2}
2,MassSpecGymID0000003,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,40.0,train,True,"[[69.0343, 0.03403403403403404], [91.0542, 0.3...",{'0_1': 2}
3,MassSpecGymID0000004,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,55.0,train,True,"[[69.0343, 0.17917917917917917], [91.0542, 0.4...",{'0_1': 2}
4,MassSpecGymID0000005,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,10.0,train,True,"[[91.0542, 0.07807807807807808], [125.0233, 0....",{'0_1': 2}


In [51]:
from collections import defaultdict

def create_mappings(df_gb, df_us_train, df_train, smiles_col='smiles', identifier_col='identifier'):
    """
    Create mappings from MurckoHistStr to identifiers and vice versa.

    Parameters:
    - df_gb (pd.DataFrame): Grouped DataFrame by MurckoHistStr.
    - df_us_train (pd.DataFrame): DataFrame with unique SMILES and their MurckoHistStr.
    - df_train (pd.DataFrame): Training set DataFrame.
    - smiles_col (str): Column name for SMILES strings.
    - identifier_col (str): Column name for unique identifiers.

    Returns:
    - hist_to_ids (dict): Mapping from MurckoHistStr to list of identifiers.
    - id_to_hist (dict): Mapping from identifier to MurckoHistStr.
    """
    hist_to_ids = defaultdict(list)
    # Iterate through each group and map identifiers
    for _, row in df_gb.iterrows():
        murcko_hist_str = row['MurckoHistStr']
        smiles_list = row['smiles_list']
        # Get identifiers corresponding to these SMILES
        identifiers = df_train[df_train[smiles_col].isin(smiles_list)][identifier_col].tolist()
        hist_to_ids[murcko_hist_str].extend(identifiers)
    
    # Create mapping from identifier to MurckoHistStr
    id_to_hist = df_train.set_index('identifier')['MurckoHistStr'].to_dict()
    
    print(f"Total Murcko histogram groups mapped to identifiers: {len(hist_to_ids)}")
    return hist_to_ids, id_to_hist

# Usage
hist_to_ids, id_to_hist = create_mappings(df_gb_train, df_us_train, df_train)

KeyError: 'MurckoHistStr'

In [46]:
import torch
from torch.utils.data import Dataset
import random

In [47]:
class ContrastiveDataset(Dataset):
    def __init__(self, smiles_list, smiles_to_hist, hist_to_smiles, num_negatives=5, seed=None):
        """
        Initialize the Contrastive Dataset.
        
        Parameters:
        - smiles_list (list): List of unique SMILES strings.
        - smiles_to_hist (dict): Mapping from SMILES to MurckoHistStr.
        - hist_to_smiles (dict): Mapping from MurckoHistStr to list of SMILES.
        - num_negatives (int): Number of negative examples per anchor.
        - seed (int, optional): Random seed for reproducibility.
        """
        self.smiles_list = smiles_list
        self.smiles_to_hist = smiles_to_hist
        self.hist_to_smiles = hist_to_smiles
        self.num_negatives = num_negatives
        self.all_smiles = set(smiles_list)
        self.all_hist_strs = list(hist_to_smiles.keys())
        
        if seed is not None:
            random.seed(seed)
        
    def __len__(self):
        return len(self.smiles_list)
    
    def __getitem__(self, idx):
        """
        Get a triplet for the given index.
        
        Returns:
        - anchor (str): Anchor SMILES.
        - positive (str): Positive SMILES (same MurckoHistStr).
        - negatives (list of str): List of negative SMILES (different MurckoHistStr).
        """
        anchor = self.smiles_list[idx]
        anchor_hist = self.smiles_to_hist[anchor]
        
        # Positive Example
        positive_candidates = self.hist_to_smiles[anchor_hist].copy()
        positive_candidates.remove(anchor)  # Exclude the anchor itself
        
        if not positive_candidates:
            # If no positive available, return anchor with itself and random negatives
            positive = anchor
        else:
            positive = random.choice(positive_candidates)
        
        # Negative Examples
        # Select SMILES from different MurckoHistStrs
        negative_hist_strs = [hist for hist in self.all_hist_strs if hist != anchor_hist]
        negative_smiles_pool = []
        for hist in negative_hist_strs:
            negative_smiles_pool.extend(self.hist_to_smiles[hist])
        
        # If there are fewer negatives than required, adjust
        if len(negative_smiles_pool) < self.num_negatives:
            negatives = random.sample(negative_smiles_pool, len(negative_smiles_pool))
        else:
            negatives = random.sample(negative_smiles_pool, self.num_negatives)
        
        return anchor, positive, negatives

In [49]:
# Extract the list of unique SMILES
unique_smiles = df_us_train['smiles'].tolist()

# Initialize the dataset
contrastive_dataset = ContrastiveDataset(
    smiles_list=unique_smiles,
    smiles_to_hist=smiles_to_hist,
    hist_to_smiles=hist_to_smiles,
    num_negatives=5,
    seed=42  # Optional: for reproducibility
)

print(f"Dataset size: {len(contrastive_dataset)}")

Dataset size: 25046


In [50]:
def collate_fn(batch):
    """
    Custom collate function to handle batches of triplets.
    
    Parameters:
    - batch (list of tuples): Each tuple contains (anchor, positive, negatives)
    
    Returns:
    - anchors (list): List of anchor SMILES.
    - positives (list): List of positive SMILES.
    - negatives (list of lists): List containing lists of negative SMILES.
    """
    anchors = [item[0] for item in batch]
    positives = [item[1] for item in batch]
    negatives = [item[2] for item in batch]
    return anchors, positives, negatives