In [1]:
# Correct imports
from dataloader.dataset_wrapper import DataSetWrapper, DataSetWrapper_noddp
import pandas as pd
import numpy as np

In [None]:
def process_dataframe_to_spectra(df):
    """
    Convert DataFrame with peaks_json to matchms Spectrum objects
    """
    from matchms import Spectrum
    import json
    
    spectra = []
    valid_indices = []
    
    for idx, row in df.iterrows():
        try:
            # Parse peaks_json to get mz and intensity arrays
            peaks_data = json.loads(row['peaks_json'])
            mz = np.array([peak[0] for peak in peaks_data])
            intensities = np.array([peak[1] for peak in peaks_data])
            
            # Create spectrum object with metadata
            metadata = {
                'precursor_mz': row['precursor_mz'],
                'adduct': row['adduct'],
                'ion_mode': row['ion_mode'],
                'molecular_formula': row['molecular_formula'],
                'instrument': row['instrument'],
                'ion_source': row['ion_source'],
                'compound_source': row['compound_source'],
                'inchikey': row['inchikey']
            }
            
            spectrum = Spectrum(mz=mz, intensities=intensities, metadata=metadata)
            spectra.append(spectrum)
            valid_indices.append(idx)
            
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            continue
    
    return spectra, valid_indices

def dataframe_to_files(df, output_dir="./temp_data"):
    """
    Convert DataFrame to temporary files compatible with existing DataSetWrapper
    """
    import os
    from matchms.exporting import save_as_mgf
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Process DataFrame to get spectra and valid indices
    spectra, valid_indices = process_dataframe_to_spectra(df)
    
    print(f"Processed {len(spectra)} valid spectra out of {len(df)} total entries.")
    
    # Filter valid rows
    valid_df = df.iloc[valid_indices].reset_index(drop=True)
    smiles_list = valid_df['smiles'].tolist()
    
    # Save SMILES as numpy array
    smi_file = os.path.join(output_dir, "smiles.npy")
    np.save(smi_file, np.array(smiles_list))
    
    # Save spectra as MGF file
    mgf_file = os.path.join(output_dir, "spectra.mgf")
    save_as_mgf(spectra, mgf_file)
    
    return smi_file, mgf_file, valid_df

def create_wrapper_from_dataframe(df, batch_size=32, num_workers=4, valid_size=0.2, 
                                 use_ddp=False, world_size=1, rank=0, output_dir="./temp_data"):
    """
    Create DataSetWrapper from DataFrame
    """
    # Convert DataFrame to compatible files
    smi_file, mgf_file, processed_df = dataframe_to_files(df, output_dir)
    
    # Create appropriate wrapper
    if use_ddp:
        wrapper = DataSetWrapper(
            world_size=world_size,
            rank=rank,
            batch_size=batch_size,
            num_workers=num_workers,
            valid_size=valid_size,
            s=None,  # Not used in current implementation
            ms2_file=mgf_file,
            smi_file=smi_file
        )
    else:
        wrapper = DataSetWrapper_noddp(
            batch_size=batch_size,
            num_workers=num_workers,
            valid_size=valid_size,
            s=None,  # Not used in current implementation
            ms2_file=mgf_file,
            smi_file=smi_file
        )
    
    return wrapper, processed_df

In [3]:


df = pd.read_csv("/Users/ivangolov/Desktop/Диплом/CSMP_project/CSMP_spectrum_database/data/production/train_deduplicated.csv")  # Replace with your actual file path

In [4]:
# Take sample of 1000 rows for testing
df_sample = df.sample(n=1000, random_state=42).reset_index(drop=True)

In [5]:
df_sample

Unnamed: 0,peaks_json,ion_source,compound_source,instrument,adduct,precursor_mz,smiles,inchikey,ion_mode,molecular_formula
0,"[[42.03384, 0.099], [44.049313, 1.612000000000...",ESI,Commercial,Orbitrap,[M+H]+,262.191,Cc1cc(C)c(NC(=O)CN2CCNCC2)c(C)c1,XCJSVHRUHKUAHV-UHFFFAOYSA-N,Positive,C15H23N3O
1,"[[290.13797, 0.6], [305.154572, 0.8], [305.162...",ESI,Isolated,qTof,M-H,399.193,COC1C(O)CC2CN3CCC4=C(NC5=C4C=CC(OC)=C5)C3CC2C1...,JVHNBFFHWQQPLL-UHFFFAOYSA-N,Negative,C22H28N2O5
2,"[[334.026001, 100.0]]",ESI,Commercial standard,Orbitrap,[M+Na]+,537.350,O=C(OC(CC(C1=C2CC(O)C3C4(C)CCC(=O)C(C)(C)C4CCC...,NLOAQXKIIGTTRE-UHFFFAOYSA-N,Positive,C32H50O5
3,"[[78.918015, 100.0], [173.53862, 3.508], [237....",ESI,Crude,Orbitrap,[M-H]-,395.956,Cn1cnc2c1cc(C(=O)O)c(Nc1c(Cl)cc(Br)cc1)c2F,XAAPQRFIXGDKPZ-UHFFFAOYSA-N,Negative,C15H10BrClFN3O2
4,"[[43.017899, 1.8009999999999997], [53.038628, ...",ESI,Commercial,Orbitrap,[M+H]+,414.173,CC1(CNC(=O)c2cc(S(=O)(=O)Cc3ccccc3)ccc2)CCOC1C...,QJZVQDPBHPGSQT-UHFFFAOYSA-N,Positive,C23H27NO4S
...,...,...,...,...,...,...,...,...,...,...
995,"[[85.058609, 100.0], [173.542023, 25.806999], ...",ESI,Commercial,Orbitrap,[M+Na]+,226.990,CP(C)(=O)c1csc(C(=O)O)c1,IAUFTVZQJXFLHG-UHFFFAOYSA-N,Positive,C7H9O3PS
996,"[[163.005997, 10.100000000000001], [188.985001...",ESI,Commercial standard,Orbitrap,[M+Na]+,487.090,O=C1C(OC2OC(C)C(O)C(O)C2O)=C(OC=3C=C(O)C=C(O)C...,DCYOADKBABEMIQ-UHFFFAOYSA-N,Positive,C21H20O12
997,"[[50.007881, 0.092], [50.010555, 0.107], [55.0...",ESI,Crude,Orbitrap,[M+H]+,400.133,COc1cc(S(=O)(=O)N2CCCC(n3cnc4ccccc4c3=O)C2)ccc1,QCVNEMDIXQHJNF-UHFFFAOYSA-N,Positive,C20H21N3O4S
998,"[[40.762592, 0.274], [41.038479, 5.215], [44.9...",ESI,Commercial,Orbitrap,[M+H]+,219.076,O=C(O)c1cc2c(cc1)[nH]c(=O)n2C1CC1,JOALTBZZXLKPMS-UHFFFAOYSA-N,Positive,C11H10N2O3


In [6]:
print("Creating DataSetWrapper from DataFrame...")
    
wrapper, processed_df = create_wrapper_from_dataframe(
    df=df_sample,
    batch_size=16,
    num_workers=2,
    valid_size=0.2,
    use_ddp=False
)

Creating DataSetWrapper from DataFrame...
Processed 1000 valid spectra out of 1000 total entries.


In [7]:
train_loader, valid_loader = wrapper.get_data_loaders()

calculating molecular graphs


 21%|██        | 168/800 [00:00<00:00, 1674.54it/s]

SMILES [Cl-].O=C1C2=CC=C(O)C(=C2OC(=C1C=3C=CC=4OCCCOC4C3)C)C[NH+](C)C calculation failure


100%|██████████| 800/800 [00:00<00:00, 1728.30it/s]


Calculated 686 molecular graph-mass spectrometry pairs
calculating molecular graphs


100%|██████████| 200/200 [00:00<00:00, 1770.49it/s]

Calculated 174 molecular graph-mass spectrometry pairs





In [8]:
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(valid_loader)}")

Training batches: 42
Validation batches: 11


In [11]:
smi_file, mgf_file, processed_df = dataframe_to_files(df_sample,'./temp_data' )

Processed 1000 valid spectra out of 1000 total entries.
