In [1]:
import pandas as pd
from matchms.importing import load_from_mgf
from rdkit import Chem

from collections import defaultdict
import pandas as pd
import numpy as np
import ast

from tqdm import tqdm
import logging

import torch
from torch.utils.data import Dataset, DataLoader
import random

from fiora.MOL.constants import PPM

In [2]:
def load_mgf_with_folds(mgf_path):

    spectra = list(load_from_mgf(mgf_path))
    records = []
    for spec in spectra:
        record = spec.to_dict()
        records.append(record)
    df = pd.DataFrame(records)

    if 'fold' not in df.columns:
        raise ValueError("fold column is missing. Ensure the dataset has been split into train/val/test.")
    
    return df

In [3]:
spectra_path = "/Users/macbook/CODE/mol2DreaMS/data/data/MassSpecGym.mgf"
df = load_mgf_with_folds(spectra_path)
print(df.head())


             identifier                                         smiles  \
0  MassSpecGymID0000001  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
1  MassSpecGymID0000002  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
2  MassSpecGymID0000003  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
3  MassSpecGymID0000004  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   
4  MassSpecGymID0000005  CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC   

         inchikey    formula precursor_formula parent_mass  precursor_mz  \
0  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
1  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
2  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
3  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   
4  VFMQMACUYWGDOJ  C16H17NO4         C16H18NO4  287.115224      288.1225   

   adduct instrument_type collision_energy   fold simulation_challenge  \
0  [M+H]+        Orbitra

In [4]:
# metadata_key_map = {
#                 "name": "Name",
#                 "collision_energy":  "CE", 
#                 "instrument": "Instrument_type",
#                 "ionization": "Ionization",
#                 "precursor_mz": "PrecursorMZ",
#                 "precursor_mode": "Precursor_type",
#                 "retention_time": "RETENTIONTIME",
#                 "ccs": "CCS"
#                 }


In [5]:
def prepare_dataframe_for_fiora(df):
    # Step 1: Rename Columns to Match metadata_key_map
    df = df.rename(columns={
        'identifier': 'Name',
        'collision_energy': 'CE',
        'instrument_type': 'Instrument_type',
        'adduct': 'Precursor_type',          # Adjusted based on your mapping
        'precursor_mz': 'PrecursorMZ',
        'smiles': 'SMILES',
        'inchikey': 'InChiKey',
        'formula': 'Formula',
        'precursor_formula': 'PrecursorFormula',
        'parent_mass': 'ParentMass'
    })
    
    # Step 2: Handle Retention Time and CCS
    if 'RETENTIONTIME' not in df.columns:
        df['RETENTIONTIME'] = np.nan
    else:
        df['RETENTIONTIME'] = pd.to_numeric(df['RETENTIONTIME'], errors='coerce')
    
    if 'CCS' not in df.columns:
        df['CCS'] = np.nan
    else:
        df['CCS'] = pd.to_numeric(df['CCS'], errors='coerce')
    
    # Step 3: Handle 'peaks_json' to 'peaks'
    if 'peaks_json' in df.columns:
        df = df.rename(columns={'peaks_json': 'peaks'})
        
        def parse_peaks(peaks_entry):
            if isinstance(peaks_entry, str):
                try:
                    return ast.literal_eval(peaks_entry)
                except (ValueError, SyntaxError):
                    print(f"Failed to parse peaks: {peaks_entry}")
                    return None
            elif isinstance(peaks_entry, list):
                return peaks_entry
            else:
                print(f"Unexpected type for peaks: {type(peaks_entry)}")
                return None
        
        df['peaks'] = df['peaks'].apply(parse_peaks)
    else:
        raise ValueError("peaks_json column is missing in the DataFrame.")
    
    # Step 4: Convert 'peaks' from list of pairs to dict with 'mz' and 'intensity'
    def convert_peaks(peaks_list):
        if peaks_list is None:
            return {'mz': [], 'intensity': []}
        mz = []
        intensity = []
        for pair in peaks_list:
            if isinstance(pair, list) and len(pair) == 2:
                mz.append(pair[0])
                intensity.append(pair[1])
            else:
                print(f"Invalid peak pair: {pair}")
        return {'mz': mz, 'intensity': intensity}
    
    df['peaks'] = df['peaks'].apply(convert_peaks)
    
    # Step 5: Add Missing Columns Required by Fiora
    if 'PPM_num' not in df.columns:
        df['PPM_num'] = 50  # Default value as used in your example
    if 'ppm_peak_tolerance' not in df.columns:
              # Adjust based on Fiora's definitions
        df['ppm_peak_tolerance'] = df['PPM_num'] * PPM
    
    # Step 6: Add 'lib' and 'origin' Columns
    df['lib'] = 'MassSpecGym'
    df['origin'] = 'MassSpecGym'
    
    # Step 7: Ensure 'fold' Column Exists
    if 'fold' not in df.columns:
        raise ValueError("fold column is missing. Ensure the dataset has been split into train/val/test.")
    
    # Step 8: Add 'simulation_challenge' Column
    if 'simulation_challenge' not in df.columns:
        df['simulation_challenge'] = False  # Adjust based on your dataset's context
    
    # Step 9: Reorder Columns to Match Fiora's Expectations
    desired_order = [
        'Name',
        'SMILES',
        'InChiKey',
        'Formula',
        'PrecursorFormula',
        'ParentMass',
        'PrecursorMZ',
        'Ionization',
        'Instrument_type',
        'CE',
        'fold',
        'simulation_challenge',
        'peaks',
        'RETENTIONTIME',
        'CCS',
        'Precursor_type',
        'PPM_num',
        'ppm_peak_tolerance',
        'lib',
        'origin'
    ]
    
    # Check if all desired columns are present
    missing_columns = set(desired_order) - set(df.columns)
    if missing_columns:
        print(f"Warning: The following expected columns are missing and will be added as NaN: {missing_columns}")
        for col in missing_columns:
            df[col] = np.nan
    
    # Reorder columns, ignoring any extra columns
    df = df.reindex(columns=desired_order)
    
    return df


In [6]:
df_prepared = prepare_dataframe_for_fiora(df)



In [33]:
df_prepared["Ionization"] = 'positive'
df_prepared["RETENTIONTIME"] = 0
df_prepared["CCS"] = 0

In [8]:
df_prepared.head()

Unnamed: 0,Name,SMILES,InChiKey,Formula,PrecursorFormula,ParentMass,PrecursorMZ,Ionization,Instrument_type,CE,fold,simulation_challenge,peaks,RETENTIONTIME,CCS,Precursor_type,PPM_num,ppm_peak_tolerance,lib,origin
0,MassSpecGymID0000001,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,positive,Orbitrap,30.0,train,True,"{'mz': [91.0542, 125.0233, 154.0499, 155.0577,...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym
1,MassSpecGymID0000002,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,positive,Orbitrap,20.0,train,True,"{'mz': [91.0542, 125.0233, 155.0577, 185.0961,...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym
2,MassSpecGymID0000003,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,positive,Orbitrap,40.0,train,True,"{'mz': [69.0343, 91.0542, 125.0233, 127.039, 1...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym
3,MassSpecGymID0000004,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,positive,Orbitrap,55.0,train,True,"{'mz': [69.0343, 91.0542, 110.06, 111.0441, 11...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym
4,MassSpecGymID0000005,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,positive,Orbitrap,10.0,train,True,"{'mz': [91.0542, 125.0233, 185.0961, 229.0859,...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym


In [16]:
df = df_prepared
df = df.iloc[5000:6000].copy()

In [30]:
df.columns

Index(['Name', 'SMILES', 'InChiKey', 'Formula', 'PrecursorFormula',
       'ParentMass', 'PrecursorMZ', 'Ionization', 'Instrument_type', 'CE',
       'fold', 'simulation_challenge', 'peaks', 'RETENTIONTIME', 'CCS',
       'Precursor_type', 'PPM_num', 'ppm_peak_tolerance', 'lib', 'origin',
       'Metabolite'],
      dtype='object')

### Data Filtering

In [31]:
metadata_key_map = {
                "name": "Name",
                "collision_energy":  "CE", 
                "instrument": "Instrument_type",
                "ionization": "Ionization",
                "precursor_mz": "PrecursorMZ",
                "precursor_mode": "Precursor_type",
                "retention_time": "RETENTIONTIME",
                "ccs": "CCS"
                }


In [34]:
from fiora.MOL.Metabolite import Metabolite
from fiora.GNN.AtomFeatureEncoder import AtomFeatureEncoder
from fiora.GNN.BondFeatureEncoder import BondFeatureEncoder
from fiora.GNN.SetupFeatureEncoder import SetupFeatureEncoder
#
filter_spectra = True
CE_upper_limit = 100.0
weight_upper_limit = 1000.0


# if test_run:
#     df = df.iloc[5000:6000,:]
#     #df = df.iloc[5000:20000,:]



df["Metabolite"] = df["SMILES"].apply(Metabolite)
df["Metabolite"].apply(lambda x: x.create_molecular_structure_graph())

node_encoder = AtomFeatureEncoder(feature_list=["symbol", "num_hydrogen", "ring_type"])
bond_encoder = BondFeatureEncoder(feature_list=["bond_type", "ring_type"])
setup_encoder = SetupFeatureEncoder(feature_list=["collision_energy", "molecular_weight", "precursor_mode", "instrument"])
rt_encoder = SetupFeatureEncoder(feature_list=["molecular_weight", "precursor_mode", "instrument"])

if filter_spectra:
    setup_encoder.normalize_features["collision_energy"]["max"] = CE_upper_limit 
    setup_encoder.normalize_features["molecular_weight"]["max"] = weight_upper_limit 
    rt_encoder.normalize_features["molecular_weight"]["max"] = weight_upper_limit 
df["Metabolite"].apply(lambda x: x.compute_graph_attributes(node_encoder, bond_encoder))

df["summary"] = df.apply(lambda x: {key: x[name] for key, name in metadata_key_map.items()}, axis=1)
df.apply(lambda x: x["Metabolite"].add_metadata(x["summary"], setup_encoder, rt_encoder), axis=1)

if filter_spectra:
    num_ori = df.shape[0]
    correct_energy = df["Metabolite"].apply(lambda x: x.metadata["collision_energy"] <= CE_upper_limit and x.metadata["collision_energy"] > 1) 
    df = df[correct_energy]
    correct_weight = df["Metabolite"].apply(lambda x: x.metadata["molecular_weight"] <= weight_upper_limit)
    df = df[correct_weight]    
    print(f"Filtering spectra ({num_ori}) down to {df.shape[0]}")
    print(df["Precursor_type"].value_counts())


TypeError: unsupported operand type(s) for -: 'str' and 'int'

In [19]:
df.head()

Unnamed: 0,Name,SMILES,InChiKey,Formula,PrecursorFormula,ParentMass,PrecursorMZ,Ionization,Instrument_type,CE,...,simulation_challenge,peaks,RETENTIONTIME,CCS,Precursor_type,PPM_num,ppm_peak_tolerance,lib,origin,Metabolite
5000,MassSpecGymID0006183,CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OCCCCCC)C(=O)OC...,MXHBQKVKHGQWRB,C27H42O6,C27H43O6,462.298124,463.3054,positive,QTOF,5.0,...,True,"{'mz': [85.1005, 149.0444, 193.0135, 223.0656,...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym,<Metabolite: CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OC...
5001,MassSpecGymID0006184,CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OCCCCCC)C(=O)OC...,MXHBQKVKHGQWRB,C27H42O6,C27H43O6,462.298124,463.3054,positive,QTOF,35.0,...,True,"{'mz': [57.0708, 81.0328, 109.029, 137.0224, 1...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym,<Metabolite: CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OC...
5002,MassSpecGymID0006185,CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OCCCCCC)C(=O)OC...,MXHBQKVKHGQWRB,C27H42O6,C27H43O6,462.298124,463.3054,positive,QTOF,20.0,...,True,"{'mz': [57.07, 109.0287, 149.0437, 193.0123, 2...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym,<Metabolite: CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OC...
5003,MassSpecGymID0006186,CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OCCCCCC)C(=O)OC...,MXHBQKVKHGQWRB,C27H42O6,C27H43O6,462.298124,463.3054,positive,QTOF,45.0,...,True,"{'mz': [55.0183, 57.0704, 69.0708, 81.0342, 10...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym,<Metabolite: CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OC...
5004,MassSpecGymID0006187,CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OCCCCCC)C(=O)OC...,MXHBQKVKHGQWRB,C27H42O6,C27H43O6,462.298124,463.3054,positive,QTOF,10.0,...,True,"{'mz': [85.1018, 137.0219, 193.0123, 207.0292,...",,,[M+H]+,50,5e-05,MassSpecGym,MassSpecGym,<Metabolite: CCCCCCOC(=O)C1=CC(=C(C=C1)C(=O)OC...


In [20]:
df.columns

Index(['Name', 'SMILES', 'InChiKey', 'Formula', 'PrecursorFormula',
       'ParentMass', 'PrecursorMZ', 'Ionization', 'Instrument_type', 'CE',
       'fold', 'simulation_challenge', 'peaks', 'RETENTIONTIME', 'CCS',
       'Precursor_type', 'PPM_num', 'ppm_peak_tolerance', 'lib', 'origin',
       'Metabolite'],
      dtype='object')

In [21]:
df["Metabolite"] = df["SMILES"].apply(Metabolite)
df["Metabolite"].apply(lambda x: x.create_molecular_structure_graph())

In [25]:
df["Metabolite"].apply(lambda x: x.create_molecular_structure_graph())

5000    None
5001    None
5002    None
5003    None
5004    None
        ... 
5995    None
5996    None
5997    None
5998    None
5999    None
Name: Metabolite, Length: 1000, dtype: object

In [28]:
type(df["Metabolite"].iloc[0])

fiora.MOL.Metabolite.Metabolite