In [20]:
import os
import json
import argparse
import pandas as pd
import requests

from pygoslin.parser.Parser import LipidParser
from pygoslin.domain.LipidLevel import LipidLevel

In [None]:
def parse_lipid_maps_sdf_database(file_path) -> dict:
    lipid_maps_ids = []
    lipid_maps_names = []
    lipid_maps_goslin_names = []
    lipid_maps_hmdb_ids = []
    lipid_maps_chebi_ids = []
    lipid_maps_kegg_ids = []
    lipid_maps_lipidbank_ids = []
    lipid_maps_pubchem_cids = []

    current_id = ''
    current_name = ''
    current_hmdb_id = ''
    current_chebi_id  = ''
    current_kegg_id  = ''
    current_lipidbank_id = ''
    current_pubchem_cid = ''

    current_value_name = 'ID'
    next_line_is_value = True

    lipid_parser = LipidParser()

    with open(file_path, 'r') as file:
        for line in file:
            if next_line_is_value:
                line = line.rstrip()
                if current_value_name == 'ID':
                    current_id = line
                elif current_value_name == 'NAME':
                    current_name = line
                elif current_value_name == 'HMDB_ID':
                    current_hmdb_id = line
                elif current_value_name == 'PUBCHEM_CID':
                    current_pubchem_cid = line
                elif current_value_name == 'KEGG_ID':
                    current_kegg_id = line
                elif current_value_name == 'CHEBI_ID':
                    current_chebi_id = line
                elif current_value_name == 'LIPIDBANK_ID':
                    current_lipidbank_id = line
                current_value_name = ''
                next_line_is_value = False
            elif line.startswith('$$$$'):
                lipid_maps_ids.append(current_id)
                lipid_maps_names.append(current_name)
                lipid_maps_hmdb_ids.append(current_hmdb_id)
                lipid_maps_chebi_ids.append(current_chebi_id)
                lipid_maps_kegg_ids.append(current_kegg_id)
                lipid_maps_lipidbank_ids.append(current_lipidbank_id)
                lipid_maps_pubchem_cids.append(current_pubchem_cid)

                try:
                    lipid = lipid_parser.parse(current_name)
                    lipid_maps_goslin_names.append(lipid.get_lipid_string())
                except Exception as _:
                    lipid_maps_goslin_names.append(None)

                current_id = ''
                current_name = ''
                current_hmdb_id = ''
                current_chebi_id  = ''
                current_kegg_id  = ''
                current_lipidbank_id = ''
                current_pubchem_cid = ''
                current_value_name = 'ID'
                next_line_is_value = True
            elif line.startswith('> <NAME>'):
                current_value_name = 'NAME'
                next_line_is_value = True
            elif line.startswith('> <HMDB_ID>'):
                current_value_name = 'HMDB_ID'
                next_line_is_value = True
            elif line.startswith('> <PUBCHEM_CID>'):
                current_value_name = 'PUBCHEM_CID'
                next_line_is_value = True
            elif line.startswith('> <KEGG_ID>'):
                current_value_name = 'KEGG_ID'
                next_line_is_value = True
            elif line.startswith('> <CHEBI_ID>'):
                current_value_name = 'CHEBI_ID'
                next_line_is_value = True
            elif line.startswith('> <LIPIDBANK_ID>'):
                current_value_name = 'LIPIDBANK_ID'
                next_line_is_value = True

    return pd.DataFrame({
        'ID': lipid_maps_ids,
        'NAME': lipid_maps_names,
        'GOSLIN_NAME': lipid_maps_goslin_names,
        'HMDB_ID':lipid_maps_hmdb_ids,
        'CHEBI_ID': lipid_maps_chebi_ids,
        'KEGG_ID': lipid_maps_kegg_ids,
        'LIPIDBANK_ID': lipid_maps_lipidbank_ids,
        'PUBCHEM_CID': lipid_maps_pubchem_cids
    })

In [None]:
# download the structures.sdf from lipidmaps
lipidmaps_db = parse_lipid_maps_sdf_database("../data/lipidmaps/structures.sdf")

In [23]:
lipidmaps_db[lipidmaps_db['GOSLIN_NAME'].notna()]

Unnamed: 0,ID,NAME,GOSLIN_NAME,HMDB_ID,CHEBI_ID,KEGG_ID,LIPIDBANK_ID,PUBCHEM_CID
21,LMFA00000031,17-dimethylarsinoyl-9Z-heptadecenoic acid,FA 17:1(9Z);17MMAs,,185174,,,101857689
22,LMFA00000034,13-dimethylarsinoyl-tridecanoic acid,FA 13:0;13MMAs,,185707,,,101857686
24,LMFA00000039,19-dimethylarsinoyl-nonadecanoic acid,FA 19:0;19MMAs,,185432,,,101891710
38,LMFA00000030,9-dimethylarsinoyl-nonanoic acid,FA 9:0;9MMAs,,185761,,,138454165
39,LMFA00000032,16-dimethylarsinoyl-9Z-hexadecenoic acid,FA 16:1(9Z);16MMAs,,187398,,,101891711
...,...,...,...,...,...,...,...,...
41594,LMSL03001293,"AC2SGL(18:0/34:0(2Me[S],4Me[S],6Me[S],8Me[S],1...","AC2SGL 18:0/34:0;2Me,4Me,6Me,8Me,10Me,12Me,14M...",,,,,126457734
41595,LMSL03001294,"AC2SGL(18:0/34:0(2Me[S],4Me[S],6Me[S],8Me[S],1...","AC2SGL 18:0/34:0;2Me,4Me,6Me,8Me,10Me,12Me,14M...",,,,,126457735
41596,LMSL03001295,"AC2SGL(18:0/36:0(2Me[S],4Me[S],6Me[S],8Me[S],1...","AC2SGL 18:0/36:0;2Me,4Me,6Me,8Me,10Me,12Me,14M...",,,,,126457736
41597,LMSL03001296,"AC2SGL(18:0/36:0(2Me[S],4Me[S],6Me[S],8Me[S],1...","AC2SGL 18:0/36:0;2Me,4Me,6Me,8Me,10Me,12Me,14M...",,,,,126457737


In [None]:
lipidmaps_db[lipidmaps_db['GOSLIN_NAME'].notna()].to_csv("../data/lipidmaps/lipid_names.tsv", sep='\t', index=False)