*MeNu GUIDE*

# Preprocessing FooDB Data

In [1]:
import pandas as pd
import re

## Load relevant data tables

In [None]:
foodb_folder = "/path/to/downloaded/FooDB/data/"
processed_data_folder = "/path/to/processed/data/folder/"

In [2]:
foodb_compounds = pd.read_csv(f"{foodb_folder}Compound.csv",
                              dtype={'moldb_iupac': 'string', 'state': 'string', 'annotation_quality': 'string', 'description': 'string', 'kingdom': 'string', 'superklass': 'string', 'klass': 'string', 'subklass': 'string'})

foodb_descriptors = pd.read_csv(f"{foodb_folder}CompoundExternalDescriptor.csv")

## Process compounds table

### Drop unneccessary columns

In [4]:
foodb_compounds = foodb_compounds.drop(columns=['moldb_iupac', 'state'])
foodb_compounds.head()

Unnamed: 0,id,public_id,name,annotation_quality,description,cas_number,moldb_inchikey,moldb_inchi,moldb_smiles,moldb_mono_mass,kingdom,superklass,klass,subklass
0,4,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),Constituent of the leaves of Nymphaea alba [CC...,350602-26-5,[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...,InChI=1S/C23H22O12/c1-9(24)32-8-18-19(29)20(30...,491.118951,HBXXDBKJLPLXPR-DLBZZEGUSA-O,"3-{[(2S,3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4...",Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides
1,13,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),Constituent of Phragmites australis [CCD]. Cya...,216692-08-9,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,InChI=1S/C25H24O14/c26-11-6-14(28)12-8-17(24(3...,549.124431,MIYGQTFETYBMKF-WVXUANQFSA-O,"7-methoxy-2,2-dimethyl-2H-chromene-6-carboxyli...",Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides
2,14,FDB000014,Pelargonidin 3-(6''-succinyl-glucoside),Pelargonidin 3-(6''-succinyl-glucoside) is a m...,,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,InChI=1S/C25H24O13/c26-12-3-1-11(2-4-12)24-17(...,533.129516,UBUSYXLSGMWUJJ-WVXUANQFSA-O,"3-{[(2S,3R,4S,5S,6R)-6-{[(3-carboxypropanoyl)o...",,,,
3,24,FDB000024,Petunidin 3-O-(6''-acetyl-galactoside),Petunidin 3-o-(6''-acetyl-galactoside) is a me...,,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,InChI=1S/C24H24O13/c1-9(25)34-8-18-20(30)21(31...,521.129516,GPUBWXUQPURXOQ-SKKXNPCDSA-O,"3-{[(3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4,5-...",,,,
4,25,FDB000025,Peonidin 3-(6''-acetyl-galactoside),Peonidin 3-(6''-acetyl-galactoside) is a membe...,75-07-0,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,InChI=1S/C24H24O12/c1-10(25)33-9-19-20(29)21(3...,505.134601,MBSKDCPWFSMEFD-ZKVZURMCSA-O,"3-{[(3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4,5-...",,,,


### Rename misnamed columns

In [5]:
# Seems like they mixed up the column names in the downloadable FooDB file, so this should sort it out and match the correct column name with the correct info
foodb_compounds = foodb_compounds.rename(columns={'annotation_quality': 'description', 'description': 'cas_number', 'cas_number': 'smiles', 'moldb_inchikey': 'inchi', 'moldb_inchi': 'mono_mass', 'moldb_smiles': 'inchikey', 'moldb_mono_mass': 'iupac', 'superklass': 'superclass', 'klass': 'class', 'subklass': 'subclass'})
foodb_compounds.head()

Unnamed: 0,id,public_id,name,description,cas_number,smiles,inchi,mono_mass,inchikey,iupac,kingdom,superclass,class,subclass
0,4,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),Constituent of the leaves of Nymphaea alba [CC...,350602-26-5,[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...,InChI=1S/C23H22O12/c1-9(24)32-8-18-19(29)20(30...,491.118951,HBXXDBKJLPLXPR-DLBZZEGUSA-O,"3-{[(2S,3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4...",Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides
1,13,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),Constituent of Phragmites australis [CCD]. Cya...,216692-08-9,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,InChI=1S/C25H24O14/c26-11-6-14(28)12-8-17(24(3...,549.124431,MIYGQTFETYBMKF-WVXUANQFSA-O,"7-methoxy-2,2-dimethyl-2H-chromene-6-carboxyli...",Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides
2,14,FDB000014,Pelargonidin 3-(6''-succinyl-glucoside),Pelargonidin 3-(6''-succinyl-glucoside) is a m...,,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,InChI=1S/C25H24O13/c26-12-3-1-11(2-4-12)24-17(...,533.129516,UBUSYXLSGMWUJJ-WVXUANQFSA-O,"3-{[(2S,3R,4S,5S,6R)-6-{[(3-carboxypropanoyl)o...",,,,
3,24,FDB000024,Petunidin 3-O-(6''-acetyl-galactoside),Petunidin 3-o-(6''-acetyl-galactoside) is a me...,,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,InChI=1S/C24H24O13/c1-9(25)34-8-18-20(30)21(31...,521.129516,GPUBWXUQPURXOQ-SKKXNPCDSA-O,"3-{[(3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4,5-...",,,,
4,25,FDB000025,Peonidin 3-(6''-acetyl-galactoside),Peonidin 3-(6''-acetyl-galactoside) is a membe...,75-07-0,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,InChI=1S/C24H24O12/c1-10(25)33-9-19-20(29)21(3...,505.134601,MBSKDCPWFSMEFD-ZKVZURMCSA-O,"3-{[(3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4,5-...",,,,


## Process external descriptor table

### Add database origin to external identifiers

In order to match the external descriptors to the original compounds data it will have to be transformed slightly, so that it is only one unique compound per row with its various identifiers (ChEBI, LipidMaps, etc.) as columns.

In [6]:
def match_descriptor_to_database(descriptor:str):
    if descriptor.startswith('CHEBI'):
        return 'chebi'
    elif descriptor.startswith(('LMPK', 'LMST', 'LMFA', 'LMGP', 'LMGL', 'LMPR', 'LMSP')):
        return 'lipid_maps'
    elif descriptor.startswith('CPD'):
        return 'meta_cyc'
    elif re.match(r'^C[0-9]{5}$', descriptor):
        return 'kegg'
    else:
        return 'synonym'

In [7]:
foodb_descriptors['database'] = foodb_descriptors.external_id.apply(match_descriptor_to_database)

In [8]:
# extract only relevant information as preparation to pivot table
foodb_descriptors = foodb_descriptors[['external_id', 'compound_id', 'database']]
foodb_descriptors.head()

Unnamed: 0,external_id,compound_id,database
0,CHEBI:6584,78,chebi
1,C08652,78,kegg
2,CPD-11945,78,meta_cyc
3,LMPK12120534,87,lipid_maps
4,CHEBI:76132,97,chebi


### Deduplicate entries

In [9]:
# check for duplicates as pivot does not work otherwise
foodb_descriptors[foodb_descriptors.duplicated(subset=['compound_id', 'database'])]

Unnamed: 0,external_id,compound_id,database
319,CHEBI:62084,1161,chebi
1061,CHEBI:19092,8291,chebi


In [10]:
# have a closer look at duplicates
foodb_descriptors[(foodb_descriptors.compound_id == 1161) | (foodb_descriptors.compound_id == 8291)]

Unnamed: 0,external_id,compound_id,database
318,CHEBI:47962,1161,chebi
319,CHEBI:62084,1161,chebi
1060,CHEBI:33135,8291,chebi
1061,CHEBI:19092,8291,chebi


In [11]:
# match them back to the original compounds table
foodb_compounds[(foodb_compounds.id == 1161) | (foodb_compounds.id == 8291)]

Unnamed: 0,id,public_id,name,description,cas_number,smiles,inchi,mono_mass,inchikey,iupac,kingdom,superclass,class,subclass
626,1161,FDB001161,D-Galacturonic acid,obtained from the hydrolysis prods. of polymer...,14982-50-4,OC(C=O)C(O)C(O)C(O)C(O)=O.OC(C1OC(O)C(O)C1O)C(...,InChI=1S/3C6H10O7/c7-1-2(8)6(12)13-4(1)3(9)5(1...,582.127958,TYNHVGLYUDYTJD-UHFFFAOYSA-N,"2,3,4,5-tetrahydroxy-6-oxohexanoic acid; 2-hyd...",Organic compounds,Organooxygen compounds,Carbohydrates and carbohydrate conjugates,Sugar acids and derivatives


It seems like there is no match for compound_id 8291 surprisingly.

After having a closer look at the ChEBI website and the entries, we will deduplicate the table by keeping the first entry, as CHEBI:47962 seems to be the correct one.

In [12]:
foodb_descriptors_deduplicated = foodb_descriptors.drop_duplicates(subset=['compound_id', 'database'])

### Pivot table in preparation for merge

In [13]:
foodb_descriptors_deduplicated_pivot = foodb_descriptors_deduplicated.pivot(index='compound_id', columns='database', values='external_id')
foodb_descriptors_deduplicated_pivot

database,chebi,kegg,lipid_maps,meta_cyc,synonym
compound_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
78,CHEBI:6584,C08652,,CPD-11945,
87,,,LMPK12120534,,
97,CHEBI:76132,,LMPK12020089,,
176,CHEBI:18010,C01265,LMPK12112731,,345-TRIHYDROXY-37-DIMETHOXYFLAVONE
218,,,LMPK12113021,,
...,...,...,...,...,...
29778,CHEBI:78759,C13914,LMSP01070001,,
29781,CHEBI:71465,,LMFA08040043,,
29974,CHEBI:50399,C14313,LMPK12050055,,
29977,CHEBI:69437,C10510,LMPK12050251,,


In [14]:
# reformat table so that the compound_id is a normal colum instead of an index for the merging process
foodb_descriptors_deduplicated_pivot = foodb_descriptors_deduplicated_pivot.reset_index()
foodb_descriptors_deduplicated_pivot = foodb_descriptors_deduplicated_pivot.rename(columns={'compound_id': 'id'})
foodb_descriptors_deduplicated_pivot

database,id,chebi,kegg,lipid_maps,meta_cyc,synonym
0,78,CHEBI:6584,C08652,,CPD-11945,
1,87,,,LMPK12120534,,
2,97,CHEBI:76132,,LMPK12020089,,
3,176,CHEBI:18010,C01265,LMPK12112731,,345-TRIHYDROXY-37-DIMETHOXYFLAVONE
4,218,,,LMPK12113021,,
...,...,...,...,...,...,...
2548,29778,CHEBI:78759,C13914,LMSP01070001,,
2549,29781,CHEBI:71465,,LMFA08040043,,
2550,29974,CHEBI:50399,C14313,LMPK12050055,,
2551,29977,CHEBI:69437,C10510,LMPK12050251,,


## Merging of dataframes

In [15]:
foodb_compounds_merged = foodb_compounds.merge(foodb_descriptors_deduplicated_pivot, on='id', how='left')
foodb_compounds_merged

Unnamed: 0,id,public_id,name,description,cas_number,smiles,inchi,mono_mass,inchikey,iupac,kingdom,superclass,class,subclass,chebi,kegg,lipid_maps,meta_cyc,synonym
0,4,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),Constituent of the leaves of Nymphaea alba [CC...,350602-26-5,[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...,InChI=1S/C23H22O12/c1-9(24)32-8-18-19(29)20(30...,491.118951,HBXXDBKJLPLXPR-DLBZZEGUSA-O,"3-{[(2S,3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4...",Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,
1,13,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),Constituent of Phragmites australis [CCD]. Cya...,216692-08-9,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,InChI=1S/C25H24O14/c26-11-6-14(28)12-8-17(24(3...,549.124431,MIYGQTFETYBMKF-WVXUANQFSA-O,"7-methoxy-2,2-dimethyl-2H-chromene-6-carboxyli...",Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,
2,14,FDB000014,Pelargonidin 3-(6''-succinyl-glucoside),Pelargonidin 3-(6''-succinyl-glucoside) is a m...,,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,InChI=1S/C25H24O13/c26-12-3-1-11(2-4-12)24-17(...,533.129516,UBUSYXLSGMWUJJ-WVXUANQFSA-O,"3-{[(2S,3R,4S,5S,6R)-6-{[(3-carboxypropanoyl)o...",,,,,,,,,
3,24,FDB000024,Petunidin 3-O-(6''-acetyl-galactoside),Petunidin 3-o-(6''-acetyl-galactoside) is a me...,,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,InChI=1S/C24H24O13/c1-9(25)34-8-18-20(30)21(31...,521.129516,GPUBWXUQPURXOQ-SKKXNPCDSA-O,"3-{[(3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4,5-...",,,,,,,,,
4,25,FDB000025,Peonidin 3-(6''-acetyl-galactoside),Peonidin 3-(6''-acetyl-galactoside) is a membe...,75-07-0,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,InChI=1S/C24H24O12/c1-10(25)33-9-19-20(29)21(3...,505.134601,MBSKDCPWFSMEFD-ZKVZURMCSA-O,"3-{[(3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4,5-...",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70472,139984,FDB112151,gamma-Glutamylthreonine,,,C[C@@H](O)[C@H](NC(=O)CC[C@H](N)C(O)=O)C(O)=O,InChI=1S/C9H16N2O6/c1-4(12)7(9(16)17)11-6(13)3...,248.100836,GWNXFCYUJXASDX-ZDLURKLDSA-N,"(2S)-2-amino-4-{[(1S,2R)-1-carboxy-2-hydroxypr...",,,,,,,,,
70473,139985,FDB112152,gamma-Glutamyltryptophan,,,N[C@@H](CCC(=O)N[C@@H](CC1=CNC2=CC=CC=C12)C(O)...,InChI=1S/C16H19N3O5/c17-11(15(21)22)5-6-14(20)...,333.132471,CATMPQFFVNKDEY-AAEUAGOBSA-N,(2S)-2-amino-4-{[(1S)-1-carboxy-2-(1H-indol-3-...,,,,,,,,,
70474,139986,FDB112153,TG(i-16:0/18:0/10:0),,,[H][C@@](COC(=O)CCCCCCCCC)(COC(=O)CCCCCCCCCCCC...,InChI=1S/C47H90O6/c1-5-7-9-11-13-14-15-16-17-1...,750.673741,HUSHOYQNDOSYMV-USYZEHPZSA-N,(2R)-1-(decanoyloxy)-3-[(14-methylpentadecanoy...,,,,,,,,,
70475,139987,FDB112154,TG(a-17:0/10:0/8:0)[rac],,,[H][C@@](COC(=O)CCCCCCC)(COC(=O)CCCCCCCCCCCCC(...,InChI=1S/C38H72O6/c1-5-8-10-12-17-23-27-31-38(...,624.532890,IBWSCATYYLCICJ-ICBMVRCQSA-N,(2R)-2-(decanoyloxy)-3-(octanoyloxy)propyl 14-...,,,,,,,,,


In [16]:
foodb_compounds_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70477 entries, 0 to 70476
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70477 non-null  int64  
 1   public_id    70477 non-null  object 
 2   name         70477 non-null  object 
 3   description  23650 non-null  string 
 4   cas_number   15229 non-null  string 
 5   smiles       70413 non-null  object 
 6   inchi        70415 non-null  object 
 7   mono_mass    70409 non-null  float64
 8   inchikey     70415 non-null  object 
 9   iupac        45231 non-null  object 
 10  kingdom      4384 non-null   string 
 11  superclass   4384 non-null   string 
 12  class        4350 non-null   string 
 13  subclass     3901 non-null   string 
 14  chebi        1574 non-null   object 
 15  kegg         546 non-null    object 
 16  lipid_maps   962 non-null    object 
 17  meta_cyc     215 non-null    object 
 18  synonym      201 non-null    object 
dtypes: f

In [17]:
foodb_compounds_merged.head()

Unnamed: 0,id,public_id,name,description,cas_number,smiles,inchi,mono_mass,inchikey,iupac,kingdom,superclass,class,subclass,chebi,kegg,lipid_maps,meta_cyc,synonym
0,4,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),Constituent of the leaves of Nymphaea alba [CC...,350602-26-5,[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...,InChI=1S/C23H22O12/c1-9(24)32-8-18-19(29)20(30...,491.118951,HBXXDBKJLPLXPR-DLBZZEGUSA-O,"3-{[(2S,3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4...",Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,
1,13,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),Constituent of Phragmites australis [CCD]. Cya...,216692-08-9,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,InChI=1S/C25H24O14/c26-11-6-14(28)12-8-17(24(3...,549.124431,MIYGQTFETYBMKF-WVXUANQFSA-O,"7-methoxy-2,2-dimethyl-2H-chromene-6-carboxyli...",Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavonoid glycosides,,,,,
2,14,FDB000014,Pelargonidin 3-(6''-succinyl-glucoside),Pelargonidin 3-(6''-succinyl-glucoside) is a m...,,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...,InChI=1S/C25H24O13/c26-12-3-1-11(2-4-12)24-17(...,533.129516,UBUSYXLSGMWUJJ-WVXUANQFSA-O,"3-{[(2S,3R,4S,5S,6R)-6-{[(3-carboxypropanoyl)o...",,,,,,,,,
3,24,FDB000024,Petunidin 3-O-(6''-acetyl-galactoside),Petunidin 3-o-(6''-acetyl-galactoside) is a me...,,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,InChI=1S/C24H24O13/c1-9(25)34-8-18-20(30)21(31...,521.129516,GPUBWXUQPURXOQ-SKKXNPCDSA-O,"3-{[(3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4,5-...",,,,,,,,,
4,25,FDB000025,Peonidin 3-(6''-acetyl-galactoside),Peonidin 3-(6''-acetyl-galactoside) is a membe...,75-07-0,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...,InChI=1S/C24H24O12/c1-10(25)33-9-19-20(29)21(3...,505.134601,MBSKDCPWFSMEFD-ZKVZURMCSA-O,"3-{[(3R,4S,5R,6R)-6-[(acetyloxy)methyl]-3,4,5-...",,,,,,,,,


In [18]:
foodb_compounds_merged.to_csv(f'{processed_data_folder}foodb_compounds_with_external_descriptors.csv', index=False)