#*MeNu GUIDE*

# Matching ChEBI
Now that HMDB, FooDB and MarkerDB are matched, we will continue with ChEBI. For this purpose we wanted to use the entries with the highest confidence, meaning the 3-star entries, but it seems like some other database entries also reference 2 star entries, therefore we decided to use all possible ChEBI compounds. The ChEBI data will have to be pre-processed, as there is information about KEGG and InCHI identifiers in two separate files: 'chebiId_inchi.tsv' and 'database_accession.tsv'. This will have to be merged with the 'compounds.tsv' file.

In [2]:
import os.path
import pandas as pd
import requests

In [3]:
chebi_folder = '/path/to/chebi/data/folder/'
processed_data_folder = "/path/to/processed/data/folder/"

In [4]:
chebi_compounds = pd.read_csv(os.path.join(chebi_folder, 'compounds.tsv'), sep='\t')
chebi_inchi = pd.read_csv(os.path.join(chebi_folder, 'chebiId_inchi.tsv'), sep='\t')
chebi_accession = pd.read_csv(os.path.join(chebi_folder, 'database_accession.tsv'), sep='\t')
chebi_structures = pd.read_csv(os.path.join(chebi_folder, 'structures.csv'))
chebi_chemical_data = pd.read_csv(os.path.join(chebi_folder, 'chemical_data.tsv'), sep='\t')

## Deduplicate external identifiers

In [20]:
chebi_accession_duplicated = chebi_accession[chebi_accession.duplicated(subset=['COMPOUND_ID', 'TYPE'])]
chebi_accession_duplicated

Unnamed: 0,ID,COMPOUND_ID,SOURCE,TYPE,ACCESSION_NUMBER
4513,15359,17020,KEGG COMPOUND,KEGG COMPOUND accession,C01810
6329,17766,18070,KEGG COMPOUND,KEGG COMPOUND accession,C02416
6463,12897,18241,KEGG COMPOUND,KEGG COMPOUND accession,C00676
6636,1850,37537,KEGG COMPOUND,KEGG COMPOUND accession,C05151
6637,1851,37537,KEGG COMPOUND,CAS Registry Number,16561-29-8
...,...,...,...,...,...
391708,1154710,19720,Europe PMC,PubMed citation,1429971
391709,1154711,19720,Europe PMC,PubMed citation,11375172
391710,1154712,19720,Patent,Patent accession,EP0116944
391711,1154713,19720,Europe PMC,PubMed citation,3104384


### Query KEGG to check which identifiers match
We will have to use the KEGG API to weed out the wrong IDs. Careful when executing the code, this takes quite a long time.

In [21]:
kegg_url = 'https://rest.kegg.jp/find/compound/'


def check_kegg(row):
    if row.TYPE == 'KEGG COMPOUND accession':
        print(row['ID'])
        query_url = f'{kegg_url}{row.ACCESSION_NUMBER}'
        response = requests.request("GET", query_url)
        response = response.text
        if response == '\n':
            return 'no match'
        else:
            response = response.split('\t')[1]
            if ';' in response:
                response = response.split(';')[0]
            else:
                response = response.split('\n')[0]
            return response


chebi_accession['kegg_api_match'] = chebi_accession.apply(check_kegg, axis=1)
chebi_accession.to_csv(os.path.join(chebi_folder ,'accession_ids_kegg_api_check_all_stars.csv'), index=False)

15233
15236
15238
15246
15250
15256
15260
15262
15269
15282
15286
15292
15296
15300
15304
15310
15318
1249
9
10
16
20
22
24
26
30
35
39
43
44
46
48
52
54
58
63
67
73
76
78
84
89
93
95
101
103
109
111
113
118
125
126
129
135
142
144
146
151
155
159
161
167
170
172
180
183
187
191
194
199
208
219
223
227
235
239
243
245
254
262
264
268
275
280
283
303
307
310
316
320
326
327
330
332
335
337
345
347
352
355
359
367
373
375
380
406
408
412
414
416
418
420
423
425
435
1463
1466
1475
1496
1504
1511
1516
1519
1522
1536
1538
1542
1544
1548
1549
1555
1556
1565
1568
1569
1579
1585
1587
1589
1597
1600
1603
1606
1614
1623
1626
1629
1645
1646
1650
1659
1663
1671
1673
1688
1694
1707
1709
1720
1722
1724
1729
1749
1754
1760
1761
1765
1782
1784
1789
1790
1792
1802
1804
1814
1818
1835
1838
1842
1845
1848
1856
1858
1860
1865
1867
1877
1879
1883
5200
5203
5207
5211
5222
5224
5227
5231
5235
5236
5241
5246
5263
5276
5278
5284
5292
5298
5306
5314
5327
5331
5334
5341
5346
5348
5361
5365
5374
5376
5381
5386
53

In [26]:
chebi_accession = chebi_accession[chebi_accession.kegg_api_match != 'no match']
chebi_accession = chebi_accession[chebi_accession.TYPE != 'PubMed citation']

In [40]:
chebi_accession_deduplicated = chebi_accession.drop_duplicates(subset=['COMPOUND_ID', 'TYPE', 'ACCESSION_NUMBER'])

In [41]:
chebi_accession_deduplicated[chebi_accession_deduplicated.duplicated(subset=['COMPOUND_ID', 'TYPE'])]

Unnamed: 0,ID,COMPOUND_ID,SOURCE,TYPE,ACCESSION_NUMBER,kegg_api_match
4513,15359,17020,KEGG COMPOUND,KEGG COMPOUND accession,C01810,Glucomannan
9267,32249,33704,KEGG COMPOUND,KEGG COMPOUND accession,C05167,alpha-Amino acid
11999,65386,15405,ChemIDplus,CAS Registry Number,512-13-0,
12214,32242,35411,KEGG COMPOUND,KEGG COMPOUND accession,C02896,"alpha,omega-Diamine"
13164,72974,16761,ChemIDplus,CAS Registry Number,58-64-0,
...,...,...,...,...,...,...
391193,1154164,6439,Patent,Patent accession,JP2008094780,
391607,1154601,231353,Europe PMC,PubMed Central citation,PMC4218005,
391635,1154629,231356,Europe PMC,PubMed Central citation,PMC6421189,
391636,1154630,231356,Europe PMC,PubMed Central citation,PMC6421168,


In [42]:
chebi_accession_deduplicated = chebi_accession.drop_duplicates(subset=['COMPOUND_ID', 'TYPE'], keep='last')

In [43]:
## Pivot Accessions Table
chebi_descriptors_pivot = chebi_accession_deduplicated.pivot(index='COMPOUND_ID', columns='TYPE', values='ACCESSION_NUMBER')
chebi_descriptors_pivot.to_csv(os.path.join(chebi_folder, 'accession_ids_deduplicated_pivot_all_stars.csv'))
chebi_descriptors_pivot = chebi_descriptors_pivot.reset_index()

In [5]:
chebi_descriptors_pivot = pd.read_csv(os.path.join(chebi_folder,'accession_ids_deduplicated_pivot_all_stars.csv'))
chebi_descriptors_pivot

  chebi_descriptors_pivot = pd.read_csv('/Users/vivy/PycharmProjects/MeNuGUIDE/databases/raw/ChEBI/data_processed/accession_ids_deduplicated_pivot_all_stars.csv')


Unnamed: 0,COMPOUND_ID,Agricola citation,BPDB accession,Beilstein Registry Number,CAS Registry Number,COMe accession,ChemIDplus accession,Chemspider accession,Chinese Abstracts citation,CiteXplore citation,...,PubMed Central citation,Pubchem accession,RESID accession,Reaxys Registry Number,SMID accession,UM-BBD compID,VSDB accession,WebElements accession,Wikipedia accession,YMDB accession
0,7,,,4229885.0,498-15-7,,,,,,...,,,,1902767.0,,,,,,
1,8,,,5257045.0,88642-92-6,,,,,,...,,,,4671990.0,,,,,,
2,9,,,,524-46-9,,,,,,...,,,,,,,,,,
3,10,,,,21008-67-3,,,,,,...,,,,,,,,,,
4,11,,,,477-60-1,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150812,691037,,,2228097.0,382-67-2,,,,,,...,,,,,,,,,Desoximetasone,
150813,691622,IND20602807,,,5415-44-1,,,71754.0,,,...,PMC6818814,,,20503.0,,,,,"1,3,7-Trimethyluric_acid",
150814,724125,,,,33320-16-0,,,,,,...,,,,5921212.0,,,,,Methyl_Aminolevulinate,
150815,741548,,,,516-05-2,,,,,,...,,,,774334.0,,,,,,


## Reformat chemical data table

In [6]:
chebi_chemical_data.TYPE.unique()

array(['FORMULA', 'MASS', 'CHARGE', 'MONOISOTOPIC MASS'], dtype=object)

In [7]:
chebi_chemical_data = chebi_chemical_data[chebi_chemical_data.TYPE.isin(['FORMULA', 'MONOISOTOPIC MASS'])]
chebi_chemical_data = chebi_chemical_data[chebi_chemical_data.SOURCE == 'ChEBI']

In [8]:
chebi_chemical_data[chebi_chemical_data.duplicated(subset=['COMPOUND_ID', 'TYPE'])]

Unnamed: 0,ID,COMPOUND_ID,SOURCE,TYPE,CHEMICAL_DATA
13376,20705,28312,ChEBI,FORMULA,C34H54Cl2N10O14
36887,57319,48391,ChEBI,FORMULA,C22H23ClF3N
36948,57404,48557,ChEBI,FORMULA,C13H16N2.HCl
38933,63244,50059,ChEBI,FORMULA,C3H5N2
39139,65263,556075,ChEBI,FORMULA,C18H17ClO6
...,...,...,...,...,...
592918,2662871,190439,ChEBI,FORMULA,BaCO3
604857,2676132,45379,ChEBI,MONOISOTOPIC MASS,75.03203
607145,2678645,63598,ChEBI,MONOISOTOPIC MASS,361.14378
620316,2692765,87393,ChEBI,MONOISOTOPIC MASS,102.10447


In [9]:
chebi_chemical_data_deduplicated = chebi_chemical_data.drop_duplicates(subset=['COMPOUND_ID', 'SOURCE', 'TYPE', 'CHEMICAL_DATA'])

In [10]:
chebi_chemical_data_deduplicated[chebi_chemical_data_deduplicated.duplicated(subset=['COMPOUND_ID', 'TYPE'])]

Unnamed: 0,ID,COMPOUND_ID,SOURCE,TYPE,CHEMICAL_DATA
13376,20705,28312,ChEBI,FORMULA,C34H54Cl2N10O14
36887,57319,48391,ChEBI,FORMULA,C22H23ClF3N
36948,57404,48557,ChEBI,FORMULA,C13H16N2.HCl
40227,67411,50377,ChEBI,FORMULA,C8H16ClN7O2S3
40359,67572,48602,ChEBI,FORMULA,C7H17ClN2O2
40360,67573,48603,ChEBI,FORMULA,C7H17ClN2O2
40364,67577,48601,ChEBI,FORMULA,C7H17N2O2.Cl
40991,68310,50679,ChEBI,FORMULA,C20H20N8O5.2Na
40993,68312,50682,ChEBI,FORMULA,C20H21N8O5.Na
41019,68357,50697,ChEBI,FORMULA,C23H27N3O7.HCl


In [11]:
chebi_chemical_data_deduplicated = chebi_chemical_data_deduplicated.drop_duplicates(subset=['COMPOUND_ID', 'SOURCE', 'TYPE'])
chebi_chemical_data_pivot = chebi_chemical_data_deduplicated.pivot(index='COMPOUND_ID', columns='TYPE', values='CHEMICAL_DATA')
chebi_chemical_data_pivot = chebi_chemical_data_pivot.reset_index()

In [12]:
chebi_descriptors_pivot_merged = chebi_descriptors_pivot.merge(chebi_chemical_data_pivot, on='COMPOUND_ID', how='outer')

## Match Accessions and InCHI Table with Compounds

In [13]:
print(f'Number of ChEBI compounds without name: {chebi_compounds["ID"].count() - chebi_compounds["NAME"].count()}')

Number of ChEBI compounds without name: 19473


In [44]:
chebi_compounds_extended = chebi_compounds.merge(chebi_descriptors_pivot_merged, left_on='ID', right_on='COMPOUND_ID', how='left')

In [45]:
chebi_inchi = chebi_inchi.drop_duplicates()

In [46]:
chebi_compounds_extended = chebi_compounds_extended.merge(chebi_inchi, left_on='ID', right_on='CHEBI_ID', how='left')

In [47]:
chebi_compounds_extended = chebi_compounds_extended[['ID', 'NAME', 'DEFINITION', 'STAR', 'CAS Registry Number', 'Chemspider accession', 'DrugBank accession', 'FooDB accession', 'HMDB accession', 'KEGG COMPOUND accession', 'KNApSAcK accession', 'LIPID MAPS instance accession', 'Pubchem accession', 'PDB accession', 'Wikipedia accession', 'FORMULA', 'MONOISOTOPIC MASS', 'InChI']]

In [48]:
chebi_compounds_extended = chebi_compounds_extended.rename(
    columns={'ID': 'chebi_id', 'NAME': 'name', 'DEFINITION': 'description', 'STAR': 'stars_chebi', 'CAS Registry Number': 'cas_number', 'DrugBank accession': 'drugbank_id', 'KEGG COMPOUND accession': 'kegg_id', 'HMDB accession': 'hmdb_id', 'Chemspider accession': 'chemspider_id', 'PDB accession': 'pdb_id', 'Wikipedia accession': 'wikipedia_id',
             'FooDB accession': 'foodb_id', 'KNApSAcK accession': 'knapsack_id', 'LIPID MAPS instance accession': 'lipid_maps', 'Pubchem accession': 'pubchem_compound_id', 'FORMULA': 'chemical_formula', 'MONOISOTOPIC MASS': 'mono_mass', 'InChI': 'inchi'})

In [49]:
## Add SMILES from structure file
chebi_smiles = chebi_structures[chebi_structures.TYPE == 'SMILES']
chebi_smiles = chebi_smiles[['COMPOUND_ID', 'STRUCTURE']]
chebi_smiles = chebi_smiles.rename(columns={'COMPOUND_ID': 'chebi_id', 'STRUCTURE': 'smiles'})

In [50]:
chebi_compounds_extended = chebi_compounds_extended.merge(chebi_smiles, on='chebi_id', how='left')

In [54]:
chebi_compounds_extended['name'] = chebi_compounds_extended['name'].str.lower()

In [55]:
chebi_compounds_extended.to_csv(os.path.join(processed_data_folder, 'chebi_compounds_with_accession.csv'), index=False)