In [1]:
from libchebipy._chebi_entity import ChebiEntity
import os
import json

import reframed
import pandas as pd
import numpy as np

import collections

### Load models

In [2]:
directory = os.fsencode("output/GEMs/")

GEMs_dict = {}
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        print(filename)
        GEMs_dict[filename[:-4]]= reframed.load_cbmodel("output/GEMs/"+filename)
        continue
    else:
        continue

CH9-bin.8.xml
CH7-bin.24.xml
CH7-bin.18.xml
CH15-bin.7.xml
CH13-bin.10.xml
CH13-bin.0.xml
CH1-bin.4.xml
CH1-bin.5.xml
CH13-bin.1.xml
CH13-bin.11.xml
CH15-bin.6.xml
CH7-bin.2.xml
CH7-bin.25.xml
CH15-bin.4.xml
CH15-bin.18.xml
CH13-bin.13.xml
CH1-bin.10.xml
CH1-bin.7.xml
CH1-bin.6.xml
CH13-bin.2.xml
CH13-bin.12.xml
CH15-bin.5.xml
CH7-bin.1.xml
CH15-bin.1.xml
CH13-bin.16.xml
CH13-bin.6.xml
CH1-bin.2.xml
CH1-bin.3.xml
CH13-bin.17.xml
CH15-bin.0.xml
CH15-bin.20.xml
CH7-bin.4.xml
CH7-bin.23.xml
CH7-bin.21.xml
CH7-bin.6.xml
CH15-bin.22.xml
CH15-bin.2.xml
CH1-bin.1.xml
CH1-bin.0.xml
CH13-bin.4.xml
CH13-bin.14.xml
CH15-bin.3.xml
CH15-bin.23.xml
CH7-bin.20.xml
CH8-bin.8.xml
CH8-bin.9.xml
CH8-bin.27.xml
CH8-bin.25.xml
CH14-bin.4.xml
CH8-bin.20.xml
CH14-bin.1.xml
CH8-bin.21.xml
CH14-bin.2.xml
CH8-bin.22.xml
CH8-bin.0.xml
CH8-bin.12.xml
CH8-bin.2.xml
CH8-bin.11.xml
CH8-bin.29.xml
CH8-bin.7.xml
CH8-bin.6.xml
CH3-bin.2.xml
CH8-bin.14.xml
CH8-bin.16.xml
CH3-bin.0.xml
CH8-bin.4.xml
CH8-bin.5.xml
CH3-bin

### Find the best way to map exchange to classes

#### Get Chebi ids associated with extracellular metabolites

In [15]:
chebi_ids = {}
chebi_frac = {}
for key,model in GEMs_dict.items():
    
    ex_met = []
    chebi_ids[key]={}
    chebi_in_met = []
    for met in model.metabolites:

        if model.metabolites[met].compartment=="C_e":
            ex_met.append(met)
            if "chebi" in model.metabolites[met].metadata.keys():
                chebi_in_met.append(met)
                chebi_ids[key][met]=model.metabolites[met].metadata["chebi"]
            else:
                chebi_ids[key][met]=None
    
    chebi_frac[key]=len(chebi_in_met)/len(ex_met)

In [16]:
pd.Series(chebi_frac).min()

0.6823529411764706

In [17]:
pd.Series(chebi_frac).mean()

0.7844137022915882

In [18]:
pd.Series(chebi_frac).max()

0.8881987577639752

**Find all chebi ids for the exchange reactions in all models (this also includes the ones that do not have chebi_id)**

In [19]:
super_dict = collections.defaultdict(set)
for d in chebi_ids.values():
    for k, v in d.items():  
        super_dict[k]=v

In [20]:
all_mets = [met[2:-2] for met in super_dict.keys()]
len(all_mets)

546

#### Get KEGG ids associated with extracellular metabolites

In [21]:
kegg_frac = {}
for key,model in GEMs_dict.items():
    
    ex_met = []
    kegg_in_met = []
    for met in model.metabolites:

        if model.metabolites[met].compartment=="C_e":
            ex_met.append(met)
            if "kegg.compound" in model.metabolites[met].metadata.keys():
                kegg_in_met.append(met)
    
    kegg_frac[key]=len(kegg_in_met)/len(ex_met)

In [22]:
pd.Series(kegg_frac).min()

0.6235294117647059

In [23]:
pd.Series(kegg_frac).mean()

0.74751105074859

In [24]:
pd.Series(kegg_frac).max()

0.8726114649681529

#### HMDB

In [25]:
HMDB_classes = pd.read_csv("input/bigg_classes.tsv",sep="\t")
HMDB_classes

Unnamed: 0,bigg,name,super_class,class,sub_class
0,thm,Thiamine,Organoheterocyclic compounds,Diazines,Pyrimidines and pyrimidine derivatives
1,26dap__M,"Meso-2,6-Diaminoheptanedioate",Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues"
2,3cmp,3 CMP,"Nucleosides, nucleotides, and analogues",Ribonucleoside 3'-phosphates,Ribonucleoside 3'-phosphates
3,5mtr,5-Methylthio-D-ribose,Organooxygen compounds,Carbohydrates and carbohydrate conjugates,Sugar acids and derivatives
4,chols,Choline sulfate,Homogeneous non-metal compounds,Non-metal oxoanionic compounds,Non-metal sulfates
...,...,...,...,...,...
1611,xylnact__D,"D-Xylono-1,5-lactone",Organoheterocyclic compounds,Lactones,Delta valerolactones
1612,xylu__D,D-Xylulose,Organooxygen compounds,Carbohydrates and carbohydrate conjugates,Monosaccharides
1613,xylu__L,L-Threo-2-pentulose,Organic oxygen compounds,Organooxygen compounds,Carbohydrates and carbohydrate conjugates
1614,zeax,Zeaxanthin,Lipids and lipid-like molecules,Prenol lipids,Tetraterpenoids


In [26]:
HMDB_classes[HMDB_classes["bigg"].isin(all_mets)]

Unnamed: 0,bigg,name,super_class,class,sub_class
0,thm,Thiamine,Organoheterocyclic compounds,Diazines,Pyrimidines and pyrimidine derivatives
1,26dap__M,"Meso-2,6-Diaminoheptanedioate",Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues"
2,3cmp,3 CMP,"Nucleosides, nucleotides, and analogues",Ribonucleoside 3'-phosphates,Ribonucleoside 3'-phosphates
4,chols,Choline sulfate,Homogeneous non-metal compounds,Non-metal oxoanionic compounds,Non-metal sulfates
5,citr__L,L-Citrulline,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues"
...,...,...,...,...,...
1606,xtsn,Xanthosine,"Nucleosides, nucleotides, and analogues",Purine nucleosides,
1608,xyl__D,D-Xylose,Organic oxygen compounds,Organooxygen compounds,Carbohydrates and carbohydrate conjugates
1609,xyl__D,Aldehydo-D-xylose,Organic oxygen compounds,Organooxygen compounds,Carbohydrates and carbohydrate conjugates
1610,xylb,Xylobiose,Organic oxygen compounds,Organooxygen compounds,Carbohydrates and carbohydrate conjugates


In [27]:
HMDB_classes[HMDB_classes["bigg"].isin(all_mets)]["sub_class"].dropna()

0          Pyrimidines and pyrimidine derivatives
1            Amino acids, peptides, and analogues
2                    Ribonucleoside 3'-phosphates
4                              Non-metal sulfates
5            Amino acids, peptides, and analogues
                          ...                    
1604               Purines and purine derivatives
1608    Carbohydrates and carbohydrate conjugates
1609    Carbohydrates and carbohydrate conjugates
1610    Carbohydrates and carbohydrate conjugates
1613    Carbohydrates and carbohydrate conjugates
Name: sub_class, Length: 294, dtype: object

## Define substrate classes based on ChEBI

It's necessary to define some classes from CHEBI that we are interested in. 


Strategy:

- Define main classes. NB: Some compounds might fit into several classes
    - Define a hierarchy to avoid placing some compounds in a general class
    - Put the 

In [273]:
really_bigg_classes = collections.OrderedDict({## Really Bigg classes
                "organonitrogen compounds":{
                    "CHEBI:35352":"organonitrogen compound"},
                "organosulfur compounds":{
                    "CHEBI:33261":"organosulfur compound"}, # from thiol CHEBI:29256
                "organophosphorus compounds":{
                    "CHEBI:25710":"organophosphorus compound"},
                "carboxylic acids and anions":{
                    "CHEBI:29067":"carboxylic acid anion",
                    "CHEBI:33575":"carboxylic acid"},
                "carbohydrate derivatives":{
                    "CHEBI:63299":"carbohydrate derivative"}
    })
bigg_classes = collections.OrderedDict({
                ## Bigg classes
                "alcohols and aldehydes":{
                    "CHEBI:15734": "primary alcohol",
                    "CHEBI:35681":"secondary alcohol",
                    "CHEBI:17478":"aldehyde"},
                "phospholipids":{
                    "CHEBI:16247":"phospholipid"},
                "oligosaccharides":{
                    "CHEBI:50699":"oligosaccharide (undefined)"},
                #"cofactors":{} #"CHEBI:5975":"iron chelate"
})
main_classes = collections.OrderedDict({
                "amino acids and derivatives":{
                    "CHEBI:37022":"amino-acid anion",
                    "CHEBI:33709":"amino acid",
                    "CHEBI:83821":"amino acid derivative"
},
                "oligopeptides":{                    
                    "CHEBI:25676":"oligopeptide"},
    
                "fatty acids":{
                    "CHEBI:58954":"straight-chain saturated fatty acid anion",
                    "CHEBI:58956":"branched-chain saturated fatty acid anion"},


                "carboxylic acids and anions":{
                    "CHEBI:33576":"sulfur-containing carboxylic acid"}, #From M_sulfac_e

                ## Carbohydrates
                "simple sugars":{
                    "CHEBI:35381":"monosaccharide",
                    "CHEBI:36233": "disaccharide"},
                "carbohydrate derivatives":{
                    "CHEBI:23639":"deoxy sugar",
                    "CHEBI:33720":"carbohydrate acid",# From M_galctn__L_e 
                },
                "oligosaccharides":{
                    "CHEBI:22590":"arabinan",
                    "CHEBI:37163":"glucan (undefined)"},
                "nucleosides and derivatives":{
                    "CHEBI:18282":"nucleobase", # From ura
                    "CHEBI:33838":"nucleoside",
                    "CHEBI:26401":"purines",
                    "CHEBI:39447":"pyrimidines",
                    "CHEBI:25608":"nucleoside phosphate"},
                    #"CHEBI:35241":"nucleotide-sugar",
                "B-vitamins and cofactors":{
                   "CHEBI:75769": "B vitamin",                    
                    "CHEBI:23357":"cofactor",
                    "CHEBI:33892":"iron coordination entity"},
                "minerals and atoms":{
                    "CHEBI:24835":"inorganic ion",
                    "CHEBI:25585": "nonmetal atom"},
                "gasses":{
                    "CHEBI:138675":"gas molecular entity"},
                "urea and urea derivatives":{
                    "CHEBI:47857":"ureas",
                    "CHEBI:16199":"urea"},
    
                "aromatic compounds":{
                    "CHEBI:33853":"phenols",
                    "CHEBI:27338": "xylene",
                    "CHEBI:27024":"toluenes"}, # From M_tol_e
                "other":{
                    "CHEBI:26191":"polyol",
                    "CHEBI:23217": "cholines",
                    "CHEBI:24828": "indoles",
                    "CHEBI:26188":"polyketide"} # Based on ttrcyc

               })

**Recursive function to find first parent matching with main classes**

In [274]:
def find_main_class(chebi_id, main_classes):
    # If chebi_id is the id of a  main class -> return value
    if chebi_id in main_classes:
        return chebi_id  

    entity = ChebiEntity(chebi_id)
    parents = [rel.get_target_chebi_id() for rel in entity.get_outgoings() if rel.get_type() == "is_a"]
    
    # If we have reached the end of the graph
    if len(parents)==0:
        return None
    
    
    for parent in parents:
        result = find_main_class(parent, main_classes)  
        if result is not None:
            return result  

    return None

In [275]:
met_chebi_dict = {} # previously called main_class_dict

for main_class_dict_nested in [main_classes,bigg_classes,really_bigg_classes]:
    main_class_dict = collections.OrderedDict()
    for d in main_class_dict_nested.values():
        for k, v in d.items():  
            main_class_dict[k]=v
    
    for met_id,chebi_list in super_dict.items():
        if chebi_list is None:
            continue
        
        if met_id in met_chebi_dict.keys():
            continue

        for chebi_id in chebi_list:
            main_class = find_main_class(chebi_id,main_class_dict.keys())
            if main_class is not None:
                met_chebi_dict[met_id]=main_class
                break

In [276]:
len(met_chebi_dict)

367

In [277]:
pd.Series(met_chebi_dict).value_counts()

CHEBI:24835     39
CHEBI:33709     29
CHEBI:29067     25
CHEBI:33720     22
CHEBI:25608     22
CHEBI:63299     16
CHEBI:35381     15
CHEBI:26191     13
CHEBI:33838     13
CHEBI:25710     12
CHEBI:83821     12
CHEBI:33853     11
CHEBI:35352     11
CHEBI:25676     11
CHEBI:33261     10
CHEBI:17478      8
CHEBI:36233      8
CHEBI:37022      7
CHEBI:50699      7
CHEBI:33575      6
CHEBI:33892      6
CHEBI:39447      6
CHEBI:15734      6
CHEBI:75769      5
CHEBI:23639      5
CHEBI:58954      5
CHEBI:26401      5
CHEBI:35681      5
CHEBI:16247      3
CHEBI:47857      3
CHEBI:24828      3
CHEBI:33576      3
CHEBI:23217      2
CHEBI:138675     2
CHEBI:27338      2
CHEBI:26188      2
CHEBI:37163      2
CHEBI:58956      2
CHEBI:27024      1
CHEBI:22590      1
CHEBI:25585      1
Name: count, dtype: int64

In [278]:
met_chebi_df = pd.Series(met_chebi_dict)

met_chebi_df[met_chebi_df=="CHEBI:33575"]

M_ocdcea_e    CHEBI:33575
M_succ_e      CHEBI:33575
M_hdca_e      CHEBI:33575
M_acon_C_e    CHEBI:33575
M_tcb_e       CHEBI:33575
M_cinnm_e     CHEBI:33575
dtype: object

#### Define class chebi ids for new metabolites

In [279]:
iIB746 = reframed.load_cbmodel("/Users/idunmariaburgos/universal_model_extension/input/curated_models_newId/iIB746.xml")

In [280]:
model_specific_data = pd.read_csv("/Users/idunmariaburgos/universal_model_extension/output/model_specific_data.csv")

model_specific_data_iIB746 = model_specific_data[model_specific_data.model =="iIB746"]

In [281]:
xylo_gluc = [rxn for rxn in model_specific_data_iIB746[model_specific_data_iIB746.reaction.str.contains("EX")].reaction.values if "Q" in rxn]
ara_xyl = [rxn for rxn in model_specific_data_iIB746[model_specific_data_iIB746.reaction.str.contains("EX")].reaction.values if "A" in rxn]
cellulose = [rxn for rxn in iIB746.get_exchange_reactions() if "cell" in rxn]
xylan = [rxn for rxn in iIB746.get_exchange_reactions() if "xyla" in rxn]

new_classes_rxns = {"CHEBI:18233":xylo_gluc,"CHEBI:28427":ara_xyl,"CHEBI:3523":cellulose,"CHEBI:60938":xylan}

new_classes = {"oligosaccharides":{"CHEBI:60938":"glucuronoxylan","CHEBI:28427":"arabinoxylan","CHEBI:18233":"xyloglucan","CHEBI:3523":"cellodextrin"}}

In [282]:
for chebi_id,rxns in new_classes_rxns.items():
    for rxn in rxns:
        met_id = rxn.replace("R_EX_","M_")
        met_id = met_id.replace("EX_","M_")
        met_chebi_dict[met_id]=chebi_id

In [283]:
len(met_chebi_dict.keys())

382

In [284]:
len(super_dict.keys())

546

In [285]:
len(super_dict.keys())-len(met_chebi_dict.keys())

164

#### Combine data as dataframe

**First we need a dict to map chebi id to class and the self defined super class**

In [286]:
chebi_classes = collections.defaultdict(set)

for class_dict in [bigg_classes,really_bigg_classes,main_classes,new_classes]:
    for met_super_class_name,sub_class_dict in class_dict.items():
        
        for chebi_id, chebi_class_name in sub_class_dict.items():  

            chebi_classes[chebi_id]={"chebi class":chebi_class_name,"self defined super class":met_super_class_name}

**Make a df for all the metabolites we were able to classify**

In [287]:
met_chebi_df = pd.DataFrame(met_chebi_dict.values(),index=met_chebi_dict.keys())
met_chebi_df.columns=["chebi id"]
met_chebi_df

Unnamed: 0,chebi id
M_12ppd__R_e,CHEBI:26191
M_25dkglcn_e,CHEBI:33720
M_26dap__M_e,CHEBI:33709
M_2hxmp_e,CHEBI:33853
M_3cmp_e,CHEBI:25608
...,...
M_cell5_e,CHEBI:3523
M_cell4_e,CHEBI:3523
M_cell3_e,CHEBI:3523
M_xylan4_e,CHEBI:60938


**Combine data**

In [288]:
met_chebi_df["chebi class"]=met_chebi_df["chebi id"].map(lambda x:chebi_classes[x]["chebi class"])
met_chebi_df["self defined super class"]=met_chebi_df["chebi id"].map(lambda x:chebi_classes[x]["self defined super class"])

In [289]:
met_chebi_df

Unnamed: 0,chebi id,chebi class,self defined super class
M_12ppd__R_e,CHEBI:26191,polyol,other
M_25dkglcn_e,CHEBI:33720,carbohydrate acid,carbohydrate derivatives
M_26dap__M_e,CHEBI:33709,amino acid,amino acids and derivatives
M_2hxmp_e,CHEBI:33853,phenols,aromatic compounds
M_3cmp_e,CHEBI:25608,nucleoside phosphate,nucleosides and derivatives
...,...,...,...
M_cell5_e,CHEBI:3523,cellodextrin,oligosaccharides
M_cell4_e,CHEBI:3523,cellodextrin,oligosaccharides
M_cell3_e,CHEBI:3523,cellodextrin,oligosaccharides
M_xylan4_e,CHEBI:60938,glucuronoxylan,oligosaccharides


In [290]:
met_chebi_df["self defined super class"].unique()

array(['other', 'carbohydrate derivatives', 'amino acids and derivatives',
       'aromatic compounds', 'nucleosides and derivatives',
       'oligosaccharides', 'oligopeptides', 'simple sugars',
       'urea and urea derivatives', 'minerals and atoms', 'gasses',
       'B-vitamins and cofactors', 'fatty acids',
       'carboxylic acids and anions', 'alcohols and aldehydes',
       'phospholipids', 'organonitrogen compounds',
       'organophosphorus compounds', 'organosulfur compounds'],
      dtype=object)

In [291]:
met_chebi_df["self defined super class"].value_counts()

self defined super class
amino acids and derivatives    48
nucleosides and derivatives    46
carbohydrate derivatives       43
minerals and atoms             40
carboxylic acids and anions    34
oligosaccharides               26
simple sugars                  22
other                          20
alcohols and aldehydes         19
aromatic compounds             14
organophosphorus compounds     12
oligopeptides                  11
B-vitamins and cofactors       11
organonitrogen compounds       11
organosulfur compounds         10
fatty acids                     7
urea and urea derivatives       3
phospholipids                   3
gasses                          2
Name: count, dtype: int64

In [292]:
met_chebi_df[met_chebi_df["self defined super class"]=="carboxylic acids and anions"]

Unnamed: 0,chebi id,chebi class,self defined super class
M_lipoate_e,CHEBI:33576,sulfur-containing carboxylic acid,carboxylic acids and anions
M_sulfac_e,CHEBI:33576,sulfur-containing carboxylic acid,carboxylic acids and anions
M_sula_e,CHEBI:33576,sulfur-containing carboxylic acid,carboxylic acids and anions
M_2obut_e,CHEBI:29067,carboxylic acid anion,carboxylic acids and anions
M_ac_e,CHEBI:29067,carboxylic acid anion,carboxylic acids and anions
M_acnam_e,CHEBI:29067,carboxylic acid anion,carboxylic acids and anions
M_bz_e,CHEBI:29067,carboxylic acid anion,carboxylic acids and anions
M_fum_e,CHEBI:29067,carboxylic acid anion,carboxylic acids and anions
M_mal__L_e,CHEBI:29067,carboxylic acid anion,carboxylic acids and anions
M_man6pglyc_e,CHEBI:29067,carboxylic acid anion,carboxylic acids and anions


### Save data

In [293]:
met_chebi_df.to_csv("output/met_chebi_class.tsv",sep="\t")