# Metabolomics Data Inspection

By Garrett Roell and Christina Schenk

Tested on biodesign_3.7 kernel on jprime

This notebook gets metabolite data from the model, and uses it to attempt to match unknown metabolites in LCMS data.                                          
### Method: 
<ol>
<li>Set up imports</li>
<li>Load model and relevant data</li>
<li>Extract metabolite data from the genome scale mode</li>
<li>Check for matches between model metabolites and LCMS data</li>
</ol>


### 1. Set up imports

In [1]:
import cobra
import pandas as pd

### 2. Load model and relevant data

In [2]:
model = cobra.io.read_sbml_model("../models/r_opacus_annotated_curated.xml")

# load LCMS intracellular data (Can use other file names to get other LCMS data sets)
lcms_df = pd.read_csv('../data/metabolomics/LCMS_extracellular_metabolites_positive.csv')
lcms_df.head(2)

Scaling...
 A: min|aij| =  1.000e+00  max|aij| =  1.000e+00  ratio =  1.000e+00
Problem data seem to be well scaled


Unnamed: 0,m/z,RT [min],Name,Tags,Foston_Ex_1-2 (F6),Foston_Ex_1-9 (F7),Foston_Ex_1-10 (F8),Foston_Ex_1-11 (F9),Foston_Ex_1-12 (F10),Foston_Ex_1-13 (F11),...,Foston_Ex_4-52 (F73),Foston_Ex_4-53 (F74),Foston_Ex_4-54 (F75),Foston_Ex_4-55 (F76),Foston_Ex_4-56 (F77),Foston_Ex_4-57 (F78),Foston_Ex_4-58 (F79),Foston_Ex_4-59 (F80),Foston_Ex_4-60 (F81),Foston_Ex_4-64 (F82)
0,113.03449,3.109,Uracil,Confirmed ID (HIgh Confidence),24301.7878,27544.39,27719.69,28568.39,25825.51,27127.06,...,39332.3,37647.01,35963.52,26299.89,23757.63,79872.54,23541.28,22358.29,22917.143,25707.82
1,148.06024,5.053,O-Acetyl-DL-serine,Confirmed ID (HIgh Confidence),82578.31694,34037510.0,27341870.0,48123230.0,9047609.0,19434220.0,...,12145360.0,9120849.0,19831000.0,11413260.0,5268308.0,45010330.0,4042393.0,1120618.0,2237182.321,4857366.0


### 3. Extract metabolite data from the genome scale model

In [3]:
# helper function for getting molecular weight from metanetx.org
def get_metanetx_molecular_weight(metanetx_id):
    url = f'https://www.metanetx.org/chem_info/{metanetx_id}'
    
    metanetx_df = pd.read_html(url)[1]
    
    mass_row = metanetx_df[metanetx_df["Unnamed: 0"] == 'mass']
    
    molecular_weight = float(mass_row.Properties.values[0])
    
    return molecular_weight

get_metanetx_molecular_weight('MNXM61')

149.05105

In [4]:
# create a list to hold metabolite data
row_data = []

# loop over the metabolites in the model
for m in model.metabolites:
    print(m)
    
    # get MetaNetX id if present
    if 'metanetx.chemical' in m.annotation.keys():
        metanetx_id = m.annotation['metanetx.chemical']
        metanetx_molecular_weight = get_metanetx_molecular_weight(metanetx_id)
    else:
        metanetx_id = ''
        metanetx_molecular_weight = ''
    
    # get KEGG id if present
    if 'kegg.compound' in m.annotation.keys():
        kegg_id = m.annotation['kegg.compound']
    else:
        kegg_id = ''
        
    # create a dictionary for each metabolite's information
    row_data.append({
        "metanetx_molecular_weight": metanetx_molecular_weight,
        "formula_molecular_weight": m.formula_weight,
        "name": m.name,
        "formula": m.formula,
        "metabolite_id": m.id,
        "metanetx_id": metanetx_id,
        "kegg_id": kegg_id,
    })
    
# convert the row data into a data frame
metabolite_df = pd.DataFrame(row_data)

# sort by molecular weight
metabolite_df.sort_values(by=['formula_molecular_weight'], inplace=True)

metabolite_df.head(5)

10fthf_c
12dag3p_BS_c
12dgr140_c
12dgr140_p
12dgr160_c
12dgr160_e
12dgr180_e
12dgr_BS_c
12ppd__R_c
12ppd__S_c
13dampp_c
13dpg_c
13ppd_c
14dh2napcoa_c
14dhncoa_c
15dap_c
1ag160_e
1ag180_e
1ag181d9_e
1ag182d9d12_e
1ag3p_BS_c
1agpg160_p
1btol_c
1ddecg3p_c
1hdec9eg3p_c
1hdecg3p_c
1odec11eg3p_c
1odecg3p_c
1odecg3p_p
1p2cbxl_c
1p3h5c_c
1pyr5c_c
1tdec7eg3p_c
23ddhb_c
23dhacoa_c
23dhb_c
23dhba_c
23dhbzs2_c
23dhbzs3_c
23dhbzs_c
23dhdp_c
23dhmb_c
23dhmp_c
24dab_c
24dhhed_c
25aics_c
25dhpp_c
25dkglcn_c
25dkglcn_e
25dkglcn_p
25dop_c
25drapp_c
26dap_LL_c
26dap__M_c
26dap__M_e
26dap__M_p
2agpe120_c
2agpe120_p
2agpe141_c
2agpe141_p
2agpe160_c
2agpe160_p
2agpe161_c
2agpe161_p
2agpe180_c
2agpe180_p
2agpe181_c
2agpe181_p
2agpg120_c
2agpg120_p
2agpg140_c
2agpg140_p
2agpg141_c
2agpg141_p
2agpg160_c
2agpg160_p
2agpg161_c
2agpg161_p
2agpg180_c
2agpg180_p
2agpg181_c
2agpg181_p
2ahbut_c
2ahethmpp_c
2ahhmd_c
2ahhmp_c
2ameph_c
2ameph_e
2ameph_p
2amsa_c
2aobut_c
2cpr5p_c
2dda7p_c
2ddara_c
2ddecg3p_c
2ddecg3p_p
2

fadh2_c
fald_c
fald_p
fbac_kt_c
fbac_kt_p
fc1p_c
fcmcbtt_c
fdp_c
fdxo_2_2_c
fdxox_c
fdxrd_c
fe2_c
fe2_e
fe2_p
fe3_c
fe3_e
fe3_p
fe3dcit_c
fe3dcit_e
fe3dcit_p
fe3dhbzs3_c
fe3pyovd_kt_e
feenter_c
feenter_e
feenter_p
feoxam_c
feoxam_e
feoxam_p
feoxam_un_c
feoxam_un_e
feoxam_un_p
fer_c
fer_e
fer_p
ferulcoa_c
fgam_c
ficytc_c
fmcbtt_c
fmnRD_c
fmn_c
fmnh2_c
focytc_c
fol_c
fol_e
for_c
forcoa_c
forglu_c
fpram_c
fprica_c
frdp_c
frmd_c
fru_c
fru_e
fru_p
fruur_c
fuc_e
fum_c
fum_e
fum_p
g1p_c
g3p_c
g3pe_c
g3pe_e
g3pe_p
g3pg_c
g3pg_p
g6p_A_c
g6p_B_c
g6p_c
ga_c
ga_e
ga_p
gal1p_c
gal_bD_c
gal_bD_e
gal_c
gal_e
gal_p
galct__D_c
galct__D_e
galct__D_p
galctn__D_c
galctn__D_e
galctn__D_p
galman4_c
galman4_e
galman6_c
galman6_e
galt1p_c
galt_e
galur_c
galur_e
gam1p_c
gam6p_c
gam_e
gar_c
gcald_c
gcvHL_ADPr_c
gcvHL_nhLA_c
gdbtal_c
gdp_c
gdpmann_c
gdpmanur_c
gdptp_c
gg13dampp_c
gg15dap_c
gg4abut_c
ggala_B_c
ggaptn_c
ggbamppal_c
ggbdapal_c
ggbutal_c
ggdp_c
ggspmd_c
glc_D_B_c
glc__D_c
glc__D_e
glc__D_p
glc__aD_c

tol_e
tol_p
trans_dd2coa_c
trdox_c
trdrd_c
tre6p_c
tre_c
tre_e
tre_p
trnaglu_c
trp__L_c
trp__L_e
trypta_c
tsul_c
tsul_e
tsul_p
ttdca_c
ttdca_p
ttdcea_c
ttdcea_p
ttdceap_c
tton_e
ttrcyc_c
ttrcyc_e
ttrcyc_p
tyr__L_c
tyr__L_e
uaGgla_c
uaaGgla_c
uaagmda_c
uaccg_c
uacgam_c
uacmam_c
uagmda_c
uama_c
uamag_c
uamr_c
udcpdp_c
udcpp_c
udp_c
udpg_c
udpgal_c
ugmd_c
ugmda_c
um4p_c
ump_c
uppg3_c
ura_c
ura_e
ura_p
uracp_c
urate_c
urcan_c
urdglyc_c
urea_c
urea_p
uri_c
utp_c
vacc2coa_c
vacc_c
vacc_p
vacccoa_c
vaccoa_c
val__D_c
val__D_e
val__D_p
val__L_c
val__L_e
val__L_p
vanln_c
vanln_e
vanln_p
vanlt_c
vanlt_e
vanlt_p
xan_c
xan_e
xan_p
xmp_c
xtp_c
xtsn_c
xtsn_e
xu5p__D_c
xu5p__L_c
xyl3_c
xyl3_e
xyl4_c
xyl__D_c
xyl__D_e
xyl__D_p
xylan4_c
xylan4_e
xylb_c
xylb_e
xylu__D_c
zn2_c
zn2_e
zn2_p
phenol_e
phenol_c
guaiacol_e
guaiacol_c
tag_c
ficytc6_p
focytc6_p
pq_p
pqh2_p


Unnamed: 0,metanetx_molecular_weight,formula_molecular_weight,name,formula,metabolite_id,metanetx_id,kegg_id
1955,,0.0,Plastoquinol,,pqh2_p,,
1953,,0.0,Ferrocytochrome c6,,focytc6_p,,
1952,,0.0,Ferricytochrome c6,,ficytc6_p,,
1954,,0.0,Plastoquinone,,pq_p,,
1131,1.00794,1.00794,H+,H,h_c,MNXM1,C00080


Save metabolite data from the model as a csv

In [5]:
metabolite_df.to_csv('../data/metabolomics/model_metabolites.csv', index=False, header=True)

### 4. Check for matches between model metabolites and LCMS data

In [6]:
# define a helper function to get the model metabilte data from a given molecular weight
def molecular_weight_to_metabolite_data(molecular_weight):

    # keep track of the closest mass distance between the given 
    # molecular weight and model metabolite's molecular weight
    minimum_mass_difference = 1000
    
    # define an arbitrary closest metabolite
    closest_molecular_weight_data = metabolite_df[0]
    
    # loop over metabolite data
    for _, row in metabolite_df.iterrows():
        
        # check if this metabolite is the closest in mass to the given molecular weight
        if abs(row.formula_molecular_weight - molecular_weight) < minimum_mass_difference:
            # if so, the update the data for the the closest metabolite and the min mass distance
            closest_molecular_weight_data = row
            minimum_mass_difference = abs(row.formula_molecular_weight - molecular_weight)

    # return the data from the metabolite with the closest molecular weight
    return closest_molecular_weight_data

# a testing function
# molecular_weight_to_metabolite_data(148.06024)

In [7]:
lcms_df[['m/z', 'RT [min]', 'Name', 'Tags', 'Foston_Ex_1-2 (F6)']].head(5)

Unnamed: 0,m/z,RT [min],Name,Tags,Foston_Ex_1-2 (F6)
0,113.03449,3.109,Uracil,Confirmed ID (HIgh Confidence),24301.79
1,148.06024,5.053,O-Acetyl-DL-serine,Confirmed ID (HIgh Confidence),82578.32
2,162.07599,4.449,N-Methyl-L-Glutamic acid,Confirmed ID (HIgh Confidence),13866.73
3,124.03935,3.352,Nicotinic acid/Niacin,Confirmed ID (HIgh Confidence),1735242.0
4,190.07092,4.881,N-Acetyl-DL-glutamic acid,Confirmed ID (HIgh Confidence),9956.026


In [9]:
# loop over metabolites that have LCMS measurements
for _, row in lcms_df.iterrows():
    molecular_weight = row['m/z']
    print(row.Name, molecular_weight_to_metabolite_data(molecular_weight))

KeyError: 0