# Curating a genome scale model (third pass)

This notebook has been tested on [jprime.lbl.gov](jprime.lbl.gov) with the biodesign_3.7 kernel.

It starts with the model that gets output by the annotation_gr.ipynb notebook.

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from IPython.display import IFrame
import numpy as np
import pandas as pd
import json
import urllib
import cobra
import cplex
import os
import requests
import collections
from itertools import product

# Getting and preparing the genome-scale model

## Load *R.opacus* NCBI model generated by CarveMe

In [2]:
model = cobra.io.read_sbml_model("GSMs/Ropacus_curation_second_pass.xml")
model

0,1
Name,ropacus_curated_second_pass
Memory address,0x07fda8eb6db10
Number of metabolites,1581
Number of reactions,2380
Number of groups,0
Objective expression,1.0*Growth - 1.0*Growth_reverse_699ae
Compartments,"cytosol, periplasm, extracellular space"


## Starting MEMOTE Output

In [3]:
IFrame('memotes/ropacus_carveme_grampos.htm', 1500, 800)

# Function Definitions

In [4]:
def should_be_balanced(r):
    if r.id.startswith('EX_') or r.id.startswith('sink_') or r.id.startswith('Growth'):
        return False
    else:
        return True
    
def has_metabolite_with_multiple_formulas(r):
    for m in r.metabolites:
        if len(m.formula.split(';')) > 1:
            return True
    return False

def get_all_balanced_reactions(m):
    return [r for r in m.reactions if should_be_balanced(r)]

def get_number_of_undefined_metabolites(r):
    return len([m for m in r.metabolites if len(m.formula.split(';')) > 1])

def get_subset_with_one_undefined_metabolite(rxn_list):
    return [r for r in rxn_list if get_number_of_undefined_metabolites(r) == 1]

def reactions_with_m_as_sole_undefined_metabolite(m):
    if ';' not in m.formula:
        return []
    else:
        return [r for r in m.reactions if should_be_balanced(r) and get_number_of_undefined_metabolites(r) == 1]

def get_initial_number_string(substring):
    initial_string = ''
    for char in substring:
        if char.isdigit():
            initial_string += char
        else:
            return initial_string
    return initial_string

def formula_dict_from_string(formula_string):
    formula_dict = {}
    elements = [char for char in formula_string if char.isalpha()]
    for element in elements:
        string_after_element = formula_string.split(element, 1)[1]
        coefficient = get_initial_number_string(string_after_element)
        if coefficient == '':
            coefficient = '1'
        formula_dict[element] = int(coefficient)
    return formula_dict

def ensure_positive_mass_error(mass_error):
    if list(mass_error.values())[0] > 0:
        return mass_error
    else:
        negative_mass_error = {}
        for k in mass_error:
            negative_mass_error[k] = -1 * mass_error[k]
        return negative_mass_error

def all_have_matching_mass_errors(rxn_list):
    for r in rxn_list:
        if ensure_positive_mass_error(r.check_mass_balance()) != ensure_positive_mass_error(rxn_list[0].check_mass_balance()):
            return False
    return True

def check_if_formula_balances_all_rxns(m, formula, rxn_list):
    original_formula = m.formula
    m.formula = formula
    for r in rxn_list:
        if r.check_mass_balance() != {}:
            m.formula = original_formula
            return False
    m.formula = original_formula
    return True

def check_fraction_of_reactions_formula_balances(m, formula, rxn_list):
    original_formula = m.formula
    m.formula = formula
    balanced_reactions   = [r for r in rxn_list if r.check_mass_balance() == {}]
    unbalanced_reactions = [r for r in rxn_list if r.check_mass_balance() != {}]
    m.formula = original_formula
    # avoid divide by zero
    if len(balanced_reactions) + len(unbalanced_reactions) == 0:
        return 0
    return len(balanced_reactions) / (len(balanced_reactions) + len(unbalanced_reactions))

def check_if_formula_balances_all_rxns(m, formula, rxn_list):
    original_formula = m.formula
    m.formula = formula
    balanced_reactions   = [r for r in rxn_list if r.check_mass_balance() == {}]
    m.formula = original_formula
    if len(rxn_list) == len(balanced_reactions):
        return True
    else:
        return False
                
def get_formula_to_balance_rxn(m, rxn_list):
    original_formula = m.formula
    for f in m.formula.split(';'):
        if check_if_formula_balances_all_rxns(m, f, rxn_list):
            return f
    return original_formula

Define model status report

In [5]:
def status_report():
    for i in range(1,5):
        num_formulas = [m for m in model.metabolites if len(m.formula.split(';')) == i]
        print(f'{len(num_formulas)} of {len(model.metabolites)} metabolites have {i} formula(s)')
    print('\n')
    
    unbalanced = [r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {}]
    unbalanced_but_okay = [r for r in model.reactions if not should_be_balanced(r) and r.check_mass_balance() != {}]
    balanced = [r for r in model.reactions if r.check_mass_balance() == {}]
    
    unbalanced_multiple_formulas = [r for r in unbalanced if has_metabolite_with_multiple_formulas(r)]
    unbalanced_but_okay_multiple_formulas = [r for r in unbalanced_but_okay if has_metabolite_with_multiple_formulas(r)]
    balanced_multiple_formulas   = [r for r in   balanced if has_metabolite_with_multiple_formulas(r)]
    
    print(f'{len(unbalanced)} of the {len(model.reactions)} reactions in the model are wrongly unbalanced')
    print(f'{len(unbalanced_but_okay)} of the {len(model.reactions)} reactions in the model are properly unbalanced')
    print(f'{len(balanced)} of the {len(model.reactions)} reactions in the model are balanced')
    print('\n')
    
    print(f'{len(unbalanced_multiple_formulas)} of the {len(unbalanced)} improperly unbalanced reactions in the model have at least one metabolite with multiple formulas')
    print(f'{len(unbalanced_but_okay_multiple_formulas)} of the {len(unbalanced_but_okay)} properly unbalanced reactions in the model have at least one metabolite with multiple formulas')
    print(f'{len(balanced_multiple_formulas)} of the {len(balanced)} balanced reactions in the model have at least one metabolite with multiple formulas')

# Status Report

In [6]:
status_report()

1576 of 1581 metabolites have 1 formula(s)
4 of 1581 metabolites have 2 formula(s)
1 of 1581 metabolites have 3 formula(s)
0 of 1581 metabolites have 4 formula(s)


34 of the 2380 reactions in the model are wrongly unbalanced
228 of the 2380 reactions in the model are properly unbalanced
2118 of the 2380 reactions in the model are balanced


17 of the 34 improperly unbalanced reactions in the model have at least one metabolite with multiple formulas
0 of the 228 properly unbalanced reactions in the model have at least one metabolite with multiple formulas
0 of the 2118 balanced reactions in the model have at least one metabolite with multiple formulas


### Assign formulas to undefined metabolites involved in reactions with multiple undefined metabolites
Get list of undefined metabolites, and a list of lists of possible formulas for those metabolites

In [8]:
undefined_metabolites = []
possible_formulas = []
for m in [m for m in model.metabolites if ';' in m.formula]:
    undefined_metabolites.append(m)
    
    formula_list = []
    for f in m.formula.split(';'):
        formula_list.append(f)
    possible_formulas.append(formula_list)
    
print(f'There are {len(undefined_metabolites)} undefined metabolites')

There are 5 undefined metabolites


Define functions to get number of reactions that are unbalanced by more than just a hydrogen error

In [9]:
def only_hydrogen_unbalanced(r):
    if list(r.check_mass_balance().keys()) == ['H'] and should_be_balanced(r):
        return True
    else:
        return False

In [10]:
def is_balanced(r):
    return abs(sum(list(r.check_mass_balance().values()))) < 1e-5

In [11]:
def reactions_off_by_more_than_hydrogen():
    return [r for r in model.reactions if not is_balanced(r) and not only_hydrogen_unbalanced(r) and should_be_balanced(r)]

In [12]:
print(f'There are {len(reactions_off_by_more_than_hydrogen())} reactions that are unbalanced by more than hydrogen')

There are 4 reactions that are unbalanced by more than hydrogen


Define function to find best combination of metabolites that are undefined and are exclusively in reactions with other undefined metabolites

In [15]:
original_formulas = [m.formula for m in undefined_metabolites]
best_formulas = original_formulas
best_num_reactions_off_by_more_than_hydrogen = len(reactions_off_by_more_than_hydrogen())

# goes through all permutations of formulas for undefined metabolites
for formulas in list(product(*possible_formulas)):
    # assigns the formulas to the metabolites
    for count, m in enumerate(undefined_metabolites):
        model.metabolites.get_by_id(m.id).formula = formulas[count]

    # get the number of reactions that are off by more than hydrogen
    num_reactions_off_by_more_than_hydrogen = len(reactions_off_by_more_than_hydrogen())
    print(num_reactions_off_by_more_than_hydrogen, formulas)
    
    # if its the best fit replace the best formulas
    if num_reactions_off_by_more_than_hydrogen < best_num_reactions_off_by_more_than_hydrogen:
        best_formulas = formulas

2 ('X', 'XH2', 'X', 'C5H7NO3R', 'R')
4 ('X', 'XH2', 'X', 'C5H7NO3R', 'HOX')
4 ('X', 'XH2', 'X', 'C5H7NO3R', 'C15H21N5O10PR')
4 ('X', 'XH2', 'X', 'C5H8NO4X', 'R')
2 ('X', 'XH2', 'X', 'C5H8NO4X', 'HOX')
4 ('X', 'XH2', 'X', 'C5H8NO4X', 'C15H21N5O10PR')
2 ('X', 'XH2', 'Fe8S8X', 'C5H7NO3R', 'R')
4 ('X', 'XH2', 'Fe8S8X', 'C5H7NO3R', 'HOX')
4 ('X', 'XH2', 'Fe8S8X', 'C5H7NO3R', 'C15H21N5O10PR')
4 ('X', 'XH2', 'Fe8S8X', 'C5H8NO4X', 'R')
2 ('X', 'XH2', 'Fe8S8X', 'C5H8NO4X', 'HOX')
4 ('X', 'XH2', 'Fe8S8X', 'C5H8NO4X', 'C15H21N5O10PR')
15 ('X', 'C6H9NO2S2R2', 'X', 'C5H7NO3R', 'R')
17 ('X', 'C6H9NO2S2R2', 'X', 'C5H7NO3R', 'HOX')
17 ('X', 'C6H9NO2S2R2', 'X', 'C5H7NO3R', 'C15H21N5O10PR')
17 ('X', 'C6H9NO2S2R2', 'X', 'C5H8NO4X', 'R')
15 ('X', 'C6H9NO2S2R2', 'X', 'C5H8NO4X', 'HOX')
17 ('X', 'C6H9NO2S2R2', 'X', 'C5H8NO4X', 'C15H21N5O10PR')
15 ('X', 'C6H9NO2S2R2', 'Fe8S8X', 'C5H7NO3R', 'R')
17 ('X', 'C6H9NO2S2R2', 'Fe8S8X', 'C5H7NO3R', 'HOX')
17 ('X', 'C6H9NO2S2R2', 'Fe8S8X', 'C5H7NO3R', 'C15H21N5O10PR')

In [16]:
for count, m in enumerate(undefined_metabolites):
    model.metabolites.get_by_id(m.id).formula = best_formulas[count]

In [17]:
status_report()

1581 of 1581 metabolites have 1 formula(s)
0 of 1581 metabolites have 2 formula(s)
0 of 1581 metabolites have 3 formula(s)
0 of 1581 metabolites have 4 formula(s)


19 of the 2380 reactions in the model are wrongly unbalanced
228 of the 2380 reactions in the model are properly unbalanced
2133 of the 2380 reactions in the model are balanced


0 of the 19 improperly unbalanced reactions in the model have at least one metabolite with multiple formulas
0 of the 228 properly unbalanced reactions in the model have at least one metabolite with multiple formulas
0 of the 2133 balanced reactions in the model have at least one metabolite with multiple formulas


### Balance Reactions off by hydrogen

In [18]:
def fix_unbalanced_hydrogen(r):
    hydrogen_error = int(r.check_mass_balance()['H'])
    r.subtract_metabolites({model.metabolites.get_by_id("h_c"): hydrogen_error})

In [19]:
for r in [r for r in model.reactions if only_hydrogen_unbalanced(r)]:
    fix_unbalanced_hydrogen(r)

In [20]:
status_report()

1581 of 1581 metabolites have 1 formula(s)
0 of 1581 metabolites have 2 formula(s)
0 of 1581 metabolites have 3 formula(s)
0 of 1581 metabolites have 4 formula(s)


5 of the 2380 reactions in the model are wrongly unbalanced
228 of the 2380 reactions in the model are properly unbalanced
2147 of the 2380 reactions in the model are balanced


0 of the 5 improperly unbalanced reactions in the model have at least one metabolite with multiple formulas
0 of the 228 properly unbalanced reactions in the model have at least one metabolite with multiple formulas
0 of the 2147 balanced reactions in the model have at least one metabolite with multiple formulas


look into last 5 reactions

In [22]:
problem_reactions = [r for r in model.reactions if r.check_mass_balance() != {} and should_be_balanced(r)]

for r in problem_reactions:
    print(r)
    print(r.check_mass_balance())
    print()

AGPATr_BS: 0.01 1ag3p_BS_c + 0.07 fa11coa_c + 0.17 fa12coa_c + 0.01 fa1coa_c + 0.2 fa3coa_c + 0.34 fa4coa_c + 0.05 fa6coa_c + 0.1 pmtcoa_c + 0.03 strcoa_c + 0.03 tdcoa_c <=> 0.01 12dag3p_BS_c + coa_c
{'C': -7.105427357601002e-15, 'H': -1.4210854715202004e-14, 'N': -8.881784197001252e-16, 'O': -3.552713678800501e-15, 'S': -2.220446049250313e-16}

FRDO: fdxrd_c + nadp_c <=> fdxox_c + h_c + nadph_c
{'H': 2.0, 'Fe': 6.0, 'S': 6.0}

G3POA_BS: 0.07 fa11coa_c + 0.17 fa12coa_c + 0.01 fa1coa_c + 0.2 fa3coa_c + 0.34 fa4coa_c + 0.05 fa6coa_c + glyc3p_c + 0.1 pmtcoa_c + 0.03 strcoa_c + 0.03 tdcoa_c --> 0.01 1ag3p_BS_c + coa_c
{'C': -7.105427357601002e-15, 'O': -3.552713678800501e-15, 'N': -8.881784197001252e-16, 'S': -2.220446049250313e-16}

NADH8: 2dmmq8_c + 3.8 h_c + nadh_c --> 2dmmql8_c + 2.8 h_e + nad_c
{'H': 2.6645352591003757e-15}

OOR3r: akg_c + coa_c + fdxox_c + h_c --> co2_c + fdxrd_c + succoa_c
{'H': -2.0, 'S': -6.0, 'Fe': -6.0}



In [29]:
for count, r in enumerate(problem_reactions):
    add_placeholder_to_balance_rxn(r, count)

{'C': -7.105427357601002e-15, 'H': -1.4210854715202004e-14, 'N': -8.881784197001252e-16, 'O': -3.552713678800501e-15, 'S': -2.220446049250313e-16}
{'H': 2.0, 'Fe': 6.0, 'S': 6.0}
{'C': -7.105427357601002e-15, 'O': -3.552713678800501e-15, 'N': -8.881784197001252e-16, 'S': -2.220446049250313e-16}
{'H': 2.6645352591003757e-15}
{'H': -2.0, 'S': -6.0, 'Fe': -6.0}


In [31]:
for r in problem_reactions:
    mass_error = r.check_mass_balance()
    print(mass_error.values())
    

dict_values([7.105427357600995, 1.4210854715201862, 8.881784197001252, 3.5527136788004974, 2.220446049250313])
dict_values([])
dict_values([7.105427357600995, 3.5527136788004974, 8.881784197001252, 2.220446049250313])
dict_values([-2.664535259100373])
dict_values([])




In [28]:
def add_placeholder_to_balance_rxn(r, count):
    mass_error = r.check_mass_balance()
    placeholder_id = 'placeholder_' + str(count)
    placeholder_name = 'placeholder ' + str(count)
    
    print(r.check_mass_balance())
    placeholder = cobra.Metabolite(
    placeholder_id,
    formula=formula_string_from_dict(r.check_mass_balance()),
    name= placeholder_name,
    compartment='c')
    
    # check whether to add or subtract the placeholder
    if list(mass_error.values())[0] > 0:
        r.subtract_metabolites({placeholder: 1})
    else:
        r.add_metabolites({placeholder: 1})

In [26]:
def formula_string_from_dict(formula_dict):
    output = ''
    for k in formula_dict:
        output += k
        if formula_dict[k] == int(formula_dict[k]):
            output += str(abs(int(formula_dict[k])))
        else:
            output += str(abs(formula_dict[k]))
    return output

In [32]:
model.id = 'ropacus_curated_third_pass_2'
model.name = 'Rhodococcus opacus PD630 curated third pass 2'
model.description = 'Rhodococcus opacus PD630 model with annotations and almost final curatation'

In [33]:
cobra.io.write_sbml_model(model, "GSMs/Ropacus_curation_third_pass.xml")

In [None]:
for count, r in enumerate(problem_reactions):
    placeholder_id = 'placeholder_' + str(count)
    placeholder_name = 'placeholder ' + str(count)
    
    print(r.check_mass_balance())
    placeholder = cobra.Metabolite(
    placeholder_id,
    formula=formula_string_from_dict(r.check_mass_balance()),
    name= placeholder_name,
    compartment='c')
    
    

In [None]:
for count, r in enumerate(problem_reactions):
    print(r.check_mass_balance())
    formula_string_from_dict(r.check_mass_balance())

Get metabolites with more than one formula, and sort them by the number of reactions they participate in where they are the sole undefined metabolite

In [None]:
multiple_formula_metabolites = [m for m in model.metabolites if len(m.formula.split(';')) > 1]
sorted_multiple_formula_metabolites = sorted(multiple_formula_metabolites, key=lambda m: len(reactions_with_m_as_sole_undefined_metabolite(m)), reverse=True)
    
for m in sorted_multiple_formula_metabolites:
    print(m.id, m.formula)
    for r in m.reactions:
        print (r.reaction)
        print(f"{len([m for m in r.metabolites if ';' in m.formula])} undefined metabolite(s)")
        for m2 in r.metabolites:
            print(m2.formula)
        print()
        
    print()

In [None]:
for m in [m for m in model.metabolites if ';' in m.formula]:
    print(m.id, m.formula)

In [None]:
model.metabolites.get_by_id('trdox_c').fomula = 'C6H7NO2S2R2'
model.metabolites.get_by_id('trdrd_c').fomula = 'C6H9NO2S2R2'
model.metabolites.get_by_id('fdxox_c').fomula = 'Fe8S8X'
# model.metabolites.get_by_id('glutrna_c').fomula = 'C6H7NO2S2R2'
model.metabolites.get_by_id('trnaglu_c').fomula = 'C15H21N5O10PR'

In [None]:
status_report()

In [None]:
for m in [m for m in model.metabolites if ';' in m.formula]:
    print(m.id, m.formula)

In [None]:
model.metabolites.get_by_id('trdox_c').fomula

In [None]:
len([m for m in model.metabolites if ';' in m.formula])

Define function to check if hydrogen is the only problem

In [None]:
def only_hydrogen_unbalanced(r):
    if list(r.check_mass_balance().keys()) == ['H'] and should_be_balanced(r):
        return True
    else:
        return False

Define function to add hydrogens to solve hydrogen imbalance problem

In [None]:
def fix_unbalanced_hydrogen(r):
    hydrogen_error = int(r.check_mass_balance()['H'])
    r.subtract_metabolites({model.metabolites.get_by_id("h_c"): hydrogen_error})

In [None]:
for r in [r for r in model.reactions if only_hydrogen_unbalanced(r)]:
    print(r.reaction)
    fix_unbalanced_hydrogen(r)
    print(r.reaction)
    print(r.check_mass_balance())
    print()

In [None]:
for r in [r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {}]:
    print(r)
    print(r.check_mass_balance())
    
    print()

In [None]:
def is_balanced(r):
    return abs(sum(list(r.check_mass_balance().values()))) < 1e-5

In [None]:
for r in [r for r in model.reactions if should_be_balanced(r) and is_not_balanced(r)]:
    print(r)
    print(r.reaction)
    for m in r.metabolites:
        print(m.id, m.formula)
    print('\n')

In [None]:
len([r for r in model.reactions if should_be_balanced(r) and not is_balanced(r)])

In [None]:
for r in [r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {}]:
    print(r.check_mass_balance())
    print(is_balanced(r))
    print()

In [None]:
len([m for m in model.metabolites if ';' in m.formula])

In [None]:
def choose_longest_formula(m):
    longest_formula = ''
    for f in m.formula.split(';'):
        if len(f) > len(longest_formula):
            longest_formula = f
    m.formula = longest_formula

In [None]:
for m in [m for m in model.metabolites if ';' in m.formula]:
    print(m.id, m.formula)
    choose_longest_formula(m)
    

In [None]:
for r in [r for r in model.reactions if only_hydrogen_unbalanced(r)]:
    print(r.reaction)
    fix_unbalanced_hydrogen(r)
    print(r.reaction)
    print(r.check_mass_balance())
    print()

In [None]:
for r in [r for r in model.reactions if should_be_balanced(r) and is_not_balanced(r)]:
    print(r)
    print(r.reaction)
    for m in r.metabolites:
        print(m.id, m.formula)
    print('\n')

In [None]:
len([r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {}])

In [None]:
status_report()

In [None]:
for r in [r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {} and not is_balanced(r)]:
    print(r.check_mass_balance())
    print(r.reaction)
    for m in r.metabolites:
        print(m.id, m.formula)
    print()

In [None]:
model.metabolites.get_by_id('trnaglu_c').formula = 'R'

In [None]:
for r in [r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {} and not is_balanced(r)]:
    print(r.check_mass_balance())
    print(r.reaction)
    for m in r.metabolites:
        print(m.id, m.formula)
    print()

In [None]:
model.metabolites.get_by_id('fdxox_c').formula = 'X'

In [None]:
for r in [r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {} and not is_balanced(r)]:
    print(r.check_mass_balance())
    print(r.reaction)
    for m in r.metabolites:
        print(m.id, m.formula)
    print()

In [None]:
model.metabolites.get_by_id('fdxrd_c').formula = 'XH2'

In [None]:
for r in [r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {} and not is_balanced(r)]:
    print(r.check_mass_balance())
    print(r.reaction)
    for m in r.metabolites:
        print(m.id, m.formula)
    print()