# Curating a genome scale model (second pass)

This notebook has been tested on [jprime.lbl.gov](jprime.lbl.gov) with the biodesign_3.7 kernel.

It starts with the model that gets output by the annotation_gr.ipynb notebook.

In [2]:
%matplotlib inline
from matplotlib import pyplot as plt
from IPython.display import IFrame
import numpy as np
import pandas as pd
import json
import urllib
import cobra
import cplex
import os
import requests
import collections

# Getting and preparing the genome-scale model

## Load *R.opacus* NCBI model generated by CarveMe

In [3]:
model = cobra.io.read_sbml_model("GSMs/Ropacus_curation_first_pass.xml")
model

0,1
Name,ropacus_curated_first_pass
Memory address,0x07feb51c99bd0
Number of metabolites,1581
Number of reactions,2380
Number of groups,0
Objective expression,1.0*Growth - 1.0*Growth_reverse_699ae
Compartments,"cytosol, periplasm, extracellular space"


## Starting MEMOTE Output

In [4]:
IFrame('memotes/ropacus_carveme_grampos.htm', 1500, 800)

# Function Definitions

In [12]:
def should_be_balanced(r):
    if r.id.startswith('EX_') or r.id.startswith('sink_') or r.id.startswith('Growth'):
        return False
    else:
        return True
    
def has_metabolite_with_multiple_formulas(r):
    for m in r.metabolites:
        if len(m.formula.split(';')) > 1:
            return True
    return False

def get_all_balanced_reactions(m):
    return [r for r in m.reactions if should_be_balanced(r)]

def get_number_of_undefined_metabolites(r):
    return len([m for m in r.metabolites if len(m.formula.split(';')) > 1])

def get_subset_with_one_undefined_metabolite(rxn_list):
    return [r for r in rxn_list if get_number_of_undefined_metabolites(r) == 1]

def reactions_with_m_as_sole_undefined_metabolite(m):
    if ';' not in m.formula:
        return []
    else:
        return [r for r in m.reactions if should_be_balanced(r) and get_number_of_undefined_metabolites(r) == 1]

def get_initial_number_string(substring):
    initial_string = ''
    for char in substring:
        if char.isdigit():
            initial_string += char
        else:
            return initial_string
    return initial_string

def formula_dict_from_string(formula_string):
    formula_dict = {}
    elements = [char for char in formula_string if char.isalpha()]
    for element in elements:
        string_after_element = formula_string.split(element, 1)[1]
        coefficient = get_initial_number_string(string_after_element)
        if coefficient == '':
            coefficient = '1'
        formula_dict[element] = int(coefficient)
    return formula_dict

def ensure_positive_mass_error(mass_error):
    if list(mass_error.values())[0] > 0:
        return mass_error
    else:
        negative_mass_error = {}
        for k in mass_error:
            negative_mass_error[k] = -1 * mass_error[k]
        return negative_mass_error

def all_have_matching_mass_errors(rxn_list):
    for r in rxn_list:
        if ensure_positive_mass_error(r.check_mass_balance()) != ensure_positive_mass_error(rxn_list[0].check_mass_balance()):
            return False
    return True

def check_if_formula_balances_all_rxns(m, formula, rxn_list):
    original_formula = m.formula
    m.formula = formula
    for r in rxn_list:
        if r.check_mass_balance() != {}:
            m.formula = original_formula
            return False
    m.formula = original_formula
    return True

def check_fraction_of_reactions_formula_balances(m, formula, rxn_list):
    original_formula = m.formula
    m.formula = formula
    balanced_reactions   = [r for r in rxn_list if r.check_mass_balance() == {}]
    unbalanced_reactions = [r for r in rxn_list if r.check_mass_balance() != {}]
    m.formula = original_formula
    # avoid divide by zero
    if len(balanced_reactions) + len(unbalanced_reactions) == 0:
        return 0
    return len(balanced_reactions) / (len(balanced_reactions) + len(unbalanced_reactions))

def check_if_formula_balances_all_rxns(m, formula, rxn_list):
    original_formula = m.formula
    m.formula = formula
    balanced_reactions   = [r for r in rxn_list if r.check_mass_balance() == {}]
    m.formula = original_formula
    if len(rxn_list) == len(balanced_reactions):
        return True
    else:
        return False
                
def get_formula_to_balance_rxn(m, rxn_list):
    original_formula = m.formula
    for f in m.formula.split(';'):
        if check_if_formula_balances_all_rxns(m, f, rxn_list):
            return f
    return original_formula

Define model status report

In [13]:
def status_report():
    for i in range(1,5):
        num_formulas = [m for m in model.metabolites if len(m.formula.split(';')) == i]
        print(f'{len(num_formulas)} of {len(model.metabolites)} metabolites have {i} formula(s)')
    print('\n')
    
    unbalanced = [r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {}]
    unbalanced_but_okay = [r for r in model.reactions if not should_be_balanced(r) and r.check_mass_balance() != {}]
    balanced = [r for r in model.reactions if r.check_mass_balance() == {}]
    
    unbalanced_multiple_formulas = [r for r in unbalanced if has_metabolite_with_multiple_formulas(r)]
    unbalanced_but_okay_multiple_formulas = [r for r in unbalanced_but_okay if has_metabolite_with_multiple_formulas(r)]
    balanced_multiple_formulas   = [r for r in   balanced if has_metabolite_with_multiple_formulas(r)]
    
    print(f'{len(unbalanced)} of the {len(model.reactions)} reactions in the model are wrongly unbalanced')
    print(f'{len(unbalanced_but_okay)} of the {len(model.reactions)} reactions in the model are properly unbalanced')
    print(f'{len(balanced)} of the {len(model.reactions)} reactions in the model are balanced')
    print('\n')
    
    print(f'{len(unbalanced_multiple_formulas)} of the {len(unbalanced)} improperly unbalanced reactions in the model have at least one metabolite with multiple formulas')
    print(f'{len(unbalanced_but_okay_multiple_formulas)} of the {len(unbalanced_but_okay)} properly unbalanced reactions in the model have at least one metabolite with multiple formulas')
    print(f'{len(balanced_multiple_formulas)} of the {len(balanced)} balanced reactions in the model have at least one metabolite with multiple formulas')
    

# Status Report

In [14]:
status_report()

1513 of 1581 metabolites have 1 formula(s)
63 of 1581 metabolites have 2 formula(s)
5 of 1581 metabolites have 3 formula(s)
0 of 1581 metabolites have 4 formula(s)


166 of the 2380 reactions in the model are wrongly unbalanced
228 of the 2380 reactions in the model are properly unbalanced
1986 of the 2380 reactions in the model are balanced


163 of the 166 improperly unbalanced reactions in the model have at least one metabolite with multiple formulas
14 of the 228 properly unbalanced reactions in the model have at least one metabolite with multiple formulas
26 of the 1986 balanced reactions in the model have at least one metabolite with multiple formulas


Get metabolites with more than one formula, and sort them by the number of reactions they participate in where they are the sole undefined metabolite

In [15]:
multiple_formula_metabolites = [m for m in model.metabolites if len(m.formula.split(';')) > 1]
sorted_multiple_formula_metabolites = sorted(multiple_formula_metabolites, key=lambda m: len(reactions_with_m_as_sole_undefined_metabolite(m)), reverse=True)

In [17]:
for m in sorted_multiple_formula_metabolites:
    for f in m.formula.split(';'):
        if check_fraction_of_reactions_formula_balances(m, f, reactions_with_m_as_sole_undefined_metabolite(m)) == 1:
            m.formula = f

In [18]:
status_report()

1559 of 1581 metabolites have 1 formula(s)
20 of 1581 metabolites have 2 formula(s)
2 of 1581 metabolites have 3 formula(s)
0 of 1581 metabolites have 4 formula(s)


73 of the 2380 reactions in the model are wrongly unbalanced
228 of the 2380 reactions in the model are properly unbalanced
2079 of the 2380 reactions in the model are balanced


70 of the 73 improperly unbalanced reactions in the model have at least one metabolite with multiple formulas
4 of the 228 properly unbalanced reactions in the model have at least one metabolite with multiple formulas
5 of the 2079 balanced reactions in the model have at least one metabolite with multiple formulas


In [19]:
multiple_formula_metabolites = [m for m in model.metabolites if len(m.formula.split(';')) > 1]
sorted_multiple_formula_metabolites = sorted(multiple_formula_metabolites, key=lambda m: len(reactions_with_m_as_sole_undefined_metabolite(m)), reverse=True)

In [20]:
for m in sorted_multiple_formula_metabolites:
    for f in m.formula.split(';'):
        if check_fraction_of_reactions_formula_balances(m, f, reactions_with_m_as_sole_undefined_metabolite(m)) == 1:
            m.formula = f

In [21]:
status_report()

1560 of 1581 metabolites have 1 formula(s)
20 of 1581 metabolites have 2 formula(s)
1 of 1581 metabolites have 3 formula(s)
0 of 1581 metabolites have 4 formula(s)


72 of the 2380 reactions in the model are wrongly unbalanced
228 of the 2380 reactions in the model are properly unbalanced
2080 of the 2380 reactions in the model are balanced


69 of the 72 improperly unbalanced reactions in the model have at least one metabolite with multiple formulas
3 of the 228 properly unbalanced reactions in the model have at least one metabolite with multiple formulas
5 of the 2080 balanced reactions in the model have at least one metabolite with multiple formulas


In [22]:
multiple_formula_metabolites = [m for m in model.metabolites if len(m.formula.split(';')) > 1]
sorted_multiple_formula_metabolites = sorted(multiple_formula_metabolites, key=lambda m: len(reactions_with_m_as_sole_undefined_metabolite(m)), reverse=True)

In [23]:
for m in sorted_multiple_formula_metabolites:
    for f in m.formula.split(';'):
        if check_fraction_of_reactions_formula_balances(m, f, reactions_with_m_as_sole_undefined_metabolite(m)) == 1:
            m.formula = f

In [24]:
status_report()

1560 of 1581 metabolites have 1 formula(s)
20 of 1581 metabolites have 2 formula(s)
1 of 1581 metabolites have 3 formula(s)
0 of 1581 metabolites have 4 formula(s)


72 of the 2380 reactions in the model are wrongly unbalanced
228 of the 2380 reactions in the model are properly unbalanced
2080 of the 2380 reactions in the model are balanced


69 of the 72 improperly unbalanced reactions in the model have at least one metabolite with multiple formulas
3 of the 228 properly unbalanced reactions in the model have at least one metabolite with multiple formulas
5 of the 2080 balanced reactions in the model have at least one metabolite with multiple formulas


All perfect matches have been made

Now go for imperfect matches based on best fit

In [31]:
for m in sorted_multiple_formula_metabolites:
    best_formula = m.formula
    best_score = 0
    for f in m.formula.split(';'):
        formula_score = check_fraction_of_reactions_formula_balances(m, f, reactions_with_m_as_sole_undefined_metabolite(m))
        if formula_score > best_score:
            best_formula = f
            best_score = formula_score
        
    m.formula = best_formula
    
    # Get new list of metabolites that are undefined
    multiple_formula_metabolites = [m for m in model.metabolites if len(m.formula.split(';')) > 1]
    sorted_multiple_formula_metabolites = sorted(multiple_formula_metabolites, key=lambda m: len(reactions_with_m_as_sole_undefined_metabolite(m)), reverse=True)
    
    # Check if the most recent formula assignment makes any metabolites formula fits equal to 1
    for m in sorted_multiple_formula_metabolites:
        for f in m.formula.split(';'):
            if check_fraction_of_reactions_formula_balances(m, f, reactions_with_m_as_sole_undefined_metabolite(m)) == 1:
                m.formula = f

15 so3_c
8 h2s_c
5 23dhbzs_c
4 34dhpha_c
4 5mthf_c
4 23dhba_c
3 4cml_c
3 dscl_c
3 fpram_c
2 fdxox_c
2 udcpo4_c
0 trdox_c
0 trdrd_c
0 so3_p
0 fe3dhbzs_c
0 fe3dhbzs_p
0 fe3dhbzs_e
0 glutrna_c
0 trnaglu_c
0 h2s_e
0 so3_e


In [32]:
status_report()

1576 of 1581 metabolites have 1 formula(s)
4 of 1581 metabolites have 2 formula(s)
1 of 1581 metabolites have 3 formula(s)
0 of 1581 metabolites have 4 formula(s)


34 of the 2380 reactions in the model are wrongly unbalanced
228 of the 2380 reactions in the model are properly unbalanced
2118 of the 2380 reactions in the model are balanced


17 of the 34 improperly unbalanced reactions in the model have at least one metabolite with multiple formulas
0 of the 228 properly unbalanced reactions in the model have at least one metabolite with multiple formulas
0 of the 2118 balanced reactions in the model have at least one metabolite with multiple formulas


### Output after 2 curation rounds

In [38]:
model.id = 'ropacus_curated_second_pass'
model.name = 'Rhodococcus opacus PD630 curated second pass'
model.description = 'Rhodococcus opacus PD630 model with annotations and intitial curatation'

In [39]:
cobra.io.write_sbml_model(model, "GSMs/Ropacus_curation_second_pass.xml")