# Curating a genome scale model

This notebook has been tested on [jprime.lbl.gov](jprime.lbl.gov) with the biodesign_3.7 kernel.

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from IPython.display import IFrame
import numpy as np
import pandas as pd
import json
import urllib
import cobra
import cplex
import os
import requests
import collections

# Getting and preparing the genome-scale model

## Load *R.opacus* NCBI model generated by CarveMe

In [None]:
model = cobra.io.read_sbml_model("GSMs/ropacus_annotated.xml")
model

## Starting MEMOTE Output

In [None]:
IFrame('memotes/ropacus_carveme_grampos.htm', 1500, 800)

# Fix unbalanced reactions

define a function that returns whether a reactions should be balanced

In [None]:
def should_be_balanced(r):
    if r.id.startswith('EX_') or r.id.startswith('sink_') or r.id.startswith('Growth'):
        return False
    else:
        return True

## Check how many reactions are unbalanced 

In [None]:
unbalanced = []
balanced = []

for r in model.reactions:
    if should_be_balanced(r) and r.check_mass_balance():
        unbalanced.append(r)
    else:
        balanced.append(r)
        
print(f'{len(unbalanced)} of the {len(model.reactions)} reactions in the model are unbalanced')
print(f'{len(balanced)} of the {len(model.reactions)} reactions in the model are balanced')

### Check how many of the reactions involve a metabolite with multiple formulas

In [None]:
unbalanced_multiple_formulas = []

for r in unbalanced:
    contains_multiple_formulas = False
    for m in r.metabolites:
        formulas = m.formula.split(';')
        if len(formulas) > 1:
            contains_multiple_formulas = True
    if contains_multiple_formulas:
        unbalanced_multiple_formulas.append(r)
            
print(f'{len(unbalanced_multiple_formulas)} of the {len(unbalanced)} unbalanced reactions in the model have at least one metabolite with multiple formulas')

In [None]:
balanced_multiple_formulas = []
for r in balanced:
    contains_multiple_formulas = False
    for m in r.metabolites:
        formulas = m.formula.split(';')
        if len(formulas) > 1:
            contains_multiple_formulas = True
    if contains_multiple_formulas:
        balanced_multiple_formulas.append(r)
            
print(f'{len(balanced_multiple_formulas)} of the {len(balanced)} unbalanced reactions in the model have at least one metabolite with multiple formulas')

## Check how many metabolites have multiple formulas

In [None]:
multiple_formulas = []
for m in model.metabolites:
    formulas = m.formula.split(';')
    if len(formulas) > 1:
        multiple_formulas.append(m)
    
print(f'{len(multiple_formulas)} of the {len(model.metabolites)} metabolites in the model have multiple formulas')

## Fix equivalent metabolite formulas

### Define functions to convert formula string to dictionary

This function takes in the substring after an element and returns the coefficient as an int

In [None]:
def coeff_from_substring(substring):
    if len(substring) == 0 or substring[0].isalpha():
        return 1
    elif len(substring) >= 3 and substring[0].isdigit() and substring[1].isdigit() and substring[2].isdigit():
        return int(substring[0] + substring[1] + substring[2])
    elif len(substring) >= 2 and substring[0].isdigit() and substring[1].isdigit():
        return int(substring[0] + substring[1])
    elif substring[0].isdigit():
        return int(substring[0])
    else:
        return -1

This function takes in a formula as a string and returns it as a dictionary

In [None]:
def formula_dict_from_string(formula):
    formula_dict = {}
    
    for c in formula:
        if c.isalpha():
            string_after_c = formula.split(c,1)[1] 
            formula_dict[c] =  coeff_from_substring(string_after_c)
            
    return formula_dict

Test the function

In [None]:
for m in multiple_formulas[:5]:
    print(m.id)
    for f in m.formula.split(';'):
        print(f)
        print(formula_dict_from_string(f))
    print()

### Merge equivalent formulas

In [None]:
equivalent_formulas = 0
for m in multiple_formulas:
    formulas = m.formula.split(';')
    if len(formulas) == 2 and formula_dict_from_string(formulas[0]) == formula_dict_from_string(formulas[1]):
        print(m.id, formulas[0], formulas[1])
        m.formula = formulas[0]
        equivalent_formulas += 1
    if (len(formulas) == 3 and 
            formula_dict_from_string(formulas[0]) == formula_dict_from_string(formulas[1]) and
            formula_dict_from_string(formulas[1]) == formula_dict_from_string(formulas[2])):
        print(m.id, formulas[0], formulas[1], formulas[2])
        m.formula = formulas[0]
        equivalent_formulas += 1

print(f'There are {equivalent_formulas} metabolites with equivalent formulas, and they have been fixed.')

### Check how many metabolites with multiple formulas remain

In [None]:
multiple_formulas = []
for m in model.metabolites:
    formulas = m.formula.split(';')
    if len(formulas) > 1:
        multiple_formulas.append(m)
    
print(f'{len(multiple_formulas)} of the {len(model.metabolites)} metabolites in the model have multiple formulas')

### Check how many unbalanced reactions remain

In [None]:
unbalanced = []
balanced = []

for r in model.reactions:
    if should_be_balanced(r) and r.check_mass_balance():
        unbalanced.append(r)
    else:
        balanced.append(r)
        
print(f'{len(unbalanced)} of the {len(model.reactions)} reactions in the model are unbalanced')
print(f'{len(balanced)} of the {len(model.reactions)} reactions in the model are balanced')

In [None]:
unbalanced_multiple_formulas = []

for r in unbalanced:
    contains_multiple_formulas = False
    for m in r.metabolites:
        formulas = m.formula.split(';')
        if len(formulas) > 1:
            contains_multiple_formulas = True
    if contains_multiple_formulas:
        unbalanced_multiple_formulas.append(r)
            
print(f'{len(unbalanced_multiple_formulas)} of the {len(unbalanced)} unbalanced reactions in the model have at least one metabolite with multiple formulas')

In [None]:
balanced_multiple_formulas = []
for r in balanced:
    contains_multiple_formulas = False
    for m in r.metabolites:
        formulas = m.formula.split(';')
        if len(formulas) > 1:
            contains_multiple_formulas = True
    if contains_multiple_formulas:
        balanced_multiple_formulas.append(r)
            
print(f'{len(balanced_multiple_formulas)} of the {len(balanced)} balanced reactions in the model have at least one metabolite with multiple formulas')

# Remove shorthand notation from the model

Check the set of elements present in the model

In [None]:
all_letters = []
for m in model.metabolites:
    for c in m.formula:
        if c.isalpha():
            all_letters.append(c)
            
print(set(all_letters)) #print makes it horizontal

Check the set of elements present in the growth reaction

In [None]:
growth_elements = []
[growth_elements.extend(list(m.elements.keys())) for m in model.reactions.get_by_id('Growth').metabolites]
    
print(set(growth_elements)) #print makes it horizontal

## Remove 'PRS' from model

In [None]:
metabolites_with_PRS = []
for m in model.metabolites:
    if 'PRS' in m.formula:
        metabolites_with_PRS.append(m)
        print(m.id, m.formula)
        
print(f'There are {len(metabolites_with_PRS)} metabolites with PRS in their formula')

First check metabolites that have PRS that only have one formula

In [None]:
for m in model.metabolites:
    if ';' not in m.formula and'PRS' in m.formula:
        print(m.id)
        print(m.name)
        print(m.formula)
        print()

There are four such metabolites. They will need to be fixed later. First we need to define what PRS is

Next, check metabolties that have PRS and multiple formulas

In [None]:
met_multiple_formulas_PRS = []
for m in model.metabolites:
    if ';' in m.formula and'PRS' in m.formula:
        met_multiple_formulas_PRS.append(m)
print(f'There are {len(met_multiple_formulas_PRS)} metabolites with multiple formulas and PRS')

define function to subtract element dictionaries

In [None]:
def subtract_element_dicts(elements_1, elements_2):
    output = {}
    all_keys = list(elements_1.keys())
    element_2_keys = list(elements_2.keys())
    all_keys.extend(element_2_keys)
    all_keys = set(all_keys)
    
    
    for k in all_keys:
        if k in elements_1.keys() and k in elements_2.keys():
            output[k] = elements_1[k] - elements_2[k]
        elif k in elements_1.keys() and k not in elements_2.keys():
            output[k] = elements_1[k]
        else:
            output[k] = -1*elements_2[k]
            
    return output

Test that the function works

In [None]:
elements_1 = {'C': 394, 'H': 621, 'O': 144, 'N': 96, 'P': 1, 'S': 3}
elements_2 = {'C': 21, 'H': 39, 'N': 2, 'O': 9, 'P': 1, 'R': 1, 'S': 1}
subtract_element_dicts(elements_1, elements_2)

In [None]:
subtract_element_dicts(elements_1, elements_2)

In [None]:
for m in met_multiple_formulas_PRS:
    formulas = m.formula.split(';')
    if len(formulas) == 2:
        elements_1 = formula_dict_from_string(formulas[0])
        elements_2 = formula_dict_from_string(formulas[1])
                                              
        print(m.id, m.formula, subtract_element_dicts(elements_1, elements_2))


Seems clear that PRS has formula N94H582S2C373O135R-1

Remove all formulas that contain 'PRS' 

In [None]:
for m in met_multiple_formulas_PRS:
    formulas = m.formula.split(';')
    if len(formulas) == 2:
        if 'PRS' in formulas[0]:
            m.formula = formulas[1]
        else:
            m.formula = formulas[0]

Check how many metabolites with multiple formulas and 'PRS' remain

In [None]:
met_multiple_formulas_PRS = []
for m in model.metabolites:
    if ';' in m.formula and'PRS' in m.formula:
        met_multiple_formulas_PRS.append(m)
        print(m.id, m.formula)
print(f'There are {len(met_multiple_formulas_PRS)} metabolites with multiple formulas and PRS')

Update this one manually

In [None]:
model.metabolites.get_by_id('ACP_c').formula = 'C384H603N96O142P1S3'

Fix metabolites with 'PRS' that only have one formula, by adding 'PRS' elements to it

In [None]:
for m in model.metabolites:
    if ';' not in m.formula and'PRS' in m.formula:
        print(m.id)
        print(m.name)
        print(m.formula)
        print(m.elements)
        negative_PRS = {'R': 1, 'H': -582, 'S': -2, 'P': 0, 'N': -94, 'O': -135, 'C': -373}
        print(subtract_element_dicts(m.elements, negative_PRS))
        m.elements =subtract_element_dicts(m.elements, negative_PRS)
        print('New formula', m.formula)
        print()

### Check how many metabolites with multiple formulas remain

In [None]:
multiple_formulas = []
for m in model.metabolites:
    formulas = m.formula.split(';')
    if len(formulas) > 1:
        multiple_formulas.append(m)
    
print(f'{len(multiple_formulas)} of the {len(model.metabolites)} metabolites in the model have multiple formulas')

### Check how many unbalanced reactions remain

In [None]:
unbalanced = []
balanced = []

for r in model.reactions:
    if should_be_balanced(r) and r.check_mass_balance():
        unbalanced.append(r)
    else:
        balanced.append(r)
        
print(f'{len(unbalanced)} of the {len(model.reactions)} reactions in the model are unbalanced')
print(f'{len(balanced)} of the {len(model.reactions)} reactions in the model are balanced')

In [None]:
unbalanced_multiple_formulas = []

for r in unbalanced:
    contains_multiple_formulas = False
    for m in r.metabolites:
        formulas = m.formula.split(';')
        if len(formulas) > 1:
            contains_multiple_formulas = True
    if contains_multiple_formulas:
        unbalanced_multiple_formulas.append(r)
            
print(f'{len(unbalanced_multiple_formulas)} of the {len(unbalanced)} unbalanced reactions in the model have at least one metabolite with multiple formulas')

In [None]:
balanced_multiple_formulas = []
for r in balanced:
    contains_multiple_formulas = False
    for m in r.metabolites:
        formulas = m.formula.split(';')
        if len(formulas) > 1:
            contains_multiple_formulas = True
    if contains_multiple_formulas:
        balanced_multiple_formulas.append(r)
            
print(f'{len(balanced_multiple_formulas)} of the {len(balanced)} balanced reactions in the model have at least one metabolite with multiple formulas')

## Inspect metabolites and reactions to make next decision. Remove this section later 

In [None]:
for r in unbalanced:
    print(r.id)
    print(r.reaction)
    for m in r.metabolites:
        if ';' in m.formula:
            print(m.id, m.formula)
    print(r.check_mass_balance())
    print()

In [None]:
for m in multiple_formulas:
    print(m.id)
    print(m.formula)
    for r in m.reactions:
        
        if r.check_mass_balance() != {} and len([m for m in r.metabolites if ';' in m.formula]) == 1:
            print(r.id)
            print(r.reaction)
            print(r.check_mass_balance())
    print()

## Check out element R

In [None]:
metabolites_with_R = []
for m in model.metabolites:
    if 'R' in m.formula:
        metabolites_with_R.append(m)
        print(m.id, m.formula)
        
print(f'There are {len(metabolites_with_R)} metabolites with R in their formula')

In [None]:
for m in model.metabolites:
    if ';' in m.formula and 'R' in m.formula:
        print(m.id)
        print(m.name)
        print(m.formula)
        print()

## Check out element X

In [None]:
metabolites_with_X = []
for m in model.metabolites:
    if 'X' in m.formula:
        metabolites_with_X.append(m)
        print(m.id, m.formula)
        
print(f'There are {len(metabolites_with_X)} metabolites with X in their formula')

In [None]:
for m in model.metabolites:
    if ';' in m.formula and 'X' in m.formula:
        print(m.id)
        print(m.name)
        print(m.formula)
        print()

## Fix unbalanced reactions starting with reactions with a single undefined metabolite formula

In [None]:
one_undefined_formula_rxns = []
for r in unbalanced:
    num_multiple_formula_mets = 0
    for m in r.metabolites:
        if ';' in m.formula:
            num_multiple_formula_mets += 1
    if num_multiple_formula_mets == 1:
        one_undefined_formula_rxns.append(r)
        
print(f'There are {len(one_undefined_formula_rxns)} unbalanced reactions with a single undefined formula')

Now attempt to fix them by checking if the mass error corrosponds to one of the formulas of the undefined metabolite

## Find the metabolites with multiple formulas that are used in one reaction, and fix them

In [None]:
def fix_metabolite_with_single_reaction(m, r):
    print(f'The metabolite {m.name} with the id {m.id} is only involved in one reaction {r.id}')
    print(f'{m.name} has the formula {m.formula}')
    print(f'This reactions is {r.name} which has the form {r.reaction} and the mass error {r.check_mass_balance()}')
    
    if r.check_mass_balance() == {}:
        print('The smaller of the two formulas that does not include the metabolites X' )
    print()

Check the reactions that they are in and see if they can be balanced. 

Fixing these metabolites will have the least harm to the rest of the model

find metabolites with multiple formulas that are used in a single reaction

In [None]:
for m in multiple_formulas:
    if len(m.reactions) == 1:
        print(m.formula)
        r = list(m.reactions)[0]
        fix_metabolite_with_single_reaction(m, r)

In [None]:
for m in multiple_formulas:
#     print(len(m.reactions))
    if len(m.reactions) == 1:
        reaction = list(m.reactions)[0]
        print(f'The metabolite {m.name} with the id {m.id} is only involved in one reaction {reaction.id}')
        print(f'{m.name} has the formula {m.formula}')
        print(f'This reactions is {reaction.name} which has the form {reaction.reaction}')
        rxn = list(m.reactions)[0]
        print(rxn.check_mass_balance())
        [print(m) for m in rxn.metabolites]
        print()

### Find reactions with only one metabolite with multiple formulas

In [None]:
one_undefined_formula_rxns = []
for r in unbalanced:
    num_multiple_formula_mets = 0
    for m in r.metabolites:
        if ';' in m.formula:
            num_multiple_formula_mets += 1
    if num_multiple_formula_mets == 1:
        one_undefined_formula_rxns.append(r)
        
print(f'There are {len(one_undefined_formula_rxns)} reactions with a single undefined formula')

In [None]:
for r in one_undefined_formula_rxns:
    print(r.reaction)
    print(r.check_mass_balance())
    for m in r.metabolites:
        if ';' in m.formula:
            print(m.formula)
    print()

Define function to allow dictionaries to be placed in lists. Found this solution on [stack overflow](https://stackoverflow.com/questions/56063246/how-to-obtain-a-set-of-dictionaries)

In [None]:
def make_hashable(o):
    if isinstance(o, dict):
        return frozenset((k, make_hashable(v)) for k, v in o.items())
    elif isinstance(o, list):
        return tuple(make_hashable(elem) for elem in o)
    elif isinstance(o, set):
        return frozenset(make_hashable(elem) for elem in o)
    else:
        return o

In [None]:
mass_error_list = [make_hashable(r.check_mass_balance()) for r in unbalanced_multiple_formulas]
for mass_error in collections.Counter(mass_error_list).most_common()[:20]:
    print(mass_error)

## Fix most frequent reaction unbalancing issues

Check which mass errors are most common

In [None]:
mass_error_list = [make_hashable(r.check_mass_balance()) for r in unbalanced_multiple_formulas]
for mass_error in collections.Counter(mass_error_list).most_common()[:20]:
    print(mass_error)

Most Frequent Problems:
1) Being off by two hyrdogens <br>
2) Being off by a water molecule <br>
3) ('H', 3.0), ('R', 1.0), ('O', -2.0), ('S', 1.0), ('C', -1.0), ('X', 1.0) <br>
4) ('N', 4.0), ('P', 1.0), ('S', 1.0), ('H', 19.0), ('O', 12.0), ('C', 17.0) <br>
5) ('C', -24.0), ('S', -1.0), ('N', -7.0), ('O', -19.0), ('P', -3.0), ('H', -34.0) <br>

### Look into two hydrogen error

In [None]:
two_hydrogen_error = []
for r in unbalanced_multiple_formulas:
    if r.check_mass_balance() == {'H': 2.0} or r.check_mass_balance() == {'H': -2.0}:
        two_hydrogen_error.append(r)

In [None]:
for r in two_hydrogen_error:
    print (r.check_mass_balance(), r.reaction)

Almost all have NAD or NADP. These reactions have the form:<br>
X + NADPH + H --> XH2 + NADP
<br>
The two hydrogen error happens becuase the formulas metabolite X and XH2 are both listed twice. So the two hydrogens on XH2 are double counted
<br>
The fix is to remove one of these formulas from each metabolite. For consistancy sake, the higher molecular weight formula will always be removed


In [None]:
for m in two_hydrogen_error[1].metabolites:
    print (m.name)
    print (m.formula)
    print (m.elements)
    print()

### Look into water molecule error

In [None]:
water_error = []
for r in unbalanced_multiple_formulas:
    if r.check_mass_balance() == {'H': 2.0, 'O': 1.0} or r.check_mass_balance() == {'H': -2.0, 'O': -1.0}:
        water_error.append(r)

In [None]:
for r in water_error:
    print (r.check_mass_balance(), r.reaction)

Notice that these all have water molecule as product<br>
These reactions are where one molecule loses a water <br>
Since the formulas are duplicated it is reading that two water molecules are lost, and only one is accounted for. <br>
This would be fixed by 

In [None]:
for m in water_error[2].metabolites:
    print (m.name)
    print (m.formula)
    print (m.elements)
    print()

In [None]:
[print(m.formula,m.name) for m in multiple_formulas if 'acyl' in m.name]

### Remove larger of two formulas for acyl-proteins

### Check how many metabolites still have multiple formulas

In [None]:
multiple_formulas = []
for m in model.metabolites:
    formulas = m.formula.split(';')
    if len(formulas) > 1:
        multiple_formulas.append(m)
    
print(f'{len(multiple_formulas)} metabolites still have multiple formulas')

### Check how many reactions are still unbalanced

In [None]:
unbalanced = []
for r in model.reactions:
    if r.check_mass_balance() != {} and should_be_balanced(r):
        unbalanced.append(r)
        
print(f'{len(unbalanced)} reactions are still unbalanced')

## Check most common mass errors now

In [None]:
mass_error_list = [make_hashable(r.check_mass_balance()) for r in unbalanced_multiple_formulas]
for mass_error in collections.Counter(mass_error_list).most_common()[:20]:
    print(mass_error)

## Inspect the balanced reactions with multiple formula metabolites

In [None]:
for r in balanced_multiple_formulas:
    if should_be_balanced(r):
        print(r.reaction)
        print([m.id for m in r.metabolites])
        print()

Most of these are transport reactions

In [None]:
set(r.check_mass_balance() for k,v in balanced_multiple_formulas)

In [None]:
set(r.subsystem for r in model.reactions)

## Inspect unbalanced reactions with multiple formulas

write a function to make a dictionary go into a list

In [None]:
def make_hashable(o):
    if isinstance(o, dict):
        return frozenset((k, make_hashable(v)) for k, v in o.items())
    elif isinstance(o, list):
        return tuple(make_hashable(elem) for elem in o)
    elif isinstance(o, set):
        return frozenset(make_hashable(elem) for elem in o)
    else:
        return o

Check the most frequent mass errors

In [None]:
mass_error_list = [make_hashable(r.check_mass_balance()) for r in unbalanced_multiple_formulas]
for mass_error in collections.Counter(mass_error_list).most_common()[:20]:
    print(mass_error)

### write function to take in array of strings and return lowest molecular weight string

In [None]:
test_m = model.metabolites.get_by_id('ddcaACP_c')
formulas = test_m.formula.split(';')
for f in formulas:
    print(formula_dict_from_string(f))
    


In [None]:
for r in model.metabolites.get_by_id('ddcaACP_c').reactions:
    print(r.id)
    print(r.reaction)
    for m in r.metabolites:
        print(m.formula.split(';'))
    print()

In [None]:
model.metabolites.get_by_id('ACP_c').name

In [None]:
model.metabolites.get_by_id('ACP_c').formula.split(';')

In [None]:
len(model.metabolites.get_by_id('ACP_c').reactions)

In [None]:
model.metabolites.get_by_id('ddcap_c')

Try to figure out which metabolites with multiple formulas are most common in unbalanced reactions

In [None]:
len(unbalanced)

In [None]:
len(multiple_formulas)

In [None]:
metabolite_occurances = {}
for r in unbalanced:
    for m in r.metabolites:
        if m in multiple_formulas:
            try:
                metabolite_occurances[m.id] += 1
            except:
                metabolite_occurances[m.id] = 1

In [None]:
dict(sorted(metabolite_occurances.items(), key=lambda item: -item[1]))

In [None]:
metabolite_with_x = []
for m in model.metabolites:
    if 'X' in m.formula:
        metabolite_with_x.append(m)
        
print(f'There are {len(metabolite_with_x )} metabolites with X in their formula')

In [None]:
for m in metabolite_with_x:
    print(m.id, m.name, m.formula.split(';'))

In [None]:
model.metabolites.get_by_id('fldox_c')

In [None]:
model.metabolites.get_by_id('fdxox_c')

In [None]:
metabolite_with_r = []
for m in model.metabolites:
    if 'R' in m.formula:
        metabolite_with_r.append(m)
        
print(f'There are {len(metabolite_with_r)} metabolites with R in their formula')

In [None]:
for m in metabolite_with_r:
    print(m.id, m.name, m.formula.split(';'))

Get all letters used in formulas

In [None]:
all_letters = []
for m in model.metabolites:
    for c in m.formula:
        if c.isalpha():
            all_letters.append(c)
            
print(set(all_letters))
    

In [None]:
for m in model.metabolites:
    if 'X' in m.formula:
        print (m.id, m.name, m.formula)

[X is the code for glutaredoxin in BiGG](http://bigg.ucsd.edu/models/universal/metabolites/grxox)

In [None]:
for m in model.metabolites:
    if 'R' in m.formula and 'PRS' not in m.formula:
        print (m.id, m.name, m.formula)

[R is the code for Ferricytochrome in BiGG](http://bigg.ucsd.edu/universal/metabolites/ficytc6)

In [None]:
for m in model.metabolites:
    if 'PRS' in m.formula:
        print (m.id, m.name, m.formula)

What does PRS mean in BiGG. Obviously something to do with ACP

In [None]:
for m in model.reactions.get_by_id('Growth').metabolites:
    print(m.formula)

In [None]:
model.reactions.get_by_id('Growth').reaction.split('+')

Get set of elements in growth equation.

In [None]:
growth_elements = []
[growth_elements.extend(list(m.elements.keys())) for m in model.reactions.get_by_id('Growth').metabolites]
    
set(growth_elements)

In [None]:
for m in model.metabolites:
    if 'obsolete' in m.name:
        print(m.id, m.name, m.formula)

In [None]:
for r in model.metabolites.get_by_id('nadph_c').reactions:
    print(r.id, r.reaction)

Check elements of metabolites with only one formula

In [None]:
one_formula_elements = []
for m in model.metabolites:
    if ';' not in m.formula:
        one_formula_elements.extend((list(m.elements.keys())))
set(one_formula_elements)

This is okay. What is R and X? Check R first

In [None]:
for m in model.metabolites:
    if ';' not in m.formula and 'R' in m.formula and 'PRS' not in m.formula:
        print(m.id)
        print(m.name)
        print(m.formula)
        print()

## PRS is causing errors. This is a method to remove it

First check metabolites that have PRS that only have one formula

In [None]:
for m in model.metabolites:
    if ';' not in m.formula and'PRS' in m.formula:
        print(m.id)
        print(m.name)
        print(m.formula)
        print()

There are four such metabolites. They will meed to be fixed later. First we need to define what PRS is

Next, check metabolties that have PRS and multiple formulas

In [None]:
met_multiple_formulas_PRS = []
for m in model.metabolites:
    if ';' in m.formula and'PRS' in m.formula:
        met_multiple_formulas_PRS.append(m)
print(f'There are {len(met_multiple_formulas_PRS)} metabolites with multiple formulas and PRS')

define function to subtract element dictionaries

In [None]:
def subtract_element_dicts(elements_1, elements_2):
    output = {}
    all_keys = list(elements_1.keys())
    element_2_keys = list(elements_2.keys())
    all_keys.extend(element_2_keys)
    all_keys = set(all_keys)
    
    
    for k in all_keys:
        if k in elements_1.keys() and k in elements_2.keys():
            output[k] = elements_1[k] - elements_2[k]
        elif k in elements_1.keys() and k not in elements_2.keys():
            output[k] = elements_1[k]
        else:
            output[k] = -1*elements_2[k]
            
    return output

Test that the function works

In [None]:
elements_1 = {'C': 394, 'H': 621, 'O': 144, 'N': 96, 'P': 1, 'S': 3}
elements_2 = {'C': 21, 'H': 39, 'N': 2, 'O': 9, 'P': 1, 'R': 1, 'S': 1}
subtract_element_dicts(elements_1, elements_2)

In [None]:
subtract_element_dicts(elements_1, elements_2)

In [None]:
for m in met_multiple_formulas_PRS:
    formulas = m.formula.split(';')
    if len(formulas) == 2:
        elements_1 = formula_dict_from_string(formulas[0])
        elements_2 = formula_dict_from_string(formulas[1])
                                              
        print(m.id, m.formula, subtract_element_dicts(elements_1, elements_2))
        print()


Seems clear that PRS has formula N94H582S2C373O135R-1

In [None]:
Remove all 

Now check X

In [None]:
for m in model.metabolites:
    if ';' not in m.formula and 'X' in m.formula:
        print(m.id)
        print(m.name)
        print(m.formula)
        print()

Check to see if R is in metabolites with multiple formulas

In [None]:
for m in model.metabolites:
    if ';' in m.formula and 'PRS' in m.formula:
        print(m.id)
        print(m.name)
        print(m.formula)
        print(m.elements)
        print()