# Structured Names
This notebook describes how to use structured names to check SBML models for moiety balance.

In [1]:
import init
from SBMLLint.common import constants as cn
from SBMLLint.common.molecule import Molecule
from SBMLLint.common import simple_sbml
from SBMLLint.common.reaction import Reaction
from SBMLLint.tools import sbmllint
from SBMLLint.tools import print_reactions
from SBMLLint.common.simple_sbml import SimpleSBML
from SBMLLint.tools import sbmllint
from SBMLLint.tools.model_maker import ModelMaker
import tellurium as te

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tellurium as te

In [2]:
from SBMLLint.common.simple_sbml import modelIterator

In [3]:
iterator = modelIterator(initial=19, final=25)
for item in iterator:
    print(item.filename)

BIOMD0000000020_url.xml
BIOMD0000000021_url.xml
BIOMD0000000022_url.xml
BIOMD0000000023_url.xml
BIOMD0000000024_url.xml
BIOMD0000000025_url.xml


In [4]:
import init
from SBMLLint.tools import sbmllint

# The first step in glycolysis, expressed
# in the Antimony language.
model = """
Glu + A__P_3 -> Glu_P + A_P_P; 1
Glu = 0
A_P_P_P = 0
Glu_P = 0
A_P_P = 0
"""
_ = sbmllint.lint(model)

UnboundLocalError: local variable 'simple' referenced before assignment

In [None]:
# The first step in glycolysis, with an error (wrongly goes to A instead of A_P_P)
model = """
Glu + A_P_P_P -> Glu_P + A_P; 1
"""
_ = sbmllint.lint(model, config_path="../SBMLLint/.sbmllint_cfg")

In [None]:
cn.ANALYSIS_STRUCTURED_NAMES_DIR

## Sturctured Names in BioModels

In [None]:
path = os.path.join(cn.ANALYSIS_STRUCTURED_NAMES_DIR, "analyze_structured_names.csv")
df_data = pd.read_csv(path)
df_data.head()

In [None]:
# Basic Statistics
num_structured = len(df_data[df_data[cn.IS_STRUCTURED]])
num_not_structured = len(df_data) - num_structured
_ = plt.pie([num_structured, num_not_structured], 
            labels=["Structured (%d)" % num_structured, "Not Structured (%d)" % num_not_structured])

In [None]:
df_sort = df_data[df_data[cn.IS_STRUCTURED]]
df_sort = df_sort.sort_values(cn.FRAC_BALANCED_REACTIONS)
df_sort = df_sort.reset_index()
yv = [1.0*i/len(df_sort) for i in df_sort.index]
_ = plt.plot(df_sort[cn.FRAC_BALANCED_REACTIONS], yv)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel("Fraction Balanced Reactions")
plt.ylabel("Fraction of Models")

In [None]:
def plotIsStructured(df, val):
    df1 = df[df[cn.IS_STRUCTURED] == val]
    plt.hist(df1[cn.TOTAL_REACTIONS], bins=100, range=(0, 500))
    if val:
        title = "Structured Names"
    else:
        title = "Not Structured Names"
    plt.title(title)
    plt.xlabel("Number Reactions")
    plt.show()
plotIsStructured(df_data, True)
plotIsStructured(df_data, False)

In [None]:
def plotFrcBoundary(df, val):
    df1 = df[df[cn.IS_STRUCTURED] == val]
    plt.scatter(df1[cn.TOTAL_REACTIONS], df1[cn.FRAC_BOUNDARY_REACTIONS])
    if val:
        title = "Structured Names"
    else:
        title = "Not Structured Names"
    plt.title(title)
    plt.xlabel("Number Reactions")
    plt.ylabel("Fraction Boundary")
    plt.show()
plotFrcBoundary(df_data, True)
plotFrcBoundary(df_data, False)

In [None]:
def plotFracBalanced(df):
    df1 = df[df[cn.IS_STRUCTURED]]
    xv = (1.0*df1[cn.TOTAL_REACTIONS] - df1[cn.NUM_BOUNDARY_REACTIONS]) / df1[cn.TOTAL_REACTIONS]
    plt.scatter(xv, df1[cn.FRAC_BALANCED_REACTIONS])
    title = "Structured Names"
    plt.title(title)
    plt.xlabel("Fraction Non-Boundary Reactions")
    plt.ylabel("Fraction Balanced")
    plt.xlim([0, 1.1])
    plt.show()
plotFracBalanced(df_data)

In [None]:
# Histogram of fraction balanced for structured names
def plotFracBalancedHist(df):
    df1 = df[df[cn.IS_STRUCTURED]]
    plt.hist(df1[cn.FRAC_BALANCED_REACTIONS], bins=50)
    title = "Models With Structured Names"
    plt.title(title)
    plt.xlabel("Fraction Balanced Reactions")
    plt.show()
plotFracBalancedHist(df_data)

## Detailed Look at Putative Structured Names

**Observations**
1. Some cases where doing ad hoc structured names that with little change can balance. For example:   
   1. BIOMD0000000192_url.xml: RCC1_RanGDP -> RCC1_Ran + GDP, RCC1_Ran + GTP -> RCC1_RanGTP
   1. BIOMD0000000010_url.xml: MKKK -> MKKK_P, MKKK_P -> MKKK

In [None]:
df = df_data[df_data[cn.IS_STRUCTURED]]
for _, row in df.iterrows():
    path = os.path.join(cn.BIOMODELS_DIR, row[cn.FILENAME])
    try:
        print("\n%s" % row[cn.FILENAME])
        print_reactions.prettyPrint(path, is_include_kinetics=False, is_include_label=False)
    except:
        pass

## BioModel Re-Writes

In [5]:
def evaluateModel(path):
    with open(path, "r") as fd:
        model = ''.join(fd.readlines())
        # Make sure that the model can be loaded
        rr = te.loada(model)
        _ = sbmllint.lint(model_reference=path, mass_balance_check=cn.MOIETY_ANALYSIS)

### BioModels 611

In [None]:
# Baseline model
evaluateModel("BIOMOD611.ant")

In [None]:
import init

config_path = "../SBMLLint/.sbmllint_cfg"
_ = sbmllint.lint(model, config_path="../SBMLLint/.sbmllint_cfg")

In [None]:
# Modified model
with open("BIOMOD611_mod.ant", "r") as fd:
    model = ''.join(fd.readlines())
    # Make sure that the model can be loaded
    rr = te.loada(model)
_ = sbmllint.lint(model, config_path="../SBMLLint/.sbmllint_cfg")

Summary of changes:
1. "a" -> "\_a"
1. mII -> m\_II
1. "\_\_" -> "\_"
1. "4551" -> "4551x"
1. Implicits: a1, a2, L (13, 14, 15), m,  2b25, 4551x, 8de8, ad2303, adc8be, mwbdb849d8

R13, R14, R15 have are questionable in terms of mass balance?

### BioModels 293
Terms in models
- SUB is substrate
- Prot is protein
- dam is damaged
- asyn is alpha syn*
- Ub is ubiquiton

In [None]:
# Baseline model
evaluateModel("BIOMOD293.ant")

In [None]:
# Eliminate name repeititions
maker = ModelMaker("BIOMOD293.txt")
maker.makeModelStr()
exclude_funcs = [ 
        lambda n: n[0]=="k",
        lambda n: n=="E1",
        lambda n: n=="E2",
        lambda n: n=="E3",
        lambda n: "UCHL1" in n,
        lambda n: "Uchl1" in n,
        lambda n: n[-2:] == "E3",
    ]
rename_dict = maker.getCandidateRenames(exclude_funcs=exclude_funcs)
model_str = maker.replaceSymbols(rename_dict)

In [None]:
rename_dict["ATP"] = "A__P__P__P"
rename_dict["ADP"] = "A__P__P"
rename_dict["AMP"] = "A__P"
rename_dict["agg"] = "Agg"
rename_dict["Parkin_asyn_dam_Ub"] = "Parkin__asyn__dam__Ub"
rename_dict["PUb"] = "ProtUb"
rename_dict["Uchl1"] = "UCHL1"
pairs = [
    ("Agg", "asyn"),
    ("Agg", "E3"),
    ("Agg", "DUB"),
    ("Agg", "Prot"),
    ("Agg", "Mis"),
    ("Agg", "Parkin"),
    ("Agg", "SUB"),
    ("Agg", "Ub"),
    ("Agg", "UCHL1"),
    ("asyn", "dam"),
    ("Nat", "Prot"), 
    ("Mis", "Prot"),
    ("E3", "Mis"),
    ("E3", "SUB"),
    ("Ub", "DUB"),
    ("P", "Ub"),
    ("P", "Proteasome"),
    ("upreg", "Ub"),
    ("Uchl1", "dam"),
    ("Seq", "Agg"),
]
for pair in pairs:
    original = "%s%s" % (pair[0], pair[1])
    modified = "%s__%s" % (pair[0], pair[1])
    rename_dict[original] = modified
rename_dict["___"] = "__"
rename_dict = {k: v for k, v in rename_dict.items() if k[0] != "k"}  # do not rename constants
rename_dict = {k: v for k, v in rename_dict.items() if not "AggS" in k}  # ignore constants
model_str = maker.replaceSymbols(rename_dict, is_sort=False)  # Apply changes in order
with open("BIOMOD293_mod1.ant", "w") as fd:
    fd.write(model_str)

In [None]:
rename_dict

In [None]:
# Modified model
evaluateModel("BIOMOD293_mod1.ant", cfg_path="sbmllint_293_cfg.yml")

Observations
1. Initial 256/316 imbalanced reactions. 10 boundary reactions.
1. Changes reduced this to 146/316
    1. Changed "P" for protein to Prot to avoid confusion with Phosphates
    1. Used separators between transformations (Agg, Mis, Nat, upreg, dam) and moieties (ROS, P, Prot, UCHL1, E3, SUB,
    1. Proteasome, A, Ub)
    1. Changed implicit repetitions: ATP -> A_P_P_P, AMP -> A_P; implicit P
    1. Made transformations implicits and some moieties (P, ROS)

1. Issues discovered
    1. Not properly expressing transitions from aggregated to misfolded: DisAggregation5: Agg__Prot_1 -> 2.00 Mis__Prot
should be DisAggregation5: Agg__Prot_1 -> 2.00 Mis__Prot_2
    1. Not counting separate Prot in aggregations that are degraded: Agg__Prot_4 + Proteasome -> Agg__Prot__Proteasome
    1. Similarly with UCHL1, SUB, asyn.

## Biomodels 140

In [None]:
maker = ModelMaker("BIOMOD140.txt")
maker.makeModelStr()
print(maker.model_str)

In [None]:
# Baseline model
evaluateModel("BIOMOD140.ant", cfg_path="sbmllint_140_cfg.yml")