# Proteomics Integration

Load a enzyme constrained metabolic mdoel of _Escherichia coli_.

In [None]:
import pandas as pd

from cobra.io import read_sbml_model
from cobra.flux_analysis import flux_variability_analysis

model = read_sbml_model('data/eciML1515.xml.gz')

The model has two differences with a standard COBRA model. First, the reactions contain another _metabolite_: the enyzme itself.

In [None]:
model.reactions.FRD2No1

In this model, all protein ids follow the form `prot_UNIPROT`.

The second difference is the existence of _protein exchange reactions_. These protein exchanges follow the naming `prot_UNIPROT_exchange`.

In [None]:
model.reactions.prot_P00363_exchange

By putting an upper bound on these exchanges, we will integrate proteomics data into the model and treat it as an usual COBRA model without further changes.

## 1. Proteomics data preparation
Proteomics data is usually presented as Number of copies per cell. We need to do convert this information into the units used by the enzyme constrained model.

<div class="alert alert-block alert-warning">
<b>DEBUG:</b> Should run PCA for the entire dataset?
</div>

In [None]:
df = pd.read_csv(
    #"data/ecoli_proteomics_schmidt2016_S9.tsv", # our strain but just Glc/LB
    "data/ecoli_proteomics_schmidt2016_S5_ren.tsv", # BW25113 22 media
    "\t", skiprows=2  # skip titles and subtitles (XLXS)
)
exp_details = pd.read_csv(
    "data/ecoli_details_schmidt2016_S23.tsv", "\t", 
    skiprows=2  # skip titles and subtitles (XLXS)
)

In [None]:
df.head()

In [None]:
exp_details.head()

In [None]:
df = df.loc[:, 
    df.columns.str.contains("_copies|_cv", regex=True) |  # only interested in copies/cell and uncertainty
    df.columns.isin(["Uniprot Accession"])  # and relevant info about proteins
]

In [None]:
df_ac = df.loc[:, ["Uniprot Accession", "Acetate_copies", "Acetate_cv"]]
# rename resulting columns
df_ac.columns = ["uniprot", "copies_per_cell", "CV"]

In [None]:
df_ac.describe()

In [None]:
# apply uncertainty (extend upper bound as 1/2 of stdev)
df_ac["copies_upper"] = df_ac["copies_per_cell"] + 0.5 * df_ac["CV"]/100 * df_ac["copies_per_cell"]

First, convert copies per cell to abundance per cell.
\begin{align}
\frac{\text{mmol}}{\text{cell}} = \frac{\text{molecules}}{\text{cell}} \frac{10^3\text{mol}}{\text{molecules}}
\end{align}

In [None]:
df_ac["mmol_per_cell"] = df_ac["copies_upper"] * 1e3/6.022e23

And, then, convert the abundance per cell into abundance per gDW.

\begin{align}
\frac{\text{mmol}}{\text{gDW}} = \frac{\text{mmol}}{\text{cell}} \frac{\text{cell}}{fL} \frac{fL}{g}\frac{g}{\text{gDW}}
\end{align}

In [None]:
growth_experimental = exp_details.loc[
    (exp_details["Growth condition"]=="Acetate") & (exp_details["Strain"]=="BW25113"), 
    "Growth rate (h-1)"
].values[0]
cell_volume = exp_details.loc[
    (exp_details["Growth condition"]=="Acetate") & (exp_details["Strain"]=="BW25113"), 
    "Single cell volume [fl]1"
].values[0]
cell_density = 1.105e-12
water_content = 0.3

In [None]:
df_ac["conc"] = df_ac["mmol_per_cell"] * 1 / (cell_volume * cell_density * water_content)

In [None]:
proteomics = df_ac["conc"]
proteomics.index = df_ac["uniprot"]

## 2. Model
Simulations part (caffeine)

In [None]:
# save the non enzyme-constrained model
plain_model = model.copy()

In [None]:
def limit_proteins(model, measurements):
    """Apply proteomics measurements to `model`.

    Adapted from https://github.com/DD-DeCaF/simulations/blob/devel/src/simulations/modeling/driven.py

    Parameters
    ----------
    model: cobra.Model
        The enzyme-constrained model.
    measurements : pd.DataFrame
        Protein abundances in mmol / gDW.

    """
    for protein_id, measure in measurements.items():
        try:
            rxn = model.reactions.get_by_id(f"prot_{protein_id}_exchange")
        except KeyError:
            pass
        else:
            # update only upper_bound (as enzymes can be unsaturated):
            rxn.bounds = (0, measure)

Optimize the enzyme constrained model.

In [None]:
limit_proteins(model, proteomics)

In [None]:
# enzyme-contrained (the model doesn't grow)

model.optimize()

The model can't grow!

## 3. Flexibilization

Experimental measurements can be too restrictive if an uncertainty is not given. Thus, a flexibilization of the proteomics data is usually required to work with enzyme constrained models.

In [None]:
def top_shadow_prices(solution, met_ids, top=1):
    """
    Retrieves shadow prices for a list of metabolites from the solution and ranks
    them from most to least sensitive in the model.

    Parameters
    ----------
    solution: cobra.Solution
        The usual Solution object returned by model.optimize().
    biomass_reaction: str
        name of biomass reaction
    met_ids: iterable of strings
        Subset of metabolite IDs from the model.
    top: int
        The number of metabolites to be returned.

    Returns
    -------
    shadow_pr: pd.Series
        Top shadow prices, ranked.
    """
    shadow_pr = solution.shadow_prices
    shadow_pr = shadow_pr.loc[shadow_pr.index.isin(met_ids)]
    return shadow_pr.sort_values()[:top]


def flexibilize_proteomics(
    model, biomass_reaction, minimal_growth, proteomics
):
    """
    Replace proteomics measurements with a set that enables the model to grow. Proteins
    are removed from the set iteratively based on sensitivity analysis (shadow prices).
    
    Adapted from https://github.com/DD-DeCaF/simulations/blob/devel/src/simulations/modeling/driven.py

    Parameters
    ----------
    model: cobra.Model
        The enzyme-constrained model.
    minimal_growth_rate: float
        Minimal growth rate to enforce.
    proteomics: pandas.DataFrame
        List of measurements.

    Returns
    -------
    growth_rate: dict
        New growth rate (will change if the model couldn't grow at the inputted value).
    proteomics: list(dict)
        Filtered list of proteomics.

    """
    def protein_to_metabolite(protein_id, model):
        met_id = model.metabolites.query(lambda m: protein_id in m.id)
        return met_id[0].id if met_id else ""
    
    # reset growth rate in model:
    model.reactions.get_by_id(biomass_reaction).bounds = (0, 1000)

    # build a table with protein ids, met ids in model and values to constrain with:
    prot_df = pd.DataFrame(proteomics)
    prot_df.index = prot_df.index.astype("str")
    prot_df["met_id"] = [protein_to_metabolite(prot, model) for prot in prot_df.index]
    prot_df = prot_df[prot_df.met_id != ""]
    
    # constrain the model with all proteins and optimize:
    limit_proteins(model, proteomics)
    solution = model.optimize()
    new_growth_rate = solution.objective_value if solution.objective_value else 0
    
    # relax growth constraint
    minimal_growth *= 1.05

    # while the model cannot grow to the desired level, remove the protein with
    # the highest shadow price:
    prots_to_remove = []
    while new_growth_rate < minimal_growth and not prot_df.empty:
        # get most influential protein in model:
        top_protein = top_shadow_prices(solution, list(prot_df["met_id"]))
        top_protein = top_protein.index[0]
        top_protein = prot_df.index[prot_df["met_id"] == top_protein][0]

        # update data: append protein to list, remove from current dataframe and
        # increase the corresponding upper bound to +1000:
        prots_to_remove.append(top_protein)
        prot_df = prot_df.drop(labels=top_protein)
        limit_proteins(model, pd.Series(data=[1000], index=[top_protein]))

        # re-compute solution:
        solution = model.optimize()
        #if solution.objective_value == new_growth_rate:  # the algorithm is stuck
        #    break
        new_growth_rate = solution.objective_value if solution.objective_value else 0

    # update growth rate if optimization was not successful:
    if new_growth_rate < minimal_growth:
        print(
            f"Minimal growth was not reached! "
            f"Final growth of the model: {new_growth_rate}"
        )

    return new_growth_rate, prots_to_remove

Enforce 0.1 of growth rate

In [None]:
biomass_reaction = "BIOMASS_Ec_iML1515_core_75p37M"
new_growth_rate, prots_removed = flexibilize_proteomics(model, biomass_reaction, growth_experimental, proteomics)

In [None]:
print(f"Proteins in dataset: {proteomics.shape[0]}\nProteins removed: {len(prots_removed)}")

In [None]:
model.optimize()

In [None]:
plain_model.optimize()

Let's compare the carbon source utilization of both models

In [None]:
# exchanges in this are on the right-had side
plain_exchanges = [reaction for reaction in plain_model.exchanges if reaction.flux > 0]
enzyme_exchanges = [reaction for reaction in model.exchanges if reaction.flux > 0]

In [None]:
plain_fva = flux_variability_analysis(plain_model)

In [None]:
enzyme_fva = flux_variability_analysis(model)

In [None]:
enzyme_fva[enzyme_fva.maximum < 700].sort_values("maximum", ascending=False)

In [None]:
plain_fva[plain_fva.maximum < 700].sort_values("maximum", ascending=False)

In [None]:
enzyme_fva[enzyme_fva.index.str.startswith("prot_")].sort_values("maximum", ascending=False)

## Exercise

* Identify enzymatic bottlenecks in the enzymed constrained model in **Acetate** as carbon source (shadow prices?).
* Prepare and limit the model for the medium with Glucose
* Does the model grow? If not, try flexibilizing the model.
* Identify enzymatic bottlenecks in the enzymed constrained model in with **Glucose** as carbon source (shadow prices?).
