# Import and Load Libraries

In [1]:
# Python Standard Library and 3rd Party Imports 

import pandas as pd
import numpy as np
import networkx as nx
import scipy
import copy
import re
from collections import Counter, defaultdict
from itertools import chain, combinations
from pathlib import Path

In [2]:
# COBRA imports

import cobra
from cobra import Model, Reaction
from cobra.core import Group, Reaction
from cobra.flux_analysis import flux_variability_analysis
from cobra.flux_analysis.fastcc import fastcc
from cobra.io import read_sbml_model, save_matlab_model, write_sbml_model
from cobra.util.solver import linear_reaction_coefficients
from rapidfuzz import fuzz, process
from networkx.algorithms import bipartite
from scipy.linalg import svd
from cobra.util.array import create_stoichiometric_matrix
from networkx.algorithms.simple_paths import shortest_simple_paths
from optlang.symbolics import Zero
from optlang import Model as OptModel
from collections import deque

# Functions

In [3]:
def are_reactions_interconnected(model, reaction_ids):
    """
    Check whether a list of reactions are all interconnected via shared metabolites.
    
    Parameters:
    - model: cobra.Model
    - reaction_ids: list of reaction IDs to test

    Returns:
    - bool: True if all reactions are connected via shared metabolites
    """
    # Create a graph where nodes = reactions, edges = shared metabolite
    G = nx.Graph()
    G.add_nodes_from(reaction_ids)

    # Build edges based on shared metabolites
    for i, rxn1_id in enumerate(reaction_ids):
        if rxn1_id not in model.reactions:
            continue
        rxn1 = model.reactions.get_by_id(rxn1_id)
        mets1 = set(rxn1.metabolites)

        for rxn2_id in reaction_ids[i+1:]:
            if rxn2_id not in model.reactions:
                continue
            rxn2 = model.reactions.get_by_id(rxn2_id)
            mets2 = set(rxn2.metabolites)

            # Add edge if they share at least one metabolite
            if mets1 & mets2:
                G.add_edge(rxn1_id, rxn2_id)

    # Check if the graph is fully connected
    return nx.is_connected(G)

In [4]:
def get_largest_connected_subgroup(model, reaction_list):
    """
    Identify the largest connected component of a group of reactions based on shared metabolites.

    Parameters:
    - model: cobra.Model
        A COBRA model containing reactions and metabolites.
    - reaction_list: list of str
        A list of reaction IDs to analyze.

    Returns:
    - list of str:
        The subset of reaction IDs from the input that belong to the largest connected component.
        Connectivity is defined by reactions sharing at least one metabolite (bipartite graph: reactions ↔ metabolites).
    """
    G = nx.Graph()
    for rxn_id in reaction_list:
        rxn = model.reactions.get_by_id(rxn_id)
        for met in rxn.metabolites:
            G.add_edge(rxn.id, met.id)

    components = nx.connected_components(G)
    
    reaction_components = [
        set(comp).intersection(reaction_list) for comp in components
    ]
    
    largest = max(reaction_components, key=len, default=[])
    return list(largest)


In [5]:
def lump_reaction(model, coupled_df, reaction_column="Coupled_Reactions", verbose=True, label_type="Group"):
    """
    Create a lumped model by replacing fully coupled reaction groups with pseudo-reactions.

    Parameters:
    - model: cobra.Model
        The original COBRA model (remains unchanged).
    - coupled_df: pd.DataFrame
        DataFrame with a column containing lists of reactions to lump.
    - reaction_column: str
        The column name in `coupled_df` that holds lists of reactions (default: "Coupled_Reactions").
    - verbose: bool
        If True, print progress messages.
    - label_type: str
        Label prefix for naming the pseudo-reactions (e.g., "Group", "Module").

    Returns:
    - model_lumped: cobra.Model
        A copy of the model with pseudo-reactions added and original reactions removed.
    - translation_df: pd.DataFrame
        DataFrame mapping pseudo-reactions to their original reaction IDs.
    """
    import copy
    from cobra import Reaction
    from collections import defaultdict
    import pandas as pd

    model_lumped = copy.deepcopy(model)
    translation_data = []

    for i, row in coupled_df.iterrows():
        group_reactions = row[reaction_column]

        # Filter only reactions that exist in the model
        valid_reactions = [rxn_id for rxn_id in group_reactions if rxn_id in model_lumped.reactions]

        if len(valid_reactions) < 2:
            if verbose:
                print(f"⏭️ Skipping group {i} — fewer than 2 valid reactions.")
            continue

        # Determine common flux bounds
        lower_bounds = [model_lumped.reactions.get_by_id(r).lower_bound for r in valid_reactions]
        upper_bounds = [model_lumped.reactions.get_by_id(r).upper_bound for r in valid_reactions]
        min_lb = max(lower_bounds)
        max_ub = min(upper_bounds)

        # Compute net stoichiometry
        net_stoich = defaultdict(float)
        for rxn_id in valid_reactions:
            rxn = model_lumped.reactions.get_by_id(rxn_id)
            for met, coeff in rxn.metabolites.items():
                net_stoich[met] += coeff
        cleaned_stoich = {met: coeff for met, coeff in net_stoich.items() if abs(coeff) > 1e-10}

        # Create pseudo-reaction
        pseudo_id = f"Pseudo_{label_type}_{i}"
        pseudo_rxn = Reaction(id=pseudo_id)
        pseudo_rxn.name = f"Lumped reaction for: {', '.join(valid_reactions)}"
        pseudo_rxn.lower_bound = min_lb
        pseudo_rxn.upper_bound = max_ub
        pseudo_rxn.add_metabolites(cleaned_stoich)

        # Add and remove reactions
        model_lumped.add_reactions([pseudo_rxn])
        for rxn_id in valid_reactions:
            model_lumped.reactions.remove(model_lumped.reactions.get_by_id(rxn_id))

        # Record mapping
        translation_data.append({
            'Pseudo_Reaction_ID': pseudo_id,
            'Original_Reactions': valid_reactions
        })

        if verbose:
            print(f"✅ {label_type} {i}: Created '{pseudo_id}' from {valid_reactions}")

    translation_df = pd.DataFrame(translation_data)
    return model_lumped, translation_df

In [6]:
def find_precursors(model, biomass_precursors, core_mets, max_depth=5):
    """
    Find core metabolites that can act as precursors to each biomass precursor,
    using reactions in the model. Handles reversibility and stoichiometry.

    Parameters:
        model (cobra.Model): The metabolic model.
        biomass_precursors (list): List of metabolite IDs in the biomass reaction.
        core_mets (list): List of core metabolite IDs.
        max_depth (int): Maximum number of reactions to search backwards.

    Returns:
        dict: {biomass_precursor: set of core_metabolites_that_can_reach_it}
    """
    # Build reverse graph: met_id -> reactions that can produce it
    met_to_producing_rxns = defaultdict(list)

    for rxn in model.reactions:
        for met, coeff in rxn.metabolites.items():
            if coeff > 0:
                # Metabolite is produced in forward direction
                met_to_producing_rxns[met.id].append((rxn, 'forward'))
            elif coeff < 0 and rxn.lower_bound < 0:
                # Metabolite is produced in reverse direction
                met_to_producing_rxns[met.id].append((rxn, 'reverse'))

    result = {}

    for target_met in biomass_precursors:
        visited_mets = set()
        visited_rxns = set()
        queue = deque([(target_met, 0)])  # (metabolite_id, depth)
        precursors = set()

        while queue:
            current_met, depth = queue.popleft()
            if depth >= max_depth:
                continue

            producing_rxns = met_to_producing_rxns.get(current_met, [])

            for rxn, direction in producing_rxns:
                if (rxn.id, direction) in visited_rxns:
                    continue
                visited_rxns.add((rxn.id, direction))

                # Get reactants based on direction
                if direction == 'forward':
                    reactants = rxn.reactants
                else:
                    reactants = rxn.products  # in reverse, products become precursors

                for reactant in reactants:
                    if reactant.id in visited_mets:
                        continue
                    visited_mets.add(reactant.id)

                    if reactant.id in core_mets:
                        precursors.add(reactant.id)
                    else:
                        queue.append((reactant.id, depth + 1))

        result[target_met] = precursors

    return result

In [7]:
def report_unreachable_precursors(model, biomass_precursors, core_mets, max_depth_range=10):
    """
    For each depth level (1 to max_depth_range), report biomass precursors 
    that cannot be reached from core metabolites.

    Parameters:
        model (cobra.Model): COBRApy model.
        biomass_precursors (list): List of biomass metabolite IDs.
        core_mets (list): List of core metabolite IDs.
        max_depth_range (int): Max depth to explore (default: 10).
    
    Returns:
        dict: {depth: list of unreachable biomass precursors at that depth}
    """
    depth_to_unreachable = {}

    for depth in range(1, max_depth_range + 1):
        precursor_map = find_precursors(model, biomass_precursors, core_mets, max_depth=depth)
        unreachable = [met for met, precursors in precursor_map.items() if not precursors]
        depth_to_unreachable[depth] = unreachable
        print(f"Step {depth}: {len(unreachable)} precursors not reachable → {unreachable}")

    return depth_to_unreachable

# Import Data

In [8]:
# Set the project root path by going up from the notebook location
notebook_dir = Path(__file__).parent if '__file__' in globals() else Path().resolve()
project_root = notebook_dir.parent.parent  # Go up from /code/GEM_Reduction/

# Construct raw paths 
raw_data_path = project_root / "data" / "raw" 
raw_sbml_path = raw_data_path / "sbml_files" 
raw_mat_path = raw_data_path / "matlab_files" 
raw_csv_path = raw_data_path / "csv_files" 

# Construct processed paths 
processed_data_path = project_root / "data" / "processed" 
processed_sbml_path = processed_data_path / "sbml_files" 
processed_csv_path = processed_data_path / "csv_files" 

In [9]:
# read model
model = read_sbml_model(raw_sbml_path / "iAF1260.xml")
subsystem_df = pd.read_csv(raw_csv_path / 'iAF1260_subsystem_assignments.csv')

In [10]:
# import pairs coupled and reactions blocked by F2C2
Coupled_Pairs_df = pd.read_csv(raw_csv_path / 'fctable_iAF1260.csv', header=None)
F2C2_Blocked_Reactions_df = pd.read_csv(raw_csv_path / 'blocked_reactions_iAF1260.csv', header=None)

### Basic model investigation

Model Basics
- Model ID: iAF1260
- Number of Reactions: 2382
- Number of Metabolites: 1668
- Number of Genes: 1261
- Number of Exchange Reactions: 299
- Reversible Reactions: 575
- Irreversible Reactions: 1807
- Objective reaction(s): R_BIOMASS_Ec_iAF1260_core_59p81M

- FBA Objective value (biomass): 0.7367

In [11]:
# Get the stoichiometric matrix as a NumPy array
stoich_dense = create_stoichiometric_matrix(model)  # Already a NumPy array

# Compute the rank of the stoichiometric matrix
rank = np.linalg.matrix_rank(stoich_dense)

# Degrees of freedom
num_reactions = len(model.reactions)
dof = num_reactions - rank

# Output
print(f"Stoichiometric Matrix Rank: {rank}")
print(f"Number of Reactions: {num_reactions}")
print(f"Degrees of Freedom: {dof}")

Stoichiometric Matrix Rank: 1630
Number of Reactions: 2382
Degrees of Freedom: 752


# Model Compression

In [12]:
 # Create a deep copy of the model to avoid modifying the original
model_copy = copy.deepcopy(model)

**Step 1**: Open flux bounds

In [13]:
# Allow flux in and out for all exchange reactions
for rxn in model_copy.exchanges:
    rxn.lower_bound = -10
    rxn.upper_bound = 1000

In [14]:
solution_open = model_copy.optimize()
obj_open = solution_open.objective_value

print(f'After opening bounds of all exchanges, the optimal solution is {round(obj_open,2)}!')

After opening bounds of all exchanges, the optimal solution is 36.4!


In [15]:
### Export as .mat for matlab applications

save_matlab_model(model_copy, raw_mat_path / "iAF1260.mat") # use generic model to get unbiased couplings (filter later!)

**Step 2**: Run FASTCC

In [16]:
# Reduce model with COBRApy FASTCC 
consistent_generic_model = fastcc(model_copy)

In [17]:
# Get the stoichiometric matrix as a NumPy array
stoich_dense = create_stoichiometric_matrix(consistent_generic_model)  # Already a NumPy array

# Compute the rank of the stoichiometric matrix
rank = np.linalg.matrix_rank(stoich_dense)

# Degrees of freedom
num_reactions = len(consistent_generic_model.reactions)
dof = num_reactions - rank

# Output
print(f"Stoichiometric Matrix Rank: {rank}")
print(f"Number of Reactions: {num_reactions}")
print(f"Degrees of Freedom: {dof}")

Stoichiometric Matrix Rank: 1440
Number of Reactions: 2153
Degrees of Freedom: 713


**Step 2.1**: Make F2C2 Data accessible

In [18]:
# Add reaction annotations to match F2C2 blocked reactions
model_rxns = [rxn.id for rxn in model.reactions]
F2C2_Blocked_Reactions_df.loc[len(F2C2_Blocked_Reactions_df)] = model_rxns

# Identify unblocked reactions from the second row (index 1)
unblocked_mask = F2C2_Blocked_Reactions_df.loc[0] == 0
unblocked_reactions = F2C2_Blocked_Reactions_df.loc[1][unblocked_mask].tolist()

# Update Coupled_Pairs_df index and columns with unblocked reactions
Coupled_Pairs_df.index = unblocked_reactions
Coupled_Pairs_df.columns = unblocked_reactions

In [19]:
# Get number of how many fully coupled pairs
fully_coupled_count = int(((Coupled_Pairs_df == 1).sum().sum()) - len(Coupled_Pairs_df))

*SideQuest* : Investigate difference in 'blocking' between FASTCC and F2C2

In [20]:
# Extract and convert reactions from models to sets for fast comparison
generic_rxns = {rxn.id for rxn in consistent_generic_model.reactions}
f2c2_unblocked_rxns = set(unblocked_reactions)

# Compare overlaps and differences
overlap_generic = list(generic_rxns & f2c2_unblocked_rxns)

# Reactions unblocked by F2C2 but removed by FASTCC
missing_from_generic = list(f2c2_unblocked_rxns - generic_rxns)

# Reactions kept by FASTCC but blocked by F2C2 (ideally empty)
unexpected_in_generic = list(generic_rxns - f2c2_unblocked_rxns)

**Step 2**: Create 'Fully Coupled Clusters'

In [21]:
# Build a graph of fully coupled reactions
G = nx.Graph()

for row in Coupled_Pairs_df.index:
    for col in Coupled_Pairs_df.columns:
        if Coupled_Pairs_df.loc[row, col] == 1 and row != col:
            G.add_edge(row, col)

# Find connected components (fully coupled groups)
coupled_groups = list(nx.connected_components(G))

In [22]:
# Build reaction groups with net stoichiometry and input/output metabolites
combined_data = []
for group in coupled_groups:
    group_reactions = list(group)

    # Compute net stoichiometry
    stoich = defaultdict(float)
    for rxn_id in group_reactions:
        rxn = model.reactions.get_by_id(rxn_id)
        for met, coeff in rxn.metabolites.items():
            stoich[met] += coeff

    inputs = [met.id for met, coeff in stoich.items() if coeff < 0]
    outputs = [met.id for met, coeff in stoich.items() if coeff > 0]

    combined_data.append({
        "Coupled_Reactions": group_reactions,
        "Num_Inputs": len(inputs),
        "Num_Outputs": len(outputs),
        "Input_Metabolites": inputs,
        "Output_Metabolites": outputs
    })

# Create initial DataFrame
Fully_Coupled_df = pd.DataFrame(combined_data)

In [23]:
# Flatten reaction lists to count how many should be removed
all_reactions = Fully_Coupled_df['Coupled_Reactions'].explode()

ex_removed_count = all_reactions.str.startswith('EX_').sum()
biomass_removed_flag = 'BIOMASS_Ec_iAF1260_core_59p81M' in set(all_reactions)
missing_set = set(missing_from_generic)
missing_removed_count = all_reactions.isin(missing_set).sum()

# Clean the reaction lists: remove EX_, biomass, and missing reactions
Fully_Coupled_df['Coupled_Reactions'] = Fully_Coupled_df['Coupled_Reactions'].apply(
    lambda rxns: [
        r for r in rxns
        if not r.startswith('EX_')
        and r != 'BIOMASS_Ec_iAF1260_core_59p81M'
        and r not in missing_set
    ]
)

# Drop empty groups
Fully_Coupled_df = Fully_Coupled_df[Fully_Coupled_df['Coupled_Reactions'].str.len() > 0]

# Report removals
print(f"Total 'EX_' reactions removed: {ex_removed_count}")
print(f"'BIOMASS_Ec_iAF1260_core_59p81M' was removed: {biomass_removed_flag}")
print(f"Total reactions removed due to fastcc removal: {missing_removed_count}")

Total 'EX_' reactions removed: 277
'BIOMASS_Ec_iAF1260_core_59p81M' was removed: True
Total reactions removed due to fastcc removal: 35


In [24]:
# Check cluster connectivity
Fully_Coupled_df['Cluster'] = Fully_Coupled_df['Coupled_Reactions'].apply(
    lambda rxn_list: are_reactions_interconnected(consistent_generic_model, rxn_list)
)

# Count number of reactions
Fully_Coupled_df['Num_Reactions'] = Fully_Coupled_df['Coupled_Reactions'].apply(len)

# Remove disconnected pairs of size 2
Fully_Coupled_df = Fully_Coupled_df[~((Fully_Coupled_df['Num_Reactions'] == 2) & (Fully_Coupled_df['Cluster'] == False))]

# Fix broken clusters with more than 2 reactions
mask = (Fully_Coupled_df['Cluster'] == False) & (Fully_Coupled_df['Num_Reactions'] > 2)
Fully_Coupled_df.loc[mask, 'Cleaned_Coupled_Reactions'] = Fully_Coupled_df.loc[mask, 'Coupled_Reactions'].apply(
    lambda rxns: get_largest_connected_subgroup(consistent_generic_model, rxns)
)

# Keep reactions as-is for connected groups
Fully_Coupled_df['Cleaned_Coupled_Reactions'] = Fully_Coupled_df['Cleaned_Coupled_Reactions'].fillna(Fully_Coupled_df['Coupled_Reactions'])

# Track removed reactions per group
Fully_Coupled_df['Removed_Reactions'] = Fully_Coupled_df.apply(
    lambda row: list(set(row['Coupled_Reactions']) - set(row['Cleaned_Coupled_Reactions'])),
    axis=1
)

In [25]:
# Prepare final DataFrame for analysis
selected_columns = ['Cleaned_Coupled_Reactions', 'Removed_Reactions']
Lumping_Couples_df = Fully_Coupled_df[selected_columns].copy()

# Drop single-reaction groups
Lumping_Couples_df = Lumping_Couples_df[Lumping_Couples_df['Cleaned_Coupled_Reactions'].str.len() > 1]

# Map reactions to subsystems
subsystem_map = dict(zip(subsystem_df['iAF1260_BIGG'], subsystem_df['Subsystem']))
Lumping_Couples_df['Subsystem'] = Lumping_Couples_df['Cleaned_Coupled_Reactions'].apply(
    lambda rxn_list: list(set(subsystem_map.get(rxn, 'Unknown') for rxn in rxn_list))
)

# Add summary columns
Lumping_Couples_df['Num_Subsystems'] = Lumping_Couples_df['Subsystem'].apply(len)
Lumping_Couples_df['Num_Reactions'] = Lumping_Couples_df['Cleaned_Coupled_Reactions'].apply(len)

**Step 3**: Lump fully coupled clusters into pseudo-reactions

In [26]:
model_lumped, translation_df = lump_reaction(consistent_generic_model, Lumping_Couples_df, reaction_column="Cleaned_Coupled_Reactions",verbose=False)

In [27]:
# Identify and remove orphaned metabolites used only in pseudo-reactions
metabolites_to_remove = []

for met in model_lumped.metabolites:
    # Get all reactions involving this metabolite
    associated_rxns = [rxn.id for rxn in met.reactions]
    
    # If the metabolite only appears in one reaction AND it's a pseudo-reaction
    if len(associated_rxns) == 1 and associated_rxns[0].startswith("Pseudo_"):
        metabolites_to_remove.append(met)

# Remove them from the model
model_lumped.remove_metabolites(metabolites_to_remove)

print(f"🧹 Removed {len(metabolites_to_remove)} internal metabolites used only in pseudo-reactions.")

🧹 Removed 0 internal metabolites used only in pseudo-reactions.


In [28]:
# Get the stoichiometric matrix as a NumPy array
stoich_dense = create_stoichiometric_matrix(model_lumped)  

# Compute the rank of the stoichiometric matrix
rank = np.linalg.matrix_rank(stoich_dense)

# Degrees of freedom
num_reactions = len(model_lumped.reactions)
dof = num_reactions - rank

# Output
print(f"Stoichiometric Matrix Rank: {rank}")
print(f"Number of Reactions: {num_reactions}")
print(f"Degrees of Freedom: {dof}")

Stoichiometric Matrix Rank: 887
Number of Reactions: 1545
Degrees of Freedom: 658


**Step 4**: Sanity Check

In [30]:
sol_orig = model_copy.optimize()
sol_lumped = model_lumped.optimize()

print(f"Original model FBA objective: {sol_orig.objective_value:.6f}")
print(f"Lumped model FBA objective:   {sol_lumped.objective_value:.6f}")

if abs(sol_orig.objective_value - sol_lumped.objective_value) < 1e-6:
    print("✅ FBA optimum preserved.")
else:
    print("⚠️ FBA optimum changed — check lumping effects.")

Original model FBA objective: 36.400622
Lumped model FBA objective:   36.400622
✅ FBA optimum preserved.


In [31]:
# Get only the pseudo-reactions
pseudo_rxns = [rxn.id for rxn in model_lumped.reactions if rxn.id.startswith("Pseudo_")]

# Run FVA on those reactions
fva_result = flux_variability_analysis(model_lumped, reaction_list=pseudo_rxns, fraction_of_optimum=0.0)

# Check which pseudo-reactions are blocked (min=max=0)
blocked = fva_result[(fva_result["minimum"] == 0) & (fva_result["maximum"] == 0)]

print(f"Total pseudo-reactions: {len(pseudo_rxns)}")
print(f"Blocked pseudo-reactions: {len(blocked)}")

if not blocked.empty:
    print("⚠️ Some pseudo-reactions are blocked:")
    print(blocked)
else:
    print("✅ All pseudo-reactions are feasible.")

Total pseudo-reactions: 347
Blocked pseudo-reactions: 0
✅ All pseudo-reactions are feasible.


# Subsystem based lumping

**Step 1**: Group by module

In [32]:
# Dictionary with subsystem mapping
subsystem_map = dict(zip(subsystem_df['iAF1260_BIGG'], subsystem_df['Subsystem']))

# Create working copy
translation_cp_df = translation_df.copy()

# Apply the logic inline, returning a tuple (Subsystem, Unambiguous_Subsystem), then split into columns
translation_cp_df[['Subsystem', 'Unambiguous_Subsystem']] = translation_cp_df['Original_Reactions'].apply(
    lambda reactions: (
        (lambda subsystems: (
            Counter(subsystems).most_common(1)[0][0] if subsystems else None,
            True if len(set(Counter(subsystems).values())) == 1 and len(set(subsystems)) == 1 else False
        ))([subsystem_map[rxn] for rxn in reactions if rxn in subsystem_map])
    )
).apply(pd.Series)

# Create new rows from subsystem_df
new_rows = pd.DataFrame({
    'Pseudo_Reaction_ID': subsystem_df['iAF1260_BIGG'],
    'Original_Reactions': [None] * len(subsystem_df),
    'Subsystem': subsystem_df['Subsystem'],
    'Unambiguous_Subsystem': [True] * len(subsystem_df)  # Set to True explicitly
})

# Append new rows to the main DataFrame
translation_cp_df = pd.concat([translation_cp_df, new_rows], ignore_index=True)

In [33]:
#  Extract all reaction IDs from model_lumped
model_reaction_ids = [rxn.id for rxn in model_lumped.reactions]

# Subset translation_df to only those rows with matching Pseudo_Reaction_ID
subset_df = translation_cp_df[translation_cp_df['Pseudo_Reaction_ID'].isin(model_reaction_ids)].copy()

In [34]:
# Group by 'Subsystem' and aggregate
grouped_df = subset_df.groupby('Subsystem').agg(
    Members=('Pseudo_Reaction_ID', list),
    Member_Count=('Pseudo_Reaction_ID', 'count')
).reset_index()

**Step 2**: Remove modules that cannot be lumped

Note: For Core, reference: *Ataman, et al. "redGEM: Systematic reduction ... consistent core metabolic models." PLoS computational biology 13.7 (2017): e1005444.*

In [35]:
Forbidden_subsystems_list = ['S_Exchange','S_Unassigned','S_']

Core_subsystems_list = ['S_GlycolysisGluconeogenesis','S_Pentose_Phosphate_Pathway','S_Citric_Acid_Cycle','S_Glyoxylate_Metabolism','S_Pyruvate_Metabolism','S_Oxidative_Phosphorylation']

Remove_list = Forbidden_subsystems_list + Core_subsystems_list

In [36]:
# Subset: rows to remove (core)
Core_grouped_df = grouped_df[grouped_df['Subsystem'].isin(Remove_list)].copy()

# Subset: rows to keep (non-core)
Noncore_grouped_df = grouped_df[~grouped_df['Subsystem'].isin(Remove_list)].copy()

**Step 3**: Connect to relevance for biomass reaction

In [37]:
# Get metabolites required by the biomass reaction (educts)
biomass_rxn = model_lumped.reactions.get_by_id("BIOMASS_Ec_iAF1260_core_59p81M")
biomass_educts = {met.id for met, coeff in biomass_rxn.metabolites.items() if coeff < 0}

# Containers for new columns
educts_list = []
products_list = []
internal_list = []
biomass_outputs = []
biomass_flags = []

# Iterate over each subsystem group (each row)
for _, row in Noncore_grouped_df.iterrows():
    reaction_ids = row['Members']
    stoich = defaultdict(float)

    # Accumulate net stoichiometry across all reactions
    for rxn_id in reaction_ids:
        if rxn_id not in model_lumped.reactions:
            continue
        rxn = model_lumped.reactions.get_by_id(rxn_id)
        for met, coeff in rxn.metabolites.items():
            stoich[met.id] += coeff

    # Classify metabolites
    educt_met = [m for m, c in stoich.items() if c < 0]
    product_met = [m for m, c in stoich.items() if c > 0]

    # Biomass-relevant products
    biomass_met = [m for m in product_met if m in biomass_educts]
    biomass_relevant = len(biomass_met) > 0

    # Append results
    educts_list.append(educt_met)
    products_list.append(product_met)
    biomass_outputs.append(biomass_met)
    biomass_flags.append(biomass_relevant)

# Assign new columns to grouped_df
Noncore_grouped_df['educt_met'] = educts_list
Noncore_grouped_df['product_met'] = products_list
Noncore_grouped_df['biomass'] = biomass_outputs
Noncore_grouped_df['biomass_relevant'] = biomass_flags

Noncore_grouped_df['Interconnected'] = Noncore_grouped_df['Members'].apply(
    lambda rxns: are_reactions_interconnected(model_lumped, rxns)
)

Noncore_grouped_df.sort_values(by='Member_Count',ascending=False,inplace=True)

In [38]:
# Find largest subclusters that are interconnected for non-interconnected groups
mask = (Noncore_grouped_df['Interconnected'] == False)

Noncore_grouped_df.loc[mask, 'Members_Cleaned_Reactions'] = Noncore_grouped_df.loc[mask, 'Members'].apply(
    lambda rxns: get_largest_connected_subgroup(model_lumped, rxns)
)

In [39]:
Noncore_grouped_df['Members_Cleaned_Reactions_Number'] = Noncore_grouped_df['Members_Cleaned_Reactions'].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [40]:
Noncore_grouped_df["Leftover_Members"] = Noncore_grouped_df.apply(
    lambda row: list(set(row["Members"]) - set(row["Members_Cleaned_Reactions"]))
    if isinstance(row["Members_Cleaned_Reactions"], list) else np.nan,
    axis=1
)

In [41]:
# Find second largest subclusters that are interconnected for non-interconnected groups

mask = Noncore_grouped_df["Members_Cleaned_Reactions"].notna()

# Apply function only to rows where the mask is True
Noncore_grouped_df.loc[mask, 'Leftover_Cleaned_Members'] = Noncore_grouped_df.loc[mask, 'Leftover_Members'].apply(
    lambda rxns: get_largest_connected_subgroup(model_lumped, rxns)
)

In [42]:
Noncore_grouped_df['Leftover_Cleaned_Members_Number'] = Noncore_grouped_df['Leftover_Cleaned_Members'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Lump Subsystems

**Step 1**: Truly interconnected subsystems

In [43]:
# subset to initially true ones
copy1_df = Noncore_grouped_df[Noncore_grouped_df['Interconnected'] == True]

In [44]:
copy1, translation_df = lump_reaction(model_lumped, copy1_df, reaction_column="Members",verbose=False,label_type="Subsystem")

In [45]:
# Get the stoichiometric matrix as a NumPy array
stoich_dense = create_stoichiometric_matrix(copy1)  

# Compute the rank of the stoichiometric matrix
rank = np.linalg.matrix_rank(stoich_dense)

# Degrees of freedom
num_reactions = len(copy1.reactions)
dof = num_reactions - rank

# Output
print(f"Stoichiometric Matrix Rank: {rank}")
print(f"Number of Reactions: {num_reactions}")
print(f"Degrees of Freedom: {dof}")

Stoichiometric Matrix Rank: 797
Number of Reactions: 1284
Degrees of Freedom: 487


**Step 2**: Largest Subclusters of non interconnected Subsystems

In [46]:
copy2_df = Noncore_grouped_df[Noncore_grouped_df["Members_Cleaned_Reactions"].notna()]

In [47]:
copy2, translation_df = lump_reaction(copy1, copy2_df, reaction_column="Members_Cleaned_Reactions",verbose=False,label_type="Subsystem_Cluster1")

In [48]:
# Get the stoichiometric matrix as a NumPy array
stoich_dense = create_stoichiometric_matrix(copy2)  

# Compute the rank of the stoichiometric matrix
rank = np.linalg.matrix_rank(stoich_dense)

# Degrees of freedom
num_reactions = len(copy2.reactions)
dof = num_reactions - rank

# Output
print(f"Stoichiometric Matrix Rank: {rank}")
print(f"Number of Reactions: {num_reactions}")
print(f"Degrees of Freedom: {dof}")

Stoichiometric Matrix Rank: 479
Number of Reactions: 540
Degrees of Freedom: 61


In [49]:
# note: can get to degree 34 when linking all other reactions of clusters, but then destroying network stucture.

# Context-Specifity

In [50]:
context_model = copy.deepcopy(copy2)

In [51]:
# Get all exchange reactions
exchange_reactions = context_model.exchanges  # model.exchanges returns a list of exchange reactions

# Build the dataframe
exchange_df = pd.DataFrame({
    "ID": [rxn.id for rxn in exchange_reactions],
    "Name": [rxn.name for rxn in exchange_reactions]
})

In [52]:
# Loop through all exchange reactions in model1
for rxn in model.exchanges:
    rxn_id = rxn.id

    # Check if the same exchange reaction exists in model2
    if rxn_id in context_model.reactions:
        # Get the corresponding reaction in model2
        target_rxn = context_model.reactions.get_by_id(rxn_id)

        # Copy bounds
        target_rxn.lower_bound = rxn.lower_bound
        target_rxn.upper_bound = rxn.upper_bound

# Pseudo Paths for Biomass Precursors

**Step 1**: Extract Biomass Precursors, Core Reactions & Metabolites

In [53]:
# Get reactants (educts) of the biomass reaction
biomass_rxn = model_lumped.reactions.get_by_id("BIOMASS_Ec_iAF1260_core_59p81M")
biomass_educts = [met.id for met in biomass_rxn.reactants]
original_stoich = biomass_rxn.metabolites  # dict: {met: coeff}

In [54]:
# Extract all Core reactions & Core metabolites

# Combine all reaction IDs from the 'core' subsystems in Remove_list
core_reactions = set()

# Filter rows where 'Subsystem' is in Remove_list
filtered_df = grouped_df[grouped_df['Subsystem'].isin(Remove_list)]

# Loop through those rows and collect reactions
for members in filtered_df['Members']:
    core_reactions.update(members)  # Assuming each entry is a list of reaction IDs

# Extract all metabolites from Core reactions
core_metabolites = set()
for rxn_id in core_reactions:
    try:
        rxn = model_lumped.reactions.get_by_id(rxn_id)
        for met in rxn.metabolites:
            core_metabolites.add(met.id)
    except KeyError:
        print(f"Reaction {rxn_id} not found in model.")

In [55]:
# Check if all metabolites are in core subset

intersected = list(set(core_metabolites) & set(biomass_educts))

all_in_core = set(biomass_educts).issubset(set(core_metabolites))
print(all_in_core)

# If False, can print missing ones!

missing = [met for met in biomass_educts if met not in core_metabolites]
print("Missing from core_metabolites:", missing)

True
Missing from core_metabolites: []


**Step 2**: Get minimum core precursors for each biomass precursors

In [56]:
# Determine at what depth each biomass precursor becomes reachable
min_depth_map = {met: 1 for met in biomass_educts}  # default to 1

unreachable_by_step = report_unreachable_precursors(model_lumped,biomass_educts,core_metabolites,max_depth_range=3)

for depth, unreachable in unreachable_by_step.items():
    for met in unreachable:
        min_depth_map[met] = depth + 1  # It's not reachable at this depth, so try next # Run find_precursors for each individual biomass precursor at its required depth

Step 1: 4 precursors not reachable → ['ca2_c', 'cobalt2_c', 'cu2_c', 'murein5px4p_p']
Step 2: 1 precursors not reachable → ['murein5px4p_p']
Step 3: 0 precursors not reachable → []


**Step 3**: Create Summary for Pseudo-Reactions

In [57]:
records = []

# For each biomass precursor, use the minimum depth needed
for biomass_met in biomass_educts:
    required_depth = min_depth_map[biomass_met]

    result = find_precursors(model_lumped, [biomass_met], core_metabolites, max_depth=required_depth)
    precursors = result.get(biomass_met, set())

    records.append({
        'biomass_precursor': biomass_met,
        'core_precursors': sorted(list(precursors)),
        'k_steps': required_depth
    })

# Create the DataFrame
df_precursor_summary = pd.DataFrame(records)

In [58]:
# Remove mappings for biomass precursors that are in fact core metabolites 

# Unique biomass_precursor values
biomass_set = set(df_precursor_summary['biomass_precursor'])

# Flatten all lists in 'core_precursors' column into one set
core_precursor_set = set()
for precursors in df_precursor_summary['core_precursors']:
    core_precursor_set.update(precursors)

# Find intersection
core_and_biomass_precursors = sorted(biomass_set & core_precursor_set)

In [59]:
# Subset the DataFrame
df_precursor_summary_filtered = df_precursor_summary[~df_precursor_summary['biomass_precursor'].isin(core_and_biomass_precursors)].copy()

**Step 4**: Create Pseudo Reactions 

In [60]:
 # Create a deep copy of the model to avoid modifying the original
model_biomass = copy.deepcopy(model_lumped)

In [61]:
pseudo_reactions = []

for _, row in df_precursor_summary.iterrows():
    biomass_met_id = row['biomass_precursor']
    core_precursor_ids = row['core_precursors']

    # Create new Reaction object
    rxn = Reaction(id=f"Pseudo_Biomass_{biomass_met_id}")
    rxn.name = f"Pseudo_Biomass_{biomass_met_id}"
    rxn.lower_bound = 0
    rxn.upper_bound = 1000

    # Build stoichiometry dict: reactants have negative coeffs, product is positive
    stoich = {}

    for met_id in core_precursor_ids:
        met = model_biomass.metabolites.get_by_id(met_id)
        stoich[met] = -1.0  # default stoichiometry

    biomass_met = model_biomass.metabolites.get_by_id(biomass_met_id)
    stoich[biomass_met] = 1.0  # product

    # Add metabolites and append to list
    rxn.add_metabolites(stoich)
    pseudo_reactions.append(rxn)

# Add all pseudo-reactions to the model
model_biomass.add_reactions(pseudo_reactions)