# Import and Load Libraries

In [8]:
# Python Standard Library and 3rd Party Imports 

import pandas as pd
import numpy as np
import networkx as nx
import scipy
import copy
import re
from collections import Counter, defaultdict
from itertools import chain, combinations
from pathlib import Path

In [2]:
# COBRA imports

import cobra
from cobra import Model, Reaction
from cobra.core import Group, Reaction
from cobra.flux_analysis import flux_variability_analysis
from cobra.flux_analysis.fastcc import fastcc
from cobra.io import read_sbml_model, save_matlab_model, write_sbml_model
from cobra.util.solver import linear_reaction_coefficients
from rapidfuzz import fuzz, process
from cobra.util.array import create_stoichiometric_matrix

# Functions

In [3]:
def lump_reaction(model, coupled_df, verbose=True, Search_COBRA_groups=False, label_type="Group"):
    """
    Create a lumped model by replacing fully coupled reaction groups with pseudo-reactions.

    Parameters:
    - model: cobra.Model (the original model, will remain unchanged)
    - coupled_df: pd.DataFrame with either:
        - columns ['Coupled_Reactions', 'COBRA_Groups'] (default format), or
        - column ['Reactions'], and Search_COBRA_groups=True
    - verbose: bool, if True print progress messages
    - Search_COBRA_groups: bool, if True, search model groups for each reaction if not supplied
    - label_type: str, either 'Group' or 'Module' — used in pseudo-reaction naming (required)

    Returns:
    - model_lumped: cobra.Model with pseudo-reactions added
    - translation_df: pd.DataFrame mapping pseudo-reactions to original reactions and groups
    """
    import copy
    from cobra import Reaction
    from collections import defaultdict
    import pandas as pd

    model_lumped = copy.deepcopy(model)
    translation_data = []

    # Handle case with only 'Reactions' column — determine COBRA_Groups if needed
    if Search_COBRA_groups and 'Reactions' in coupled_df.columns:
        cobra_groups_list = []
        for i, row in coupled_df.iterrows():
            reaction_list = row['Reactions']
            found_groups = set()
            for rxn_id in reaction_list:
                for group in model.groups:
                    if rxn_id in [rxn.id for rxn in group.members]:
                        found_groups.add(group.name)
            cobra_groups_list.append(list(found_groups))
        coupled_df = coupled_df.rename(columns={'Reactions': 'Coupled_Reactions'})
        coupled_df['COBRA_Groups'] = cobra_groups_list

    for i, row in coupled_df.iterrows():
        group_reactions = row['Coupled_Reactions']
        cobra_groups = row['COBRA_Groups']

        valid_reactions = []
        for rxn_id in group_reactions:
            if rxn_id in model_lumped.reactions:
                valid_reactions.append(rxn_id)
            else:
                if verbose:
                    print(f"⚠️ Reaction '{rxn_id}' not found in model — skipping.")

        if len(valid_reactions) < 2:
            if verbose:
                print(f"⏭️ Skipping group {i} — fewer than 2 valid reactions.")
            continue

        lower_bounds = []
        upper_bounds = []
        for rxn_id in valid_reactions:
            rxn = model_lumped.reactions.get_by_id(rxn_id)
            lower_bounds.append(rxn.lower_bound)
            upper_bounds.append(rxn.upper_bound)
        min_lb = max(lower_bounds)
        max_ub = min(upper_bounds)

        net_stoich = defaultdict(float)
        for rxn_id in valid_reactions:
            rxn = model_lumped.reactions.get_by_id(rxn_id)
            for met, coeff in rxn.metabolites.items():
                net_stoich[met] += coeff

        cleaned_stoich = {met: coeff for met, coeff in net_stoich.items() if abs(coeff) > 1e-10}

        pseudo_id = f"Pseudo_{label_type}_{i}"
        pseudo_rxn = Reaction(id=pseudo_id)
        pseudo_rxn.name = f"Lumped reaction for: {', '.join(valid_reactions)}"
        pseudo_rxn.lower_bound = min_lb
        pseudo_rxn.upper_bound = max_ub
        pseudo_rxn.add_metabolites(cleaned_stoich)

        model_lumped.add_reactions([pseudo_rxn])

        for group in model_lumped.groups:
            if group.name in cobra_groups:
                group.add_members([pseudo_rxn])

        for rxn_id in valid_reactions:
            rxn = model_lumped.reactions.get_by_id(rxn_id)
            for group in model_lumped.groups:
                if rxn in group.members:
                    group.members.remove(rxn)
            model_lumped.reactions.remove(rxn)

        translation_data.append({
            'Pseudo_Reaction_ID': pseudo_id,
            'Original_Reactions': valid_reactions,
            'COBRA_Groups': cobra_groups
        })

        if verbose:
            print(f"✅ {label_type} {i}: Created '{pseudo_id}' from {valid_reactions}")

    translation_df = pd.DataFrame(translation_data)
    return model_lumped, translation_df


In [4]:
def are_reactions_interconnected(model, reaction_ids):
    """
    Check whether a list of reactions are all interconnected via shared metabolites.
    
    Parameters:
    - model: cobra.Model
    - reaction_ids: list of reaction IDs to test

    Returns:
    - bool: True if all reactions are connected via shared metabolites
    """
    # Create a graph where nodes = reactions, edges = shared metabolite
    G = nx.Graph()
    G.add_nodes_from(reaction_ids)

    # Build edges based on shared metabolites
    for i, rxn1_id in enumerate(reaction_ids):
        if rxn1_id not in model.reactions:
            continue
        rxn1 = model.reactions.get_by_id(rxn1_id)
        mets1 = set(rxn1.metabolites)

        for rxn2_id in reaction_ids[i+1:]:
            if rxn2_id not in model.reactions:
                continue
            rxn2 = model.reactions.get_by_id(rxn2_id)
            mets2 = set(rxn2.metabolites)

            # Add edge if they share at least one metabolite
            if mets1 & mets2:
                G.add_edge(rxn1_id, rxn2_id)

    # Check if the graph is fully connected
    return nx.is_connected(G)

In [5]:
def analyze_model(model, model_name, carb_list):
    """
    Run FBA simulations for a COBRA model across multiple carbon source conditions.

    For each carbon source in the provided list, this function:
    - Allows uptake only for that carbon source (by setting its flux bounds).
    - Blocks all other carbon sources.
    - Runs Flux Balance Analysis (FBA).
    - Records which reactions are active (i.e., carry non-zero flux).
    - Stores the model's objective value (e.g., growth rate).

    The function returns:
    1. A summary DataFrame with objective values for each carbon source.
    2. A full flux activity matrix (True/False for each reaction in each condition).
    3. A filtered version of the matrix showing only reactions that are active in some but not all conditions.

    Parameters
    ----------
    model : cobra.Model
        The COBRA model to simulate.

    model_name : str
        Identifier for the model (used in the output DataFrame).

    carb_list : list of str
        List of exchange reaction IDs corresponding to carbon sources to be tested.

    Returns
    -------
    results_df : pandas.DataFrame
        DataFrame with columns ['Model', 'Carbon_Source', 'Objective_Value'].

    flux_activity_df : pandas.DataFrame
        Boolean DataFrame with reactions as rows and carbon sources as columns.
        True indicates the reaction carried flux in that condition.

    filtered_flux_activity_df : pandas.DataFrame
        Subset of `flux_activity_df` containing only reactions that are active
        in some but not all carbon source conditions.
    """
    flux_threshold = 1e-6  # Minimum threshold to consider flux as active

    results = []
    flux_activity_df = pd.DataFrame({'Reaction': [rxn.id for rxn in model.reactions]})
    flux_activity_df.set_index('Reaction', inplace=True)

    for carb in carb_list:
        model_temp = model.copy()

        # Block all carbon sources except the current one
        for rxn_id in carb_list:
            if rxn_id in model_temp.reactions:
                rxn = model_temp.reactions.get_by_id(rxn_id)
                if rxn_id == carb:
                    rxn.lower_bound = -10.0
                    rxn.upper_bound = 1000.0
                else:
                    rxn.lower_bound = 0.0
                    rxn.upper_bound = 0.0

        # Run FBA
        solution = model_temp.optimize()

        # Record objective value
        results.append({
            'Model': model_name,
            'Carbon_Source': carb,
            'Objective_Value': solution.objective_value
        })

        # Record which reactions carry flux
        active_flux = solution.fluxes.abs() > flux_threshold
        flux_activity_df[carb] = active_flux

    # Filter reactions active in some but not all conditions
    filtered_flux_activity_df = flux_activity_df[
        ~(flux_activity_df.all(axis=1) | ~flux_activity_df.any(axis=1))
    ]

    results_df = pd.DataFrame(results)
    return results_df, flux_activity_df, filtered_flux_activity_df

# Load Model & Data

In [9]:
# Set the project root path by going up from the notebook location
notebook_dir = Path(__file__).parent if '__file__' in globals() else Path().resolve()
project_root = notebook_dir.parent.parent  # Go up from /code/GEM_Reduction/

# Construct raw paths 
raw_data_path = project_root / "data" / "raw" 
raw_sbml_path = raw_data_path / "sbml_files" 
raw_mat_path = raw_data_path / "matlab_files" 
raw_csv_path = raw_data_path / "csv_files" 

# Construct processed paths 
processed_data_path = project_root / "data" / "processed" 
processed_sbml_path = processed_data_path / "sbml_files" 
processed_csv_path = processed_data_path / "csv_files" 

In [10]:
# read model
model = read_sbml_model(str(raw_sbml_path / "iMS520.xml"))

In [11]:
# import pairs coupled and reactions blocked by F2C2
Coupled_Pairs_df = pd.read_csv(raw_csv_path / 'fctable_iMS520.csv', header=None)
F2C2_Blocked_Reactions_df = pd.read_csv(raw_csv_path / 'blocked_reactions_iMS520.csv', header=None)

In [12]:
print(model.objective)

Maximize
1.0*biomass_BIF - 1.0*biomass_BIF_reverse_508ec


In [13]:
objective_id = "R_biomass_BIF"

# Exchange Reactions
`fastcc` removes all reactions that **can never carry non-zero flux** in **steady state**, based on current bounds. This means:

**For a general reduced model** (all potentially active reactions retained):
- All **exchange reactions** should be temporarily opened to allow **both uptake and secretion** (e.g., `-10` to `1000`).
- This ensures they aren't removed by `fastcc`.
- Bounds can be **tightened again afterward**.

In [14]:
 # Create a deep copy of the model to avoid modifying the original
model_copy = copy.deepcopy(model)

# allow flux in and out for all exchange reactions
for rxn in model_copy.exchanges:
    rxn.lower_bound = -10
    rxn.upper_bound = 1000

In [15]:
### Export as .mat for matlab applications

save_matlab_model(model_copy, raw_mat_path / "iMS520.mat") # use generic model to get unbiased couplings (filter later!)

# FASTCC model reduction

In [16]:
# Reduce model with COBRApy FASTCC 
consistent_generic_model = fastcc(model_copy)

  warn("need to pass in a list")


# F2C2 Coupling Integration
* Coupled_Pairs_df: Interpretation for element (i, j):
    *  0 - uncoupled
    * 1 - fully coupled
    * 2 - partially coupled
    * 3 - reaction i is directionally coupled to j
    * 4 - reaction j is directionally coupled to i
* F2C2_Blocked_Reactions_df:
    * 1 corresponding to a blocked reaction.


**Step 1**: Make F2C2 Data accessible

In [17]:
# Add reaction annotations to match F2C2 blocked reactions
model_rxns = [rxn.id for rxn in model.reactions]
F2C2_Blocked_Reactions_df.loc[len(F2C2_Blocked_Reactions_df)] = model_rxns

# Identify unblocked reactions from the second row (index 1)
unblocked_mask = F2C2_Blocked_Reactions_df.loc[0] == 0
unblocked_reactions = F2C2_Blocked_Reactions_df.loc[1][unblocked_mask].tolist()

# Update Coupled_Pairs_df index and columns with unblocked reactions
Coupled_Pairs_df.index = unblocked_reactions
Coupled_Pairs_df.columns = unblocked_reactions

In [18]:
# Get number of how many fully coupled pairs

fully_coupled_count = ((Coupled_Pairs_df == 1).sum().sum()) - 631

*SideQuest* : Investigate difference in 'blocking' between FASTCC and F2C2

In [19]:
# Extract and convert reactions from models to sets for fast comparison
generic_rxns = {rxn.id for rxn in consistent_generic_model.reactions}
f2c2_unblocked_rxns = set(unblocked_reactions)

# Compare overlaps and differences
overlap_generic = list(generic_rxns & f2c2_unblocked_rxns)

# Reactions unblocked by F2C2 but removed by FASTCC
missing_from_generic = list(f2c2_unblocked_rxns - generic_rxns)

# Reactions kept by FASTCC but blocked by F2C2 (ideally empty)
unexpected_in_generic = list(generic_rxns - f2c2_unblocked_rxns)

**Step 2**: Implement Enzyme Subsets into model(s)

In [20]:
# Build a graph of fully coupled reactions
G = nx.Graph()

for row in Coupled_Pairs_df.index:
    for col in Coupled_Pairs_df.columns:
        if Coupled_Pairs_df.loc[row, col] == 1 and row != col:
            G.add_edge(row, col)

# Find connected components (fully coupled groups)
coupled_groups = list(nx.connected_components(G))

In [21]:
# Create a mapping from reaction ID to COBRA group name
reaction_to_group = {}

for group in model.groups:
    group_name = group.name
    for member in group.members:
        if hasattr(member, 'id'):  # make sure it's a Reaction, not a Metabolite or Gene
            reaction_to_group[member.id] = group_name

In [22]:
# Combine coupled groups with their annotated groups in the COBRA model and obtain in- and output of coupled groups

combined_data = []

for group in coupled_groups:
    group_reactions = list(group)
    
    # COBRA Group Names
    group_names = set()
    for rxn in group_reactions:
        if rxn in reaction_to_group:
            group_names.add(reaction_to_group[rxn])
    
    # Net Stoichiometry
    stoich = defaultdict(float)
    for rxn_id in group_reactions:
        rxn = model.reactions.get_by_id(rxn_id)
        for met, coeff in rxn.metabolites.items():
            stoich[met] += coeff

    inputs = [met.id for met, coeff in stoich.items() if coeff < 0]
    outputs = [met.id for met, coeff in stoich.items() if coeff > 0]

    # Append Combined Info
    combined_data.append({
        "Coupled_Reactions": group_reactions,
        "COBRA_Groups": list(group_names) if group_names else ["Unassigned"],
        "Num_Inputs": len(inputs),
        "Num_Outputs": len(outputs),
        "Input_Metabolites": inputs,
        "Output_Metabolites": outputs
    })

# Create the final DataFrame
Fully_Coupled_df = pd.DataFrame(combined_data)

In [23]:
# Check whether the reactions per group form a single connected component (i.e., fully connected cluster)
Fully_Coupled_df['Cluster'] = Fully_Coupled_df['Coupled_Reactions'].apply(
    lambda rxn_list: are_reactions_interconnected(consistent_generic_model, rxn_list)
)

# Remove Entries that are not fully connected

Fully_Coupled_df = Fully_Coupled_df[Fully_Coupled_df['Cluster'] != False]

# Remove objective function (if in df)
Fully_Coupled_df['Reactions'] = Fully_Coupled_df['Coupled_Reactions'].apply(lambda rxns: [r for r in rxns if r != objective_id])

*Option 1*: "Full" Lumping including exchange reactions

In [24]:
# perform "full" lumping (incl. all reactions)

lumped_model, lumping_log = lump_reaction(consistent_generic_model, Fully_Coupled_df, verbose=False)

*Option 2*: Selective lumping

In [25]:
# VERSION 1: Remove rows where 'Exchange' (or 'Exchange ') appears in COBRA_Groups 

Fully_Coupled_df_v1 = Fully_Coupled_df[~Fully_Coupled_df['COBRA_Groups'].apply(lambda groups: any(g.strip() == 'Exchange' for g in groups))].copy()


# VERSION 2: Remove EX_ reactions from Coupled_Reactions and remove 'Exchange' (or 'Exchange ') from COBRA_Groups 

new_reactions = []
new_groups = []

for idx, row in Fully_Coupled_df.iterrows():
    # Remove reactions that start with 'EX_'
    filtered_rxns = [rxn for rxn in row['Coupled_Reactions'] if not rxn.startswith('EX_')]

    # Remove 'Exchange' or 'Exchange ' (trailing spaces)
    filtered_groups = [grp for grp in row['COBRA_Groups'] if grp.strip() != 'Exchange']

    # Keep only if at least 2 reactions remain
    if len(filtered_rxns) >= 2:
        new_reactions.append(filtered_rxns)
        new_groups.append(filtered_groups)

# Rebuild the filtered DataFrame
Fully_Coupled_df_v2 = pd.DataFrame({
    'Coupled_Reactions': new_reactions,
    'COBRA_Groups': new_groups
})

In [26]:
# perform lumping only for groups that are not at all connected to exchange reaction

lumped_model_v1, lumping_log_v1 = lump_reaction(consistent_generic_model, Fully_Coupled_df_v1, verbose=False, label_type="Group", Search_COBRA_groups=False)

In [27]:
# perform lumping only excl. exchange reactions 

lumped_model_v2, lumping_log_v2 = lump_reaction(consistent_generic_model, coupled_df=Fully_Coupled_df_v2, verbose=False, label_type="Group", Search_COBRA_groups=False)

In [28]:
total_length = Fully_Coupled_df_v2['Coupled_Reactions'].apply(len).sum()

**Step 3**: Validation 

In [29]:
# Get all pseudo-reaction IDs
pseudo_reactions = [rxn.id for rxn in lumped_model_v2.reactions if rxn.id.startswith("Pseudo_")]

# Use Flux Variability Analysis to test max possible flux
# fraction_of_optimum=0.0 tells FVA to ignore the model's objective and simply check whether each reaction can carry any flux under the current constraints.
fva_results = flux_variability_analysis(lumped_model_v2, reaction_list=pseudo_reactions, fraction_of_optimum=0.0) 

# Identify if any pseudoreactions are blocked (min=max=0)
fva_results['is_blocked'] = (fva_results['minimum'].abs() < 1e-6) & (fva_results['maximum'].abs() < 1e-6)

# Check if any reactions are blocked
if not fva_results['is_blocked'].any():
    print("✅ All pseudo-reactions validated — they can carry flux.")
else:
    print("❌ Some pseudo-reactions are blocked and cannot carry flux:")
    blocked = fva_results[fva_results['is_blocked']]
    print(blocked[['minimum', 'maximum']])

✅ All pseudo-reactions validated — they can carry flux.


# Module-oriented Reaction Lumping

**Step 1**: Group Sanity Check 

In [30]:
# Create a mapping for all groups and their corresponding reactions

# Extract groups and their reactions
group_mapping_data = [
    {
        'COBRA_Group': group.name,
        'Reactions': [member.id for member in group.members if member.__class__.__name__ == 'Reaction']
    }
    for group in lumped_model_v2.groups
    if any(member.__class__.__name__ == 'Reaction' for member in group.members)
]
Raw_Group_df = pd.DataFrame(group_mapping_data)

# Normalize group names (case + spacing)
Raw_Group_df['Normalized_Group'] = Raw_Group_df['COBRA_Group'].apply(
    lambda x: ' '.join(x.strip().split()).title()
)

# Group by normalized name, merge reactions, remove duplicates
Normalized_Group_df = (
    Raw_Group_df.groupby('Normalized_Group')
    .agg({'Reactions': lambda lists: list(set(r for sub in lists for r in sub))})
    .reset_index()
    .rename(columns={'Normalized_Group': 'COBRA_Group'})
)

In [31]:
# Detect suspiciously similar group names using fuzzy matching

group_names = Normalized_Group_df['COBRA_Group'].tolist()
suspicious_matches = []
SIMILARITY_THRESHOLD = 85 # max.: 100 

for i, name1 in enumerate(group_names):
    for j, name2 in enumerate(group_names):
        if i >= j:
            continue
        score = fuzz.ratio(name1, name2)
        if score >= SIMILARITY_THRESHOLD and name1 != name2:
            suspicious_matches.append({
                'Group_1': name1,
                'Group_2': name2,
                'Similarity_Score': score
            })

# Build suspicious match DataFrame and default merge plan
suspicious_df = pd.DataFrame(suspicious_matches)
suspicious_df['Merge_Into'] = suspicious_df['Group_1']

In [32]:
# Manual Check of suspicious match DataFrame + check of merge names

suspicious_df.loc[suspicious_df['Group_1'] == 'Pentose Phosphate Pathwa', 'Merge_Into'] = 'Pentose Phosphate Pathway'

In [33]:
# Create mapping from each group name to its cleaned (merged) name
merge_map = {}

for _, row in suspicious_df.iterrows():
    merge_map[row['Group_1']] = row['Merge_Into']
    merge_map[row['Group_2']] = row['Merge_Into']

# Apply merge mapping to normalized group names
Normalized_Group_df['Final_Group'] = Normalized_Group_df['COBRA_Group'].apply(
    lambda g: merge_map.get(g, g)
)

# Re-merge by Final_Group, removing duplicates in reaction lists
Final_Group_df = (
    Normalized_Group_df.groupby('Final_Group')
    .agg({'Reactions': lambda lists: list(set(r for sub in lists for r in sub))})
    .reset_index()
    .rename(columns={'Final_Group': 'COBRA_Group'})
)

# Add a column for the number of reactions per group
Final_Group_df['Reaction_Count'] = Final_Group_df['Reactions'].apply(len)

In [34]:
# Check whether the reactions per group form a single connected component (i.e., fully connected cluster)
Final_Group_df['Cluster'] = Final_Group_df['Reactions'].apply(
    lambda rxn_list: are_reactions_interconnected(lumped_model_v2, rxn_list)
)

# Remove Entries that are not fully connected

Final_Group_df = Final_Group_df[Final_Group_df['Cluster'] != False]

# Remove objective function (if in df)
Final_Group_df['Reactions'] = Final_Group_df['Reactions'].apply(lambda rxns: [r for r in rxns if r != objective_id])

**Step 2**: Decide which groups to merge 

In [36]:
# Export for manual investigation

Final_Group_df.to_csv(raw_csv_path / 'iMS520_lumpmodule.csv')

In [37]:
# Define modules that should be left intact (no lumping)

avoid_list = ['Anaplerotic Reactions','Citrate Acid Cycle','Exchange','Fructose And Mannose Metabolism','Galacto-N-Biose Pathway','Galactose Metabolism','Glycolysis','Glycolysis/Gluconeogenesis',
              'Lacto-N-Biose','Pentose And Glucoronate Interconversions','Pentose And Glucoronate Metabolism','Pentose Phosphate Pathway','Phosphoketolase Pathway','Pyruvate Metabolism',
              'Starch Sucrose Metabolis','Starch Sucrose Metabolismgalactose Metabolism','Transport','Unassigned']

In [38]:
# Filter & deduplicate rows in the avoid_list
avoid_reactions_series = Final_Group_df[Final_Group_df['COBRA_Group'].isin(avoid_list)]['Reactions']

avoid_reactions = set()
for reaction_list in avoid_reactions_series:
    avoid_reactions.update(reaction_list)

# Make a deep copy of the DataFrame to avoid changing the original
Lump_df = Final_Group_df.copy(deep=True)

# Iterate over rows not in avoid_list and clean their Reactions
for idx, row in Lump_df.iterrows():
    if row['COBRA_Group'] not in avoid_list:
        original_reactions = set(row['Reactions'])
        updated_reactions = original_reactions - avoid_reactions

        if original_reactions != updated_reactions:
            removed = original_reactions & avoid_reactions
            # print(f"Cleaning group '{row['COBRA_Group']}': removed {len(removed)} reaction(s): {removed}")

        Lump_df.at[idx, 'Reactions'] = list(updated_reactions)

# Identify rows where 'Reactions' is an empty list
empty_rows = Lump_df[Lump_df['Reactions'].apply(lambda x: len(x) == 0)]

# Remove these rows from the cleaned DataFrame
Final_Lump_df = Lump_df[Lump_df['Reactions'].apply(lambda x: len(x) > 1)].reset_index(drop=True)

In [39]:
# Check whether the reactions per group form a single connected component (i.e., fully connected cluster)
Final_Lump_df['Cluster_New'] = Final_Lump_df['Reactions'].apply(
    lambda rxn_list: are_reactions_interconnected(lumped_model_v2, rxn_list))

# Remove Entries that are not fully connected

Final_Lump_df = Final_Lump_df[Final_Lump_df['Cluster'] != False]

**Step 3**: Lump modules

In [40]:
# Remove rows where 'COBRA_Group' is in the avoid_list
Filtered_Lump_df = Final_Lump_df[~Final_Lump_df['COBRA_Group'].isin(avoid_list)].copy() # these won't get lumped!

#  Deduplicate reactions across all rows (ensures each reaction appears only once)
seen_reactions = set()
for idx, row in Filtered_Lump_df.iterrows():
    unique_reactions = [rxn for rxn in row['Reactions'] if rxn not in seen_reactions]
    seen_reactions.update(unique_reactions)
    Filtered_Lump_df.at[idx, 'Reactions'] = unique_reactions

# Remove reactions that start with "EX_"
for idx, row in Filtered_Lump_df.iterrows():
    non_exchange_reactions = [rxn for rxn in row['Reactions'] if not rxn.startswith("EX_")]
    Filtered_Lump_df.at[idx, 'Reactions'] = non_exchange_reactions

# Remove rows with empty reaction lists
Filtered_Lump_df = Filtered_Lump_df[Filtered_Lump_df['Reactions'].apply(len) > 1].reset_index(drop=True)

In [41]:
# perform lumping only excl. exchange reactions 

lumped_model_final, lump_log_final = lump_reaction(lumped_model_v2, coupled_df=Filtered_Lump_df, verbose=False, Search_COBRA_groups=True, label_type="Module")

**Step 4**: Validation

In [42]:
# Get all pseudo-reaction IDs
pseudo_reactions = [rxn.id for rxn in lumped_model_final.reactions if rxn.id.startswith("Pseudo_")]

# Use Flux Variability Analysis to test max possible flux
# fraction_of_optimum=0.0 tells FVA to ignore the model's objective and simply check whether each reaction can carry any flux under the current constraints.
fva_results = flux_variability_analysis(lumped_model_final, reaction_list=pseudo_reactions, fraction_of_optimum=0.0) 

# Identify if any pseudoreactions are blocked (min=max=0)
fva_results['is_blocked'] = (fva_results['minimum'].abs() < 1e-6) & (fva_results['maximum'].abs() < 1e-6)

# Check if any reactions are blocked
if not fva_results['is_blocked'].any():
    print("✅ All pseudo-reactions validated — they can carry flux.")
else:
    print("❌ Some pseudo-reactions are blocked and cannot carry flux:")
    blocked = fva_results[fva_results['is_blocked']]
    print(blocked[['minimum', 'maximum']])

✅ All pseudo-reactions validated — they can carry flux.


In [43]:
write_sbml_model(lumped_model_final, processed_sbml_path / 'iMS520_red1.sbml')

Note: Went from original model (771 reactions) to a reduced generic model (249 reactions) 

# Optional: Make condition-specific

**Step 1**: Block reactions that are exchanged for compounds not in the medium

In [44]:
# define blocked reactions


blocked_carbsource_reactions = ['D-xylose exchange', 'mannose exchange', 'D-ribose exchange', 'L-arabinose exchange', 'sucrose exchange','L-rhamnose exchange','cellobiose exchange',
                                'lactose exchange','mellibiose exchange','raffinose exchange','maltotriose exchange','maltohexaose exchange','lnb exchange','gnb exchange']

blocked_AA_reactions = ['histidine exchange', 'arginine exchange','tyrosine exchange ','lysine exchange','glycine exchange','alanine exchange','serine exchange', 'glutamate exchange', 
                        'glutamine exchange', 'aspartate exchange','asparagine exchange']

blocked_noncategorical_reactions = ['methanethiol exchange','Cys-Gly Exchange','Methionine sulfoxide exchange','Shikimate exchange','putrescine exchange','spmd exchange','uracil exchange',
                                    'xanthine exchange','hypoxanthine exchange']

reactions_to_block = blocked_carbsource_reactions + blocked_AA_reactions + blocked_noncategorical_reactions

# define carbon source reactions 

carbon_sources = ['glucose exchange','galactose exchange','fructose exchange','maltose exchange']

# other medium components 

other_medium_components = [] # need to check Harolds File

In [45]:
# copy to not modify the generic model

model_copy2 = copy.deepcopy(lumped_model_final)

# block reactions and open carbon reactions (already open)

for rxn in model_copy2.reactions:
        if rxn.name in reactions_to_block:
            rxn.lower_bound = 0.0
            rxn.upper_bound = 0.0

for rxn in model_copy2.reactions:
        if rxn.name in carbon_sources:
            rxn.lower_bound = -10
            rxn.upper_bound = 1000

**Step 2**: FASTCC on conditioned model

In [46]:
# produce conditioned model

conditioned_model = fastcc(model_copy2)

  warn("need to pass in a list")


In [47]:
write_sbml_model(lumped_model_final, processed_sbml_path / 'iMS520_red2.sbml')

**Step 3**: Modify Directionalities of Exchanges

In [48]:
# extract all exchange reactions

exchange_reactions = [rxn for rxn in conditioned_model.reactions if rxn.id.startswith("EX_")]
exchange_names = [rxn.name for rxn in exchange_reactions]

In [49]:
# define whether these serve as nutrients or are secreted by the organism

import_only = ['D-glucose exchange','fructose exchange','galactose exchange','maltose exchange','NH4 exchange','phosphate exchange','potassium exchange',
               'molybdate exchange','cobalt+2 exchange','L-methionine exchange','Mg+2 exchange','Fe+2 exchange','Fe+3 exchange','Zn+2 exchange','calcium exchange',
               'Cu+2 exchange','Mn+2 exchange', 'cysteine exchange','leucine exchange','isoleucine exchange','valine exchange','threonine exchange', 'tryptophan exchange',
               'phenylalanine exchange','tyrosine exchange','proline exchange','thiamine exchange','biotin exchange','4-aminobenzoate exchange','pantethine exchange',
               'riboflavine exchange','nicotinamide exchange','nicotinic acid exchange','folate exchange','cob(I)alamine exchange','menaquinone-4 exchange',]

both = ['H2O exchange','proton exchange','chloride exchange']

secretion_only = ['Formate exchange','lactate exchange','succinate exchange','acetate exchange','ethanol exchange','acetaldehyde exchange','H2S exchange',
                  'carbon dioxide exchange', 'Hydrogen peroxide exchange']

In [50]:
# copy model

model_copy3 = copy.deepcopy(conditioned_model)

# modifiy reaction bounds to mark nutrients / secretion products

for rxn in model_copy3.reactions:
        if rxn.name in import_only:
            rxn.lower_bound = -10
            rxn.upper_bound = 0.0

for rxn in model_copy3.reactions:
        if rxn.name in secretion_only:
            rxn.lower_bound = 0
            rxn.upper_bound = 1000

In [51]:
# modify reversibilities

for rxn in model_copy3.reactions:
    if rxn.lower_bound < 0 and rxn.upper_bound == 0:
        # Flip direction
        original_stoich = rxn.metabolites
        flipped_stoich = {met: -coeff for met, coeff in original_stoich.items()}

        # Clear old stoichiometry and set new one
        rxn.subtract_metabolites(original_stoich)
        rxn.add_metabolites(flipped_stoich)

        # Update bounds
        old_lb = rxn.lower_bound
        rxn.lower_bound = 0
        rxn.upper_bound = abs(old_lb)

        # Rename or tag
        rxn.id = f"{rxn.id}_flipped"
        rxn.name = f"{rxn.name} (flipped)"

In [52]:
write_sbml_model(lumped_model_final, processed_sbml_path / 'iMS520_red3.sbml')

**Note**: 
- The conditioned reduced model with modified reversibilities has 192 reactions
- The generic reduced model has 249 reactions with 128 reversible reactions
- The original model has 771 reactions with 206 reversible reactions

In [53]:
# Get the stoichiometric matrix as a NumPy array
stoich_dense = create_stoichiometric_matrix(model_copy3)  # Already a NumPy array

# Compute the rank of the stoichiometric matrix
rank = np.linalg.matrix_rank(stoich_dense)

# Degrees of freedom
num_reactions = len(model_copy3.reactions)
dof = num_reactions - rank

# Output
print(f"Stoichiometric Matrix Rank: {rank}")
print(f"Number of Reactions: {num_reactions}")
print(f"Degrees of Freedom: {dof}")

Stoichiometric Matrix Rank: 166
Number of Reactions: 190
Degrees of Freedom: 24


# Validation

In [54]:
carb_list = ['EX_glc-D(e)_flipped','EX_fru(e)_flipped','EX_gal(e)_flipped','EX_malt(e)_flipped']
carb_list_original = ['EX_glc-D(e)','EX_fru(e)','EX_gal(e)','EX_malt(e)']

In [55]:
# Run analysis for both models
results_final, flux_activity_final, filtered_flux_final = analyze_model(lumped_model_final, 'Final_Model',carb_list_original)
results_original, flux_activity_original, filtered_flux_original = analyze_model(consistent_generic_model, 'Consistent',carb_list_original)
results_context, flux_activity_context, filtered_flux_context = analyze_model(model_copy3, 'Context',carb_list)

In [56]:
Objective_comparison_df = pd.concat([results_final, results_original,results_context], axis=0)

In [57]:
Objective_comparison_df.to_csv(raw_csv_path / 'iMS520_FBA_results.csv')