# Import and Load Libraries

In [1]:
# Python Standard Library and 3rd Party Imports 

import pandas as pd
import numpy as np
import networkx as nx
import scipy
import copy
import re
from collections import Counter, defaultdict
from itertools import chain, combinations
from collections import Counter

In [2]:
# COBRA imports

import cobra
from cobra import Model, Reaction
from cobra.core import Group, Reaction
from cobra.flux_analysis import flux_variability_analysis
from cobra.flux_analysis.fastcc import fastcc
from cobra.io import read_sbml_model, save_matlab_model, write_sbml_model
from cobra.util.solver import linear_reaction_coefficients
from rapidfuzz import fuzz, process
from pathlib import Path

# Import Data

In [3]:
# Set the project root path by going up from the notebook location
notebook_dir = Path(__file__).parent if '__file__' in globals() else Path().resolve()
project_root = notebook_dir.parent.parent  # Go up from /code/GEM_Reduction/

# Construct raw paths 
raw_data_path = project_root / "data" / "raw" 
raw_sbml_path = raw_data_path / "sbml_files" 
raw_txt_path = raw_data_path / "txt_files" 
raw_csv_path = raw_data_path / "csv_files" 

In [4]:
# read model
model1 = read_sbml_model(raw_sbml_path / "Ec_iAF1260_flux1.xml")
model2 = read_sbml_model(raw_sbml_path / "iAF1260.xml")

Model does not contain SBML fbc package information.
SBML package 'layout' not supported by cobrapy, information is not parsed
SBML package 'render' not supported by cobrapy, information is not parsed
Use of the species charge attribute is discouraged, use fbc:charge instead: <Species M_10fthf_c "M_10_Formyltetrahydrofolate_C20H21N7O7">
Use of the species charge attribute is discouraged, use fbc:charge instead: <Species M_12dgr120_c "M_1_2_Diacyl_sn_glycerol__didodecanoyl__n_C120__C27H52O5">
Use of the species charge attribute is discouraged, use fbc:charge instead: <Species M_12dgr120_p "M_1_2_Diacyl_sn_glycerol__didodecanoyl__n_C120__C27H52O5">
Use of the species charge attribute is discouraged, use fbc:charge instead: <Species M_12dgr140_c "M_1_2_Diacyl_sn_glycerol__ditetradecanoyl__n_C140__C31H60O5">
Use of the species charge attribute is discouraged, use fbc:charge instead: <Species M_12dgr140_p "M_1_2_Diacyl_sn_glycerol__ditetradecanoyl__n_C140__C31H60O5">
Use of the species char

In [5]:
reaction_df = pd.read_csv(raw_txt_path / "bigg_models_reactions.txt", sep="\t")
metabolite_df = pd.read_csv(raw_txt_path / "bigg_models_metabolites.txt", sep="\t")

# Reaction Assignment to Subsystems

**Step 1**: Extract Subystems from iAF1260 (published version)

In [6]:
# Extract reactions & their subsystems from iAF1260 (published version)

reaction_subsystem_dict = {} # empty dicitonary

for rxn in model1.reactions:
    notes = rxn.notes
    subsystem = None

    # Extract subsystem info from the notes
    if 'SUBSYSTEM' in notes:
        subsystem = notes['SUBSYSTEM']
    else:
        # Look into notes XML text (if not parsed cleanly)
        for key, value in notes.items():
            if isinstance(value, str) and "SUBSYSTEM" in value:
                subsystem = value.split("SUBSYSTEM:")[-1].strip()
    
    if subsystem:
        reaction_subsystem_dict[rxn.id] = subsystem # add to dictionary

**Step 2**: Match subsystems to 'modern' BIGG Ids using the old reaction link (i.e. use the old BIGG Ids column and reactions from published iAF1260)

In [7]:
# Add 'Subsystem' column to reaction dataframe
reaction_df['Subsystem'] = None


# Add subsystems to reactions
for key, subsystem in reaction_subsystem_dict.items():
    
    # Try matching in bigg_id ('modern' reaction name)
    mask_bigg_id = reaction_df['bigg_id'] == key
    if mask_bigg_id.any():
        reaction_df.loc[mask_bigg_id, 'Subsystem'] = subsystem
        continue  # Skip to next key if found in bigg_id ('modern' reaction names should be unique!)

    # If not found in bigg_id, try matching in old_bigg_ids (reaction labels used before 'modern' reaction names)
    mask_old_bigg_ids = reaction_df['old_bigg_ids'].fillna('').apply(lambda x: key in x.split(';'))
    reaction_df.loc[mask_old_bigg_ids, 'Subsystem'] = subsystem

In [8]:
# Subset dataframe to only retain reactions with matched subsystems

matched_df = reaction_df[reaction_df['Subsystem'].notna()].copy()

In [9]:
# Extract reactions & their subsystems that couldn't be matched (apparently not included in the BIGG reaction metatable)

# Obtain set to track matched keys
matched_keys = set(reaction_df.loc[reaction_df['Subsystem'].notna(), 'bigg_id']) # from 'modern' bigg_id column

for key in reaction_subsystem_dict: # from old_bigg_ids column
    if key in matched_keys:
        continue
    matched_in_old = reaction_df['old_bigg_ids'].fillna('').apply(lambda x: key in x.split(';')).any()
    if matched_in_old:
        matched_keys.add(key)

# Filter dictionary for unmatched keys
unmatched_dict = {k: v for k, v in reaction_subsystem_dict.items() if k not in matched_keys}

# Remove exchange reactions (not assigned subsystems apparently, can later be grouped into 'exchange')
filtered_unmatched_dict = {k: v for k, v in unmatched_dict.items() if not k.startswith('EX')}

**Step 3**: Map reactions of BIGG iAF1260 to subsystems via metatable 

In [10]:
# Add column for iAF1260_BIGG reactions
matched_df['iAF1260_BIGG'] = None

# Get all reaction IDs from the BIGG iAF1260 model
reaction_ids = [rxn.id for rxn in model2.reactions]

# Loop through reaction IDs and try to match
for rxn_id in reaction_ids:
    # Exact match with bigg_id
    mask_bigg = matched_df['bigg_id'] == rxn_id
    if mask_bigg.any():
        matched_df.loc[mask_bigg, 'iAF1260_BIGG'] = rxn_id
        continue  # Move to next reaction if matched

    # Check if rxn_id appears in any of the old_bigg_ids
    mask_old = matched_df['old_bigg_ids'].fillna('').apply(lambda x: rxn_id in x.split(';'))
    matched_df.loc[mask_old, 'iAF1260_BIGG'] = rxn_id


In [11]:
# Extract subset of dataframe with & without reactions from BIGG iAF126 model

matched_df2 = matched_df[matched_df['iAF1260_BIGG'].notna()].copy()
unmatched_rows = matched_df[matched_df['iAF1260_BIGG'].isna()].copy()

**Step 4**: Manually assign 'leftover' reactions
- 28 'normal' reactions couldn't be mapped and 299 exchange reactions

In [12]:
# Find reactions that couldn't be mapped to a subsystem via the metatable

matched_reaction_ids = set(matched_df['iAF1260_BIGG'].dropna()) # Collect all matched reaction IDs from matched_df
all_model2_reactions = set(rxn.id for rxn in model2.reactions) # Get all reaction IDs from model2
unmatched_model2_reactions = all_model2_reactions - matched_reaction_ids # Find unmatched ones

# Filter out reactions starting with 'EX'
unmatched_model2_filtered = {rxn_id for rxn_id in unmatched_model2_reactions if not rxn_id.startswith('EX')}

In [13]:
unique_subsystems = set(reaction_subsystem_dict.values())

| iAF1260 Reaction | Model2 Equivalent | Subsystem Annotation                              |
| ---------------- | ----------------- | ------------------------------------------------- |
| ADNt2rpp         | ADNt2pp\_copy1    | S\_Transport\_\_Inner\_Membrane                   |
| ADNt2rpp         | ADNt2pp\_copy2    | S\_Transport\_\_Inner\_Membrane                   |
| CYTDt2rpp        | CYTDt2pp\_copy1   | S\_Transport\_\_Inner\_Membrane                   |
| CYTDt2rpp        | CYTDt2pp\_copy2   | S\_Transport\_\_Inner\_Membrane                   |
| GLCtexi          | GLCtex\_copy1     | S\_Transport\_\_Outer\_Membrane                   |
| GLCtexi          | GLCtex\_copy2     | S\_Transport\_\_Outer\_Membrane                   |
| ICHORSi          | ICHORS\_copy1     | S\_Cofactor\_and\_Prosthetic\_Group\_Biosynthesis |
| ICHORSi          | ICHORS\_copy2     | S\_Cofactor\_and\_Prosthetic\_Group\_Biosynthesis |
| INSt2rpp         | INSt2pp\_copy1    | S\_Transport\_\_Inner\_Membrane                   |
| INSt2rpp         | INSt2pp\_copy2    | S\_Transport\_\_Inner\_Membrane                   |
| MICITD           | MICITDr           | S\_Alternate\_Carbon\_Metabolism                  |
| PPKr             | PPK               | S\_Oxidative\_Phosphorylation                     |
| PPK2r            | PPK2              | S\_Oxidative\_Phosphorylation                     |
| SULRi            | SULR              | S\_Cysteine\_Metabolism                           |
| THMDt2rpp        | THMDt2pp\_copy1   | S\_Transport\_\_Inner\_Membrane                   |
| THMDt2rpp        | THMDt2pp\_copy2   | S\_Transport\_\_Inner\_Membrane                   |
| THRAi            | THRA              | S\_Threonine\_and\_Lysine\_Metabolism             |
| URAt2rpp         | URAt2pp\_copy1    | S\_Transport\_\_Inner\_Membrane                   |
| URAt2rpp         | URAt2pp\_copy2    | S\_Transport\_\_Inner\_Membrane                   |
| URIt2rpp         | URIt2pp\_copy1    | S\_Transport\_\_Inner\_Membrane                   |
| URIt2rpp         | URIt2pp\_copy2    | S\_Transport\_\_Inner\_Membrane                   |

Note: All 28 reaction of the BIGG iAF1260 besides 7 reactions (ACACT8r, BIOMASS_Ec_iAF1260_core_59p81M, CTECOAI6, CTECOAI7, CTECOAI8, FDH4pp_1 and FDH5pp_1) could be assigned an equivalent with the help of the reactions of the published iAF1260 version that was not mapped to the metatable ('filtered_unmatched_dict').

Note: Of the leftover 7 reactions, FDH4pp_1 and FDH5pp_1, could be mapped to S_Oxidative_Phosphorylation using the rows mapped to the metatable via the pubslihed version, but not mapped to the modern iAF1260 reactions from the BIGG model ('unmatched_rows').

Note: The remaining reactions, ACACT8r, BIOMASS_Ec_iAF1260_core_59p81M, CTECOAI6, CTECOAI7, CTECOAI8, were assigned to a subsystem from the unique set of subsystems from the published model using literature research and my brain. ACACT8r was assigned to S_Membrane_Lipid_Metabolism (alternative would be S_Alternate_Carbon_Metabolism), BIOMASS_Ec_iAF1260_core_59p81M was assigned to S_Unassigned and the remaining reactions also to S_Membrane_Lipid_Metabolism.

In [14]:
# Manual Assignment of 'leftover' reactions to subsystems

new_entries = {
    'ACACT8r': 'S_Membrane_Lipid_Metabolism',  # No match
    'ADNt2pp_copy1': 'S_Transport__Inner_Membrane',
    'ADNt2pp_copy2': 'S_Transport__Inner_Membrane',
    'BIOMASS_Ec_iAF1260_core_59p81M': 'S_Unassigned',  # No match
    'CTECOAI6': 'S_Membrane_Lipid_Metabolism',  # No match
    'CTECOAI7': 'S_Membrane_Lipid_Metabolism',  # No match
    'CTECOAI8': 'S_Membrane_Lipid_Metabolism',  # No match
    'CYTDt2pp_copy1': 'S_Transport__Inner_Membrane',
    'CYTDt2pp_copy2': 'S_Transport__Inner_Membrane',
    'FDH4pp_1': 'S_Oxidative_Phosphorylation',  
    'FDH5pp_1': 'S_Oxidative_Phosphorylation', 
    'GLCtex_copy1': 'S_Transport__Outer_Membrane',
    'GLCtex_copy2': 'S_Transport__Outer_Membrane',
    'ICHORS_copy1': 'S_Cofactor_and_Prosthetic_Group_Biosynthesis',
    'ICHORS_copy2': 'S_Cofactor_and_Prosthetic_Group_Biosynthesis',
    'INSt2pp_copy1': 'S_Transport__Inner_Membrane',
    'INSt2pp_copy2': 'S_Transport__Inner_Membrane',
    'MICITDr': 'S_Alternate_Carbon_Metabolism',
    'PPK': 'S_Oxidative_Phosphorylation',
    'PPK2': 'S_Oxidative_Phosphorylation',
    'SULR': 'S_Cysteine_Metabolism',
    'THMDt2pp_copy1': 'S_Transport__Inner_Membrane',
    'THMDt2pp_copy2': 'S_Transport__Inner_Membrane',
    'THRA': 'S_Threonine_and_Lysine_Metabolism',
    'URAt2pp_copy1': 'S_Transport__Inner_Membrane',
    'URAt2pp_copy2': 'S_Transport__Inner_Membrane',
    'URIt2pp_copy1': 'S_Transport__Inner_Membrane',
    'URIt2pp_copy2': 'S_Transport__Inner_Membrane'
}


In [15]:
# Add the newly assigned 'leftover' reactions to the metatable

new_rows = pd.DataFrame({
    'iAF1260_BIGG': list(new_entries.keys()),
    'Subsystem': list(new_entries.values())
})

for col in matched_df2.columns: # Add missing columns from matched_df2 as NaN
    if col not in new_rows.columns:
        new_rows[col] = pd.NA

new_rows = new_rows[matched_df2.columns] # Reorder columns to match matched_df2
matched_df2 = pd.concat([matched_df2, new_rows], ignore_index=True) # Concatenate to matched_df2

In [16]:
# Add exchange reactions to the metatable with manual subsystem assignment

exchange_reactions = [rid for rid in reaction_ids if rid.startswith('EX_')] # Filter reaction_ids for those starting with 'EX_'

exchange_rows = pd.DataFrame({
    'iAF1260_BIGG': exchange_reactions,
    'Subsystem': ['S_Exchange'] * len(exchange_reactions)
})

for col in matched_df2.columns: # Add missing columns from matched_df2 as NaN
    if col not in exchange_rows.columns:
        exchange_rows[col] = pd.NA

exchange_rows = exchange_rows[matched_df2.columns] # Reorder columns to match matched_df2
matched_df2 = pd.concat([matched_df2, exchange_rows], ignore_index=True) # Concatenate to matched_df2

In [17]:
number_matched = matched_df2['iAF1260_BIGG'].nunique()

print(f'Of the 2382 reactions of iAF1260 in BIGG db, {number_matched} were successfully assigned a subsystem')

Of the 2382 reactions of iAF1260 in BIGG db, 2382 were successfully assigned a subsystem


In [18]:
# Assuming matched_df2 is already defined

# Count of unique values in 'iAF1260_BIGG'
unique_count = matched_df2['iAF1260_BIGG'].nunique()
print(f"Number of unique 'iAF1260_BIGG' values: {unique_count}")

# Find which values are assigned multiple times and how often
value_counts = matched_df2['iAF1260_BIGG'].value_counts()
duplicates = value_counts[value_counts > 1]
print(f"Values assigned multiple times:{len(duplicates)}")

# For these duplicated entries, check if all have the same Subsystem
mismatched_subsystems = []

for val in duplicates.index:
    subsystems = matched_df2.loc[matched_df2['iAF1260_BIGG'] == val, 'Subsystem'].unique()
    if len(subsystems) > 1:
        mismatched_subsystems.append((val, list(subsystems)))

# Report mismatched subsystem entries
if mismatched_subsystems:
    print("\nEntries with inconsistent subsystem assignments:")
    for entry in mismatched_subsystems:
        print(f"Reaction: {entry[0]} → Subsystems: {entry[1]}")
else:
    print("\nAll duplicated reactions have consistent subsystem assignments.")


Number of unique 'iAF1260_BIGG' values: 2382
Values assigned multiple times:0

All duplicated reactions have consistent subsystem assignments.


**Step 5**: Clean Subsystems

In [19]:
# Find pairs with high similarity
threshold = 90  # Adjust sensitivity
similar_pairs = []

for a, b in combinations(unique_subsystems, 2):
    score = fuzz.ratio(a, b)
    if score >= threshold:
        similar_pairs.append((a, b, score))

# Print results
for a, b, score in sorted(similar_pairs, key=lambda x: -x[2]):
    print(f"{a} & {b}: (Similarity: {round(score,2)}%)")

S_Glutamate_metabolism & S_Glutamate_Metabolism: (Similarity: 95.45%)
S_tRNA_charging & S_tRNA_Charging: (Similarity: 93.33%)
S_Transport__Outer_Membrane_Porin & S_Transport__Outer_Membrane: (Similarity: 90.0%)


In [20]:
# Define your manual mapping of near-duplicates to unified names
subsystem_replacements = {
    'S_Glutamate_metabolism': 'S_Glutamate_Metabolism',
    'S_tRNA_charging': 'S_tRNA_Charging'
}

# Replace in the DataFrame
matched_df2['Subsystem'] = matched_df2['Subsystem'].replace(subsystem_replacements)

In [21]:
# Save results

selected_columns = ['iAF1260_BIGG', 'Subsystem','name','reaction_string']  
iAF1260_reaction_subsystems = matched_df2[selected_columns].copy()

iAF1260_reaction_subsystems.to_csv(raw_csv_path / 'iAF1260_subsystem_assignments.csv', index=False) # export results