## Imports

In [1]:
import pandas as pd
from cobra.io.json import load_json_model
import re
from cobra.flux_analysis import single_reaction_deletion

## Loading Data and Model

In [2]:
model = load_json_model('./data/Recon3D.json')
model.solver = 'gurobi' 
model.objective = "BIOMASS_reaction"

Set parameter Username
Set parameter LicenseID to value 2732830
Academic license - for non-commercial use only - expires 2026-11-04


In [3]:
df_fva_HG  = pd.read_csv('./data/fva_exoflux_HG.csv', index_col=0)

#inject bounds into the model
for index, row in df_fva_HG.iterrows():
    model.reactions.get_by_id(index).bounds = (row['minimum'], row['maximum'])

#check feasibility 
print(model.slim_optimize())

0.0224804490992415


In [4]:
# check feasibility
print(model.slim_optimize())

# verify objective matches biomass 
solution = model.optimize()
print("Objective:", solution.objective_value)
print("BIOMASS_reaction flux:", solution.fluxes.get("BIOMASS_reaction", 0))


0.0224804490992415
Objective: 0.0224804490992415
BIOMASS_reaction flux: 0.0224804490992415


In [5]:
CCLE_expression = pd.read_csv('./data/CCLE_expression.csv', index_col=0)

## Exploring Data naming vs model naming

### Transcipromics

In [6]:
CCLE_expression.head()

Unnamed: 0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),NFYA (4800),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
ACH-001113,4.331992,0.0,7.364397,2.792855,4.470537,0.028569,1.226509,3.042644,6.499686,4.739848,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.0,0.275007,0.0,0.0,0.0
ACH-001289,4.566815,0.584963,7.106537,2.543496,3.50462,0.0,0.189034,3.813525,4.221104,3.481557,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.0,0.014355,0.0,0.0,0.0
ACH-001339,3.15056,0.0,7.379032,2.333424,4.227279,0.056584,1.31034,6.687061,3.682573,3.273516,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.0,0.084064,0.0,0.0,0.042644
ACH-001538,5.08534,0.0,7.154109,2.545968,3.084064,0.0,5.868143,6.165309,4.489928,3.956986,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.0,0.028569,0.0,0.0,0.0
ACH-000242,6.729145,0.0,6.537607,2.456806,3.867896,0.799087,7.208381,5.569856,7.127014,4.568032,...,1.117695,2.358959,0.084064,1.910733,0.0,0.0,0.464668,0.0,0.0,0.0


In [7]:
#checking different outputs and what they look like
r = model.reactions[1000]  
print("Reaction ID:", r.id)
print("Reaction name:", r.name)
print("Gene-reaction rule:", r.gene_reaction_rule)
print("Genes:", [g.id for g in r.genes])
print(model.genes[3].id)
print(model.genes[3].name)

Reaction ID: LYStm
Reaction name: Lysine mitochondrial transport via ornithine carrier
Gene-reaction rule: 83884_AT1 or 10166_AT1
Genes: ['83884_AT1', '10166_AT1']
8639_AT1
AOC3


## Fixing Naming Ambiguity

In [8]:
def clean_gene_name(name: str) -> str:
    """
    Clean up a gene name so it matches the model's format.
    Removes extra info in parentheses, spaces, etc.
    Example: 'AOC3 (8639)' â†’ 'AOC3'
    """
    cleaned = re.sub(r"\s*\(.*\)$", "", str(name))  # remove text in parentheses
    return cleaned.strip().upper()  # remove spaces, make uppercase for consistency

## Defining flowchart fonctions

In [9]:
#IS REACTION DEFINED IN MEDIA
def is_defined_inmedia(rxn_id):
    """
    Check if a reaction is already defined in the media conditions.
    
    Parameters
    ----------
    rxn_id : str
        Reaction ID (e.g., 'EX_glc__D_e')

    Returns
    -------
    bool
        True if the reaction is marked as an exchange reaction
    """
    return rxn_id.startswith("EX")

In [10]:
def rxn_essential(model):
    df_res_rxn = single_reaction_deletion(model, processes=10)
    df_essential=df_res_rxn[df_res_rxn["status"]!="optimal"] # remove non essential
    df_essential["rxn_ids"]=df_essential.ids.apply(lambda x: list(x)[0] if isinstance(x, (set, frozenset)) else x) # remove {} around ids

    return df_essential

In [11]:
def is_essential_reaction(df, rxn):
    """
    Check if a reaction ID is present in the dataframe of essential reactions.
    
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing at least a 'rxn_ids' column.
    rxn : str
        Reaction ID to check.
    
    Returns
    -------
    bool
        True if rxn is in df['rxn_ids'], else False.
    """
    return rxn in df['rxn_ids'].values

In [12]:
#DO WE HAVE TRANSCIPTOMICS FOR THE REACTION GENES
def has_transcriptomics(reaction, CCLE_genes):
    """
    Checks whether transcriptomics data are available for the gene(s)
    associated with a given reaction.
    
    Parameters
    ----------
    reaction : cobra.Reaction
        Reaction object from the COBRA model.
    CCLE_expression : pandas.DataFrame
        Transcriptomics dataset where columns are gene IDs (e.g., ENSG IDs).
    
    Returns
    -------
    bool
        True if at least one gene in the reaction is found in CCLE_expression, else False.
    """
    
    # Get all gene IDs linked to this reaction
    reaction_genes = [gene.name for gene in reaction.genes]
    
    
    # Check if any gene in the reaction exists in the expression dataframe
    for g in reaction_genes:
        if g in CCLE_genes:
            return True  # we have transcriptomics for at least one gene
    
    # If none of the genes matched
    return False


## Testing flowchart fonctions

In [13]:
# Create a quick-access set of cleaned CCLE column names
CCLE_gene_names = { clean_gene_name(c) for c in CCLE_expression.columns }

In [14]:
#testing if the function works
results = []
for rxn in model.reactions:
    results.append(has_transcriptomics(rxn, CCLE_gene_names))
if any(results):
    print("At least one element is True")
    print(sum(results))

At least one element is True
5925


In [15]:
results = []
for rxn in model.reactions:
    results.append(is_defined_inmedia(rxn.id))
if any(results):
    print("At least one element is True")
    print(sum(results))

At least one element is True
1560


In [16]:
defined_rxns = []
for rxn in model.reactions:
    if is_defined_inmedia(rxn.id):
            defined_rxns.append(rxn.id)
print(defined_rxns)

['EX_5adtststerone_e', 'EX_5adtststerones_e', 'EX_5fthf_e', 'EX_5htrp_e', 'EX_5mthf_e', 'EX_5thf_e', 'EX_6dhf_e', 'EX_6htststerone_e', 'EX_10fthf5glu_e', 'EX_10fthf6glu_e', 'EX_10fthf7glu_e', 'EX_11_cis_retfa_e', 'EX_13_cis_retnglc_e', 'EX_24nph_e', 'EX_25hvitd3_e', 'EX_2hb_e', 'EX_2mcit_e', 'EX_34dhphe_e', 'EX_35cgmp_e', 'EX_4hphac_e', 'EX_4mptnl_e', 'EX_7dhf_e', 'EX_7thf_e', 'EX_9_cis_retfa_e', 'EX_acetone_e', 'EX_acgalfucgalacgalfuc12gal14acglcgalgluside_hs_e', 'EX_acnacngal14acglcgalgluside_hs_e', 'EX_adp_e', 'EX_ahandrostanglc_e', 'EX_ala_B_e', 'EX_ala__D_e', 'EX_aldstrn_e', 'EX_andrstrn_e', 'EX_andrstrnglc_e', 'EX_antipyrene_e', 'EX_appnn_e', 'EX_arach_e', 'EX_avite1_e', 'EX_avite2_e', 'EX_bhb_e', 'EX_bildglcur_e', 'EX_biocyt_e', 'EX_bvite_e', 'EX_caro_e', 'EX_carveol_e', 'EX_chol_e', 'EX_cholate_e', 'EX_chtn_e', 'EX_clpnd_e', 'EX_coumarin_e', 'EX_creat_e', 'EX_crmp_hs_e', 'EX_crn_e', 'EX_crtstrn_e', 'EX_crvnc_e', 'EX_cspg_c_e', 'EX_cspg_e_e', 'EX_cyan_e', 'EX_dag_hs_e', 'EX_dcsp

## Defining bound-changing Functions

In [17]:
def open_bounds(rxn):
    """
    Set reaction bounds fully open depending on reversibility.
    """
    if rxn.reversibility:
        rxn.bounds = (-1000, 1000)
    else:
        rxn.bounds = (0, 1000)
    

In [18]:
def clean_bounds(rxn, tol=1e-6):
    lb, ub = rxn.lower_bound, rxn.upper_bound

    # Round tiny bounds to zero
    if abs(lb) < tol:
        lb = 0.0
    if abs(ub) < tol:
        ub = 0.0

    # If still inverted, collapse to zero flux
    if lb > ub:
        lb, ub = 0.0, 0.0

    rxn.bounds = (lb, ub)

In [19]:
def classify_rule(rxn):
    rule = rxn.gene_reaction_rule.lower()
    if "and" in rule and "or" not in rule:
        return "and_rule"
    elif "or" in rule and "and" not in rule:
        return "or_rule"
    elif "and" not in rule and "or" not in rule and rule != "":
        return "one_gene"
    else:
        return None

In [20]:
def calculate_new_bounds(rxn, rule_type, cell_line, CCLE_name_map, tol=1e-6):
    matched_cols = [CCLE_name_map[g.name] for g in rxn.genes if g.name in CCLE_name_map]
    if not matched_cols:
        return

    expr_values = CCLE_expression.loc[cell_line, matched_cols].astype(float).tolist()

    if rule_type == "one_gene":
        E = expr_values[0]
    elif rule_type == "or_rule":
        E = sum(expr_values)
    elif rule_type == "and_rule":
        E = min(expr_values)
    else:
        return

    # Ensure E is valid and non-negative
    if not pd.notnull(E) or E < 0:
        return

    # Determine new bounds
    if rxn.reversibility:
        lb = -E
        ub = E
    else:
        lb = 0.0
        ub = E

    # --- Critical part: clean tiny noise BEFORE assigning ---
    if abs(lb) < tol: lb = 0.0
    if abs(ub) < tol: ub = 0.0
    if lb > ub:
        lb, ub = (0.0, 0.0)

    # --- Assign both bounds at once (avoids ValueError) ---
    rxn.bounds = (lb, ub)


## Testing bound-changing Functions

In [21]:
# create mapping: cleaned_name â†’ original_column_name
CCLE_name_map = { clean_gene_name(col): col for col in CCLE_expression.columns }

In [22]:
df_essential = rxn_essential(model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_essential["rxn_ids"]=df_essential.ids.apply(lambda x: list(x)[0] if isinstance(x, (set, frozenset)) else x) # remove {} around ids


In [23]:
# Choose the cell line 
cell_line = "ACH-000520"        
experimental_growth = 0.02083

In [28]:
for rxn in model.reactions[0:100]:
    clean_bounds(rxn)
    print(f"\nðŸ”¹ Checking reaction: {rxn.id}")
    print(f"Initial bounds: {rxn.bounds}")
    if is_defined_inmedia(rxn.id):
        print("   â†’ Defined in media. Skipping.")
        continue  # leave bounds as set by media definition
    if is_essential_reaction(df_essential, rxn.id):
        open_bounds(rxn)
        continue
    if has_transcriptomics(rxn, CCLE_gene_names):
        rule_type = classify_rule(rxn)
        print(f"   â†’ Has transcriptomics data. Rule type: {rule_type}")
        calculate_new_bounds(rxn, rule_type, cell_line, CCLE_name_map)
        print(f"   â†’ New bounds after calculation: {rxn.bounds}")
        solution = model.optimize()
        growth = solution.fluxes.get("BIOMASS_reaction", 0)
        if growth <= experimental_growth:
            print("   â†’ Growth â‰¤ experimental. opening bounds.")
            open_bounds(rxn)
            print(f"   â†’ Opened bounds: {rxn.bounds}")
            model.optimize()
    else:
        print("   â†’ No transcriptomics data. Opening bounds.")
        open_bounds(rxn)
        print(f"   â†’ Opened bounds: {rxn.bounds}")



ðŸ”¹ Checking reaction: 24_25DHVITD3tm
Initial bounds: (0.0, 1000)
   â†’ No transcriptomics data. Opening bounds.
   â†’ Opened bounds: (0, 1000)

ðŸ”¹ Checking reaction: 25HVITD3t
Initial bounds: (0.0, 1000)
   â†’ No transcriptomics data. Opening bounds.
   â†’ Opened bounds: (0, 1000)

ðŸ”¹ Checking reaction: COAtl
Initial bounds: (0.0, 1000)
   â†’ No transcriptomics data. Opening bounds.
   â†’ Opened bounds: (0, 1000)

ðŸ”¹ Checking reaction: EX_5adtststerone_e
Initial bounds: (np.float64(-100.0), np.float64(100.0))
   â†’ Defined in media. Skipping.

ðŸ”¹ Checking reaction: EX_5adtststerones_e
Initial bounds: (np.float64(-100.0), np.float64(100.0))
   â†’ Defined in media. Skipping.

ðŸ”¹ Checking reaction: EX_5fthf_e
Initial bounds: (np.float64(-100.0), 0.0)
   â†’ Defined in media. Skipping.

ðŸ”¹ Checking reaction: EX_5htrp_e
Initial bounds: (0.0, np.float64(100.0))
   â†’ Defined in media. Skipping.

ðŸ”¹ Checking reaction: EX_5mthf_e
Initial bounds: (np.float64(-100.0), n