 # Flux comparison

 In FIG. 4 , the authors compared subsystems by selecting the most de-regulated reactions across cell types (LG and HG). I think They optimized to max biomass each of the 7 models  (they generated 7 FBA solutions) and then they computed the average of each flux across cell lines of the same family.



 However, they selected one of the many solutions that respect max biomass growth. I suggest to:



 1. Impose biomass at its upper bound for each model (LB = UB*0.90 to avoid solver numerical issues)

 2. Run flux sampling with OPTGP (cobrapy) with thinning = 100 and 1k samples per cell line

 3. DO not 'summarize' flux probability distributions with a simple average, but use more advanced methods. The objective here is to identify the most different reactions across the two cancer families. You could run non parametric statistical tests such as mann-whitney to check if two probability distributions are significantly different or not. You have 3 cells vs 2 cells (all pair combinations), so you could perform this test only on reactions belonging to core subsystems such as glycolysis, TCA cycle pentophosphate etc.. in order to redure the number of compared distribution per cell couple.

 4. Once you identified the top-n most different probability distributions (reaction fluxes) across cells of different type, you could plot them with boxplots as the authors did

 5. It might be interesting to check if we have 'less differences' in distributions of cells belonging to the same family.

In [None]:
from cobra.sampling import sample
import pandas as pd
import numpy as np
import pandas as pd
from cobra.io.json import load_json_model
import numpy as np
from cobra.flux_analysis import flux_variability_analysis
import os



In [None]:
model = load_json_model('./data/Recon3D.json')
model.solver = 'gurobi'
model.objective = 'BIOMASS_reaction'


 # Seperating reactions into pathways

In [None]:
subsystem_dict = {}

for r in model.reactions:
    # get subsystem safely
    s = (getattr(r, "subsystem", "") or "").strip()

    # if it contains "/", keep only the first part
    if "/" in s:
        s = s.split("/")[0].strip()   # take text before "/" and remove spaces

    s = s.lower()

    # skip empty subsystems
    if not s:
        continue

    # add reaction to that subsystem list
    subsystem_dict.setdefault(s, []).append(r.id)


In [None]:
# MODIFIED: Restricted keywords to only Glycolysis, TCA, and PPP
category_keywords = {
    "Glycolysis": [
        "glycolysis", "gluconeogenesis"
    ],
    
    "TCA": [
        "tca", "citric acid", "krebs", "tricarboxylic"
    ],
    
    "Pentose Phosphate Pathway": [
        "pentose phosphate", "ppp"
    ]
}


In [None]:
selected = {cat: [] for cat in category_keywords}


In [None]:
for subsystem_name, rxns in subsystem_dict.items():
    name_lower = subsystem_name.lower()

    for category, keywords in category_keywords.items():
        # Check if any keyword exists in the subsystem name
        if any(kw in name_lower for kw in keywords):
            # Use set to avoid duplicate reaction IDs if they appear in multiple subsystem variants
            selected[category].extend(rxns)


In [None]:
for cat, rxns in selected.items():
    # Using set() here ensures unique IDs count
    print(f"{cat}: {len(set(rxns))} reactions")


In [None]:
# Optional: Check specific reactions for one of the groups (e.g., TCA)
for rxn_id in list(set(selected["TCA"]))[:5]: # printing first 5 as example
    rxn = model.reactions.get_by_id(rxn_id)
    print(rxn_id, " â†’ ", rxn.subsystem)


In [None]:
# MODIFIED: Final dictionary only contains the 3 requested pathways
reactions_by_category = {
    "glycolysis": list(set(selected["Glycolysis"])),
    "tca": list(set(selected["TCA"])),
    "ppp": list(set(selected["Pentose Phosphate Pathway"]))
}

print("Categories created:", reactions_by_category.keys())

## Flux sampling

In [None]:
low_grade = {
    'ACH-000520': '59M',
    'ACH-000542': 'HEYA8',
    'ACH-000091': 'OV56'
}
high_grade = {
    'ACH-000256': 'COV318',
    'ACH-000713': 'CAOV3',
    'ACH-000116': 'OAW28'
}


In [None]:
os.makedirs("data/flux_sampling_data_new", exist_ok=True)


In [None]:
def run_sampling_for_group(group_dict, suffix):
    for cl_id, cl_name in group_dict.items():
        print(f"\n=== Processing {cl_name} ({cl_id}) â€” {suffix} ===")

        # Load FVA file
        fva_path = f'./data/fva_rnaseq_{cl_id}_{suffix}.csv'
        df_fva = pd.read_csv(fva_path, index_col=0)

        # Clean invalid or missing bounds
        df_fva[['minimum', 'maximum']] = df_fva[['minimum', 'maximum']].fillna(0.0)
        invalid = df_fva['minimum'] > df_fva['maximum']
        df_fva.loc[invalid, ['minimum', 'maximum']] = 0.0

        #  Inject reaction bounds into model
        for rxn_id, row in df_fva.iterrows():
            model.reactions.get_by_id(rxn_id).bounds = (row['minimum'], row['maximum'])

        # Check feasibility
        sol = model.optimize()
        print(f"   Status: {sol.status}, Growth: {sol.objective_value}")

        #  Fix biomass at 90% of UB
        UB = model.reactions.get_by_id('BIOMASS_reaction').upper_bound
        model.reactions.get_by_id('BIOMASS_reaction').bounds = (0.9 * UB, UB)

        # 7) Run flux sampling
        print("   Running OPTGP sampling...")
        samples = sample(model, n=1000, method="optgp", thinning=100, seed=42)

        # 8) Save CSV
        out_path = f"./data/flux_sampling_data/flux_sampling_{suffix}_{cl_name}.csv"
        samples.to_csv(out_path)

        print(f"   Saved â†’ {out_path}")


In [None]:
run_sampling_for_group(low_grade, "LG")
run_sampling_for_group(high_grade, "HG")


 # Flux Distribution comparisions

In [None]:
"""LG = {
    "59M": pd.read_csv("./data/flux_sampling_data/flux_sampling_LG_59M.csv", index_col=0),
    "HEYA8": pd.read_csv("./data/flux_sampling_data/flux_sampling_LG_HEYA8.csv", index_col=0),
    "OV56": pd.read_csv("./data/flux_sampling_data/flux_sampling_LG_OV56.csv", index_col=0)
}

HG = {
    "COV318": pd.read_csv("./data/flux_sampling_data/flux_sampling_HG_COV318.csv", index_col=0),
    "CAOV3": pd.read_csv("./data/flux_sampling_data/flux_sampling_HG_CAOV3.csv", index_col=0),
    "OAW28": pd.read_csv("./data/flux_sampling_data/flux_sampling_HG_OAW28.csv", index_col=0)
}"""

LG = {
    "59M": pd.read_csv("data/flux_sampling_data/flux_sampling_LG_59M.csv", index_col=0),
    "HEYA8": pd.read_csv("data/flux_sampling_data/flux_sampling_LG_HEYA8.csv", index_col=0),
    "OV56": pd.read_csv("data/flux_sampling_data/flux_sampling_LG_OV56.csv", index_col=0)
}

HG = {
    "COV318": pd.read_csv("data/flux_sampling_data/flux_sampling_HG_COV318.csv", index_col=0),
    "CAOV3": pd.read_csv("data/flux_sampling_data/flux_sampling_HG_CAOV3.csv", index_col=0),
    "OAW28": pd.read_csv("data/flux_sampling_data/flux_sampling_HG_OAW28.csv", index_col=0)
}



 ## Automated version

In [None]:
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
results = []

# Iterate across all pathways
for pathway, reactions in reactions_by_category.items():

    for rxn in reactions:  # only reactions in this pathway

        for lg_name, lg_df in LG.items():
            for hg_name, hg_df in HG.items():

                # extract distributions
                if rxn not in lg_df.columns or rxn not in hg_df.columns:
                    continue

                lg_values = lg_df[rxn].dropna()
                hg_values = hg_df[rxn].dropna()

                # skip empty
                if len(lg_values) == 0 or len(hg_values) == 0:
                    continue

                # compute means
                mean_lg = lg_values.mean()
                mean_hg = hg_values.mean()

                # Fold Difference FD = |(mean_lg - mean_hg) / mean_hg|
                if mean_hg == 0:
                    FD = np.nan
                else:
                    FD = abs((mean_lg - mean_hg) / mean_hg)

                # Mann-Whitney U test
                stat, pval = mannwhitneyu(lg_values, hg_values, alternative="two-sided")

                results.append([
                    pathway, rxn, lg_name, hg_name,
                    stat, pval, mean_lg, mean_hg, FD
                ])

# Build DataFrame
df_stats = pd.DataFrame(results, columns=[
    "pathway", "reaction", "LG", "HG",
    "U_stat", "p_value", "mean_LG", "mean_HG", "fold_change"
])

# Multiple testing correction
df_stats["p_adjusted"] = multipletests(df_stats["p_value"], method="fdr_bh")[1]

# Keep significant results
signif = df_stats[(df_stats["p_adjusted"] < 0.01) & (df_stats["fold_change"] > 0.90)].copy()

top5_per_pathway = (
    signif
    .sort_values(by="fold_change", ascending=True)
    #  Filter out duplicates.
    .drop_duplicates(subset=['reaction'], keep='first')
    .groupby("pathway")
    .head(5)
    
    .reset_index(drop=True)
)


In [None]:
def build_pathway_dict_for_cell(flux_df, top5_per_pathway):
    """
    flux_df: flux sampling dataframe for ONE cell line (samples Ã— reactions)
    top5_per_pathway: df with selected reactions (pathway + reaction)

    returns:
        dict[pathway] -> df (samples Ã— top reactions)
    """
    pathway_dict = {}

    for pathway in top5_per_pathway["pathway"].unique():
        # reactions selected for this pathway
        rxns = top5_per_pathway.loc[
            top5_per_pathway["pathway"] == pathway, "reaction"
        ].tolist()

        # keep only reactions present in this cell line
        rxns = [r for r in rxns if r in flux_df.columns]

        if rxns:
            pathway_dict[pathway] = flux_df[rxns].copy()

    return pathway_dict





In [None]:
LG_pathway_dicts = {}
for cell_name, flux_df in LG.items():
    LG_pathway_dicts[cell_name] = build_pathway_dict_for_cell(flux_df, top5_per_pathway)

HG_pathway_dicts = {}
for cell_name, flux_df in HG.items():
    HG_pathway_dicts[cell_name] = build_pathway_dict_for_cell(flux_df, top5_per_pathway)

all_pathway_dicts = {**LG_pathway_dicts,**HG_pathway_dicts}


In [None]:
all_pathway_dicts.keys()


In [None]:
def invert_cellline_pathway_dict(cellline_first_dict):
    """
    input:  {cell_line -> {pathway -> df}}
    output: {pathway  -> {cell_line -> df}}
    """
    pathway_first = {}

    for cell_line, pw_dict in cellline_first_dict.items():
        for pathway, df in pw_dict.items():
            if pathway not in pathway_first:
                pathway_first[pathway] = {}
            pathway_first[pathway][cell_line] = df

    return pathway_first


pathway_dicts = invert_cellline_pathway_dict(all_pathway_dicts)
pathway_dicts['glycolysis']



In [None]:
# 1. Access the inner dictionary using the outer key ('59M')
LG_pathway_dicts['OV56']




In [None]:
list_LG_names =list(LG_pathway_dicts.keys())
list_LG_names



In [None]:
for key in list_LG_names:
    subkeys = LG_pathway_dicts[key].keys()
    list_pathways= list(subkeys)


In [None]:
list_pathways


In [None]:
LG_pathway_dicts["59M"]['fatty_acid']


In [None]:
LG_pathway_dicts["59M"]['lipid']


In [None]:
df_long = LG_pathway_dicts["59M"]['fatty_acid'].melt(var_name="reaction", value_name="flux")
df_long



In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 4, figsize=(10, 4))


In [None]:
import seaborn as sns


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_flux_distributions_by_pathway(pathway, cell_dict):
    """
    pathway: name of the pathway (string), used for the title
    cell_dict: { cell_line -> df(samples Ã— reactions) } for this pathway
    """

    # ----------- Build combined long dataframe -----------
    long_frames = []

    for cell_line, df in cell_dict.items():
        df_long = df.melt(
            var_name="reaction",
            value_name="flux"
        )
        df_long["cell_line"] = cell_line
        long_frames.append(df_long)

    df_all = pd.concat(long_frames, ignore_index=True)

    # ----------- Get list of reactions --------------------
    reactions = df_all["reaction"].unique()
    n = len(reactions)

    # ----------- Create subplots --------------------------
    fig, axes = plt.subplots(
        1, n,
        figsize=(5 * n, 4),
        sharey=False
    )

    if n == 1:
        axes = [axes]

    # ----------- Plot each reaction -----------------------
    for ax, rxn in zip(axes, reactions):
        sub = df_all[df_all["reaction"] == rxn]

        # ðŸ”¹ variance per cell line for this reaction
        var_per_cell = sub.groupby("cell_line")["flux"].var()

        print(f"\n=== Reaction: {rxn} (pathway: {pathway}) ===")
        print(var_per_cell)

        # optional: highlight almost-constant ones
        very_small = var_per_cell[var_per_cell < 1e-10]
        if len(very_small) > 0:
            print("  -> Nearly constant in:", list(very_small.index))

        sns.kdeplot(
            data=sub,
            x="flux",
            hue="cell_line",
            fill=True,
            common_norm=False,
            ax=ax
        )

        ax.set_title(rxn)
        ax.set_xlabel("Flux")
        ax.set_ylabel("Density")

    plt.suptitle(f"Flux Distributions â€” {pathway}", fontsize=16)
    plt.tight_layout()
    plt.show()






In [None]:
for pathway, cell_dict in pathway_dicts.items():
    plot_flux_distributions_by_pathway(pathway, cell_dict)




In [None]:
sns.displot(df_long, x="flux", hue="cell_line", kind="kde")


