In [33]:
import os
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LogNorm


In [34]:

def collect_results(root_dir):
    results = {}
    metrics = ["ABS", "RE_A", "RMSE"]

    # Parcours des dossiers log_..._scale_...
    for folder in os.listdir(root_dir):
        full_path = os.path.join(root_dir, folder)
        if not os.path.isdir(full_path):
            continue

        # Extraction log + scale depuis le nom du dossier
        # Format attendu : log_<bool>_scale_<scale>
        try:
            parts = folder.split("_")
            log = parts[1]     # "False" ou "True"
            scale = parts[3]   # "minmax", "standard", etc.
        except IndexError:
            # Si le dossier ne respecte pas cette naming convention on saute
            continue

        # Recherche les sous-dossiers ABS / RE_A / RMSE
        for metric in metrics:
            metric_path = os.path.join(full_path, metric)
            if not os.path.isdir(metric_path):
                continue

            csv_path = os.path.join(metric_path, "Processing_Optimized.csv")
            if os.path.isfile(csv_path):
                df = pd.read_csv(csv_path)
                # Clé = tuple (log, scale, metric)
                results[(log, scale, metric)] = df

    return results


In [35]:
Ref = pd.read_csv("Processing_Detailed.csv")
Start = pd.read_csv("Processing_Start.csv")

root ="./"
data_dict = collect_results(root)



In [36]:
input_fitness_sp = ['Y_NH3', 'Y_H2', 'Y_O2', 'Y_H2O', 'Y_NO', 'Y_NO2', 'Y_N2O',  'Y_NNH', 'Y_HNO',"Y_N2"]


In [67]:

def compute_and_plot_pearson_for(log, scale, ref_df, start_df, subset, input_fitness, input_fitness_sp):
    results = {}

    # START vs REF
    start_corr = {}
    for sp in input_fitness:
        r, _ = stats.pearsonr(ref_df[sp].values, start_df[sp].values)
        start_corr[sp] = r

    results["START"] = start_corr

    # Metrics vs REF
    for metric, df in subset.items():
        metric_corr = {}
        for sp in input_fitness:
            r, _ = stats.pearsonr(ref_df[sp].values, df[sp].values)
            metric_corr[sp] = r
        results[metric] = metric_corr

    df_corr = pd.DataFrame(results)

    # Réordonner
    prioritaires = [sp for sp in input_fitness_sp if sp in df_corr.index]
    autres = [sp for sp in df_corr.index if sp not in input_fitness_sp]
    ordered_species = prioritaires + autres
    df_corr_sorted = df_corr.loc[ordered_species]    

    # --- Heatmap
    plt.figure(figsize=(10, 6))
    norm = LogNorm(0.5, 1)
    ax = sns.heatmap(
        df_corr_sorted,
        annot=True,
        fmt =".3f",
        cmap="flare_r",
        norm = norm,
        cbar_kws={"label": "r (Pearson)"}
    )

    ylabels = df_corr_sorted.index.tolist()
    weights = ["bold" if lab in input_fitness_sp else "normal" for lab in ylabels]
    ax.set_yticklabels(ylabels, rotation=0)
    for ticklabel, weight in zip(ax.get_yticklabels(), weights):
        ticklabel.set_fontweight(weight)

    ax.set_ylabel("Species", fontsize=12)
    ax.set_xlabel("Mechanism", fontsize=12)
    ax.set_title(f"Log : {log} || Scale : {scale}")

    if len(prioritaires) > 0 and len(autres) > 0:
        ax.hlines(len(prioritaires), *ax.get_xlim(), linewidth=1.5)

    plt.tight_layout()
    os.makedirs("png_pearson",exist_ok=True)
    os.makedirs("csv_pearson",exist_ok=True)
    fname = f"png_pearson/pearson_log_{log}_scale_{scale}.png"
    # df_corr_sorted.to_csv(f"csv_pearson/pearson_log_{log}_scale_{scale}.csv")
    plt.savefig(fname, dpi=300)
    plt.close()
    
    
    



In [68]:
for (log, scale, metric) in data_dict.keys():
    # seulement une fois par couple (log,scale)
    subset = {m: df for (l, s, m), df in data_dict.items()
              if l == log and s == scale}

    # skip si déjà traité
    compute_and_plot_pearson_for(log, scale, Ref, Start, subset,
                                 input_fitness_sp, input_fitness_sp)


