In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.metrics import r2_score
import matplotlib.cm as cm

In [None]:
# Font settings
legend_font = FontProperties(family='Arial', style='normal', size=9)

# Base path to all datasets
base_path = "/home/tvanhout/oxides_ML/models/NCV/"
plt_path = "db_figures/"

# List of subdirectories (each is one NCV experiment)
directories = ["Db1", "Db1_TiO2", 
               "Db2", "Db2_TiO2",
               "Db3", "Db3_TiO2",]

# Human-readable x-axis labels (must match order of 'directories')
xtick_labels = ["Db1", "Db1 w/ Ti",
                "Db2", "Db2 w/ Ti",
                "Db3", "Db3 w/ Ti"]

# Store results
results = []

for dir_name, label in zip(directories, xtick_labels):
    df_path = os.path.join(base_path, dir_name, "summary.csv")
    df = pd.read_csv(df_path)

    data_col = df.columns[0]
    all_dfs = []
    for _, row in df.iterrows():
        row_data = row[data_col].split(",")
        run_id = row["run"]
        all_dfs.append(row_data + [run_id])

    df_summary = pd.DataFrame(all_dfs, columns=[
        "System", "Material", "Surface", "Molecule Group", "Molecule",
        "State", "Dissociation", "True_eV", "Prediction_eV",
        "Error_eV", "Abs_error_eV", "run"
    ])
    df_summary[["True_eV", "Prediction_eV", "Error_eV", "Abs_error_eV"]] = \
        df_summary[["True_eV", "Prediction_eV", "Error_eV", "Abs_error_eV"]].astype(float)

    mae_per_run = df_summary.groupby("run")["Abs_error_eV"].mean()
    mae_nested = mae_per_run.mean()
    std_nested = mae_per_run.std()

    group = dir_name.split("_")[0]  # e.g., "Db1"
    results.append((group, label, mae_nested, std_nested))

# Convert to DataFrame
plot_df = pd.DataFrame(results, columns=["Group", "Label", "MAE", "STD"])

# Assign consistent colors to all bars
unique_labels = plot_df["Label"].tolist()
cmap = cm.get_cmap('tab10', len(unique_labels))
label_color_map = {label: cmap(i) for i, label in enumerate(unique_labels)}

In [None]:
plot_df.head(6)

In [None]:
# === Plot 1: All Experiments Together ===
fig, ax = plt.subplots(figsize=(14/2.54, 6/2.54), dpi=300)

bars = ax.bar(
    plot_df["Label"],
    plot_df["MAE"],
    yerr=plot_df["STD"],
    capsize=5,
    color=[label_color_map[label] for label in plot_df["Label"]],
    edgecolor="black",
    linewidth=0.5
)

ax.set_ylabel(r"$\it{\mathrm{MAE}}$ / eV", fontproperties=legend_font.copy().set_size(11))
ax.set_xticks(np.arange(len(plot_df)))
ax.set_xticklabels(plot_df["Label"], rotation=45, ha="center", fontproperties=legend_font.copy().set_size(9))
ax.tick_params(axis='y', labelsize=9)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f}"))
# ax.set_title("Nested Cross-Validation MAE per Dataset", fontproperties=legend_font.copy().set_size(12))

plt.tight_layout()
fig.savefig(os.path.join(plt_path, "NCV_MAE.svg"), dpi=300, bbox_inches="tight")
fig.savefig(os.path.join(plt_path, "NCV_MAE.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# === Plot 2: One Plot per Group (Db1, Db2, Db3) ===
for group_name, group_df in plot_df.groupby("Group"):
    fig, ax = plt.subplots(figsize=(12/2.54, 6/2.54), dpi=300)

    bars = ax.bar(
        group_df["Label"],
        group_df["MAE"],
        yerr=group_df["STD"],
        capsize=5,
        color=[label_color_map[label] for label in group_df["Label"]],
        edgecolor="black",
        linewidth=0.5
    )

    ax.set_ylabel(r"$\it{MAE}$ / eV", fontproperties=legend_font.copy().set_size(11))
    ax.set_xticks(np.arange(len(group_df)))
    ax.set_xticklabels(group_df["Label"], ha="center", fontproperties=legend_font.copy().set_size(9))
    ax.tick_params(axis='y', labelsize=9)
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f}"))
    ax.set_title(f"Nested CV MAE – {group_name}", fontproperties=legend_font.copy().set_size(12))

    plt.tight_layout()
    plt.show()