In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

METRIC_FUNCTIONS = {
    "VisualFidelity": lambda vf, cs: vf,
    "Contrastiveness": lambda vf, cs: cs,
    "VF*Contrastiveness": lambda vf, cs: vf * cs if vf is not None else None,
    "Min(VF,Contrast)": lambda vf, cs: np.minimum(vf, cs) if vf is not None else None,
    "Avg(VF,Contrast)": lambda vf, cs: (vf + cs) / 2 if vf is not None else None,
}


def plot_calibration_curve(y_true, y_prob, ax, title):
    """
    Plots a calibration curve (using 10 bins) on the provided axes.
    Returns the computed Expected Calibration Error (ECE).
    """
    n_bins = 10
    bin_num_positives = []
    bin_mean_probs = []
    bin_num_instances = []
    bin_centers = []
    
    for i in range(n_bins):
        bin_lower = i / n_bins
        bin_upper = (i+1) / n_bins if i != n_bins - 1 else 1.01  # capture 1.0 values
        bin_indices = [j for j in range(len(y_prob)) if bin_lower <= y_prob[j] < bin_upper]
        if len(bin_indices) > 0:
            bin_num_positives.append(np.mean(y_true[bin_indices]))
            bin_mean_probs.append(np.mean(y_prob[bin_indices]))
            bin_num_instances.append(len(bin_indices))
        else:
            bin_num_positives.append(0)
            bin_mean_probs.append(0)
            bin_num_instances.append(0)
        bin_centers.append((i+0.5))
            
    # Calculate ECE
    total_instances = np.sum(bin_num_instances)
    ece = 0
    for pos, mean_prob, count in zip(bin_num_positives, bin_mean_probs, bin_num_instances):
        
        ece += count * np.abs(pos - mean_prob)
    ece = ece / total_instances if total_instances > 0 else None
    
    # Prepare DataFrame for plotting
    df = pd.DataFrame({
        'bin_num_positives': bin_num_positives,
        'bin_centers': bin_centers,
        'bin_num_instances': bin_num_instances
    })
    
    # Define a fixed normalization: values 0 to 500 map consistently across plots.
    norm_obj = mpl.colors.Normalize(vmin=0, vmax=500)
    cmap = plt.get_cmap("crest")
    # Compute a color for each bar based on the number of instances
    colors = [cmap(norm_obj(count)) for count in df["bin_num_instances"]]
    
    # Create a barplot; note the hue now encodes number of instances
    sns.barplot(x='bin_centers', y='bin_num_positives', data=df, ax=ax,
                hue='bin_num_instances', edgecolor='black',
                linewidth=1.2, width=1, hue_norm=(0,500), palette='crest')
    ax.grid(axis='y')
    ax.plot([-0.5, n_bins-0.5], [0, 1], "k--")
    ax.set_xlim(-0.5, n_bins-0.5)
    ax.set_ylim(0, 1)
    ax.set_xlabel("Metric Score", fontsize='x-large')
    ax.set_ylabel("Prediction Accuracy", fontsize='x-large')
    if ece is not None:
        ax.set_title(f"{title}\nCalibration Error = {ece:.4f}", fontsize='large')
    else:
        ax.set_title(title, fontsize='large')
    ax.set_xticks(np.arange(-0.5, n_bins+0.5, 1))
    ax.set_xticklabels([f"{i/n_bins:.1f}" for i in range(n_bins+1)])
    ax.legend().remove()
    norm = plt.Normalize(0, 500)
    sm = plt.cm.ScalarMappable(cmap="crest", norm=norm)
    # colorbar
    cbar = ax.figure.colorbar(sm, ax=ax, orientation='vertical', pad=0.02)
    cbar.set_label("Number of Instances", fontsize='x-large')
    return ece


def compute_ece(y_true, y_prob, n_bins=10):
    """Computes Expected Calibration Error (ECE) for given true labels and predicted scores."""
    bin_num_positives = []
    bin_mean_probs = []
    bin_num_instances = []
    
    for i in range(n_bins):
        bin_lower = i / n_bins
        bin_upper = (i + 1) / n_bins if i != n_bins - 1 else 1.01
        bin_indices = np.where((y_prob >= bin_lower) & (y_prob < bin_upper))[0]
        if len(bin_indices) > 0:
            bin_num_positives.append(np.mean(y_true[bin_indices]))
            bin_mean_probs.append(np.mean(y_prob[bin_indices]))
            bin_num_instances.append(len(bin_indices))
        else:
            bin_num_positives.append(0)
            bin_mean_probs.append(0)
            bin_num_instances.append(0)
    total_instances = np.sum(bin_num_instances)
    ece = 0
    for pos, mean_prob, count in zip(bin_num_positives, bin_mean_probs, bin_num_instances):
        ece += count * np.abs(pos - mean_prob)
    return ece / total_instances if total_instances > 0 else None

def save_individual_plots(data, dataset_type, model):
    """
    For the given data (DataFrame), plots calibration curves for each metric (VisualFidelity,
    Contrastiveness, and their product) and saves them in a folder with structure:
    ece_analysis/dataset_type/model/PROPERTY.png
    """
    save_dir = os.path.join("ece_analysis", dataset_type, model)
    os.makedirs(save_dir, exist_ok=True)
    
    y_true = data["is_correct"].astype(float).values
    
    vf = data["visual_fidelity"].astype(float).values
    cs = data["contrastiveness_score"].astype(float).values
    
    # Generate metrics based on the global dictionary.
    metrics = {}
    for metric_name, func in METRIC_FUNCTIONS.items():
        if vf is not None:
            metrics[metric_name] = func(vf, cs)
        else:
            # If visual fidelity data is missing and the metric depends on it,
            # you can set it to None or skip it.
            metrics[metric_name] = None
    
    ece_values = {}
    for metric_name, scores in metrics.items():
        if scores is None:
            print(f"Skipping {metric_name} for {dataset_type} {model} due to missing data.")
            continue
        fig, ax = plt.subplots(figsize=(6, 4), dpi=200)
        ece = plot_calibration_curve(y_true, scores, ax, metric_name)
        ece_values[metric_name] = ece
        # Replace any problematic characters in the filename if needed.
        save_path = os.path.join(save_dir, f"{metric_name.replace('*','x')}.png")
        fig.tight_layout()
        fig.savefig(save_path)
        plt.close(fig)
        
    # Plot additional metrics if they exist
    additional_metrics = {
        "Support": data["entail_prob"].astype(float).values if "support" in data.columns else None,
        "informative": data["informative"].astype(float).values if "informative" in data.columns else None,
        "Commonsense_Plausibility": data["commonsense_plausibility"].astype(float).values if "commonsense_plausibility" in data.columns else None
    }

    for metric_name, values in additional_metrics.items():
        if values is not None:
            fig, ax = plt.subplots(figsize=(6, 4), dpi=200)
            ece = plot_calibration_curve(y_true, values, ax, metric_name)
            ece_values[metric_name] = ece
            save_path = os.path.join(save_dir, f"{metric_name}.png")
            fig.tight_layout()
            fig.savefig(save_path)
            plt.close(fig)
        else:
            print(f"{metric_name} column not available for {dataset_type} {model}.")

    return ece_values

# def create_large_combined_plot(results_list):
#     """
#     Creates one large figure (6 rows x 3 columns) for all dataset–model combinations.
#     Each row corresponds to one dataset–model pair and each column to one metric:
#       - Column 1: VisualFidelity
#       - Column 2: Contrastiveness
#       - Column 3: VF*Contrastiveness
#     The large figure is saved in ece_analysis/.
#     """
#     num_combos = len(results_list)
#     metric_names = list(METRIC_FUNCTIONS.keys())
#     num_metrics = len(metric_names)
    
#     fig, axs = plt.subplots(num_combos, num_metrics, figsize=(5*num_metrics, 4*num_combos), dpi=200)
    
#     # Ensure axs is 2D (in case of only one row)
#     if num_combos == 1:
#         axs = np.array([axs])
    
#     # For each dataset-model combination:
#     for idx, item in enumerate(results_list):
#         dataset_type = item["Dataset"]
#         model = item["Model"]
#         data = item["data"]
#         y_true = data["is_correct"].astype(float).values
        
#         vf = data["visual_fidelity"].astype(float).values
#         cs = data["contrastiveness_score"].astype(float).values
        
#         # Compute each metric using the functions from the dictionary.
#         computed_metrics = {}
#         for metric_name, func in METRIC_FUNCTIONS.items():
#             if vf is not None:
#                 computed_metrics[metric_name] = func(vf, cs)
#             else:
#                 computed_metrics[metric_name] = None
                
        
#         # Plot each metric in the corresponding column
#         for col, metric_name in enumerate(metric_names):
#             ax = axs[idx, col]
#             scores = computed_metrics[metric_name]
#             title = f"{dataset_type} | {model}\n{metric_name}"
#             if scores is not None:
#                 plot_calibration_curve(y_true, scores, ax, title)
#             else:
#                 ax.text(0.5, 0.5, f"{metric_name} data not available", horizontalalignment='center', verticalalignment='center')
#                 ax.set_title(title)
    
#     fig.tight_layout()
#     os.makedirs("ece_analysis", exist_ok=True)
#     save_path = os.path.join("ece_analysis", "large_combined_plot.png")
#     fig.savefig(save_path)
#     plt.close(fig)

def create_large_combined_plot(results_list):
    """
    Creates a large figure for all dataset–model combinations.
    For each combination, we include both the original metrics (from METRIC_FUNCTIONS) and additional ones if available.
    """
    all_rows = []
    # We’ll first determine the maximum number of metrics available across all items.
    for item in results_list:
        data = item["data"]
        metrics = {}
        if "visual_fidelity" in data.columns and "contrastiveness_score" in data.columns:
            vf = data["visual_fidelity"].astype(float).values
            cs = data["contrastiveness_score"].astype(float).values
            for metric_name, func in METRIC_FUNCTIONS.items():
                metrics[metric_name] = func(vf, cs)
        # Add additional metrics if available
        if "support" in data.columns:
            metrics["Support"] = data["entail_prob"].astype(float).values
        if "informative" in data.columns:
            metrics["Informative"] = data["informative"].astype(float).values
        if "commonsense_plausibility" in data.columns:
            metrics["Commonsense_Plausibility"] = data["commonsense_plausibility"].astype(float).values
        item["metrics"] = metrics  # store for later use
        item["metric_names"] = list(metrics.keys())

    # Assume that all items have the same set of metrics (or take the union)
    all_metric_names = sorted({name for item in results_list for name in item["metric_names"]})
    num_metrics = len(all_metric_names)
    num_combos = len(results_list)

    fig, axs = plt.subplots(num_combos, num_metrics, figsize=(5*num_metrics, 4*num_combos), dpi=200)
    if num_combos == 1:
        axs = np.array([axs])

    for idx, item in enumerate(results_list):
        dataset_type = item["Dataset"]
        model = item["Model"]
        data = item["data"]
        y_true = data["is_correct"].astype(float).values
        metrics = item["metrics"]
        for col, metric_name in enumerate(all_metric_names):
            ax = axs[idx, col]
            title = f"{dataset_type} | {model}\n{metric_name}"
            if metric_name in metrics and metrics[metric_name] is not None:
                plot_calibration_curve(y_true, metrics[metric_name], ax, title)
            else:
                ax.text(0.5, 0.5, f"{metric_name} data not available", horizontalalignment='center', verticalalignment='center')
                ax.set_title(title)
    fig.tight_layout()
    os.makedirs("ece_analysis", exist_ok=True)
    save_path = os.path.join("ece_analysis", "large_combined_plot.png")
    fig.savefig(save_path)
    plt.close(fig)


def main():
    datasets_folder = "model_outputs"
    # List your dataset identifiers and models here
    dataset_types = ["AOKVQA", "VizWiz"]
    model_list = ["llava-v1.5-7b", "qwen2.5-vl-7b-instruct", "gpt-4o-2024-05-13"]
    
    summary_results = []
    combined_results = []
    
    for dataset_type in dataset_types:
        for model in model_list:
            csv_path = os.path.join(datasets_folder, dataset_type, f"{model}.csv")
            if not os.path.exists(csv_path):
                print(f"CSV file not found: {csv_path}")
                continue
            
            data = pd.read_csv(csv_path)
            # Check necessary columns
            if "is_correct" not in data.columns:
                print(f"'is_correct' column not found in {csv_path}.")
                continue
            if "contrastiveness_score" not in data.columns:
                print(f"'contrastiveness_score' column not found in {csv_path}.")
                continue
            
            correctnesses = data["is_correct"].astype(float).values
            if "visual_fidelity" in data.columns:
                visual_fidelities = data["visual_fidelity"].astype(float).values
            else:
                visual_fidelities = None
            contrastiveness_scores = data["contrastiveness_score"].astype(float).values
            
            
            support = data["entail_prob"].astype(float).values if "support" in data.columns else None
            informative = data["informative"].astype(float).values if "informative" in data.columns else None
            commonsense_plausibility = data["commonsense_plausibility"].astype(float).values if "commonsense_plausibility" in data.columns else None
            
            # Compute overall accuracy and ECE values using the helper function
            accuracy = np.mean(correctnesses)
            
            ece_visual = compute_ece(correctnesses, visual_fidelities)
            ece_product = compute_ece(correctnesses, visual_fidelities * contrastiveness_scores)
            ece_contrast = compute_ece(correctnesses, contrastiveness_scores)
            
            ece_min_vf_contr = compute_ece(correctnesses, np.minimum(visual_fidelities, contrastiveness_scores))
            ece_avg_vf_contr = compute_ece(correctnesses, (visual_fidelities + contrastiveness_scores) / 2)
            
            ece_support = compute_ece(correctnesses, support) if support is not None else None
            ece_informative = compute_ece(correctnesses, informative) if informative is not None else None
            ece_commonsense_plausibility = compute_ece(correctnesses, commonsense_plausibility) if commonsense_plausibility is not None else None
            
            summary_results.append({
                "Dataset": dataset_type,
                "Model": model,
                "Accuracy": accuracy,
                "ECE VisualFidelity": ece_visual,
                "ECE Contrastiveness": ece_contrast,
                "ECE VF*Contrastiveness": ece_product,
                "ECE Min(VF,Contrast)": ece_min_vf_contr,
                "ECE Avg(VF,Contrast)": ece_avg_vf_contr,
                "ECE Support": ece_support,
                "ECE informative": ece_informative,
                "ECE Commonsense_Plausibility": ece_commonsense_plausibility
            })
            
            # Save individual calibration plots for this dataset-model combination
            save_individual_plots(data, dataset_type, model)
            
            # Save info for the combined large plot
            combined_results.append({
                "Dataset": dataset_type,
                "Model": model,
                "data": data
            })
    
    # Print summary table
    summary_df = pd.DataFrame(summary_results)
    print(summary_df)
    summary_df.to_csv("ece_analysis/summary_results.csv", index=False)
    
    # Create and save the large combined plot (6 rows x 3 columns)
    if combined_results:
        create_large_combined_plot(combined_results)

if __name__ == "__main__":
    main()

In [None]:
import glob
import json

def collect_human_study_ids(folder: str) -> set[str]:
    """
    Reads every *.json file in `folder` (expects 000.json … 009.json) and
    returns the set of `question_id` strings found inside.
    """
    ids = set()
    for fp in sorted(glob.glob(os.path.join(folder, "*.json"))):
        with open(fp, "r", encoding="utf-8") as f:
            for entry in json.load(f):
                ids.add(str(entry["question_id"]))        # keep as str
    return ids


def compute_subset_ece(csv_path: str, id_subset: set[str],
                       model_name: str, dataset_name: str) -> dict:
    """
    Loads `csv_path`, filters rows whose index is in `id_subset`,
    then calculates all ECE variants defined in METRIC_FUNCTIONS.
    Returns a dict {metric_name: ece_value}.
    """
    if not os.path.exists(csv_path):
        raise FileNotFoundError(csv_path)

    df = pd.read_csv(csv_path, dtype={"index": str})   # keep ids as str
    df = df[df["index"].isin(id_subset)].copy()

    if df.empty:
        raise ValueError(f"No rows from {csv_path} match the provided ids.")
    
    print(f"Loaded {len(df)} rows from {csv_path} for model {model_name}.")

    y_true = df["is_correct"].astype(float).values
    vf     = df["visual_fidelity"].astype(float).values
    cs     = df["contrastiveness_score"].astype(float).values

    results = {}
    results["Accuracy"] = float(np.mean(y_true))
    for name, fn in METRIC_FUNCTIONS.items():
        scores = fn(vf, cs)
        results[name] = compute_ece(y_true, scores)

    # optional extra metrics, if you stored them
    if "entail_prob" in df.columns:
        results["Support"] = compute_ece(y_true, df["entail_prob"].values)
    if "informative" in df.columns:
        results["Informative"] = compute_ece(y_true, df["informative"].values)
    if "commonsense_plausibility" in df.columns:
        results["Commonsense_Plausibility"] = compute_ece(
            y_true, df["commonsense_plausibility"].values
        )

    # === save per-metric calibration plots just like before ===
    save_individual_plots(df, f"{dataset_name}_HUMAN", model_name)

    return results


def main_human_study():
    # ---------- 1. collect the two id sets ----------
    llava_ids = collect_human_study_ids(
        "human_study_questions/llava1.5_with_image_q20_i10_s0"
    )
    qwen_ids = collect_human_study_ids(
        "human_study_questions/qwen2.5_vizwiz_q10_i10_s0"
    )

    # ---------- 2. compute ECE on each slice ----------
    llava_res = compute_subset_ece(
        "model_outputs/AOKVQA/llava-v1.5-7b.csv",
        llava_ids,
        model_name="llava-v1.5-7b",
        dataset_name="AOKVQA"
    )
    qwen_res = compute_subset_ece(
        "model_outputs/VizWiz/qwen2.5-vl-7b-instruct.csv",
        qwen_ids,
        model_name="qwen2.5-vl-7b-instruct",
        dataset_name="VizWiz"
    )

    # ---------- 3. print / save a summary ----------
    summary = (
        pd.DataFrame([{"Model": "llava-v1.5-7b", **llava_res},
                      {"Model": "qwen2.5-vl-7b-instruct", **qwen_res}])
        .set_index("Model")
          # metrics down the rows
        .round(4)
    )
    print("\n=== Human-study ECE (10-bin) ===")
    print(summary)
    summary.to_csv("ece_analysis/human_study_ECE.csv")

if __name__ == "__main__":
    # run your existing full-dataset analysis first (optional)
    # main()

    # then run the human-study slice
    main_human_study()


In [None]:
# Compute ECE for a specific JSON file
import os
import json
import pandas as pd
import numpy as np

# Path to the JSON file
file_path = 'human_study_questions/llava-v1.5-7b_aokvqa_sampled.json'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}. Please ensure the JSON file is located at this path.")
else:
    # Load JSON data
    with open(file_path, 'r') as f:
        records = json.load(f)

    # Create DataFrame
    df = pd.DataFrame(records)

    # Prepare fields
    df['is_correct'] = df['prediction_is_correct'].astype(float)
    df['visual_fidelity'] = df['visual_fidelity'].astype(float)
    df['contrastiveness'] = df['contrastiveness'].astype(float)
    df['uniform_random_score'] = df['uniform_random_score'].astype(float)

    # Define ECE computation
    def compute_ece(y_true, y_prob, n_bins=10):
        y_true = np.array(y_true)
        y_prob = np.array(y_prob)
        bin_edges = np.linspace(0, 1, n_bins + 1)
        ece = 0.0
        total = len(y_prob)

        for i in range(n_bins):
            lower, upper = bin_edges[i], bin_edges[i + 1]
            if i == n_bins - 1:
                mask = (y_prob >= lower) & (y_prob <= upper)
            else:
                mask = (y_prob >= lower) & (y_prob < upper)
            if mask.sum() > 0:
                bin_acc = y_true[mask].mean()
                bin_conf = y_prob[mask].mean()
                ece += mask.sum() * abs(bin_acc - bin_conf)
        return ece / total

    # Metric definitions
    METRIC_FUNCTIONS = {
        "VisualFidelity": lambda vf, cs: vf,
        "Contrastiveness": lambda vf, cs: cs,
        "VF*Contrastiveness": lambda vf, cs: vf * cs,
        "Min(VF,Contrast)": lambda vf, cs: np.minimum(vf, cs),
        "Avg(VF,Contrast)": lambda vf, cs: (vf + cs) / 2
    }

    # Compute ECE for each metric
    results = []
    y_true = df['is_correct']
    vf = df['visual_fidelity']
    cs = df['contrastiveness']
    rand = df['uniform_random_score']

    for name, func in METRIC_FUNCTIONS.items():
        scores = func(vf, cs)
        ece = compute_ece(y_true, scores)
        results.append({'Metric': name, 'ECE': ece})
        
    # Compute ECE for random score
    ece_random = compute_ece(y_true, rand)
    results.append({'Metric': 'Random Score', 'ECE': ece_random})

    # Display results
    result_df = pd.DataFrame(results)

result_df

In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

# Calibration helpers

def plot_calibration_curve(y_true, y_prob, ax, title):
    n_bins = 10
    bin_num_positives = []
    bin_mean_probs = []
    bin_num_instances = []
    bin_centers = []

    for i in range(n_bins):
        lower = i / n_bins
        upper = (i + 1) / n_bins if i != n_bins - 1 else 1.01
        idx = np.where((y_prob >= lower) & (y_prob < upper))[0]
        if len(idx) > 0:
            bin_num_positives.append(np.mean(y_true[idx]))
            bin_mean_probs.append(np.mean(y_prob[idx]))
            bin_num_instances.append(len(idx))
        else:
            bin_num_positives.append(0)
            bin_mean_probs.append(0)
            bin_num_instances.append(0)
        bin_centers.append(i + 0.5)

    total_instances = np.sum(bin_num_instances)
    ece = sum(count * abs(pos - prob)
              for pos, prob, count in zip(bin_num_positives, bin_mean_probs, bin_num_instances))
    ece = ece / total_instances if total_instances > 0 else None

    df = pd.DataFrame({
        'bin_centers': bin_centers,
        'accuracy': bin_num_positives,
        'counts': bin_num_instances
    })
    norm = mpl.colors.Normalize(vmin=0, vmax=100)
    cmap = plt.get_cmap('crest')
    sns.barplot(x='bin_centers', y='accuracy', data=df, ax=ax,
                hue='counts', palette='crest', hue_norm=(0,100),
                edgecolor='black', linewidth=1.2, width=1)
    ax.plot([-0.5, n_bins-0.5], [0, 1], 'k--')
    ax.set_xlim(-0.5, n_bins-0.5)
    ax.set_ylim(0, 1)
    ax.set_xlabel('Metric Score')
    ax.set_ylabel('Prediction Accuracy')
    ax.set_xticks(np.arange(n_bins))
    ax.set_xticklabels([f"{i/n_bins:.1f}" for i in range(n_bins)], rotation=45)
    ax.set_title(f"{title}\nECE = {ece:.4f}" if ece is not None else title)
    ax.legend().remove()
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    plt.colorbar(sm, ax=ax, orientation='vertical', pad=0.02).set_label('Number of Instances')
    return ece


def compute_ece(y_true, y_prob, n_bins=10):
    bin_num_positives = []
    bin_mean_probs = []
    bin_num_instances = []
    for i in range(n_bins):
        lower = i / n_bins
        upper = (i + 1) / n_bins if i != n_bins - 1 else 1.01
        idx = np.where((y_prob >= lower) & (y_prob < upper))[0]
        if len(idx) > 0:
            bin_num_positives.append(np.mean(y_true[idx]))
            bin_mean_probs.append(np.mean(y_prob[idx]))
            bin_num_instances.append(len(idx))
        else:
            bin_num_positives.append(0)
            bin_mean_probs.append(0)
            bin_num_instances.append(0)
    total = np.sum(bin_num_instances)
    ece = sum(count * abs(pos - prob)
              for pos, prob, count in zip(bin_num_positives, bin_mean_probs, bin_num_instances))
    return ece / total if total > 0 else None


def random_sample_ece_analysis(dataset, csv_path, output_base, n_samples=30, sample_size=100):
    data = pd.read_csv(csv_path)
    pos_df = data[data.is_correct == 1]
    neg_df = data[data.is_correct == 0]
    per_class = sample_size // 2
    records = []

    for i in range(1, n_samples + 1):
    # for i in range(30, 31):
        rng = np.random.RandomState(i)
        pos_samp = pos_df.sample(per_class, random_state=i)
        neg_samp = neg_df.sample(per_class, random_state=i)
        sample_df = pd.concat([pos_samp, neg_samp]).reset_index(drop=True)

        y_true = sample_df['is_correct'].astype(float).values
        vf = sample_df['visual_fidelity'].astype(float).values
        cs = sample_df['contrastiveness_score'].astype(float).values

        metrics = {
            'VisualFidelity': vf,
            'Contrastiveness': cs,
            'VFxContrastiveness': vf * cs,
            'Min(VF,Contrast)': np.minimum(vf, cs),
            'Avg(VF,Contrast)': (vf + cs) / 2,
            'UniformRandom': rng.rand(len(vf)),         # uniform [0,1]
            'FlippedProd': 1.0 - (vf * cs),             # 1 − product
            'FlippedMin': 1.0 - np.minimum(vf, cs),     # 1 − min
            'FlippedAvg': 1.0 - (vf + cs) / 2,          # 1 − avg
            'Support': sample_df['entail_prob'].astype(float).values if 'entail_prob' in sample_df else None,
            'Informative': sample_df['informative'].astype(float).values if 'informative' in sample_df else None,
            'Commonsense_Plausibility': sample_df['commonsense_plausibility'].astype(float).values if 'commonsense_plausibility' in sample_df else None
        }

        sample_dir = os.path.join(output_base, f'sample_{i}')
        os.makedirs(sample_dir, exist_ok=True)

        rec = {'sample': i}
        # Compute, print & save
        ece_vf = compute_ece(y_true, vf)
        ece_contr = compute_ece(y_true, cs)
        ece_avg = compute_ece(y_true, metrics['Avg(VF,Contrast)'])
        print(f"Run {i:02}: Acc={np.mean(y_true):.4f}, ECE VF={ece_vf:.4f}, CR={ece_contr:.4f}, Avg={ece_avg:.4f}", end=', ')
        print(f"Min={compute_ece(y_true, metrics['Min(VF,Contrast)']):.4f}, PROD={compute_ece(y_true, metrics['VFxContrastiveness']):.4f}", end=', ')
        print(f"SPT={compute_ece(y_true, metrics['Support']):.4f}, INFO={compute_ece(y_true, metrics['Informative']):.4f}, COMSP={compute_ece(y_true, metrics['Commonsense_Plausibility']):.4f}")
        print(f"UR={compute_ece(y_true, metrics['UniformRandom']):.4f}, FLIPPROD={compute_ece(y_true, metrics['FlippedProd']):.4f}, FLIPMIN={compute_ece(y_true, metrics['FlippedMin']):.4f}, FLIPAVG={compute_ece(y_true, metrics['FlippedAvg']):.4f}")

        for name, vals in metrics.items():
            if vals is None:
                continue
            ece_val = compute_ece(y_true, vals)
            rec[name] = ece_val

            fig, ax = plt.subplots(figsize=(6, 4), dpi=200)
            plot_calibration_curve(y_true, vals, ax, name)
            fig.tight_layout()
            out_path = os.path.join(sample_dir, f"{name.replace('*','x')}.png")
            fig.savefig(out_path)
            plt.close(fig)
            
            # define 20 equal-width bins over [0,1]
            bin_edges = np.linspace(0, 1, 21)
            
            # Distributions
            sns.histplot(vals, bins=bin_edges, kde=True)
            plt.title(f"Distribution of {name} scores")
            plt.xlabel(name)
            plt.ylabel("Frequency")
            plt.savefig(os.path.join(sample_dir, f"{name}_distribution.png"))
            plt.close()
            
            plt.figure(figsize=(6, 4), dpi=200)
            plt.title(f"Distribution of {name} scores for correct and incorrect answers")
            sns.histplot(vals[y_true == 1], bins=bin_edges, kde=False, label='Correct', color='tab:green')
            sns.histplot(vals[y_true == 0], bins=bin_edges, kde=False, label='Incorrect', color='tab:red')
            plt.xlabel(name)
            plt.ylabel("Frequency")
            plt.legend()
            plt.savefig(os.path.join(sample_dir, f"{name}_correct_incorrect_distribution.png"))
            plt.close()

        records.append(rec)

    # Save summary and identify best run
    summary_df = pd.DataFrame(records)
    if dataset == 'VizWiz':
        cols = ['VisualFidelity']
    else:    
        cols = ['VisualFidelity', 'Contrastiveness', 'VFxContrastiveness']
        
    summary_df['mean_ece'] = summary_df[cols].mean(axis=1)
    best = summary_df.loc[summary_df['mean_ece'].idxmin()]
    print(f"\nBest run: sample_{int(best['sample'])} with mean ECE among {cols} = {best['mean_ece']:.4f}\n")

    os.makedirs(output_base, exist_ok=True)
    summary_df.to_csv(os.path.join(output_base, 'ece_summary.csv'), index=False)

    print(f"Completed runs. Summary saved to {output_base}/ece_summary.csv")


if __name__ == '__main__':
    model = 'llava-v1.5-7b'
    # dataset = 'AOKVQA'
    # model = 'qwen2.5-vl-7b-instruct'
    dataset = 'VizWiz'
    csv_path = os.path.join('model_outputs', dataset, f'{model}.csv')
    out_base = os.path.join('ece_analysis', f'{dataset}_random_sampled', model)
    random_sample_ece_analysis(dataset, csv_path, out_base, n_samples=50, sample_size=100)
    # model = 'qwen2.5-vl-7b-instruct'
    # dataset = 'AOKVQA'
    # csv_path = os.path.join('model_outputs', dataset, f'{model}.csv')
    # out_base = os.path.join('ece_analysis', f'{dataset}_random_sampled', model)
    # random_sample_ece_analysis(csv_path, out_base, n_samples=50, sample_size=100)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the summary results CSV file (ensure that main() from your earlier code has been executed)
summary_csv_path = "ece_analysis/summary_results.csv"
summary_df = pd.read_csv(summary_csv_path)

# List the columns corresponding to the ECE metrics
metric_columns = [
    "ECE VisualFidelity", 
    "ECE Contrastiveness", 
    "ECE VF*Contrastiveness", 
    "ECE Min(VF,Contrast)", 
    "ECE Avg(VF,Contrast)", 
    "ECE Support", 
    "ECE informative", 
    "ECE Commonsense_Plausibility"
]

# Convert metric columns to numeric values in case of non-numeric entries
for col in metric_columns:
    summary_df[col] = pd.to_numeric(summary_df[col], errors='coerce')

# Reshape the DataFrame to long-format
melted_df = summary_df.melt(id_vars=["Dataset", "Model"], 
                            value_vars=metric_columns,
                            var_name="Metric", 
                            value_name="ECE")
# Remove any rows with missing ECE values
melted_df = melted_df.dropna(subset=["ECE"])

# Create a KDE plot for each metric's ECE distribution on the same figure
plt.figure(figsize=(12, 6))
metrics = melted_df["Metric"].unique()
for metric in metrics:
    subset = melted_df[melted_df["Metric"] == metric]
    sns.kdeplot(data=subset, x="ECE", label=metric, fill=True, common_norm=False)
    
plt.title("KDE Plot for ECE Distributions for Each Metric")
plt.xlabel("Expected Calibration Error (ECE)")
plt.ylabel("Density")
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig("ece_analysis/ece_distribution_kde.png")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import joypy
import seaborn as sns

# Set a Seaborn style for enhanced aesthetics
sns.set(style="whitegrid", context="talk", palette="viridis")

# Load the summary results CSV file
summary_df = pd.read_csv("ece_analysis/summary_results.csv")

# Rename the columns immediately
summary_df.rename(columns={
    "ECE VisualFidelity": "VF",
    "ECE Contrastiveness": "CONTR",
    "ECE Avg(VF,Contrast)": "Avg(VF,CONTR)",
    "ECE Support": "SUPT",
    "ECE informative": "INFO",
    "ECE Commonsense_Plausibility": "COMMONSENSE"
}, inplace=True)

# Update the metric_columns list based on the new names
metric_columns = ["VF", "CONTR", "Avg(VF,CONTR)", "SUPT", "INFO", "COMMONSENSE"]

# Ensure the metric columns are numeric
for col in metric_columns:
    summary_df[col] = pd.to_numeric(summary_df[col], errors='coerce')


# Reshape the DataFrame from long to wide format because joypy expects a wide format
# We pivot on the metrics so each column is one metric's ECE values.
melted_df = summary_df.melt(id_vars=["Dataset", "Model"], 
                             value_vars=metric_columns,
                             var_name="Metric", 
                             value_name="ECE")
melted_df = melted_df.dropna(subset=["ECE"])  # remove any missing ECE values

# Pivot the data so that each metric becomes a column
wide_data = melted_df.pivot_table(index=["Dataset", "Model"], columns="Metric", values="ECE")
wide_data.reset_index(drop=True, inplace=True)

# Create the ridgeline plot with joypy
# Customize the colormap, overlap, and additional styling parameters to improve aesthetics.
fig, axes = joypy.joyplot(
    wide_data[metric_columns],  # only include metric columns
    colormap=plt.cm.viridis,     # choose a vibrant, appealing colormap
    linewidth=1.2,               # slightly thicker lines for clarity
    overlap=1,                   # adjust overlap between curves
    grid=True,                   # display background grid lines
    figsize=(10, 8),             # adjust figure size as needed
    kind="kde"                  # plot density curves
)

for ax in axes:
    ax.tick_params(axis='both', which='major', labelsize=22)

# Enhance the plot with titles and axis labels.
plt.title("Plot of ECE Distributions", fontsize=22, pad=20)
plt.xlabel("Expected Calibration Error (ECE)", fontsize=20)
plt.tight_layout()

# Save and display the plot
plt.savefig("ece_analysis/ece_distribution_ridgeline_colorful.png", dpi=300)
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

summary_df = pd.read_csv("ece_analysis/summary_results.csv")

metric_columns = [
    "ECE VisualFidelity", 
    "ECE Contrastiveness", 
    "ECE VF*Contrastiveness", 
    "ECE Min(VF,Contrast)", 
    "ECE Avg(VF,Contrast)", 
    "ECE Support", 
    "ECE informative", 
    "ECE Commonsense_Plausibility"
]

for col in metric_columns:
    summary_df[col] = pd.to_numeric(summary_df[col], errors='coerce')
melted_df = summary_df.melt(id_vars=["Dataset", "Model"], 
                            value_vars=metric_columns,
                            var_name="Metric", 
                            value_name="ECE")
melted_df = melted_df.dropna(subset=["ECE"])

# Violin Plot
sns.violinplot(data=melted_df, x="Metric", y="ECE", ax=None)
plt.title("ECE Distribution by Metric and Dataset")
plt.xlabel("Metric")
plt.ylabel("Expected Calibration Error (ECE)")
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig("ece_analysis/ece_distribution_box_violin.png")
plt.show()


In [None]:
# import numpy as np
# import pandas as pd
# import os
# import seaborn as sns
# import matplotlib.pyplot as plt
# import matplotlib as mpl


# def plot_calibration_curve(y_true, y_prob, ax, title):
#     """
#     Plots a calibration curve (using 10 bins) on the provided axes.
#     Returns the computed Expected Calibration Error (ECE).
#     """
#     n_bins = 10
#     bin_num_positives = []
#     bin_mean_probs = []
#     bin_num_instances = []
#     bin_centers = []
    
#     for i in range(n_bins):
#         bin_lower = i / n_bins
#         bin_upper = (i+1) / n_bins if i != n_bins - 1 else 1.01  # capture 1.0 values
#         bin_indices = [j for j in range(len(y_prob)) if bin_lower <= y_prob[j] < bin_upper]
#         if len(bin_indices) > 0:
#             bin_num_positives.append(np.mean(y_true[bin_indices]))
#             bin_mean_probs.append(np.mean(y_prob[bin_indices]))
#             bin_num_instances.append(len(bin_indices))
#         else:
#             bin_num_positives.append(0)
#             bin_mean_probs.append(0)
#             bin_num_instances.append(0)
#         bin_centers.append((i+0.5))
            
#     # Calculate ECE
#     total_instances = np.sum(bin_num_instances)
#     ece = 0
#     for pos, mean_prob, count in zip(bin_num_positives, bin_mean_probs, bin_num_instances):
        
#         ece += count * np.abs(pos - mean_prob)
#     ece = ece / total_instances if total_instances > 0 else None
    
#     # Prepare DataFrame for plotting
#     df = pd.DataFrame({
#         'bin_num_positives': bin_num_positives,
#         'bin_centers': bin_centers,
#         'bin_num_instances': bin_num_instances
#     })
    
#     # Define a fixed normalization: values 0 to 500 map consistently across plots.
#     norm_obj = mpl.colors.Normalize(vmin=0, vmax=500)
#     cmap = plt.get_cmap("crest")
#     # Compute a color for each bar based on the number of instances
#     colors = [cmap(norm_obj(count)) for count in df["bin_num_instances"]]
    
#     # Create a barplot; note the hue now encodes number of instances
#     sns.barplot(x='bin_centers', y='bin_num_positives', data=df, ax=ax,
#                 hue='bin_num_instances', edgecolor='black',
#                 linewidth=1.2, width=1, hue_norm=(0,500), palette='crest')
#     ax.grid(axis='y')
#     ax.plot([-0.5, n_bins-0.5], [0, 1], "k--")
#     ax.set_xlim(-0.5, n_bins-0.5)
#     ax.set_ylim(0, 1)
#     ax.set_xlabel("Metric Score", fontsize='x-large')
#     ax.set_ylabel("Prediction Accuracy", fontsize='x-large')
#     if ece is not None:
#         ax.set_title(f"{title}\nCalibration Error = {ece:.4f}", fontsize='large')
#     else:
#         ax.set_title(title, fontsize='large')
#     ax.set_xticks(np.arange(-0.5, n_bins+0.5, 1))
#     ax.set_xticklabels([f"{i/n_bins:.1f}" for i in range(n_bins+1)])
#     ax.legend().remove()
#     norm = plt.Normalize(0, 500)
#     sm = plt.cm.ScalarMappable(cmap="crest", norm=norm)
#     # colorbar
#     cbar = ax.figure.colorbar(sm, ax=ax, orientation='vertical', pad=0.02)
#     cbar.set_label("Number of Instances", fontsize='x-large')
#     return ece


# def compute_ece(y_true, y_prob, n_bins=10):
#     """Computes Expected Calibration Error (ECE) for given true labels and predicted scores."""
#     bin_num_positives = []
#     bin_mean_probs = []
#     bin_num_instances = []
    
#     for i in range(n_bins):
#         bin_lower = i / n_bins
#         bin_upper = (i + 1) / n_bins if i != n_bins - 1 else 1.01
#         bin_indices = np.where((y_prob >= bin_lower) & (y_prob < bin_upper))[0]
#         if len(bin_indices) > 0:
#             bin_num_positives.append(np.mean(y_true[bin_indices]))
#             bin_mean_probs.append(np.mean(y_prob[bin_indices]))
#             bin_num_instances.append(len(bin_indices))
#         else:
#             bin_num_positives.append(0)
#             bin_mean_probs.append(0)
#             bin_num_instances.append(0)
#     total_instances = np.sum(bin_num_instances)
#     ece = 0
#     for pos, mean_prob, count in zip(bin_num_positives, bin_mean_probs, bin_num_instances):
#         ece += count * np.abs(pos - mean_prob)
#     return ece / total_instances if total_instances > 0 else None

# def save_individual_plots(data, dataset_type, model):
#     """
#     For the given data (DataFrame), plots calibration curves for each metric (VisualFidelity,
#     Contrastiveness, and their product) and saves them in a folder with structure:
#     ece_analysis/dataset_type/model/PROPERTY.png
#     """
#     save_dir = os.path.join("ece_analysis", dataset_type, model)
#     os.makedirs(save_dir, exist_ok=True)
    
#     y_true = data["is_correct"].astype(float).values
#     # Get arrays (if visual_fidelity is missing, handle accordingly)
#     support_values = data["entail_prob"].astype(float).values
#     if "visual_fidelity" in data.columns:
#         visual_fidelities = data["visual_fidelity"].astype(float).values
#     else:
#         visual_fidelities = None
#     contrastiveness_scores = data["contrastiveness_score"].astype(float).values
    
#     metrics = {}
#     metrics['Support'] = support_values
#     metrics["VisualFidelity"] = visual_fidelities
#     metrics["Contrastiveness"] = contrastiveness_scores
#     # Only compute product if visual fidelity exists
#     metrics["VF*Contrastiveness"] = visual_fidelities * contrastiveness_scores if visual_fidelities is not None else None
    
#     ece_values = {}
#     for metric_name, scores in metrics.items():
#         if scores is None:
#             print(f"Skipping {metric_name} for {dataset_type} {model} due to missing data.")
#             continue
#         fig, ax = plt.subplots(figsize=(6, 4), dpi=200)
#         ece = plot_calibration_curve(y_true, scores, ax, metric_name)
#         ece_values[metric_name] = ece
#         # Save plot with filename based on property (replace * with x for filename-safety)
#         save_path = os.path.join(save_dir, f"{metric_name.replace('*','x')}.png")
#         fig.tight_layout()
#         fig.savefig(save_path)
#         plt.close(fig)
#     return ece_values

# def create_large_combined_plot(results_list):
#     """
#     Creates one large figure (6 rows x 3 columns) for all dataset–model combinations.
#     Each row corresponds to one dataset–model pair and each column to one metric:
#       - Column 1: VisualFidelity
#       - Column 2: Contrastiveness
#       - Column 3: VF*Contrastiveness
#     The large figure is saved in ece_analysis/.
#     """
#     num_combos = len(results_list)  # e.g., 6 (2 datasets x 3 models)
#     num_metrics = 4  # three metrics per combo
    
#     # Adjust figsize as needed; here, for example, 18 inches wide and 6 inches tall per row
#     fig, axs = plt.subplots(num_combos, num_metrics, figsize=(18, 6*num_combos), dpi=200)
    
#     # Ensure axs is 2D (in case of only one row)
#     if num_combos == 1:
#         axs = np.array([axs])
    
#     # For each dataset-model combination:
#     for idx, item in enumerate(results_list):
#         dataset_type = item["Dataset"]
#         model = item["Model"]
#         data = item["data"]
#         y_true = data["is_correct"].astype(float).values
        
#         support_values = data["entail_prob"].astype(float).values
        
        
#         if "visual_fidelity" in data.columns:
#             visual_fidelities = data["visual_fidelity"].astype(float).values
#         else:
#             visual_fidelities = None
#         contrastiveness_scores = data["contrastiveness_score"].astype(float).values
        
#         metrics = {}
#         metrics['Support'] = support_values
#         metrics["VisualFidelity"] = visual_fidelities
#         metrics["Contrastiveness"] = contrastiveness_scores
#         metrics["VF*Contrastiveness"] = visual_fidelities * contrastiveness_scores if visual_fidelities is not None else None
        
#         # Plot each metric in the corresponding column
#         for col, metric_name in enumerate(["Support","VisualFidelity", "Contrastiveness", "VF*Contrastiveness"]):
#             ax = axs[idx, col]
#             scores = metrics[metric_name]
#             title = f"{dataset_type} | {model}\n{metric_name}"
#             if scores is not None:
#                 plot_calibration_curve(y_true, scores, ax, title)
#             else:
#                 ax.text(0.5, 0.5, f"{metric_name} data not available", horizontalalignment='center', verticalalignment='center')
#                 ax.set_title(title)
    
#     fig.tight_layout()
#     os.makedirs("ece_analysis", exist_ok=True)
#     save_path = os.path.join("ece_analysis", "large_combined_plot.png")
#     fig.savefig(save_path)
#     plt.close(fig)

# def main():
#     datasets_folder = "model_outputs"
#     # List your dataset identifiers and models here
#     dataset_types = ["AOKVQA", "VizWiz"]
#     model_list = ["llava-v1.5-7b", "qwen2.5-vl-7b-instruct", "gpt-4o-2024-05-13"]
    
#     summary_results = []
#     combined_results = []
    
#     for dataset_type in dataset_types:
#         for model in model_list:
#             csv_path = os.path.join(datasets_folder, dataset_type, f"{model}.csv")
#             if not os.path.exists(csv_path):
#                 print(f"CSV file not found: {csv_path}")
#                 continue
            
#             data = pd.read_csv(csv_path)
#             # Check necessary columns
#             if "is_correct" not in data.columns:
#                 print(f"'is_correct' column not found in {csv_path}.")
#                 continue
#             if "contrastiveness_score" not in data.columns:
#                 print(f"'contrastiveness_score' column not found in {csv_path}.")
#                 continue
#             if "entail_prob" not in data.columns:
#                 print(f"'entail_prob' column not found in {csv_path}.")
#                 continue
            
#             correctnesses = data["is_correct"].astype(float).values
#             support_values = data["entail_prob"].astype(float).values
            
#             if "visual_fidelity" in data.columns:
#                 visual_fidelities = data["visual_fidelity"].astype(float).values
#             else:
#                 visual_fidelities = None
#             contrastiveness_scores = data["contrastiveness_score"].astype(float).values
            
#             # Compute overall accuracy and ECE values using the helper function
#             accuracy = np.mean(correctnesses)
#             ece_support = compute_ece(correctnesses, support_values)
#             if visual_fidelities is not None:
#                 ece_visual = compute_ece(correctnesses, visual_fidelities)
#                 ece_product = compute_ece(correctnesses, visual_fidelities * contrastiveness_scores)
#             else:
#                 ece_visual = "n/a"
#                 ece_product = "n/a"
#             ece_contrast = compute_ece(correctnesses, contrastiveness_scores)
            
            
#             summary_results.append({
#                 "Dataset": dataset_type,
#                 "Model": model,
#                 "Accuracy": accuracy,
#                 "ECE Support": ece_support,
#                 "ECE VisualFidelity": ece_visual,
#                 "ECE Contrastiveness": ece_contrast,
#                 "ECE VF*Contrastiveness": ece_product
#             })
            
#             # Save individual calibration plots for this dataset-model combination
#             save_individual_plots(data, dataset_type, model)
            
#             # Save info for the combined large plot
#             combined_results.append({
#                 "Dataset": dataset_type,
#                 "Model": model,
#                 "data": data
#             })
    
#     # Print summary table
#     summary_df = pd.DataFrame(summary_results)
#     print(summary_df)
    
#     # Create and save the large combined plot (6 rows x 3 columns)
#     if combined_results:
#         create_large_combined_plot(combined_results)

# if __name__ == "__main__":
#     main()


In [None]:
import json
# Calculate classifier calibration
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss
import pandas as pd
import seaborn as sns
import numpy as np


data_filename = "model_outputs/VizWiz/gpt-4o-2024-05-13.csv"
data = pd.read_csv(data_filename)
data

In [None]:
!conda list seaborn


In [None]:
visual_fidelities = data['visual_fidelity']
contrastiveness_scores = data['contrastiveness_score']
correctnesses = data['is_correct']


def plot_calibration_curve(y_true, y_prob, ax, title):
    bin_num_positives, bin_mean_probs, bin_num_instances, bin_centers = [], [], [], []
    for i in range(10):
        bin_lower = i/10
        bin_upper = (i+1)/10 if i != 9 else 1.01
        bin_indices = [j for j in range(len(y_prob)) if bin_lower <= y_prob[j] < bin_upper]
        if len(bin_indices) != 0:
            bin_num_positives.append(np.mean(y_true[bin_indices]))
            bin_mean_probs.append(np.mean(y_prob[bin_indices]))
            bin_num_instances.append(len(bin_indices))
            bin_centers.append((i+0.5)/10)
        else:
            bin_num_positives.append(0)
            bin_mean_probs.append(0)
            bin_num_instances.append(0)
            bin_centers.append((i+0.5)/10)

    # Calculate expected calibration error
    ece = 0
    for i in range(len(bin_num_positives)):
        ece += bin_num_instances[i] * np.abs(bin_num_positives[i] - bin_mean_probs[i])
    ece /= sum(bin_num_instances)

    #print(bin_num_positives, bin_mean_probs, bin_num_instances)
    df = pd.DataFrame({'bin_num_positives': bin_num_positives, 'bin_centers': bin_centers, 'bin_num_instances': bin_num_instances})
    sns.barplot(x='bin_centers', y='bin_num_positives', data=df, ax=ax, hue='bin_num_instances', palette='crest', edgecolor='black', linewidth=1.2, width=1, hue_norm=(0, 200))
    ax.grid(axis='y')
    ax.plot([-0.5, 9.5], [0, 1], "k--")
    #ax.plot(mean_predicted_value, fraction_of_positives, "s-", label="Model")
    ax.set_xlim(-0.5, 9.5)
    ax.set_ylim(0, 1)
    ax.set_xlabel("Metric Score", fontsize='x-large')
    ax.set_ylabel("Prediction Accuracy", fontsize='x-large')
    ax.set_title(f"{title}\nCalibration Error = {ece:.4f}", fontsize='large')
    ax.set_xticks(np.arange(-0.5, 10.5, 1))
    ax.set_xticklabels([i/10 for i in range(11)])
    
    ax.legend().remove()

import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 5, figsize=(18, 4), dpi=200)
fig.suptitle("Calibration of VisualFidelity and Contrastiveness metrics", fontsize='xx-large')
plot_calibration_curve(correctnesses, visual_fidelities, ax[0], "VisualFidelity")
plot_calibration_curve(correctnesses, contrastiveness_scores, ax[1], "Contrastiveness")
plot_calibration_curve(correctnesses, np.array([(x+y)/2 for x, y in zip(visual_fidelities, contrastiveness_scores)]), ax[2], "(VisualFidelity + Contrastiveness)/2")
plot_calibration_curve(correctnesses, np.array([min(x, y) for x, y in zip(visual_fidelities, contrastiveness_scores)]), ax[3], "min(VisualFidelity, Contrastiveness)")
plot_calibration_curve(correctnesses, np.array([x*y for x, y in zip(visual_fidelities, contrastiveness_scores)]), ax[4], "VisualFidelity * Contrastiveness")

norm = plt.Normalize(0, 500)
sm = plt.cm.ScalarMappable(cmap="crest", norm=norm)
ax[4].figure.colorbar(sm, ax=ax[4])

fig.tight_layout()
plt.show()
