## Setup

### Config

In [None]:
STATISTICS = "bisimilar", "Precision", "Recall", "F-Score", "total_time", "learning_rounds"
import sys
sys.executable

### Imports

In [None]:
import sys
sys.path.append(f"..")
sys.path.append(r"../../pmsat-inference")
sys.path.append(r"../../AALPy")

from evaluation.utils import print_results_info, print_results_info_per_alg, TracedMooreSUL
from evaluation.charts import *
from evaluation.charts_pandas import *
from IPython.display import display, Markdown, Latex

## Loading Results

### Loading

In [None]:
# 25 # python evaluation/learn_automata.py -o "None" -n 10 -ns 3-9 -ni 3 -no 3 --learn_num_times 3 --glitch_percent 0.0 1.0 5.0 --glitch_mode "enter_random_state" -a "APMSL" "APMSL(ddt=False)" "APMSL(ddt=False, orpg=True)" "APMSL(ONLY_RW)" "APMSL(ONLY_RW, ddt=True)" "APMSL(RW)" "APMSL(RW, ddt=False)" "APMSL(RW, ddt=False, orgp=True)" "APMSL(tc=500)" "APMSL(tc=500, ddt=False)" "APMSL(NO_REP, RW, ddt=False)"  # interrupted when starting 7state 
# 26: python evaluation/learn_automata.py --learn_all_automata_from_dir example_automata/moore -a "APMSL(RW)" -o "Random" "None"  # terminated by MemoryError
# 34:  python evaluation/learn_automata.py --files example_automata/moore -a "APMSL(RW)" "APMSL(ONLY_RW)" "APMSL" -o "Random" "None" --glitch_mode "enter_random_state" --glitch_percent 0.0 1.0
# 49: # python evaluation/learn_automata.py -a "APMSL" "APMSL(RW)" "APMSL(ONLY_RW)" "APMSL(RW,GTT1)" "APMSL(RW,GTT2)" "APMSL(ONLY_RW,GTT2)"  -o "Random" "None" -f example_automata/moore/car_alarm.dot example_automata/moore/coffee_moore.dot --learn_num_times 10 --glitch_percent 0.0 0.5 1.0 --glitch_mode "enter_random_state"

base_results_dir = r"../../../OLD_active-pmsat-inference-wip/"
#results_dir = r"learning_results_34"
results_dir = r"learning_results_49"
#results_dir = r"learning_results_25"

results_dir = os.path.join(base_results_dir, results_dir)
results = load_results(results_dir)
print(f"Loaded {len(results)} results!")

### Cleaning

In [None]:
def remove_results(df: pd.DataFrame) -> pd.DataFrame:
    # typo in orpg -> always failed. remove
    df = df[df["algorithm_name"] != "APMSL('RW', ddt=False, orgp=True)"].copy()
    
    # remove invalid results (exceptions)
    df = df[df["exception"].isna()].copy()
    
    return df

def postprocess_results(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # ddt=True is default anyways
    df.loc[df["algorithm_name"] == "APMSL('ONLY_RW', ddt=True)", "algorithm_name"] = "APMSL('ONLY_RW')"

    # Remove only a final empty parentheses pair "()"
    df["algorithm_name"] = df["algorithm_name"].str.replace(r"\(\)$", "", regex=True)

    # Remove single quotes in algorithm names
    df["algorithm_name"] = df["algorithm_name"].str.replace("'", "", regex=False)
    
    # Add 'model_name' column
    df["model_name"] = df["original_automaton"].apply(lambda x: Path(x).stem)
                                
    return df


def clean_results(df: pd.DataFrame) -> pd.DataFrame:
    """Applies filtering and post-processing to the results DataFrame."""
    return postprocess_results(remove_results(df))

def print_exceptions(df: pd.DataFrame):
    df_exceptions = df.copy().dropna(subset=['exception'])
    
    for exception, group in df_exceptions.groupby('exception'):
        print(f"Exception: {exception} occurred in following combinations:")
        for _, row in group.iterrows():
            file, alg, orac, glitch = (row['original_automaton'], row['algorithm_name'], row['oracle'], row['glitch_percent'])
            file = Path(file).stem
            print(f"   {file, alg, orac, glitch}")
        print()

print_exceptions(results)
results = clean_results(results)
load_gsm_comparison_data(results_dir, results)

## Statistics

### Overall

In [None]:
def non_mat(result: pd.DataFrame):
    if not is_valid_result(result):
        return False
    return result.oracle == "None"

kwargs = {}

if results["glitch_percent"].nunique() > 1:
    kwargs["group_by"] = ["glitch_percent"] 

car_alarm_results = results[results["model_name"] == "car_alarm"]
for stat in STATISTICS:
    bar_chart_per_algorithm_and_oracle(car_alarm_results, stat, agg_method="mean", **kwargs)

### Comparison with GSM (difference)

In [None]:
kwargs = {}

if results["glitch_percent"].nunique() > 1:
    kwargs["group_by"] = ["glitch_percent"] 
 
for stat in STATISTICS:
    if stat in ("learning_rounds", ):
        continue
    results[f"{stat} vs GSM (pm)"] = results[stat] - results[f"GSM_with_purge_mismatches.{stat}"]
    results[f"{stat} vs GSM"] = results[stat] - results[f"GSM_without_purge_mismatches.{stat}"]
    
    multiple_bar_charts_per_algorithm_and_oracle(results[results["model_name"] == "car_alarm"], [f"{stat} vs GSM (pm)", f"{stat} vs GSM"], agg_method="mean", positioning_mode="below", figsize=(12,4), **kwargs)

### GSM comparison (seperate charts)

In [None]:
kwargs = {}

if results["glitch_percent"].nunique() > 1:
    kwargs["group_by"] = ["glitch_percent"] 

car_alarm_results = results[results["model_name"] == "car_alarm"]
for stat in STATISTICS:
    if stat in ("learning_rounds", ):
        continue
    multiple_bar_charts_per_algorithm_and_oracle(car_alarm_results, [stat, f"GSM_with_purge_mismatches.{stat}", f"GSM_without_purge_mismatches.{stat}"], agg_method="mean", positioning_mode="below", figsize=(12,4), **kwargs)

### Run counts per model

In [None]:
counts = results.groupby(['original_automaton', 'algorithm_name', 'oracle', 'glitch_percent']).size().reset_index(name='count')
counts_sorted = counts.sort_values(by='count', ascending=False)

# Group by count and print formatted output
for count, group in counts_sorted.groupby('count', sort=False):
    print(f"{count} times:")
    for _, row in group.iterrows():
        file, name, oracle, glitch, count = row
        file = Path(file).stem
        print(f"   {file, name, oracle, glitch}")  # Exclude the 'count' column when printing
    print()

### Grouped by model (1 chart)

In [None]:
for stat in STATISTICS:
    bar_chart(results, stat, agg_method="mean", group_by=("original_automaton", "glitch_percent", "oracle", "algorithm_name",))

### Grouped by model (seperate charts per model)

In [None]:
for model_file in results["original_automaton"].unique():
    model_name = Path(model_file).stem
    display(Markdown(f"#### {model_name}"))
    
    for stat in STATISTICS:
        bar_chart(results[results["original_automaton"] == model_file].copy(), 
                  stat,
                  agg_method="mean", 
                  group_by=("algorithm_name", "oracle", "glitch_percent"))
    

### Grouped by algorithm (seperate charts per algorithm)

In [None]:
for algorithm in results["algorithm_name"].unique():
    display(Markdown(f"### {algorithm}"))
    
    for stat in STATISTICS:
        bar_chart(results[results["algorithm_name"] == algorithm].copy(), 
                  stat,
                  agg_method="mean", 
                  group_by=("original_automaton", "oracle", "glitch_percent"))

## Comparison with GSM baseline

In [None]:
def gsm_comparison(learning_results: pd.DataFrame, comparison_results: pd.DataFrame):
    comparison_results

gsm_comparison_folder = Path(results_dir) / "GSM_comparison"
if not gsm_comparison_folder.exists():
    print(f"No GSM comparison folder {gsm_comparison_folder} found.\nCreate it by running evaluation/compare_with_gsm.py {results_dir}")
else:
    data = []
    for file in gsm_comparison_folder.iterdir():
        if file.name.startswith('info') or not file.name.endswith(".json"):
            continue
        with open(file.resolve(), "r") as f:
            result = json.load(f)
            result["comparison_results_file"] = file.name
            data.append(result)
    comparison_results = pd.json_normalize(data)
    comparison_results["model_name"] = comparison_results["original_automaton"].apply(lambda x: Path(x).stem)
    comparison_results["apmsl_variant"] = comparison_results.apply(
        lambda row: f"{row['algorithm_name']} ({row['oracle']})", axis=1
    )

    
    #gsm_comparison(results, comparison_results)

In [None]:
# comparison_results.plot()

In [None]:
metrics = ["bisimilar", "Precision", "Recall", "F-Score"]
algorithms = ["apmsl_algorithm", "GSM_with_purge_mismatches", "GSM_without_purge_mismatches"]
colors = ["blue", "orange", "green"]  # Define colors for algorithms

means = {metric: [comparison_results[comparison_results["model_name"] == "car_alarm"][f"{algo}.{metric}"].mean() for algo in algorithms] 
         for metric in metrics}

x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))

for i, algo in enumerate(algorithms):
    ax.bar(x + i * width, [means[metric][i] for metric in metrics], width, label=algo, color=colors[i])

ax.set_xticks(x + width)  # Adjust x-tick positions
ax.set_xticklabels(metrics)  # Set metric names on x-axis
ax.set_ylabel("Mean Value")
ax.set_title("Comparison of Algorithm Statistics")
ax.legend()

plt.show()

In [None]:
metrics = ["bisimilar", "Precision", "Recall", "F-Score"]
algorithms = ["apmsl_algorithm", "GSM_with_purge_mismatches", "GSM_without_purge_mismatches"]
colors = ["blue", "orange", "green"]

group_col = "glitch_percent"  # Change to "model_name" if needed
grouped = comparison_results.groupby(group_col)

# Create subplots for each group
fig, axes = plt.subplots(1, len(metrics), figsize=(16, 5), sharey=True)

for idx, metric in enumerate(metrics):
    ax = axes[idx]

    means = grouped[[f"{algo}.{metric}" for algo in algorithms]].mean()

    x = np.arange(len(means))
    width = 0.25

    for i, algo in enumerate(algorithms):
        ax.bar(x + i * width, means[f"{algo}.{metric}"], width, label=algo, color=colors[i])

    ax.set_xticks(x + width)
    ax.set_xticklabels(means.index, rotation=15)
    ax.set_title(metric)
    ax.set_ylabel("Mean Value")

axes[0].legend(title="Algorithm")  # Show legend only once
plt.suptitle(f"Comparison of Algorithm Statistics Grouped by {group_col}")
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

df = comparison_results
# df = df[df["model_name"] == "car_alarm"]
df = df[df["oracle"] == "None"]
df = df[df["original_num_states"] > 4]

metrics = ["bisimilar", "Precision", "Recall", "F-Score"]
gsm_algorithms = ["GSM_with_purge_mismatches", "GSM_without_purge_mismatches"]

apmsl_variants = df["algorithm_name"].unique().tolist()
algorithms = apmsl_variants + gsm_algorithms

group_col = "glitch_percent"  # Change to "model_name" if needed

# 1 subplot for each metric
fig, axes = plt.subplots(1, len(metrics), figsize=(16, 5), sharey=True)

width = 0.08
colors = ["blue", "orange", "green", "red", "purple", "cyan", "pink", "brown"]

for idx, metric in enumerate(metrics):
    ax = axes[idx]

    # compute means for APMSL variants
    grouped_means_apmsl = df.groupby([group_col, "algorithm_name"])[f"apmsl_algorithm.{metric}"].mean().unstack()
    
    # compute means for GSM algorithms
    grouped_means_gsm = df.groupby(group_col)[[f"{algo}.{metric}" for algo in gsm_algorithms]].mean()

    # Merge them together
    grouped_means = pd.concat([grouped_means_apmsl, grouped_means_gsm], axis=1)
    grouped_means = grouped_means.reset_index()

    #print(f"\nMetric: {metric}\n", grouped_means)

    unique_groups = grouped_means[group_col].unique()
    x = np.arange(len(unique_groups))

    for i, algo in enumerate(algorithms):
        if algo in apmsl_variants:
            algo_col = algo  # APMSL variants are in columns directly
        else:
            algo_col = f"{algo}.{metric}"  # GSM columns are explicitly named

        # Extract means (align missing values)
        y_values = grouped_means.set_index(group_col).get(algo_col, np.nan).reindex(unique_groups).values

        ax.bar(x + i * width, y_values, width, label=algo, color=colors[i % len(colors)])

    ax.set_xticks(x + (width * len(algorithms) / 2))
    ax.set_xticklabels(unique_groups, rotation=15)
    ax.set_title(metric)
    ax.set_ylabel("Mean Value")

plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.suptitle(f"Comparison of Algorithm Variants Grouped by {group_col}")
plt.tight_layout()
plt.show()

