# Analysis
This notebook is aimed at analysing a human compared to a synthetic benchmark.

In [None]:
# config = "sales_context"
config = "sales_five_models"
# config = "squad_five_models"
# config = "ASQA_five_models"
# config = "ASQA_context"
# config = "Launchpad_context"
# config = "Launchpad_five_models"

In [None]:
import pandas as pd
import os
import sys
root = os.environ.get("PROJECT_ROOT")
sys.path.append(root)
import matplotlib.pyplot as plt
from utils.utils_eval import *
from functools import partial
from scipy.stats import wilcoxon
import yaml
pd.options.display.max_colwidth = 1000

In [None]:
with open(os.path.join(root, "configs", "experiments", f"{config}.yaml"), "r") as f:
    config = yaml.safe_load(f)
data_types = ["Human", config["generated_dataset_name"]]
output_path = os.path.join(root, "output", config["dataset_name"], config["experiment_name"])
output_path_figures = os.path.join(output_path, "figures")

In [None]:
all_results, all_results_simple = load_results(
    output_path=output_path,
    data_types=data_types,
    buddies=config['buddies']
)
df_human = pd.DataFrame.from_dict(all_results[data_types[0]], orient="columns")
df_gen = pd.DataFrame.from_dict(all_results[data_types[1]], orient="columns")
for model in config["buddies"]:
    df_human.loc["false_negatives", model] = all_results_simple[data_types[0]][model]["false_negatives"]
    df_gen.loc["false_negatives", model] = all_results_simple[data_types[1]][model]["false_negatives"]
df_comb = combine_results(df_human=df_human, df_gen=df_gen)

In [None]:
metric_names = {
    "bleu_score" : "BLEU",
    "rouge_score(mode=fmeasure)": "ROUGE",
    "rouge_score" : "ROUGE",
    "non_llm_string_similarity": "Lev. Dist.",
    "false_negatives" : "FN Ratio",
    "semantic_similarity" : "Semantic Similarity",
    "string_present" : "String Presence",
    'llm_context_precision_without_reference' : "context precision",
    'context_recall' : "context recall",
    'faithfulness' : "faithfulness", 
    'answer_relevancy': "answer relevancy",
}

dataset_names = {
    "Generated" : "Synthetic"
}

# df_comb = df_comb.rename(index=model_names, level="Model") ADD RENAMING DICT FOR MODEL NAMES IF YOU WANT
df_comb = df_comb.rename(index=metric_names, level="Metric")
df_comb = df_comb.rename(index=dataset_names, level="Dataset")
metrics = set(df_comb.index.get_level_values(level="Metric"))
models =  set(df_comb.index.get_level_values(level="Model"))

In [None]:
sem_sim_ecdf = plot_ecdf_metric(
    metric="Semantic Similarity",
    df=df_comb,
    dataset=config["dataset_name"],
    output_path=output_path_figures,
    save=False,
    dpi=1000
)
plt.show()

In [None]:
metrics
eval_metrics = {
    m for m in metrics if m not in ["reference", "response", "user_input", "retrieved_contexts", "FN ratio"]
}
df_comb_filtered = df_comb.drop(level="Metric", index=metrics-eval_metrics)
df_comb_filtered.head()

In [None]:
df_comb_means = df_comb_filtered.groupby(level=["Dataset", "Metric", "Model"]).mean()
df_comb_stds = df_comb_filtered.groupby(level=["Dataset", "Metric", "Model"]).std()
df_comb_means["Rank"] = df_comb_means.groupby(["Dataset", "Metric"])['Value'].rank(ascending=False, method="dense")
# df_comb_means.to_csv()

In [None]:
df_kendall = get_kendall_stats(df=df_comb_means)
df_kendall

In [None]:
df_kendall.mean()

In [None]:
# drop any NaN kendall tau value (all models scored the same)
df_comb_means.drop(index="String Presence", level="Metric", inplace=True)

In [None]:
baseline_model = config["baseline"]
df = df_comb_means.groupby(['Dataset', 'Metric'], group_keys=False).apply(partial(compute_percentual_change, baseline_model=baseline_model))

In [None]:
# Plot differences
models = config['buddies'].copy()
models.remove(config['baseline'])
plot_percentual_change(df=df, models=models, output_path=output_path_figures, dpi=1000, save=False, log_scale=True);