In [None]:
import os
from pathlib import Path
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

# write jsonl file
from ai_profiling.generators.planning_data_generator_datatypes import ValidationLLMResponseJsonData
from ai_profiling.helpers.file_helper.file_helper import get_base_model_from_json, get_files_in_folder

!jupyter nbextension enable --py widgetsnbextension

folder_path = "/Users/jungkookang/Downloads/nl2flow/output/translation"
file_paths = get_files_in_folder(folder_path=Path(folder_path), file_extension="json")

llm_response_json_evaluation_data = []
for file_path in file_paths:
    llm_response_json_evaluation_datum = get_base_model_from_json(
        file_path=file_path, base_model=ValidationLLMResponseJsonData
    )
    llm_response_json_evaluation_data.append(llm_response_json_evaluation_datum)


len(llm_response_json_evaluation_data)

In [None]:
# add all data to each llm model
from collections import defaultdict
import pprint
import statistics
from typing import Dict, List

# [total, correct]
llm_bin: Dict[str, Dict[str, int]] = defaultdict(
    lambda: {
        "total": 0, 
        "perfect": 0, 
        "parsing": 0, 
        "num_hallucination_per_problem": [], 
        "num_missing_element_per_problem": []
        }
)
model_ids_failed = []
for llm_response_json_evaluation_unit in llm_response_json_evaluation_data:
    llm_model_id = (llm_response_json_evaluation_unit.llm_response_planning_data.llm_response.llm_model_id).split("/")[-1]
    llm_model_id = llm_model_id.replace("3-1", "3.1").replace("3-3", "3.3")

    is_perfect_translation = True
    total_total = 0
    total_correct = 0
    total_hallucination = 0
    total_missing = 0
    llm_bin[llm_model_id]["total"] += 1
    # check if translation is intact
    stat = llm_response_json_evaluation_unit.json_translation_stat

    if stat is None:
        continue

    llm_bin[llm_model_id]["parsing"] += 1

    for field in stat.model_fields_set:
        field_obj = getattr(stat, field)
        if field_obj.num_correct != field_obj.total:
            is_perfect_translation = False

        total_total += field_obj.total
        total_correct += field_obj.num_correct
        total_hallucination += field_obj.num_hallucination
        total_missing += field_obj.num_missing
    
    llm_bin[llm_model_id]["num_hallucination_per_problem"].append(total_hallucination)
    llm_bin[llm_model_id]["num_missing_element_per_problem"].append(total_missing)
    

    if is_perfect_translation:
        llm_bin[llm_model_id]["perfect"] += 1


# calculate average
for llm_model_id in llm_bin.keys():
    lst = llm_bin[llm_model_id]["num_hallucination_per_problem"]
    llm_bin[llm_model_id]["mean_num_hallucination_per_problem"] = statistics.mean(lst) if len(lst) > 0 else -1.0
    if llm_bin[llm_model_id]["mean_num_hallucination_per_problem"] < 0.0:
        model_ids_failed.append(llm_model_id)
        print(f"No valid response for {llm_model_id}")
    llm_bin[llm_model_id]["std_num_hallucination_per_problem"] = statistics.stdev(lst) if len(lst) > 0 else -1.0
    
    lst_0 = llm_bin[llm_model_id]["num_missing_element_per_problem"]
    llm_bin[llm_model_id]["mean_num_missing_element_per_problem"] = statistics.mean(lst_0) if len(lst_0) > 0 else -1.0
    llm_bin[llm_model_id]["std_num_missing_element_per_problem"] = statistics.stdev(lst_0) if len(lst_0) > 0 else -1.0

for model_id in model_ids_failed:
    del llm_bin[llm_model_id]

pprint.pp(llm_bin)

In [None]:
from copy import deepcopy
from math import ceil

from matplotlib import pyplot as plt
import numpy as np


def make_planning_chart_multi_rev(llm_bin_input, file_name_path: str, row_labels: List[str]) -> None:
    llm_bin2 = deepcopy(llm_bin_input)
    rowmap = {row_name: idx for idx, row_name in enumerate(row_labels)}
    rows = int(ceil(len(llm_bin2) / 2))
    fig, ax = plt.subplots(rows, 2, figsize=(8, 6))
    fig.tight_layout()
    handles = None
    labels = None
    counter = 0
    for quality, obj in llm_bin2.items():
        col = counter % 2
        row = counter // 2
        width = 0.25  # the width of the bars
        multiplier = 0
        model_names = list(obj.keys())
        model_names.sort()
        values = [obj[model_name] for model_name in model_names]
        x = np.arange(1)  # the label locations
        for idx, model_name in enumerate(model_names):
            offset = width * multiplier
            
            rects = ax[row, col].bar(x + offset, [(values[idx] if values[idx] >= 0.0 else 0.0)], width * 0.8, label=model_name.capitalize())
            if values[idx] >= 0.0:
                ax[row, col].bar_label(rects, padding=3, fontsize=10, fmt="%.2f")
            multiplier += 1

        # if row == 0:
        #     ax[row].set_title(f"{prompt_type.capitalize()} prompt", y=1, fontsize=12)
        if "number" not in quality:
            ax[row, col].set_ylim(0, 1.0)
        ax[row, col].xaxis.set_visible(False)
        # ax[row, col].tick_params(axis='x', labelsize=12)
        ax[row, col].tick_params(axis="y", labelsize=12)
        # ax[row, col].set_xlabel("Plan length", fontsize=12)
        ax[row, col].set_ylabel(f"{quality}", fontsize=12)
        ax[row, col].spines[["right", "top"]].set_visible(False)

        if row == rows - 1 and col == 1:
        # ax[row, col].legend(loc="best",fontsize='small') # bbox_to_anchor=(legend_x, legend_height)
            handles, labels = ax[row, col].get_legend_handles_labels()
        # ax[row, col].xaxis.set_major_locator(MaxNLocator(integer=True))
        counter += 1

    labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
    fig.legend(handles, labels, bbox_to_anchor=(1.4, 0.7), fontsize="large", loc="upper right", frameon=False)
    plt.tight_layout()
    plt.savefig(file_name_path, bbox_inches="tight")
    plt.show()

In [None]:
import collections

def process_model_name(model_name):
    new_model_name = model_name.split("/")[-1]
    if "DeepSeek-V3" == new_model_name:
        new_model_name += "(685b, fp8)"
    
    return new_model_name.strip().lower()

new_llm_bin = deepcopy(llm_bin)
new_llm_bin = {process_model_name(model_name): val for model_name, val in new_llm_bin.items()}

ordered_dict = collections.OrderedDict(sorted(new_llm_bin.items(), key=lambda it: str(it[0]), reverse=False)) # collections.OrderedDict(sorted(llm_bin.items()))

data_dict = {
    "Successful JSON Parsing Rate": {model_name: (stat["parsing"] / stat["total"]) for model_name, stat in ordered_dict.items()},
    "Perfect translation rate": {model_name: (stat["perfect"] / stat["total"]) for model_name, stat in ordered_dict.items()},
    "Mean number of\n hallucinated elements per problem": {model_name: stat["mean_num_hallucination_per_problem"] for model_name, stat in ordered_dict.items()},
    "Mean number of\n missing elements per problem": {model_name: stat["mean_num_missing_element_per_problem"] for model_name, stat in ordered_dict.items()}
}

make_planning_chart_multi_rev(
    llm_bin_input=data_dict,
    file_name_path="output/plot/translation_rates.png",
    row_labels = list(data_dict.keys())
)

# Create Regression Data

In [None]:
X = []
Y = []
X_model = {}
Y_model = {}

X_reg = []
Y_reg = []
X_model_reg = {}
Y_model_reg = {}

for llm_response_json_evaluation_unit in llm_response_json_evaluation_data:
    llm_model_id = llm_response_json_evaluation_unit.llm_response_planning_data.llm_response.llm_model_id
    if llm_model_id not in X_model:
        X_model[llm_model_id] = []
        Y_model[llm_model_id] = []
        X_model_reg[llm_model_id] = []
        Y_model_reg[llm_model_id] = []

    # check if translation is intact
    stat = llm_response_json_evaluation_unit.json_translation_stat
    is_perfect_translation = True
    llm_bin[llm_model_id]["total"] += 1

    parsing_success = 0
    perfect_translation = 0
    prop_correct = -1.0
    prop_hallucination = -1.0
    prop_missing = -1.0
    tag = llm_response_json_evaluation_unit.llm_response_planning_data.pddl_generator_output.planning_datum_tag
    tmp_x = [
        float(tag.number_of_agents),
        float(tag.input_parameters_per_agent),
        float(tag.coupling_of_agents),
        float(
            llm_response_json_evaluation_unit.llm_response_planning_data.pddl_generator_output.agent_info_generator_input.proportion_slot_fillable_variables
        ),
        float(tag.number_of_goals),
    ]

    # float(tag.length_of_sequence),
    # float(len(llm_response_json_evaluation_unit.llm_response_planning_data.planning_prompt)),
    # float(len(llm_response_json_evaluation_unit.llm_response_planning_data.llm_response.generated_text)),
    if stat is not None:
        parsing_success = 1
        total_total = 0
        total_correct = 0
        total_hallucination = 0
        total_missing = 0

        for field in stat.model_fields_set:
            field_obj = getattr(stat, field)

            total_total += field_obj.total
            total_correct += field_obj.num_correct
            total_hallucination += field_obj.num_hallucination
            total_missing += field_obj.num_missing

            if field_obj.num_correct != field_obj.total:
                is_perfect_translation = False

        if is_perfect_translation:
            llm_bin[llm_model_id]["perfect"] += 1

        perfect_translation = 1 if is_perfect_translation else 0
        prop_correct = total_correct / total_total
        prop_hallucination = total_hallucination / total_total
        prop_missing = total_missing / total_total

        tmp_y_reg = [prop_correct, prop_hallucination, prop_missing]
        Y_reg.append(tmp_y_reg)
        Y_model_reg[llm_model_id].append(tmp_y_reg)
        X_reg.append(tmp_x)
        X_model_reg[llm_model_id].append(tmp_x)

    tmp_y = [parsing_success, perfect_translation]

    Y.append(tmp_y)
    Y_model[llm_model_id].append(tmp_y)
    X.append(tmp_x)
    X_model[llm_model_id].append(tmp_x)

In [None]:
from sklearn.preprocessing import normalize

# normalize
X_1 = np.array(X, dtype=np.float64)
X_n = normalize(X_1, axis=0, norm="l2")

# normalize
X_1_reg = np.array(X_reg, dtype=np.float64)
X_n_reg = normalize(X_1_reg, axis=0, norm="l2")

X_model_n = {}
for llm_model_id, data_x in X_model.items():
    tmp_1 = np.array(data_x, dtype=np.float64)
    if len(tmp_1) > 0:
        X_model_n[llm_model_id] = normalize(tmp_1, axis=0, norm="l2")

X_model_n_reg = {}
for llm_model_id, data_x in X_model_reg.items():
    tmp_1 = np.array(data_x, dtype=np.float64)
    if len(tmp_1) > 0:
        X_model_n_reg[llm_model_id] = normalize(tmp_1, axis=0, norm="l2")

In [None]:
# choose Y
parsing_success_idx = 0
perfect_translation_idx = 1
prop_correct_idx = 0
prop_hallucination_idx = 1
prop_missing_idx = 2

parsing_success_y = np.array(list(map(lambda arr: arr[parsing_success_idx], Y)))
perfect_translation_y = np.array(list(map(lambda arr: arr[perfect_translation_idx], Y)))
prop_correct_y = np.array(list(map(lambda arr: arr[prop_correct_idx], Y_reg)))
prop_hallucination_y = np.array(list(map(lambda arr: arr[prop_hallucination_idx], Y_reg)))
prop_missing_y = np.array(list(map(lambda arr: arr[prop_missing_idx], Y_reg)))

# Parsing Success

In [None]:
import statsmodels.api as sm

from ai_profiling.helpers.file_helper.file_helper import write_txt_file


output_folder_path = "output/regression_translation"
group_name = "parsing_success"

print("Parsing Success")
print("\n\n\nSubject: Total")
logit = sm.Logit(parsing_success_y, X_n)
result = logit.fit_regularized(method="l1", alpha=1)
name = "total"
write_txt_file(file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv())

for llm_model_id in X_model_n.keys():
    print(f"\n\n\nSubject: {llm_model_id}")
    x_n = X_model_n[llm_model_id]
    y = np.array(list(map(lambda arr: arr[parsing_success_idx], Y_model[llm_model_id])))
    logit = sm.Logit(y, x_n)
    result = logit.fit_regularized(method="l1", alpha=1)
    name = llm_model_id.split("/")[-1]
    write_txt_file(
        file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv()
    )

# Perfect Translation

In [None]:
import statsmodels.api as sm

group_name = "perfect_translation"

print("Perfect Translation")
print("\n\n\nSubject: Total")

logit = sm.Logit(perfect_translation_y, X_n)
result = logit.fit_regularized(method="l1", alpha=1)
name = "total"
write_txt_file(file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv())

for llm_model_id in X_model_n.keys():
    print(f"\n\n\nSubject: {llm_model_id}")
    x_n = X_model_n[llm_model_id]
    y = np.array(list(map(lambda arr: arr[perfect_translation_idx], Y_model[llm_model_id])))
    logit = sm.Logit(y, x_n)
    result = logit.fit_regularized(method="l1", alpha=1)
    name = llm_model_id.split("/")[-1]
    write_txt_file(
        file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv()
    )

# Proportion of Correctly Translated Elements

In [None]:
import statsmodels.api as sm

group_name = "correct_translation"

print("Proportion of Correctly Translated Elements")
print("\n\n\nSubject: Total")

X_n_c = sm.add_constant(X_n_reg, prepend=False)
mod = sm.OLS(prop_correct_y, X_n_c)
result = mod.fit()
name = "total"
write_txt_file(file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv())

for llm_model_id in X_model_n_reg.keys():
    print(f"\n\n\nSubject: {llm_model_id}")
    x_n = X_model_n_reg[llm_model_id]
    x_n_c = sm.add_constant(x_n, prepend=False)
    y = np.array(list(map(lambda arr: arr[prop_correct_idx], Y_model_reg[llm_model_id])))
    mod = sm.OLS(y, x_n_c)
    result = mod.fit()
    name = llm_model_id.split("/")[-1]
    write_txt_file(
        file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv()
    )

# Proportion of Hallucinated Elements In Translation

In [None]:
import statsmodels.api as sm

group_name = "hallucination"

print("Proportion of Hallucinated Elements")
print("\n\n\nSubject: Total")

X_n_c = sm.add_constant(X_n_reg, prepend=False)
mod = sm.OLS(prop_hallucination_y, X_n_c)
result = mod.fit()
name = "total"
write_txt_file(file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv())

for llm_model_id in X_model_n_reg.keys():
    print(f"\n\n\nSubject: {llm_model_id}")
    x_n = X_model_n_reg[llm_model_id]
    x_n_c = sm.add_constant(x_n, prepend=False)
    y = np.array(list(map(lambda arr: arr[prop_hallucination_idx], Y_model_reg[llm_model_id])))
    mod = sm.OLS(y, x_n_c)
    result = mod.fit()
    name = llm_model_id.split("/")[-1]
    write_txt_file(
        file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv()
    )

# Proportion of Missing Elements

In [None]:
import statsmodels.api as sm

group_name = "missing_elements"

print("Proportion of Hallucinated Elements")
print("\n\n\nSubject: Total")

X_n_c = sm.add_constant(X_n_reg, prepend=False)
mod = sm.OLS(prop_missing_y, X_n_c)
result = mod.fit()
name = "total"
write_txt_file(file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv())

for llm_model_id in X_model_n_reg.keys():
    print(f"\n\n\nSubject: {llm_model_id}")
    x_n = X_model_n_reg[llm_model_id]
    x_n_c = sm.add_constant(x_n, prepend=False)
    y = np.array(list(map(lambda arr: arr[prop_missing_idx], Y_model_reg[llm_model_id])))
    mod = sm.OLS(y, x_n_c)
    result = mod.fit()
    name = llm_model_id.split("/")[-1]
    write_txt_file(
        file_path=os.path.join(output_folder_path, group_name, (name + ".csv")), text=result.summary().as_csv()
    )