In [None]:
#! pip install "altair[all]"
import pandas as pd
import altair as alt
from pathlib import Path

In [None]:
chatbgc_version	= "0.2.1"
benchmark_version = "02d2c72"

In [None]:
df = pd.read_csv(f"../result/benchmark_result_{chatbgc_version}_{benchmark_version}.tsv", sep="\t", index_col=0)

In [None]:
df.loc[:, ["id", "question", "difficulty", "model", "llm_type"]]

In [None]:
df_question = pd.read_json("../test/question_sql_pair.json")
df.loc[:, "success"] = True
avalaible_models = {"ollama" : ["gemma2_9b", "gemma2_27b", "llama3.1_8b", "mistral-nemo"],
                    "openai_chat" : ["gpt-4o", "gpt-4o-mini"]
                   }
for q in df_question.index:
    question_id = df_question.loc[q, "id"]
    for iteration in [1, 2, 3]:
        for llm_type, models in avalaible_models.items():
            for model in models:
                index_name = f"Q_{str(question_id).zfill(2)}__{llm_type}__{model}__RAG_benchmark__iteration_{iteration}"
                if index_name not in df.index:
                    for col in df_question.columns:
                        value = df_question.loc[q, col]
                        if col == "sql":
                            col = "answer_sql_expected"
                        elif col == "answer":
                            col = "answer_summary_expected"
                        df.loc[index_name, col] = value
                        df.loc[index_name, "chatbgc_version"] = chatbgc_version
                        df.loc[index_name, "benchmark_version"] = benchmark_version
                        df.loc[index_name, "model"] = model
                        df.loc[index_name, "llm_type"] = llm_type
                        df.loc[index_name, "success"] = False

In [None]:
df[df.model == "gpt-4o"]

In [None]:
# Define custom order for difficulty
difficulty_order = ['Easy', 'Medium', 'Hard']

# Calculate the number of successes per model
model_success_order = df[df['success'] == True].groupby('model').size().sort_values(ascending=False).index.tolist()

# Group by difficulty, model, and success and count occurrences
grouped = df.groupby(['difficulty', 'model', 'success']).size().reset_index(name='count')

# Create selection for interactivity
selection = alt.selection_point(fields=['success'])

# Plotting with Altair
chart = alt.Chart(grouped).mark_bar().encode(
    x=alt.X('count:Q', stack='normalize', title='Success Rate'),
    y=alt.Y('difficulty:O', title=None, sort=difficulty_order),
    color=alt.Color('success:N', title='Success'),
    order=alt.Order('success', sort='descending'),
    row=alt.Row('model:N', title=None, sort=model_success_order),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(
    selection
).properties(
    title='Success Count by Difficulty and Model',
    height=100
)

# Display the chart
chart.show()

# Save the chart as an SVG file
outfile = Path(f"../figures/benchmark_success_{chatbgc_version}_{benchmark_version}.svg")
outfile.parent.mkdir(exist_ok=True, parents=True)
chart.save(outfile)

In [None]:
with open(f"../result/gpt-4o/chatbgc_{chatbgc_version}/benchmark_{benchmark_version}/iteration_2/benchmark.log", "r") as f:
    benchmark_log = f.readlines()

In [None]:
log_data = {}
ctr = 0
for num, line in enumerate(benchmark_log):
    if 'INFO - Processing question' in line:
        print("\n-----------")
        print(num, line)
        start = num
        ctr += 1
    if 'processed successfully' in line:
        print(num, line)
        stop = num
        log_data[ctr] = [start, stop]
        

In [None]:
log_data

In [None]:
q = 1
benchmark_log[log_data[q][0]:log_data[q][1]+1]

In [None]:
q = 3
benchmark_log[log_data[q][0]:log_data[q][1]+1]