In [None]:
import pandas as pd
import altair as alt
from pathlib import Path

In [None]:
filter_out = ["mistral-nemo"]

In [None]:
df = pd.read_csv("2.Evaluated_benchmark_response.tsv", sep="\t", index_col=0)
df = df[~df.model.isin(filter_out)]
chatbgc_version = df.chatbgc_version.unique()[0]
benchmark_version = df.benchmark_version.unique()[0]
df.summary_evaluation = df.summary_evaluation.fillna("N/A")
df.sql_evaluation = df.sql_evaluation.fillna("N/A")
df

In [None]:
df.sql_query_success = df.sql_query_success.replace({True : "Success", False : "Fail"})
df.sql_evaluation = df.sql_evaluation.replace({True : "Correct", False : "Incorrect"})

In [None]:
# Define custom order for difficulty
difficulty_order = ['Easy', 'Medium', 'Hard']
barchart_order = ["Fail", "Success"]

# Calculate the number of successes per model
model_success_order = df[df['sql_query_success'] == "Success"].groupby('model').size().sort_values(ascending=False).index.tolist()

# Group by difficulty, model, and success and count occurrences
grouped = df.groupby(['difficulty', 'model', 'sql_query_success']).size().reset_index(name='count')

# Create selection for interactivity
selection = alt.selection_point(fields=['sql_query_success'])

# Plotting with Altair
chart1 = alt.Chart(grouped).mark_bar().encode(
    x=alt.X('count:Q', stack='normalize', title='Success Rate', sort=barchart_order),
    y=alt.Y('difficulty:O', title=None, sort=difficulty_order, axis=alt.Axis(labels=False, tickSize=0)),
    color=alt.Color('sql_query_success:N', title='sql_query_success', legend = alt.Legend(title=None,orient='top'),
                   scale=alt.Scale(domain=["Success", "Fail"], range=["#08415c", "#e5e5e5"])
                   ),
    order=alt.Order('sql_query_success', sort='descending'),
    row=alt.Row('model:N', title=None, sort=model_success_order),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(
    selection
).properties(
    title='SQL Execution',
    height=60,
    width=150
)

# Display the chart
chart1.show()

# Save the chart as an SVG file
outfile = Path(f"../figures/benchmark_success_{chatbgc_version}_{benchmark_version}.svg")
outfile.parent.mkdir(exist_ok=True, parents=True)
chart1.save(outfile)

In [None]:
# Define custom order for difficulty
difficulty_order = ['Easy', 'Medium', 'Hard']

# Group by difficulty, model, and success and count occurrences
grouped = df.groupby(['difficulty', 'model', 'sql_evaluation']).size().reset_index(name='count')

# Create selection for interactivity
selection = alt.selection_point(fields=['sql_evaluation'])

# Customize the order of the stack
custom_order_dict = {'Correct': 1, 'Wrong': 2, 'N/A': 3}

# Map the custom order to the data
df['custom_order_sql_evaluation'] = df['sql_evaluation'].map(custom_order_dict)

# Plotting with Altair
chart2 = alt.Chart(grouped).mark_bar().encode(
    x=alt.X('count:Q', stack='normalize', title='Correct SQL Proportion'),
    y=alt.Y('difficulty:O', title=None, sort=difficulty_order, axis=alt.Axis(labels=False, tickSize=0)),
    color=alt.Color('sql_evaluation:N', title='sql_evaluation', legend = alt.Legend(title=None,orient='top'),
                    scale=alt.Scale(domain=["Correct", "Incorrect", "N/A"], range=["#006e90", "#f18f01", "#e5e5e5"])
                   ),
    order=alt.Order('sql_evaluation', sort='ascending'),
    row=alt.Row('model:N', title=None, sort=model_success_order, 
                header=alt.Header(labels=False)),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(
    selection
).properties(
    title='SQL Evaluation',
    height=60,
    width=150
)

# Display the chart
chart2.show()

# Save the chart as an SVG file
outfile = Path(f"../figures/benchmark_sql_{chatbgc_version}_{benchmark_version}.svg")
outfile.parent.mkdir(exist_ok=True, parents=True)
chart2.save(outfile)

In [None]:
# Define custom order for difficulty
difficulty_order = ['Easy', 'Medium', 'Hard']

# Calculate the number of successes per model
#model_success_order = df[df['summary_evaluation'] == True].groupby('model').size().sort_values(ascending=False).index.tolist()

# Group by difficulty, model, and success and count occurrences
grouped = df.groupby(['difficulty', 'model', 'summary_evaluation']).size().reset_index(name='count')

# Create selection for interactivity
selection = alt.selection_point(fields=['summary_evaluation'])

# Plotting with Altair
chart3 = alt.Chart(grouped).mark_bar().encode(
    x=alt.X('count:Q', stack='normalize', title='Summary Evaluation'),
    y=alt.Y('difficulty:O', title=None, sort=difficulty_order, axis=alt.Axis(labels=True, orient='right')),
    color=alt.Color('summary_evaluation:N', title='Summary Evaluation', legend = alt.Legend(title=None, orient='top'),
                    scale=alt.Scale(domain=[1, 2, 3, 4, 5, "N/A"], range=['#ff0000', '#ff6666', '#cc66ff', '#6666ff', '#0000ff', "#e5e5e5"])
                   ),
    order=alt.Order('summary_evaluation', sort='descending'),
    row=alt.Row('model:N', title=None, sort=model_success_order, header=alt.Header(labels=False)),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(
    selection
).properties(
    title='Summary Rating',
    height=60,
    width=150
)

# Display the chart
chart3.show()

# Save the chart as an SVG file
outfile = Path(f"../figures/benchmark_summary_{chatbgc_version}_{benchmark_version}.svg")
outfile.parent.mkdir(exist_ok=True, parents=True)
chart3.save(outfile)

In [None]:
# Horizontally concatenate the charts
combined_chart = alt.hconcat(chart1, chart2, chart3).resolve_scale(
    y='shared',  # Share the y-axis scale across all charts
    color='independent'
)

# Display the combined chart
combined_chart.show()

In [None]:
# Save the chart as an SVG file
for filetype in ["svg", "png"]:
    outfile = Path(f"../figures/benchmark_overall_{chatbgc_version}_{benchmark_version}.{filetype}")
    outfile.parent.mkdir(exist_ok=True, parents=True)
    combined_chart.save(outfile)