In [1]:
import pandas as pd
import altair as alt
import numpy as np
import json
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.colors import rgb2hex
import seaborn as sns

In [2]:
# Create 'family' and 'variant' columns
def process_model_name(model_name):
    if ':' in model_name:
        variant_sizes = model_name.split(':')
        family = variant_sizes[0]
        variant = float(variant_sizes[-1].replace('b', ''))
    else:
        family = model_name
        variant = 15
    return family, variant

In [3]:
def bar_chart(df, metric_order, name, color_condition, scale = 'independent'):
    # List to store individual charts
    charts = []

    # Loop over metrics in the specified order
    for metric in metric_order:
        # Filter the DataFrame for the current metric
        group = df[df['metric'] == metric].copy()
        
        # Sort models by avg_score in descending order
        sorted_models = group.sort_values('avg_score')['model'].tolist()
        
        # Create a bar chart for the metric
        chart = alt.Chart(group).mark_bar().encode(
            x=alt.X('model:N', sort=sorted_models, title='Model'),
            y=alt.Y('avg_score:Q', title='Average Score', scale=alt.Scale(domain=[0,1])),
            color=color_condition,
            tooltip=['model', 'avg_score']
        ).properties(
            width=400,
            height=300,
            title=metric
        )
        
        # Add the chart to the list
        charts.append(chart)

    # Combine all charts into a single visualization
    bar_chart = alt.hconcat(*charts).resolve_scale(y=scale).properties(
        title=name
    )

    return bar_chart


In [20]:
temp_list = [(False, False), (False, True), (True, False), (True, True)]
metric_order = ['BlueScore', 'Rouge Score', 'Non-LLM String Similarity', 'LLM Semantic Similarity']
all_average_scores = []
# Define color
# TODO: would be nice if this is automatic
color_scale = alt.Scale(
    domain=df['model'].unique(),
    range=['#afd1e7', '#86bcdc', '#5ba3cf', '#3887c0', '#1b69ad', 'red']
)
color_condition = alt.Color('model:N', title='Model', scale=color_scale)

for transpose, pre_processing in temp_list:
    transposed_path = 'transpose_' if transpose else ''
    preprocessed_path = 'preprocessed_' if pre_processing else ''
    file_path = f'../data/results/similarity_text-embedding-3-large_{transposed_path}{preprocessed_path}variants.json'

    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    df = pd.DataFrame(json_data)

    # Title
    q_group = 'across GQ ' if transpose else 'same QG '
    pre_proc = '+ PP' if pre_processing else ''
    title = 'Variations ' + q_group + pre_proc

    # Chart
    chart = bar_chart(df, metric_order=metric_order, name=title, color_condition=color_condition, scale='shared')
    chart.display()

    # Calculate the average score for each model
    average_scores = df.groupby('model')['avg_score'].mean().reset_index()
    average_scores['experiment'] = title
    all_average_scores.append(average_scores)

final_average_scores = pd.concat(all_average_scores, ignore_index=True)

# Line chart
line_chart = alt.Chart(final_average_scores).mark_line(point=True).encode(
    x=alt.X
    (
        'experiment:N', 
        title='Experiment Variation', 
        axis=alt.Axis(labelAngle=45), 
    ),
    y=alt.Y('avg_score:Q', title='Average Score'),
    color=color_condition,
    tooltip=['model', 'avg_score', 'experiment']
).properties(
    title='Model Performance Across Different Variations',
    width=600
)

# Display the chart
line_chart.display()