In [2]:
import pandas as pd
import altair as alt
import numpy as np
import json
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.colors import rgb2hex
import seaborn as sns

In [3]:
# Create 'family' and 'variant' columns
def process_model_name(model_name):
    if ':' in model_name:
        variant_sizes = model_name.split(':')
        family = variant_sizes[0]
        variant = float(variant_sizes[-1].replace('b', ''))
    else:
        family = model_name
        variant = 15
    return family, variant

In [8]:
def bar_chart(df, metric_order, name, color_condition, scale = 'independent'):
    # List to store individual charts
    charts = []

    # Loop over metrics in the specified order
    for metric in metric_order:
        # Filter the DataFrame for the current metric
        group = df[df['metric'] == metric].copy()
        
        # Sort models by avg_score in descending order
        sorted_models = group.sort_values('avg_score')['model'].tolist()
        
        # Create a bar chart for the metric
        chart = alt.Chart(group).mark_bar().encode(
            x=alt.X('model:N', sort=sorted_models, title='Model'),
            xOffset='dataset:N',  # Offset bars for different datasets
            y=alt.Y('avg_score:Q', title='Average Score', scale=alt.Scale(domain=[0,1])),
            color=color_condition,
            tooltip=['model', 'avg_score']
        ).properties(
            width=400,
            height=300,
            title=metric
        )
        
        # Add the chart to the list
        charts.append(chart)

    # Combine all charts into a single visualization
    bar_chart = alt.hconcat(*charts).resolve_scale(y=scale).properties(
        title=name
    )

    return bar_chart


In [23]:
temp_list = [(False, False), (False, True), (True, False), (True, True)]
metric_order = ['BlueScore', 'Rouge Score', 'Non-LLM String Similarity', 'LLM Semantic Similarity']
all_average_scores = []
# Define color
# TODO: would be nice if this is automatic
color_scale = alt.Scale(
    domain=["qwen2.5:0.5b", "qwen2.5:1.5b", "qwen2.5:3b", "qwen2.5:7b", "qwen2.5:14b", "gpt-4o"],
    range=['#afd1e7', '#86bcdc', '#5ba3cf', '#3887c0', '#1b69ad', 'red']
)
color_condition = alt.Color('model:N', title='Model', scale=color_scale)

for transpose, pre_processing in temp_list:
    transposed_path = 'transpose_' if transpose else ''
    preprocessed_path = 'preprocessed_' if pre_processing else ''
    file_path = f'../data/results/similarity_text-embedding-3-large_{transposed_path}{preprocessed_path}variants.json'

    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    df = pd.DataFrame(json_data)

    # Title
    q_group = 'across GQ ' if transpose else 'same QG '
    pre_proc = '+ PP' if pre_processing else ''
    title = 'Variations ' + q_group + pre_proc

    # Chart
    chart = bar_chart(df, metric_order=metric_order, name=title, color_condition=color_condition, scale='shared')
    chart.display()

    # Calculate the average score for each model
    average_scores = df.groupby('model')['avg_score'].mean().reset_index()
    average_scores['experiment'] = title
    all_average_scores.append(average_scores)

final_average_scores = pd.concat(all_average_scores, ignore_index=True)

# Line chart
line_chart = alt.Chart(final_average_scores).mark_line(point=True).encode(
    x=alt.X
    (
        'experiment:N', 
        title='Experiment Variation', 
        axis=alt.Axis(labelAngle=45), 
    ),
    y=alt.Y('avg_score:Q', title='Average Score'),
    color=color_condition,
    tooltip=['model', 'avg_score', 'experiment']
).properties(
    title='Model Performance Across Different Experiments',
    width=600
)

# Display the chart
line_chart.display()

In [67]:
temp_list = [(False, False), (False, True), (True, False), (True, True)]
metric_order = ['BlueScore', 'Rouge Score', 'Non-LLM String Similarity', 'LLM Semantic Similarity']
all_average_scores = []
# Define color
# TODO: would be nice if this is automatic
color_scale = alt.Scale(
    domain=["qwen2.5:0.5b", "qwen2.5:1.5b", "qwen2.5:3b", "qwen2.5:7b", "qwen2.5:14b", "gpt-4o"],
    range=['#afd1e7', '#86bcdc', '#5ba3cf', '#3887c0', '#1b69ad', 'red']
)
color_condition = alt.Color('model:N', title='Model', scale=color_scale)

for transpose, pre_processing in temp_list:
    transposed_path = 'transpose_' if transpose else ''
    preprocessed_path = 'preprocessed_' if pre_processing else ''
    dup_path = f'../data/results/similarity_text-embedding-3-large_{transposed_path}{preprocessed_path}duplicates.json'
    var_path = f'../data/results/similarity_text-embedding-3-large_{transposed_path}{preprocessed_path}variants.json'

    with open(var_path, 'r', encoding='utf-8') as file:
        var_data = json.load(file)
    with open(dup_path, 'r', encoding='utf-8') as file:
        dup_data = json.load(file)
    var_df = pd.DataFrame(var_data)
    dup_df = pd.DataFrame(dup_data)

    # Add a column to identify the dataset
    var_df['dataset'] = 'Variants'
    dup_df['dataset'] = 'Duplicates'

    df = pd.concat([var_df, dup_df], ignore_index=True)

    # Title
    q_group = 'across GQ ' if transpose else 'same QG '
    pre_proc = '+ PP' if pre_processing else ''
    title = 'Variations ' + q_group + pre_proc

    # Chart
    chart = bar_chart(df, metric_order=metric_order, name=title, color_condition=color_condition, scale='shared')
    chart.display()

    # Calculate the average score for each model
    # average_scores = df.groupby('model')['avg_score'].mean().reset_index()
    # average_scores['experiment'] = title

    average_scores = df.groupby(['dataset', 'model'])['avg_score'].mean().reset_index()
    average_scores['experiment'] = title

    # all_average_scores.append(average_scores)
    all_average_scores.append(average_scores)

final_average_scores = pd.concat(all_average_scores, ignore_index=True)

In [79]:
# List of small model names
models_to_filter_small = ['qwen2.5:0.5b', 'qwen2.5:1.5b', 'qwen2.5:3b']

# Filter the DataFrame
models_to_filter_large = ['qwen2.5:7b', 'qwen2.5:14b', 'gpt-4o']

filter_small_models = final_average_scores[
    final_average_scores['model'].isin(models_to_filter_small)
]
filter_large_models = final_average_scores[
    final_average_scores['model'].isin(models_to_filter_large)
]

# Define the strokeDash scale for the datasets
strokeDash_scale = alt.Scale(
    domain=["Duplicates", "Variants"],  # Replace with your actual dataset names
)
strokeDash_condition = alt.StrokeDash(
    'dataset:N',
    title='Dataset',
    scale=strokeDash_scale,
)

# Define the color scale for models
color_scale_small = alt.Scale(
    domain=models_to_filter_small,
    range=['#afd1e7', '#5ba3cf', '#1b69ad']
)

color_scale_large = alt.Scale(
    domain=models_to_filter_large,
    range=['#3887c0', '#1b69ad', 'red']
)

color_condition_small = alt.Color('model:N', title='Model', scale=color_scale_small)
color_condition_large = alt.Color('model:N', title='Model', scale=color_scale_large)

In [82]:

dot_chart_small = alt.Chart(filter_small_models).mark_point(filled=True).encode(
    x=alt.X
    (
        'experiment:N', 
        axis=alt.Axis(labelAngle=45), 
    ),
    y=alt.Y('avg_score:Q', title='Average Score'),
    # color=color_condition,
    color=color_condition_small,
    strokeDash=alt.StrokeDash('dataset', legend=None),
    tooltip=['model', 'avg_score', 'experiment']
)

line_chart_small = alt.Chart(filter_small_models).mark_line().encode(
    x=alt.X
    (
        'experiment:N', 
        title='Duplicate vs Variants', 
        axis=alt.Axis(labelAngle=45), 
    ),
    y=alt.Y('avg_score:Q', title='Average Score'),
    # color=color_condition,
    color=color_condition_small,
    strokeDash=alt.StrokeDash('dataset', legend=alt.Legend(title='Dataset')),
    tooltip=['dataset', 'avg_score', 'experiment']
)

chart = (line_chart_small + dot_chart_small).properties(
    width=600,
    height=400,
    title="Line Chart"
)

chart

In [83]:
dot_chart_large = alt.Chart(filter_large_models).mark_point(filled=True).encode(
    x=alt.X
    (
        'experiment:N', 
        axis=alt.Axis(labelAngle=45), 
    ),
    y=alt.Y('avg_score:Q', title='Average Score'),
    # color=color_condition,
    color=color_condition_large,
    strokeDash=alt.StrokeDash('dataset', legend=None),
    tooltip=['model', 'avg_score', 'experiment']
)

line_chart_large = alt.Chart(filter_large_models).mark_line().encode(
    x=alt.X
    (
        'experiment:N', 
        title='Duplicate vs Variants', 
        axis=alt.Axis(labelAngle=45), 
    ),
    y=alt.Y('avg_score:Q', title='Average Score'),
    # color=color_condition,
    color=color_condition_large,
    strokeDash=alt.StrokeDash('dataset', legend=alt.Legend(title='Dataset')),
    tooltip=['dataset', 'avg_score', 'experiment']
)

chart = (line_chart_large + dot_chart_large).properties(
    width=600,
    height=400,
    title="Line Chart"
)

chart