In [1]:
colors= ["#0173B2", "#DE8F05", "#029E73", "#D55E00", "#CC78BC", "#CA9161", "#FBAFE4", "#949494", "#ECE133", "#56B4E9", "#348FC1"]

In [2]:
# Import necessary libraries
import pandas as pd

# Load the CSV data for all three models
gpt4o_negative_prompt_negative_examples_stats_df = pd.read_csv("./gpt-4o/results/negative_prompt_negative_examples_stats.csv")
gpt4o_negative_prompt_mixed_examples_stats_df = pd.read_csv("./gpt-4o/results/negative_prompt_mixed_examples_stats.csv")
gpt4o_critical_prompt_negative_examples_stats_df = pd.read_csv("./gpt-4o/results/critical_prompt_negative_examples_stats.csv")
gpt4o_critical_prompt_mixed_examples_stats_df = pd.read_csv("./gpt-4o/results/critical_prompt_mixed_examples_stats.csv")

gpt35_negative_prompt_negative_examples_stats_df = pd.read_csv("./gpt-3.5-turbo/results/negative_prompt_negative_examples_stats.csv")
gpt35_negative_prompt_mixed_examples_stats_df = pd.read_csv("./gpt-3.5-turbo/results/negative_prompt_mixed_examples_stats.csv")
gpt35_critical_prompt_negative_examples_stats_df = pd.read_csv("./gpt-3.5-turbo/results/critical_prompt_negative_examples_stats.csv")
gpt35_critical_prompt_mixed_examples_stats_df = pd.read_csv("./gpt-3.5-turbo/results/critical_prompt_mixed_examples_stats.csv")

gemini_negative_prompt_negative_examples_stats_df = pd.read_csv("./gemini-1.5-flash/results/negative_prompt_negative_examples_stats.csv")
gemini_negative_prompt_mixed_examples_stats_df = pd.read_csv("./gemini-1.5-flash/results/negative_prompt_mixed_examples_stats.csv")
gemini_critical_prompt_negative_examples_stats_df = pd.read_csv("./gemini-1.5-flash/results/critical_prompt_negative_examples_stats.csv")
gemini_critical_prompt_mixed_examples_stats_df = pd.read_csv("./gemini-1.5-flash/results/critical_prompt_mixed_examples_stats.csv")

# Concatenate all data for each model
gpt4o_total_df = pd.concat([
    gpt4o_negative_prompt_negative_examples_stats_df, gpt4o_negative_prompt_mixed_examples_stats_df, 
    gpt4o_critical_prompt_negative_examples_stats_df, gpt4o_critical_prompt_mixed_examples_stats_df
])

gpt35_total_df = pd.concat([
    gpt35_negative_prompt_negative_examples_stats_df, gpt35_negative_prompt_mixed_examples_stats_df, 
    gpt35_critical_prompt_negative_examples_stats_df, gpt35_critical_prompt_mixed_examples_stats_df
])

gemini_total_df = pd.concat([
    gemini_negative_prompt_negative_examples_stats_df, gemini_negative_prompt_mixed_examples_stats_df, 
    gemini_critical_prompt_negative_examples_stats_df, gemini_critical_prompt_mixed_examples_stats_df
])

# Function to filter and rename the dataframes
def filter_and_rename(df, method, correct_state, incorrect_state):
    temp = df[(df['method'] == method) & (df['result_type'].isin([correct_state, incorrect_state]))].copy()
    temp.loc[:, 'result_type'] = temp['result_type'].map({correct_state: 'correct', incorrect_state: 'incorrect'})
    return temp

# Function to process and filter data for each model
def process_model_data(total_df):
    filtered_total_df_list = [
        total_df[total_df['method'] == 'cot_1'],
        filter_and_rename(total_df, 'selfreflection_cot_2', 'correct_state_2', 'incorrect_state_2'),
        filter_and_rename(total_df, 'reflexion_cot_2', 'correct_state_2', 'incorrect_state_2'),
        filter_and_rename(total_df, 'selfreflection_cot_3', 'correct_state_3', 'incorrect_state_3'),
        filter_and_rename(total_df, 'reflexion_cot_3', 'correct_state_3', 'incorrect_state_3')
    ]

    filtered_total_df = pd.concat(filtered_total_df_list, ignore_index=True)

    correct_types = ['correct', 'correct_state_1', 'correct_state_2', 'correct_state_3']
    incorrect_types = ['incorrect', 'incorrect_state_1', 'incorrect_state_2', 'incorrect_state_3']

    # Categorize result types
    def categorize_result(result_type):
        if result_type in correct_types:
            return 'correct'
        elif result_type in incorrect_types:
            return 'incorrect'
        else:
            return 'other'

    filtered_total_df['result_category'] = filtered_total_df['result_type'].apply(categorize_result)

    # Filter out 'other' categories
    filtered_total_df = filtered_total_df[filtered_total_df['result_category'].isin(['correct', 'incorrect'])]

    # Separate 'cot_1' data
    cot1_df = filtered_total_df[filtered_total_df['method'] == 'cot_1']
    other_methods_df = filtered_total_df[filtered_total_df['method'] != 'cot_1']

    # Pivot the tables to get correct and incorrect counts per method and prompt_example
    cot1_pivot = cot1_df.pivot_table(
        index='method',
        columns='result_category',
        values='value',
        aggfunc='sum',
        fill_value=0
    ).reset_index()

    other_accuracy_pivot = other_methods_df.pivot_table(
        index=['prompt_examples', 'method'],
        columns='result_category',
        values='value',
        aggfunc='sum',
        fill_value=0
    ).reset_index()

    # Calculate accuracy for 'cot_1'
    cot1_pivot['accuracy'] = cot1_pivot['correct'] / (cot1_pivot['correct'] + cot1_pivot['incorrect'])
    cot1_pivot['accuracy'] = cot1_pivot['accuracy'].fillna(0)

    # Calculate accuracy for other methods
    other_accuracy_pivot['accuracy'] = other_accuracy_pivot['correct'] / (other_accuracy_pivot['correct'] + other_accuracy_pivot['incorrect'])
    other_accuracy_pivot['accuracy'] = other_accuracy_pivot['accuracy'].fillna(0)

    return cot1_pivot, other_accuracy_pivot

# Process data for each model
gpt4o_cot1_pivot, gpt4o_other_accuracy_pivot = process_model_data(gpt4o_total_df)
gpt35_cot1_pivot, gpt35_other_accuracy_pivot = process_model_data(gpt35_total_df)
gemini_cot1_pivot, gemini_other_accuracy_pivot = process_model_data(gemini_total_df)

# Add model names to the pivots
gpt4o_cot1_pivot['model'] = 'gpt-4o'
gpt35_cot1_pivot['model'] = 'gpt-3.5-turbo'
gemini_cot1_pivot['model'] = 'gemini-1.5-flash'

gpt4o_other_accuracy_pivot['model'] = 'gpt-4o'
gpt35_other_accuracy_pivot['model'] = 'gpt-3.5-turbo'
gemini_other_accuracy_pivot['model'] = 'gemini-1.5-flash'

# Combine all data into final pivot tables
combined_cot1_pivot = pd.concat([gpt4o_cot1_pivot, gpt35_cot1_pivot, gemini_cot1_pivot], ignore_index=True)
combined_other_accuracy_pivot = pd.concat([gpt4o_other_accuracy_pivot, gpt35_other_accuracy_pivot, gemini_other_accuracy_pivot], ignore_index=True)


In [3]:
# Define the desired order and method name mapping for all models
desired_order = [
    "Basic Self-Correction + CoT @ Trial 1",
    "Basic Self-Correction + CoT @ Trial 2",
    "Oracle Self-Correction + CoT @ Trial 1*",
    "Oracle Self-Correction + CoT @ Trial 2*"
]

method_name_mapping = {
    'selfreflection_cot_2': "Basic Self-Correction + CoT @ Trial 1",
    'selfreflection_cot_3': "Basic Self-Correction + CoT @ Trial 2",
    'reflexion_cot_2': "Oracle Self-Correction + CoT @ Trial 1*",
    'reflexion_cot_3': "Oracle Self-Correction + CoT @ Trial 2*"
}

# Function to map method names, set categorical order, and sort dataframe
def apply_method_name_mapping_and_sort(df):
    # Apply the method name mapping
    df['method'] = df['method'].map(method_name_mapping)
    
    # Ensure 'method' is a categorical variable with the desired order
    df['method'] = pd.Categorical(df['method'], categories=desired_order, ordered=True)
    
    # Sort the DataFrame based on the categorical order
    df = df.sort_values('method')
    
    return df

# Apply the method name mapping and sorting to each model's other_accuracy_pivot
gpt4o_other_accuracy_pivot = apply_method_name_mapping_and_sort(gpt4o_other_accuracy_pivot)
gpt35_other_accuracy_pivot = apply_method_name_mapping_and_sort(gpt35_other_accuracy_pivot)
gemini_other_accuracy_pivot = apply_method_name_mapping_and_sort(gemini_other_accuracy_pivot)

# Combine all the dataframes into a single one for easier comparison
combined_other_accuracy_pivot = pd.concat([gpt4o_other_accuracy_pivot, gpt35_other_accuracy_pivot, gemini_other_accuracy_pivot], ignore_index=True)

method_colors = {
    "Basic Self-Correction + CoT @ Trial 1": colors[0],
    "Basic Self-Correction + CoT @ Trial 2": colors[0],
    "Oracle Self-Correction + CoT @ Trial 1*": colors[1],
    "Oracle Self-Correction + CoT @ Trial 2*": colors[1]
}


In [4]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Ensure mapping dictionaries are defined correctly
prompt_example_mapping = {
    'negative_prompt_negative_examples': 'Negative Prompt<br>Negative Examples',
    'negative_prompt_mixed_examples': 'Negative Prompt<br>Mixed Examples',
    'critical_prompt_negative_examples': 'Critical Prompt<br>Negative Examples',
    'critical_prompt_mixed_examples': 'Critical Prompt<br>Mixed Examples',
}

# Customize x-axis category order with new labels
desired_prompt_order = [
    'Negative Prompt<br>Negative Examples',
    'Negative Prompt<br>Mixed Examples',
    'Critical Prompt<br>Negative Examples',
    'Critical Prompt<br>Mixed Examples'
]

# Define pattern styles for each category
pattern_for_category = {
    'Basic Self-Correction + CoT @ Trial 1': '/',
    'Basic Self-Correction + CoT @ Trial 2': 'x',
    'Oracle Self-Correction + CoT @ Trial 1*': '/',
    'Oracle Self-Correction + CoT @ Trial 2*': 'x'
}


method_colors = {
    "Basic Self-Correction + CoT @ Trial 1": colors[1],
    "Basic Self-Correction + CoT @ Trial 2": colors[1],
    "Oracle Self-Correction + CoT @ Trial 1*": colors[2],
    "Oracle Self-Correction + CoT @ Trial 2*": colors[2]
}

# Initialize the figure with 3 subplots (one for each model)
fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=[
        '<b>GPT-4o</b>', 
        '<b>GPT-3.5 Turbo</b>', 
        '<b>Gemini-1.5 Flash</b>'
    ],
    vertical_spacing=0.05,   # Increased from 0.2,
    shared_xaxes=True,
)

# List of models and corresponding data
models_data = [
    ('GPT-4o', gpt4o_other_accuracy_pivot, gpt4o_cot1_pivot),
    ('GPT-3.5-turbo', gpt35_other_accuracy_pivot, gpt35_cot1_pivot),
    ('Gemini-1.5-flash', gemini_other_accuracy_pivot, gemini_cot1_pivot)
]

# Track the methods we've already added to the legend to avoid duplicates
methods_in_legend = set()

# Loop over models to add subplots for each
for i, (model_name, model_data, cot1_pivot) in enumerate(models_data):
    row = i + 1  # Row for the subplot

    # Add a bar for each method for the current model
    for method in desired_order:
        method_data = model_data[model_data['method'] == method].copy()
        if method_data.empty:
            continue  # Skip if there's no data for the method
        # Map the prompt examples to new labels
        method_data['prompt_examples'] = method_data['prompt_examples'].map(prompt_example_mapping)
        # Multiply accuracy by 100
        method_data['accuracy_percent'] = method_data['accuracy'] * 100
        pattern = pattern_for_category.get(method)
        
        # Show legend only for the first occurrence of each method
        showlegend = method not in methods_in_legend
        if showlegend:
            methods_in_legend.add(method)
        
        # Add the bar for this method in the current subplot
        fig.add_trace(go.Bar(
            x=method_data['prompt_examples'],
            y=method_data['accuracy_percent'],
            name=method_name_mapping.get(method, method),
            text=[f"{acc:.1f}" for acc in method_data['accuracy_percent']],  # Remove % symbol
            textposition='outside',
            marker=dict(
                color=method_colors.get(method, '#000000'), 
                pattern=dict(shape=pattern)
            ),
            showlegend=showlegend,
            
        ), row=row, col=1)
    # Calculate the average accuracy for 'cot_1' and convert to percentage
    cot1_accuracy = cot1_pivot['accuracy'].iloc[0]  # Assuming only one row for 'cot_1'
    cot1_accuracy_percent = cot1_accuracy * 100

    # Add a horizontal line for 'cot_1' accuracy
    fig.add_shape(
        type="line",
        xref=f'x{row}' if row > 1 else 'x',  # Different xref for rows > 1
        yref=f'y{row}',
        x0=-0.5,  # Start just before the first bar
        y0=cot1_accuracy_percent,
        x1=len(desired_prompt_order) - 0.5,  # End just after the last bar
        y1=cot1_accuracy_percent,
        # Compute colors[0] opacity 80% to match the bars
        line=dict(color=colors[0], width=4, dash="dash"),
        opacity=0.8,
        row=row, col=1
    )

    # Add annotation for 'cot_1' accuracy
    fig.add_annotation(
        x=1.5,  # Relative position along the x-axis in paper coordinates
        y=cot1_accuracy_percent,
        xref="paper",
        yref=f"y{row}",
        text=f"<b>{cot1_accuracy_percent:.1f}%</b>",  # Include % symbol in annotation
        showarrow=False,
        font=dict(color=colors[0], size=12),
        align="left",
        yshift=10,
        row=row, col=1
    )


fig.add_trace(go.Scatter(
    x=[None],  # No data points
    y=[None],
    mode='lines',
    line=dict(color=colors[0], width=4, dash='dash'),  # Match the horizontal line's style
    name='CoT Baseline @ Trial 0',  # Legend entry name
    showlegend=True,
))


# Add annotation explaining the asterisk (*) symbol
fig.add_annotation(
    x=-0.11,  # Position at the very left of the plot
    y=-0.065,  # Position slightly below the plot area; adjust as needed
    xref='paper',  # Reference the entire plotting area horizontally
    yref='paper',  # Reference the entire plotting area vertically
    text="<b>*</b>: Maximum number of trials",  # Explanation text
    showarrow=False,  # No arrow pointing to anything
    font=dict(color="black", size=12),  # Font styling
    align="left",  # Left-align the text
    xanchor='left',  # Anchor the text to the left
    yanchor='top',   # Anchor the text to the top
    xshift=10,        # Horizontal shift to move the text slightly right
    yshift=0          # No vertical shift since it's already below the plot
)

# Update layout for grouped bars with modified axes and grid lines
fig.update_layout(
    title={
        'text': '<b>Accuracy - Calendar Scheduling (Few-Shot Reflective Prompting)</b>',
        'y': 0.97,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    },
    template='plotly_white',
    font=dict(family="Arial", size=12),
    width=960,
    height=1200,  # Adjusted for more space
    margin=dict(t=150, b=0),
    showlegend=True,  # Ensure the legend shows up at the right spot
    legend_title_text='Method',
    legend=dict(
        yanchor="bottom",
        y=1.04,
        xanchor="center",
        x=0.5,
        orientation="h",
        font=dict(family="Arial", size=12),
    )
)

# Update the x-axis and y-axis titles
fig.update_xaxes(
    title_text='Reflective Prompt and Few-Shot Reflective Examples',
    title_font=dict(family="Arial"),
    showgrid=True,
    zeroline=False,
    linecolor='black',
    row=3,
    title_standoff=20 
)

xaxis_props = dict(
    categoryorder='array',
    categoryarray=desired_prompt_order,
    linecolor='black',             # Set x-axis line color to black
    showline=True,                 # Show x-axis line
    tickfont=dict(color='black'),  # Set x-axis tick labels to black
    showgrid=False,                # Remove vertical grid lines
    zeroline=False,                # Remove zero line if not needed
    mirror='all'                   # Mirror x-axis lines on all sides (adds top line)
)

yaxis_props = dict(
    tickformat=None,               # Show numbers without the percent symbol
    linecolor='black',             # Set y-axis line color to black
    showline=True,                 # Show y-axis line
    tickfont=dict(color='black'),  # Set y-axis tick labels to black
    gridcolor='lightgrey',         # Set grid line color to light grey
    gridwidth=1,                   # Increase grid line width for intensity
    zeroline=False,                # Remove zero line if not needed
    mirror='all',                  # Mirror y-axis lines on all sides (adds right line)
    range=[0, 100],                # Show 0 to 100
    tickvals=np.arange(0, 101, 20) # Tick values every 20
)

# Apply x-axis and y-axis settings to all subplots
for row in range(1, 4):  # Loop over 3 rows
    fig.update_xaxes(xaxis_props, row=row, col=1)  # Apply x-axis settings
    fig.update_yaxes(yaxis_props, row=row, col=1)  # Apply y-axis settings

# Update the y-axis for every subplot
for row in range(1, 4):
    fig.update_yaxes(
        title_text='Accuracy (%)',
        title_font=dict(family="Arial"),
        showgrid=True,
        zeroline=False,
        linecolor='black',
        row=row,
        col=1
    )

fig.update_layout(
    margin=dict(b=80)
)
# Show the figure
fig.show()

# Save the figure as a PDF
# Ensure you have the 'kaleido' package installed: pip install kaleido
fig.write_image("accuracy_by_method_prompt_examples_3_models.pdf")
