In [None]:
import pandas as pd

# Load the datasets
df_gpt4o = pd.read_csv('openai_campaign\gpt-4o-mini_attack_responses_20241208_191330.csv')
df_claude3 = pd.read_csv('claude_campaign\claude-3-haiku-20240307_attack_responses_20241210_231556.csv')
df_llama3 = pd.read_csv('llama_campaign\llama-3-70b-instruct_attack_responses_20241211_032750.csv')

# Function to clean the Human Result column
def clean_human_result(df):
    df['Human Result Cleaned'] = df['Human Result'].str.strip().str.upper().str.startswith('YES')
    return df

# Clean the Human Result column in all datasets
df_gpt4o = clean_human_result(df_gpt4o)
df_claude3 = clean_human_result(df_claude3)
df_llama3 = clean_human_result(df_llama3)

# Overall success rate for each model
def overall_success_rate(df):
    return df['Human Result Cleaned'].mean() * 100

# Success rate by attack type, attack name, and prompt
def success_by_category(df, category):
    return df.groupby(category)['Human Result Cleaned'].mean() * 100

# Calculate overall success rates
overall_success_gpt4o = overall_success_rate(df_gpt4o)
overall_success_claude3 = overall_success_rate(df_claude3)
overall_success_llama3 = overall_success_rate(df_llama3)

# Calculate success rates by attack type
attack_type_success_gpt4o = success_by_category(df_gpt4o, 'attack_type')
attack_type_success_claude3 = success_by_category(df_claude3, 'attack_type')
attack_type_success_llama3 = success_by_category(df_llama3, 'attack_type')

# Calculate success rates by attack name
attack_name_success_gpt4o = success_by_category(df_gpt4o, 'attack_name')
attack_name_success_claude3 = success_by_category(df_claude3, 'attack_name')
attack_name_success_llama3 = success_by_category(df_llama3, 'attack_name')

# Calculate success rates by prompt
prompt_success_gpt4o = success_by_category(df_gpt4o, 'prompt')
prompt_success_claude3 = success_by_category(df_claude3, 'prompt')
prompt_success_llama3 = success_by_category(df_llama3, 'prompt')

# Find attack_name x prompt combinations that were successful for all three models
def find_common_successes(df1, df2, df3):
    # Filter successful attempts
    df1_success = df1[df1['Human Result Cleaned']]
    df2_success = df2[df2['Human Result Cleaned']]
    df3_success = df3[df3['Human Result Cleaned']]
    
    # Merge on attack_name and prompt
    merged = df1_success.merge(df2_success, on=['attack_name', 'prompt']) \
                        .merge(df3_success, on=['attack_name', 'prompt'])
    
    return merged[['attack_name', 'prompt']].drop_duplicates()

common_successes = find_common_successes(df_gpt4o, df_claude3, df_llama3)

import pandas as pd

# Assuming the 'data' DataFrame contains the relevant columns for "prompt", "attack_name", and "human result cleaned"

def calculate_and_sort_success_rates(data):
    # Step 1: Calculate the overall success rates for prompt and attack_name
    # Clean 'human result' column to just 'YES'/'NO' and treat it as a success indicator (1 for 'YES', 0 for 'NO')
    data['Success'] = data['human result cleaned'].apply(lambda x: 1 if 'YES' in str(x).upper() else 0)

    # Calculate success rate by prompt
    prompt_success_rate = data.groupby('prompt')['Success'].mean().reset_index()
    prompt_success_rate['Success Rate'] = prompt_success_rate['Success'] * 100
    prompt_success_rate = prompt_success_rate.sort_values(by='Success Rate', ascending=False)  # Sort by success rate descending

    # Calculate success rate by attack_name
    attack_name_success_rate = data.groupby('attack_name')['Success'].mean().reset_index()
    attack_name_success_rate['Success Rate'] = attack_name_success_rate['Success'] * 100
    attack_name_success_rate = attack_name_success_rate.sort_values(by='Success Rate', ascending=False)  # Sort by success rate descending

    return prompt_success_rate, attack_name_success_rate

# Example usage
gpt_prompt_success_rate, gpt_attack_name_success_rate = calculate_and_sort_success_rates(df_gpt4o)
llama_prompt_succes_rate, llama_attack_name_success_rate = calculate_and_sort_success_rates(df_llama3)
claude_prompt_succes_rate, claude_attack_name_success_rate = calculate_and_sort_success_rates(df_claude3)

# Display the results
print("Prompt Success Rate (Sorted):")
print(prompt_success_rate)

print("\nAttack Name Success Rate (Sorted):")
print(attack_name_success_rate)


# Print results
print("Overall Success Rates:")
print(f"GPT-4o-mini: {overall_success_gpt4o:.2f}%")
print(f"Claude-3-haiku: {overall_success_claude3:.2f}%")
print(f"Llama-3-70b-instruct: {overall_success_llama3:.2f}%\n")

print("Common Successful attack_name x prompt Combinations:")
print(common_successes)

# Optional: Save common successes to a CSV
# common_successes.to_csv('common_successes.csv', index=False)


In [None]:
import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt


# Function to create and display a pretty table
def display_pretty_table(data, title):
    table = tabulate(data.reset_index(), headers='keys', tablefmt='grid', showindex=False)
    print(f"\n{title}\n")
    print(table)

# Display prompt success rates
display_pretty_table(prompt_success_gpt4o, "GPT-4o-mini - Prompt Success Rate")
display_pretty_table(prompt_success_claude3, "Claude-3-haiku - Prompt Success Rate")
display_pretty_table(prompt_success_llama3, "Llama-3-70b-instruct - Prompt Success Rate")

# Display attack name success rates
display_pretty_table(attack_name_success_gpt4o, "GPT-4o-mini - Attack Name Success Rate")
display_pretty_table(attack_name_success_claude3, "Claude-3-haiku - Attack Name Success Rate")
display_pretty_table(attack_name_success_llama3, "Llama-3-70b-instruct - Attack Name Success Rate")

# Function to save tables as images
def save_table_as_image(data, title, filename):
    fig, ax = plt.subplots(figsize=(12, len(data) * 0.5))
    ax.axis('off')
    table_data = [[str(col) for col in row] for row in data.reset_index().values]
    table = ax.table(cellText=table_data, colLabels=data.reset_index().columns, loc='center', cellLoc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    plt.title(title, fontsize=14, weight='bold')
    plt.savefig(filename, bbox_inches='tight')
    plt.close()

# Save prompt success rates as images
save_table_as_image(prompt_success_gpt4o, "GPT-4o-mini - Prompt Success Rate", "prompt_success_gpt4o.png")
save_table_as_image(prompt_success_claude3, "Claude-3-haiku - Prompt Success Rate", "prompt_success_claude3.png")
save_table_as_image(prompt_success_llama3, "Llama-3-70b-instruct - Prompt Success Rate", "prompt_success_llama3.png")

# Save attack name success rates as images
save_table_as_image(attack_name_success_gpt4o, "GPT-4o-mini - Jailbreak Technique Success Rate", "attack_success_gpt4o.png")
save_table_as_image(attack_name_success_claude3, "Claude-3-haiku - Jailbreak Technique Success Rate", "attack_success_claude3.png")
save_table_as_image(attack_name_success_llama3, "Llama-3-70b-instruct - Jailbreak Technique Success Rate", "attack_success_llama3.png")


In [None]:
import textwrap
## Function to truncate and wrap text into two lines
def truncate_and_wrap(text, max_length=97, wrap_point=50):
    if len(text) > max_length:
        text = text[:max_length] + ". . ."
    wrapped = textwrap.wrap(text, width=wrap_point)
    if len(wrapped) > 2:
        wrapped = wrapped[:2]
    return "\n".join(wrapped)

# Function to format dataframes for display and image saving
def format_dataframe(df, col_name):
    formatted_df = df.reset_index()
    formatted_df[col_name] = formatted_df[col_name].apply(lambda x: truncate_and_wrap(str(x)))
    formatted_df['Human Result Cleaned'] = formatted_df['Human Result Cleaned'].apply(lambda x: f"{x:.2f}%")
    formatted_df = formatted_df.rename(columns={'Human Result Cleaned': 'Success Rate'})
    formatted_df = formatted_df.rename(columns={'prompt': 'Prompt'})
    return formatted_df

# Function to create and display a pretty table
def display_pretty_table(data, col_name, title):
    formatted_data = format_dataframe(data, col_name)
    table = tabulate(formatted_data, headers='keys', tablefmt='grid', showindex=False)
    print(f"\n{title}\n")
    print(table)

# Display prompt success rates
display_pretty_table(prompt_success_gpt4o, 'prompt', "GPT-4o-mini - Prompt Success Rate")
display_pretty_table(prompt_success_claude3, 'prompt', "Claude-3-haiku - Prompt Success Rate")
display_pretty_table(prompt_success_llama3, 'prompt', "Llama-3-70b-instruct - Prompt Success Rate")

# Display attack name success rates
display_pretty_table(attack_name_success_gpt4o, 'attack_name', "GPT-4o-mini - Attack Name Success Rate")
display_pretty_table(attack_name_success_claude3, 'attack_name', "Claude-3-haiku - Attack Name Success Rate")
display_pretty_table(attack_name_success_llama3, 'attack_name', "Llama-3-70b-instruct - Attack Name Success Rate")

# Function to save tables as images with adjusted row height
def save_table_as_image(data, col_name, title, filename):
    formatted_data = format_dataframe(data, col_name)
    fig, ax = plt.subplots(figsize=(12, len(formatted_data) * 0.7))  # Adjust figure height
    ax.axis('off')
    table_data = [list(row) for row in formatted_data.values]
    col_labels = formatted_data.columns
    table = ax.table(cellText=table_data, colLabels=col_labels, loc='center', cellLoc='center')

    # Adjust font size
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    # Adaptive cell height based on text line count
    # for (row, col), cell in table.get_celld().items():
    #     if row == 0:
    #         # Header cells
    #         cell.set_height(0.05)
    #         cell.set_fontsize(12)
    #         cell.set_text_props(weight='bold')
    #     else:
    #         text = cell.get_text().get_text()
    #         line_count = text.count('\n') + 1
    #         if line_count == 2:
    #             cell.set_height(0.05)  # Taller cell for two lines
    #         else:
    #             cell.set_height(0.025)  # Default height for one line
    
     # Adaptive cell height and border adjustments for all cells in the row
    for row in range(1, len(formatted_data) + 1):
        # Get the text of the "prompt" cell (the first column in each row)
        prompt_text = table[(row, 0)].get_text().get_text()
        line_count = prompt_text.count('\n') + 1
        
        # If the "prompt" cell has two lines, update the height of all cells in the row
        row_height = 0.05 if line_count == 2 else 0.025
        
        # Apply the height to all cells in the row
        for col in range(len(col_labels)):
            table[(row, col)].set_height(row_height)
            

    # plt.title(title, fontsize=14, weight='bold')
    # Use tight_layout to reduce whitespace around the table
    plt.tight_layout(pad=0.05)

    # Save the table image with minimal margins
    plt.savefig(filename, bbox_inches='tight', pad_inches=0.01)
    plt.close()


# Save prompt success rates as images
save_table_as_image(prompt_success_gpt4o, 'prompt', "GPT-4o-mini - Prompt Success Rate", "prompt_success_gpt4o.png")
save_table_as_image(prompt_success_claude3, 'prompt', "Claude-3-haiku - Prompt Success Rate", "prompt_success_claude3.png")
save_table_as_image(prompt_success_llama3, 'prompt', "Llama-3-70b-instruct - Prompt Success Rate", "prompt_success_llama3.png")

# Save attack name success rates as images
save_table_as_image(attack_name_success_gpt4o, 'attack_name', "GPT-4o-mini - Attack Name Success Rate", "attack_success_gpt4o.png")
save_table_as_image(attack_name_success_claude3, 'attack_name', "Claude-3-haiku - Attack Name Success Rate", "attack_success_claude3.png")
save_table_as_image(attack_name_success_llama3, 'attack_name', "Llama-3-70b-instruct - Attack Name Success Rate", "attack_success_llama3.png")