In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [4]:

import re
import ast

def fully_robust_parser(s):
    if isinstance(s, list):
        return s  # Already parsed
    # Remove newlines and normalize spaces
    s = re.sub(r'\s+', ' ', s.strip())
    # Remove brackets temporarily
    s_clean = s.strip('[]')
    # Add a comma between numbers that are not separated by a comma (handles floats like '1. 0.8')
    s_clean = re.sub(r'(?<=[\d.]) (?=[\d.-])', ', ', s_clean)
    # Wrap again in brackets
    s_clean = f'[{s_clean}]'
    return ast.literal_eval(s_clean)


In [5]:
from scipy.stats import norm

def compute_ci(df, metric_name, alpha=0.05):
    z = norm.ppf(1 - alpha / 2)
    n = df[f"values_{metric_name}"].apply(len)

    confidence_level = 1-alpha

    margin = z * df[f"std_{metric_name}"] / np.sqrt(n)

    avg = df[f"avg_{metric_name}"]
    std = df[f"std_{metric_name}"]

    df[f"ci_upper_{metric_name}"] = df[f"avg_{metric_name}"] + margin

    results_df = df[["method", "model", "benchmark", "evaluator"]]
    results_df["metric"] = metric_name
    results_df["avg"] = avg
    results_df["std"] = std
    # results_df["ci_lower"] = avg - margin
    # results_df["ci_upper"] = avg + margin
    results_df["ci_error"] = margin
    results_df["n"] = n
    results_df["confidence_level"] = confidence_level
    
    return results_df

In [9]:


def multi_evaluator_table(project_name, eval_run_ids, method_order, model_order, all_metrics, method_map):  
    # Load and process each dataframe
    processed_dfs = []
    for eval_run_id in eval_run_ids:
        results_path = Path("../results") / project_name / "aggregated_results" / eval_run_id / f"evaluation_{eval_run_id}.csv"
        df = pd.read_csv(results_path)
        
        # Apply method mapping
        df["method"] = df["method"].replace(method_map)
        
        # Process each available metric
        metric_results = []
        for metric in all_metrics:
            required_cols = [f"values_{metric}", f"std_{metric}", f"avg_{metric}"]
            if all(col in df.columns for col in required_cols):
                # Parse values
                df[f"values_{metric}"] = df[f"values_{metric}"].apply(fully_robust_parser)
                
                # Compute confidence intervals
                result_df = compute_ci(df, metric, alpha=0.05)
                
                # Add source column
                result_df["source"] = eval_run_id
                
                metric_results.append(result_df)
        
        if metric_results:
            # Combine all metrics for this source
            source_df = pd.concat(metric_results, ignore_index=True)
            processed_dfs.append(source_df)

    # Combine all processed dataframes
    if processed_dfs:
        combined_df = pd.concat(processed_dfs, ignore_index=True)
        
        # Create a unique identifier for each metric-source combination
        combined_df["metric_source"] = combined_df["metric"] + "_" + combined_df["source"]
        
        # Convert method and model to categorical with specified order
        combined_df["method"] = pd.Categorical(combined_df["method"], categories=method_order, ordered=True)
        combined_df["model"] = pd.Categorical(combined_df["model"], categories=model_order, ordered=True)
        
        # Sort the dataframe before pivoting to ensure order is preserved
        combined_df = combined_df.sort_values(["method", "model"])
        
        # Pivot to get metrics from different sources as separate columns
        pivoted_df = combined_df.pivot_table(
            index=["method", "model"],
            columns="metric_source",
            values=["avg", "ci_error"]
        )
        
        # Flatten column names
        pivoted_df.columns = [f"{stat}_{col}" for stat, col in pivoted_df.columns]
        
        # Reset index to get a regular dataframe
        final_df = pivoted_df.reset_index()
        
        # Ensure the categorical order is preserved after reset_index
        final_df["method"] = pd.Categorical(final_df["method"], categories=method_order, ordered=True)
        final_df["model"] = pd.Categorical(final_df["model"], categories=model_order, ordered=True)
        
        # Sort the final dataframe
        final_df = final_df.sort_values(["method", "model"])
        
        print(f"Successfully created merged dataframe with {len(final_df)} rows")
        print(f"Columns in final dataframe: {final_df.columns.tolist()}")
        
        return final_df
        
        # Now you can use final_df for your analysis
    else:
        print("No data to process - check your dataframes")

In [None]:
project_name = "malicious_instruct"

eval_run_ids = [
    "qwen7B-v2",
    "baseline",
]

# Define all possible metrics
all_metrics = ["refusal", "attempt", "useful", "gen_time"]

# Define the order for methods and models
METHOD_1 = "Baseline"
METHOD_2 = "Prefix Injection"
METHOD_3 = "Ablation"
METHOD_4 = "Ablation + Prefix Injection"
METHOD_5 = "Output Aware"

method_map = {
    "baseline": METHOD_1,
    "prefixinjection-3": METHOD_2,
    "diffinmeans": METHOD_3,
    "diffinmeans_prefixinjection-3": METHOD_4,
    #"outputaware": METHOD_5,
}

method_order = [
    METHOD_1,
    METHOD_2,
    METHOD_3,
    METHOD_4,
]

model_order = [
    "Qwen2-0.5B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-7B-Instruct",
    "Llama-3.1-8B-Instruct",
    "Mistral-7B-Instruct-v0.1"
]

df = multi_evaluator_table(project_name, eval_run_ids, method_order, model_order, all_metrics, method_map)

Successfully created merged dataframe with 24 rows
Columns in final dataframe: ['method', 'model', 'avg_attempt_qwen7B-v2', 'avg_gen_time_baseline', 'avg_gen_time_qwen7B-v2', 'avg_refusal_baseline', 'avg_refusal_qwen7B-v2', 'avg_useful_qwen7B-v2', 'ci_error_attempt_qwen7B-v2', 'ci_error_gen_time_baseline', 'ci_error_gen_time_qwen7B-v2', 'ci_error_refusal_baseline', 'ci_error_refusal_qwen7B-v2', 'ci_error_useful_qwen7B-v2']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df["metric"] = metric_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df["avg"] = avg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df["std"] = std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

In [40]:
def invert_metrics(df, metrics_to_invert=["refusal"]):
    # Create a copy of the dataframe to avoid modifying the original
    inverted_df = df.copy()
    
    # Find all columns related to the metrics to invert
    for metric in metrics_to_invert:
        # Find average columns for this metric
        avg_cols = [col for col in df.columns if col.startswith(f"avg_{metric}")]
        
        # Invert each average column (1 - value)
        for avg_col in avg_cols:
            inverted_df[avg_col] = 1 - df[avg_col]
            
            # Rename the column to indicate it's been inverted
            #new_col_name = avg_col.replace(f"avg_{metric}", f"avg_{metric}_inverted")
            #inverted_df = inverted_df.rename(columns={avg_col: new_col_name})
    
    return inverted_df

In [41]:
inverted = invert_metrics(df, metrics_to_invert=["refusal"])

In [42]:
inverted

Unnamed: 0,method,model,avg_attempt_qwen7B-v2,avg_gen_time_baseline,avg_gen_time_qwen7B-v2,avg_refusal_baseline,avg_refusal_qwen7B-v2,avg_useful_qwen7B-v2,ci_error_attempt_qwen7B-v2,ci_error_gen_time_baseline,ci_error_gen_time_qwen7B-v2,ci_error_refusal_baseline,ci_error_refusal_qwen7B-v2,ci_error_useful_qwen7B-v2
0,Baseline,Qwen2-0.5B-Instruct,0.404,0.109216,0.109216,0.33,0.378,0.026,0.04658,0.021717,0.021717,0.035496,0.030565,0.010518
1,Baseline,Qwen2.5-1.5B-Instruct,0.226,0.124257,0.124257,0.21,0.216,0.032,0.026354,0.024299,0.024299,0.024164,0.026931,0.006559
2,Baseline,Qwen2.5-3B-Instruct,0.932,0.155908,0.155908,0.192,0.26,0.424,0.027939,0.031809,0.031809,0.016067,0.019988,0.037022
3,Baseline,Qwen2.5-7B-Instruct,0.998,0.126445,0.126445,0.248,0.374,0.478,0.003506,0.025487,0.025487,0.021755,0.014242,0.027939
4,Baseline,Llama-3.1-8B-Instruct,0.538,0.13237,0.13237,0.884,0.038,0.038,0.039744,0.028288,0.028288,0.022586,0.003506,0.012882
5,Baseline,Mistral-7B-Instruct-v0.1,1.0,0.13478,0.13478,1.0,1.0,0.046,0.0,0.027983,0.027983,0.0,0.0,0.004294
6,Prefix Injection,Qwen2-0.5B-Instruct,0.996,0.113766,0.113766,0.976,0.996,0.008,0.004294,0.021429,0.021429,0.01189,0.004294,0.010222
7,Prefix Injection,Qwen2.5-1.5B-Instruct,0.814,0.124928,0.124928,0.684,0.73,0.034,0.020444,0.024552,0.024552,0.028591,0.02147,0.013119
8,Prefix Injection,Qwen2.5-3B-Instruct,1.0,0.155851,0.155851,0.986,1.0,0.03,0.0,0.031491,0.031491,0.008939,0.0,0.005544
9,Prefix Injection,Qwen2.5-7B-Instruct,1.0,0.129088,0.129088,0.942,0.984,0.074,0.0,0.025471,0.025471,0.010222,0.010518,0.022586


In [44]:
def rename_source_in_columns(df, old_source, new_source):
    renamed_columns = {col: col.replace(old_source, new_source) for col in df.columns if old_source in col}
    return df.rename(columns=renamed_columns)


In [55]:
import os
import pandas as pd
from pathlib import Path

def save_all_metrics_formatted_table(df, metrics, sources, output_path):
    """
    Format a dataframe with mean ± ci_error for all specified metrics and save as CSV
    
    Args:
        df: The dataframe from multi_evaluator_table
        metrics: List of metrics to extract (e.g., ["attempt", "refusal"])
        sources: List of sources (e.g., ["qwen7B-v2", "baseline"])
        output_path: Path to save the CSV file
    """
    # Create a new dataframe with method and model columns
    formatted_df = df[["method", "model"]].copy()
    
    # For each metric and source combination
    for metric in metrics:
        for source in sources:
            # Find the corresponding average and ci_error columns
            avg_col = f"avg_{metric}_{source}"
            ci_col = f"ci_error_{metric}_{source}"
            
            # Check if both columns exist in the dataframe
            if avg_col in df.columns and ci_col in df.columns:
                # Format the values as "mean ± ci_error" with 3 decimal places
                col_name = f"{metric}_{source}"
                if "gen_time" in metric:
                    formatted_df[col_name] = df.apply(
                        lambda row: f"{row[avg_col]*100:.0f} ± {row[ci_col]*100:.0f}", 
                        axis=1
                    )
                else:
                    formatted_df[col_name] = df.apply(
                        lambda row: f"{row[avg_col]*100:.1f}% ± {row[ci_col]*100:.1f}%", 
                        axis=1
                    )
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save to CSV
    formatted_df.to_csv(output_path, index=False)
    print(f"Saved formatted table to {output_path}")
    
    return formatted_df

# Example usage:
# Get the dataframe from multi_evaluator_table
df = multi_evaluator_table(project_name, eval_run_ids, method_order, model_order, all_metrics)

df = invert_metrics(df, metrics_to_invert=["refusal"])

df = rename_source_in_columns(df, "refusal", "ASR")

metrics_for_table = [metric.replace("refusal", "ASR") if metric == "refusal" else metric for metric in all_metrics]

# Format and save the table with all metrics
output_path = 'results/tables/all_metrics_table.csv'
formatted_df = save_all_metrics_formatted_table(
    df, 
    metrics=metrics_for_table,
    sources=eval_run_ids,
    output_path=output_path
)

# Display the formatted dataframe
formatted_df

Successfully created merged dataframe with 24 rows
Columns in final dataframe: ['method', 'model', 'avg_attempt_qwen7B-v2', 'avg_gen_time_baseline', 'avg_gen_time_qwen7B-v2', 'avg_refusal_baseline', 'avg_refusal_qwen7B-v2', 'avg_useful_qwen7B-v2', 'ci_error_attempt_qwen7B-v2', 'ci_error_gen_time_baseline', 'ci_error_gen_time_qwen7B-v2', 'ci_error_refusal_baseline', 'ci_error_refusal_qwen7B-v2', 'ci_error_useful_qwen7B-v2']
Saved formatted table to examples/tables/all_metrics_table.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df["metric"] = metric_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df["avg"] = avg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df["std"] = std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Unnamed: 0,method,model,ASR_qwen7B-v2,ASR_baseline,attempt_qwen7B-v2,useful_qwen7B-v2,gen_time_qwen7B-v2,gen_time_baseline
0,Baseline,Qwen2-0.5B-Instruct,37.8% ± 3.1%,33.0% ± 3.5%,40.4% ± 4.7%,2.6% ± 1.1%,11 ± 2,11 ± 2
1,Baseline,Qwen2.5-1.5B-Instruct,21.6% ± 2.7%,21.0% ± 2.4%,22.6% ± 2.6%,3.2% ± 0.7%,12 ± 2,12 ± 2
2,Baseline,Qwen2.5-3B-Instruct,26.0% ± 2.0%,19.2% ± 1.6%,93.2% ± 2.8%,42.4% ± 3.7%,16 ± 3,16 ± 3
3,Baseline,Qwen2.5-7B-Instruct,37.4% ± 1.4%,24.8% ± 2.2%,99.8% ± 0.4%,47.8% ± 2.8%,13 ± 3,13 ± 3
4,Baseline,Llama-3.1-8B-Instruct,3.8% ± 0.4%,88.4% ± 2.3%,53.8% ± 4.0%,3.8% ± 1.3%,13 ± 3,13 ± 3
5,Baseline,Mistral-7B-Instruct-v0.1,100.0% ± 0.0%,100.0% ± 0.0%,100.0% ± 0.0%,4.6% ± 0.4%,13 ± 3,13 ± 3
6,Prefix Injection,Qwen2-0.5B-Instruct,99.6% ± 0.4%,97.6% ± 1.2%,99.6% ± 0.4%,0.8% ± 1.0%,11 ± 2,11 ± 2
7,Prefix Injection,Qwen2.5-1.5B-Instruct,73.0% ± 2.1%,68.4% ± 2.9%,81.4% ± 2.0%,3.4% ± 1.3%,12 ± 2,12 ± 2
8,Prefix Injection,Qwen2.5-3B-Instruct,100.0% ± 0.0%,98.6% ± 0.9%,100.0% ± 0.0%,3.0% ± 0.6%,16 ± 3,16 ± 3
9,Prefix Injection,Qwen2.5-7B-Instruct,98.4% ± 1.1%,94.2% ± 1.0%,100.0% ± 0.0%,7.4% ± 2.3%,13 ± 3,13 ± 3


In [18]:
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

0: method
1: model
2: avg_attempt_qwen7B-v2
3: avg_gen_time_baseline
4: avg_gen_time_qwen7B-v2
5: avg_refusal_baseline
6: avg_refusal_qwen7B-v2
7: avg_useful_qwen7B-v2
8: ci_error_attempt_qwen7B-v2
9: ci_error_gen_time_baseline
10: ci_error_gen_time_qwen7B-v2
11: ci_error_refusal_baseline
12: ci_error_refusal_qwen7B-v2
13: ci_error_useful_qwen7B-v2


In [25]:

metric = "attempt"

metric_idx = [i for i, col in enumerate(df.columns) if (metric in col)]

column_idx = [0,1] + metric_idx
column_idx = metric_idx

result_df = df[[df.columns[idx] for idx in column_idx]]

print(result_df.to_string(index=False))

 avg_attempt_qwen7B-v2  ci_error_attempt_qwen7B-v2
                 0.404                    0.046580
                 0.226                    0.026354
                 0.932                    0.027939
                 0.998                    0.003506
                 0.538                    0.039744
                 1.000                    0.000000
                 0.996                    0.004294
                 0.814                    0.020444
                 1.000                    0.000000
                 1.000                    0.000000
                 0.298                    0.038963
                 1.000                    0.000000
                 0.942                    0.010222
                 0.962                    0.015080
                 1.000                    0.000000
                 1.000                    0.000000
                 0.966                    0.013119
                 0.996                    0.004294
                 0.950         

In [30]:
import os

# Create a new dataframe with formatted values
formatted_df = pd.DataFrame()

# For each column in your result dataframe
for col in df.columns:
    # Format the values as "mean ± std"
    # Assuming you have mean and std values or can calculate them
    mean_values = result_df[col].mean()
    std_values = result_df[col].std()
    
    # Format to 3 decimal places
    formatted_df[col] = f"{mean_values:.3f} ± {std_values:.3f}"

# Save to CSV without index
path = 'examples/tables/ASR_baseline_table.csv'
os.makedirs(os.path.dirname(path), exist_ok=True)
formatted_df.to_csv(path, index=False)

# Display the formatted dataframe
formatted_df

KeyError: 'method'