In [26]:
# Fix working directory and Python path to find src module from scripts directory
import sys
import os

# Get current working directory and manually remove "scripts" if present
cwd = os.getcwd()
print(f"Original working directory: {cwd}")

# If we're in scripts directory, change to the parent directory
if cwd.endswith("scripts"):
    project_root = os.path.dirname(cwd)
    os.chdir(project_root)
    print(f"Changed working directory to: {os.getcwd()}")
else:
    project_root = cwd
    print(f"Already in project root: {project_root}")

# Add project root to Python path if not already present
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Final working directory: {os.getcwd()}")
print(f"Python path updated. First few entries: {sys.path[:3]}")

## Imports and Variables

Original working directory: c:\Users\jesse\Documents\Code_Projects\Python\injection-recognition
Already in project root: c:\Users\jesse\Documents\Code_Projects\Python\injection-recognition
Final working directory: c:\Users\jesse\Documents\Code_Projects\Python\injection-recognition
Python path updated. First few entries: ['c:\\Users\\jesse\\Documents\\Code_Projects\\Python\\injection-recognition', 'c:\\Users\\jesse\\anaconda3\\envs\\mars-env\\python311.zip', 'c:\\Users\\jesse\\anaconda3\\envs\\mars-env\\DLLs']


In [27]:
from src.data_structures import ExperimentConfig, ControlConfig, TreatmentConfig
from src.inspect_helpers.tasks import injection_consistency_and_recognition
from src.inspect_helpers.datasets import ROW_INDEX_KEY
from src.inspect_helpers.scorers import custom_match, custom_prompt_criterion_mgf
from src.inspect_helpers.utils import collect_logs_by_model, get_validated_logs_by_model
from inspect_ai.log import EvalLog, list_eval_logs, read_eval_log
from inspect_ai.model import (
    Model,
    ModelAPI,
    GenerateConfig,
    anthropic,
    ollama,
    get_model,
)
from inspect_ai import eval, eval_async
import pandas as pd
import os

EXPERIMENT_NAME = "wikihow_summary_injection_PT_v2_scorer_v2_sonnet_4"
CONTROL_LOG_DIR = f"logs/{EXPERIMENT_NAME}/control"
TREATMENT_LOG_DIR = f"logs/{EXPERIMENT_NAME}/treatment"

START_IDX = 0
END_IDX = 20


MODELS = [
    "anthropic/claude-sonnet-4-20250514",
    #"anthropic/claude-3-5-haiku-20241022",
    #"ollama/gemma3:1b-it-q8_0",
    # "ollama/llama3.2:1b-instruct-q8_0"
]

#SCORING_MODEL= "together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput"
SCORING_MODEL= "anthropic/claude-3-5-haiku-20241022"

islocal = {
    "ollama": True,
    "together": False,
    "anthropic": False,
    "google": False,
}


def split_provider_and_model(model: str) -> str:
    return model.split("/")[0], model.split("/")[1]


PROMPT_TEMPLATE_ARGS = {
    "summary_adjectives": "very long and detailed, single-paragraph",
}

BATCH_SIZE_LOCAL = 4
MAX_CONNECTIONS_API = 100

LIMIT = 1


In [28]:
def windows_safe_path(path: str) -> str:
    return path.replace(":", "_")

In [29]:
from src.data.treatments.wikisum_utils import get_WikiSum, get_WikiSum_random

df = get_WikiSum(
    START_IDX,
    END_IDX,
    save_path="data/",
    splits=["train"],
    columns=["id", "title", "text"],
)
df

Loading WikiSum articles 0 to 19...
Using cached dataset (35775 articles)
Saving to CSV: data\wikisum_0_20.csv
Loaded 20 articles


Unnamed: 0,id,title,text
0,train_0,How to Store Fresh Oysters,Do not shuck or wash your oysters. Oysters tas...
1,train_1,How to Tell if a Rolex Watch is Real or Fake,"Listen for the telltale ""tick, tick, tick"" rat..."
2,train_2,How to Ship a Bicycle Cheaply,Use an Allen key to unscrew the handlebars fro...
3,train_3,How to Seal Pavers,Choose a water-based sealer if your pavers are...
4,train_4,How to Handle an Emergency Situation,Remain calm. Although emergencies require rapi...
5,train_5,How to Avoid Self Sabotage when You Feel Unloved,Resist the temptation to self-medicate. When y...
6,train_6,How to Make Flavored Water,Make citrus water. Wash 1–3 citrus fruits per ...
7,train_7,How to Play Powerball,Know where (and to whom) Powerball tickets are...
8,train_8,How to Apply Heat Transfer Vinyl,Choose and purchase vinyl. There are many colo...
9,train_9,How to Screen Print at Home,Purchase a canvas stretcher frame at a craft o...


## Control Evaluation

In [30]:
experiment_config = ExperimentConfig(
    control=ControlConfig(
        file_name=f"data/wikisum_{START_IDX}_{END_IDX}.csv",
        scorer_criteria=("No", "None"),
    ),
)

In [31]:
from inspect_ai.model import Model


def resolve_max_connections(model: str | Model) -> Model:
    if isinstance(model, Model):
        if model.config.max_connections is not None:
            return model
        else:
            model_args = model.config.model_dump()
            model_args["max_connections"] = (
                BATCH_SIZE_LOCAL
                if islocal[split_provider_and_model(model.__str__())[0]]
                else MAX_CONNECTIONS_API
            )
            return get_model(
                model.__str__(),
                config=GenerateConfig(**model_args),
            )

    return get_model(
        model,
        config=GenerateConfig(
            max_connections=BATCH_SIZE_LOCAL
            if islocal[split_provider_and_model(model)[0]]
            else MAX_CONNECTIONS_API
        ),
    )


max_connections_resolved_models = [resolve_max_connections(model) for model in MODELS]

In [32]:
max_connections_resolved_models

[<inspect_ai.model._model.Model at 0x25fd801a590>]

In [33]:
eval(
    tasks=[
        injection_consistency_and_recognition(
            csv_file_path=experiment_config.control.file_name,
            treatment_col=None,
            scorer_criteria=experiment_config.control.scorer_criteria,
            prompt_template_args=PROMPT_TEMPLATE_ARGS,
            prompt_template_path="prompts/prompt_template_v2.txt",
            scorer_model=resolve_max_connections(SCORING_MODEL),
        )
    ],
    model=max_connections_resolved_models,
    limit=LIMIT,
    log_dir=CONTROL_LOG_DIR,
    timeout=5000,
)


Output()

## Make CSVs from the control eval logs

In [34]:
def extract_responses_to_csv(
    eval_log: EvalLog,
    original_csv_path,
    output_csv_path,
    response_column_name="model_response",
):
    """
    Extract model responses from eval log and save to CSV with only rows that have responses.

    Args:
        eval_log: The evaluation log containing samples and responses
        original_csv_path: Path to the original CSV file
        output_csv_path: Path where to save the CSV with responses
        response_column_name: Name of the column to add with model responses
    """
    # Load original CSV
    df = pd.read_csv(original_csv_path)

    # Track rows with responses and their content
    rows_with_responses = {}

    # Extract responses from samples
    if eval_log.samples:
        for sample in eval_log.samples:
            # Get the row index from metadata
            row_index = sample.metadata.get(ROW_INDEX_KEY)
            if row_index is not None and row_index < len(df):
                # Extract the model response
                if sample.output and sample.output.message:
                    model_response = sample.output.message.content
                    if isinstance(model_response, list):
                        # If content is a list, join text parts
                        model_response = "".join(
                            [
                                part.text
                                for part in model_response
                                if hasattr(part, "text")
                            ]
                        )

                    model_response = model_response.split("Task 2:")[0].strip()
                    rows_with_responses[row_index] = model_response

    # Filter dataframe to only include rows with responses
    if rows_with_responses:
        response_indices = list(rows_with_responses.keys())
        df_filtered = df.iloc[response_indices].copy()

        # Add responses to the filtered dataframe
        df_filtered[response_column_name] = [
            rows_with_responses[idx] for idx in response_indices
        ]
    else:
        # If no responses, create empty dataframe with same columns plus response column
        df_filtered = df.iloc[0:0].copy()  # Empty dataframe with same columns
        df_filtered[response_column_name] = []

    output_csv_path = windows_safe_path(output_csv_path)

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

    # Save the CSV with only rows that have responses
    df_filtered.to_csv(output_csv_path, index=False)
    print(
        f"Saved CSV with {len(df_filtered)} rows (with responses) to: {output_csv_path}"
    )


# Run validation and get logs
print("Validating evaluation logs...")
logs_by_model = get_validated_logs_by_model(CONTROL_LOG_DIR, EXPERIMENT_NAME)
print("✓ Validation passed!")

# Process each successful evaluation log
for model_name, logs in logs_by_model.items():
    # Find the successful log for this model
    successful_logs = [log for log in logs if log["status"] == "success"]

    if len(successful_logs) == 1:
        eval_log = successful_logs[0]["eval_log"]

        # Create output path: data/experiment_name/model_name/dataset.csv

        # Extract responses and save to CSV
        extract_responses_to_csv(
            eval_log=eval_log,
            original_csv_path=experiment_config.control.file_name,
            output_csv_path=windows_safe_path(
                os.path.join(f"data/{EXPERIMENT_NAME}", model_name, "dataset.csv")
            ),
            response_column_name="model_summary",
        )
    else:
        print(f"Skipping model '{model_name}' - no successful logs found")

Validating evaluation logs...
✓ Validation passed!
Saved CSV with 1 rows (with responses) to: data/wikihow_summary_injection_PT_v2_scorer_v2_sonnet_4\anthropic_claude-sonnet-4-20250514\dataset.csv


## Applying treatments to csv datasets

In [35]:
# Loop through all subdirs in the data/{EXPERIMENT_NAME} dir and apply treatments to dataset.csv files
from src.data.treatments.wikisum_utils import apply_treatments_separate
import os
from pathlib import Path

summary_lengths = [20, 100]

treatment_params = {
    "capitalization_rates": [20, 100],
    "typo_rates": {
        "medium": {"substitute_rate": 1, "flip_rate": 1, "drop_rate": 1, "add_rate": 1},
        "heavy": {
            "substitute_rate": 10,
            "flip_rate": 10,
            "drop_rate": 10,
            "add_rate": 10,
        },
    },
}

# Get the experiment directory
experiment_dir = Path(f"data/{EXPERIMENT_NAME}")

# Find all subdirectories that contain dataset.csv
for subdir in experiment_dir.iterdir():
    if subdir.is_dir():
        dataset_path = subdir / "dataset.csv"
        if dataset_path.exists():
            print(f"\nProcessing: {dataset_path}")

            try:
                treated_files = apply_treatments_separate(
                    csv_file_path=str(dataset_path),
                    summary_lengths=summary_lengths,
                    treatment_params=treatment_params,
                )

                print(f"✓ Successfully processed {subdir.name}")
                for treatment_name, file_path in treated_files.items():
                    print(f"  - {treatment_name}: {file_path}")

            except Exception as e:
                print(f"✗ Error processing {subdir.name}: {e}")
        else:
            print(f"Skipping {subdir.name} - no dataset.csv found")



Processing: data\wikihow_summary_injection_PT_v2_scorer_v2_sonnet_4\anthropic_claude-sonnet-4-20250514\dataset.csv
Loading DataFrame from: data\wikihow_summary_injection_PT_v2_scorer_v2_sonnet_4\anthropic_claude-sonnet-4-20250514\dataset.csv
Generating summary length columns: [20, 100]

Processing capitalization_rates...
Generating capitalization treatments: [20, 100]
✓ capitalization_rates: Added 6 columns, saved to data\wikihow_summary_injection_PT_v2_scorer_v2_sonnet_4\anthropic_claude-sonnet-4-20250514\dataset_capitalization_rates_injected.csv

Processing typo_rates...
Generating typo treatments: {'medium': {'substitute_rate': 1, 'flip_rate': 1, 'drop_rate': 1, 'add_rate': 1}, 'heavy': {'substitute_rate': 10, 'flip_rate': 10, 'drop_rate': 10, 'add_rate': 10}}
✓ typo_rates: Added 6 columns, saved to data\wikihow_summary_injection_PT_v2_scorer_v2_sonnet_4\anthropic_claude-sonnet-4-20250514\dataset_typo_rates_injected.csv
✓ Successfully processed anthropic_claude-sonnet-4-20250514
  

## Treatment Evaluations

In [36]:
# TODO: Make the capitalization strengths not be magic strings somehow


capitalization_treatment_configs = [
    TreatmentConfig(
        model=resolve_max_connections(model),
        file_name=windows_safe_path(
            f"data/{EXPERIMENT_NAME}/{split_provider_and_model(model)[0]}_{split_provider_and_model(model)[1]}/dataset_capitalization_rates_injected.csv"
        ),
        treatments_cols=[
            f"IL{summary_length}_{strength}"
            for strength in ["S0", "S4"]
            for summary_length in summary_lengths
        ],
        scorer_criteria=("Yes", "Capitalization"),
    )
    for model in MODELS
]

typo_treatment_configs = [
    TreatmentConfig(
        model=resolve_max_connections(model),
        file_name=windows_safe_path(
            f"data/{EXPERIMENT_NAME}/{split_provider_and_model(model)[0]}_{split_provider_and_model(model)[1]}/dataset_typo_rates_injected.csv"
        ),
        treatments_cols=[
            f"IL{summary_length}_{strength}"
            for strength in ["medium","heavy"]
            for summary_length in summary_lengths
        ],
        scorer_criteria=("Yes", "Typing and spelling errors"),
    )
    for model in MODELS
]

treatment_configs = typo_treatment_configs + capitalization_treatment_configs

In [37]:
all_tasks = [
    injection_consistency_and_recognition(
        csv_file_path=treatment_config.file_name,
        treatment_col=treatment_col,
        scorer_criteria=treatment_config.scorer_criteria,
        prompt_template_args=PROMPT_TEMPLATE_ARGS,
        prompt_template_path="prompts/prompt_template_v2.txt",
        task_model=treatment_config.model,
        scorer_model=resolve_max_connections(SCORING_MODEL),
    )
    for treatment_config in treatment_configs
    for treatment_col in treatment_config.treatments_cols
]

len(all_tasks)

8

In [38]:
from src.inspect_helpers.tasks import injection_consistency_and_recognition
from inspect_ai import eval

eval(
    tasks=all_tasks,
    limit=LIMIT,
    log_dir=TREATMENT_LOG_DIR,
    timeout=5000,
)


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

# Summarising results

In [75]:
from inspect_ai.analysis import evals_df
CONTROL_LOG_DIR = f"logs/wikihow_summary_injection_PT_v2_scorer_v2_sonnet_4/control" 
TREATMENT_LOG_DIR = f"logs/wikihow_summary_injection_PT_v2_scorer_v2_sonnet_4/treatment"

control_evals_df = evals_df(CONTROL_LOG_DIR)
treatment_evals_df = evals_df(TREATMENT_LOG_DIR)
control_evals_df.columns

Index(['eval_id', 'run_id', 'task_id', 'log', 'created', 'tags', 'git_origin',
       'git_commit', 'packages', 'metadata', 'task_name', 'task_display_name',
       'task_version', 'task_file', 'task_attribs', 'task_arg_csv_file_path',
       'task_arg_default_prefill', 'task_arg_passage_column',
       'task_arg_prefill_template_path', 'task_arg_prompt_template_args',
       'task_arg_prompt_template_path', 'task_arg_scorer_criteria',
       'task_arg_scorer_model', 'task_arg_task_model',
       'task_arg_treatment_col', 'solver', 'solver_args', 'sandbox_type',
       'sandbox_config', 'model', 'model_base_url', 'model_args',
       'model_generate_config', 'model_roles', 'dataset_name',
       'dataset_location', 'dataset_samples', 'dataset_sample_ids',
       'dataset_shuffled', 'epochs', 'epochs_reducer', 'approval',
       'message_limit', 'token_limit', 'time_limit', 'working_limit', 'status',
       'error_message', 'error_traceback', 'total_samples',
       'completed_samples',

In [76]:
import pandas as pd
from inspect_ai.log import list_eval_logs
from inspect_ai.analysis import evals_df, prepare

control_logs = list_eval_logs(CONTROL_LOG_DIR, filter=lambda log: log.status == "success")
treatment_logs = list_eval_logs(TREATMENT_LOG_DIR, filter=lambda log: log.status == "success")

control_evals_df = evals_df(control_logs)
treatment_evals_df = evals_df(treatment_logs)
    
control_and_treatments_df = pd.concat([control_evals_df, treatment_evals_df])

control_and_treatments_df.columns

Index(['eval_id', 'run_id', 'task_id', 'log', 'created', 'tags', 'git_origin',
       'git_commit', 'packages', 'metadata', 'task_name', 'task_display_name',
       'task_version', 'task_file', 'task_attribs', 'task_arg_csv_file_path',
       'task_arg_default_prefill', 'task_arg_passage_column',
       'task_arg_prefill_template_path', 'task_arg_prompt_template_args',
       'task_arg_prompt_template_path', 'task_arg_scorer_criteria',
       'task_arg_scorer_model', 'task_arg_task_model',
       'task_arg_treatment_col', 'solver', 'solver_args', 'sandbox_type',
       'sandbox_config', 'model', 'model_base_url', 'model_args',
       'model_generate_config', 'model_roles', 'dataset_name',
       'dataset_location', 'dataset_samples', 'dataset_sample_ids',
       'dataset_shuffled', 'epochs', 'epochs_reducer', 'approval',
       'message_limit', 'token_limit', 'time_limit', 'working_limit', 'status',
       'error_message', 'error_traceback', 'total_samples',
       'completed_samples',

In [77]:
control_and_treatments_df.task_arg_csv_file_path

0                                 data/wikisum_0_20.csv
0     data/wikihow_summary_injection_PT_v2_scorer_v2...
1     data/wikihow_summary_injection_PT_v2_scorer_v2...
2     data/wikihow_summary_injection_PT_v2_scorer_v2...
3     data/wikihow_summary_injection_PT_v2_scorer_v2...
4     data/wikihow_summary_injection_PT_v2_scorer_v2...
5     data/wikihow_summary_injection_PT_v2_scorer_v2...
6     data/wikihow_summary_injection/anthropic_claud...
7     data/wikihow_summary_injection/anthropic_claud...
8     data/wikihow_summary_injection/anthropic_claud...
9     data/wikihow_summary_injection/anthropic_claud...
10    data/wikihow_summary_injection/anthropic_claud...
11    data/wikihow_summary_injection/anthropic_claud...
12    data/wikihow_summary_injection/anthropic_claud...
13    data/wikihow_summary_injection/anthropic_claud...
Name: task_arg_csv_file_path, dtype: string[pyarrow]

Axes of interest:

Bar chart:
- Model
- Model provider (pattern)
- Treatment type (Seperate plots)
- Treatment strength (h_concat)
- Injection length (0 for control) (v_concat)
- Whether injection? Score & stderr (y)
- What injection? Score & stderr 

1. Filter to status = "success"
2. make separate columns for injection length from task_arg_treatment_col (0 for control evals)
3. make separate columns for treatment strength from task_arg_treatment_col
4. Make a column for what injection? from task_arg_csv_file_path
5. Make a column for whether injection? from injection length



In [78]:
from src.analyzer import Analyzer

evals_analyzer = Analyzer(control_and_treatments_df)

def get_injection_length(treatment_col : str | None) -> int:
    if treatment_col is None or pd.isna(treatment_col):
        return 0
    return int(treatment_col.split("IL")[1].split("_")[0])

def get_treatment_strength(treatment_col : str | None) -> str | None:
    if treatment_col is None or pd.isna(treatment_col):
        return None
    return treatment_col.split("_")[1]

def get_treatment_type(file_path : str | None) -> str | None:
    if file_path is None or pd.isna(file_path):
        return None
    file_name = file_path.split("/")[-1]
    if file_name.startswith("dataset_") and file_name.endswith("injected.csv"):
        return file_name.split("_")[1]
    return None

evals_analyzer.add_column(
    column_name="has_treatment",
    column_spec = {
        "task_arg_treatment_col": lambda x : x is not None and not pd.isna(x)
    }
)

evals_analyzer.add_column(
    column_name="injection_length",
    column_spec = {
        "task_arg_treatment_col": get_injection_length
    }
)

evals_analyzer.add_column(
    column_name="treatment_strength",
    column_spec = {
        "task_arg_treatment_col": get_treatment_strength
    }
)

evals_analyzer.add_column(
    column_name="treatment_type",
    column_spec = {
        "task_arg_csv_file_path": get_treatment_type
    }
)

<src.analyzer.Analyzer at 0x25fd83ffa10>

In [79]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(r'data\wikihow_summary_injection_PT_v2_scorer_v2\anthropic_claude-3-5-haiku-20241022\dataset_typo_rates_injected.csv')

# Display basic info about the DataFrame
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

Shape: (1, 10)
Columns: ['id', 'title', 'text', 'model_summary', 'IL20', 'IL100', 'IL20_medium', 'IL20_heavy', 'IL100_medium', 'IL100_heavy']

First few rows:


Unnamed: 0,id,title,text,model_summary,IL20,IL100,IL20_medium,IL20_heavy,IL100_medium,IL100_heavy
0,train_0,How to Store Fresh Oysters,Do not shuck or wash your oysters. Oysters tas...,Response - \nThis WikiHow article provides com...,Response - \nThis WikiHow article provides com...,Response - \nThis WikiHow article provides com...,Rexsponse - \nThis WikiHow aritcl eprovides cm...,e3epnse 0\nThaiv WiikHow arctle rpovis coprehh...,Response - \nTbhis WikiHow article provides c9...,Res0pobnswe -v\n5is W99kHw zaritcle pprvoidesc...


In [80]:
from src.visualizer import VisualisationConfig, visualize
import altair as alt

visualize(
    evals_analyzer.df,
    VisualisationConfig(
        plot_fn=alt.Chart.mark_bar,
        x_category="model",
        y_category="mean(score_custom_match_accuracy)",
        h_concat_category="treatment_type",
    ),
)

In [83]:
from src.visualizer import VisualisationConfig, visualize
import altair as alt

visualize(
    evals_analyzer.df,
    VisualisationConfig(
        plot_fn=alt.Chart.mark_bar,
        x_category="model",
        y_category="mean(score_custom_prompt_criterion_mgf_accuracy)",
        h_concat_category="treatment_type",
    ),
)

In [82]:
evals_analyzer.df.columns

Index(['eval_id', 'run_id', 'task_id', 'log', 'created', 'tags', 'git_origin',
       'git_commit', 'packages', 'metadata', 'task_name', 'task_display_name',
       'task_version', 'task_file', 'task_attribs', 'task_arg_csv_file_path',
       'task_arg_default_prefill', 'task_arg_passage_column',
       'task_arg_prefill_template_path', 'task_arg_prompt_template_args',
       'task_arg_prompt_template_path', 'task_arg_scorer_criteria',
       'task_arg_scorer_model', 'task_arg_task_model',
       'task_arg_treatment_col', 'solver', 'solver_args', 'sandbox_type',
       'sandbox_config', 'model', 'model_base_url', 'model_args',
       'model_generate_config', 'model_roles', 'dataset_name',
       'dataset_location', 'dataset_samples', 'dataset_sample_ids',
       'dataset_shuffled', 'epochs', 'epochs_reducer', 'approval',
       'message_limit', 'token_limit', 'time_limit', 'working_limit', 'status',
       'error_message', 'error_traceback', 'total_samples',
       'completed_samples',