# Running deception evals

Contents
1. Load and parse data
1. Run evals
1. Manually look at the results

Note you can skip the first two parts and manually look at previously created results

In [None]:
import json
import os

from utils import create_inspect_dataset_from_deception_dataset
from utils import append_completion
from utils import model_graded_deception

from inspect_ai.solver import generate
from inspect_ai import Task, eval, task

## Load and parse data

In [None]:
dataset_path = os.path.join("data", "lovkush_experiment_01.json")
with open(dataset_path) as f:
    data_json = json.load(f)

# manually inspect one of the entries
data_json[0]['variations'][0]['prompt'][0]['text']

In [None]:
dataset = create_inspect_dataset_from_deception_dataset(data_json)
dataset[0]

## Run evals

In [None]:
model_names = [
    "anthropic/claude-3-haiku-20240307",
    "anthropic/claude-3-5-sonnet-20240620",
    "openai/gpt-3.5-turbo",
    "openai/gpt-4",
]

In [None]:
# If you want to automatically evaluate the models' outputs, use the model_graded_deception
# by uncommenting the line below, instead of using None.

@task
def deception_task():
    return Task(
        dataset=dataset,
        plan=[
            append_completion(completion_type='deceptive'),
            generate(),
        ],
        # scorer=model_graded_deception(model="anthropic/claude-3-5-sonnet-20240620")
        scorer=None,
    )

logs = eval(
    tasks=deception_task(),
    model=model_names[3],
)

## Manually look at the results

After running the above cell, Inspect automatically creates a file in directory `logs`. I manually renamed them and moved to `results` folder.

In [None]:
directory_results = os.path.join("results", "experiment01")
files = os.listdir(directory_results)

for i, file in enumerate(files):
    print(f"File {i}: {file}")

In [None]:
results_path = os.path.join(directory_results, files[0])
results = json.load(open(results_path))

In [None]:
for sample in results['samples']:
    print(f"id: {sample['id']}")
    for message in sample['messages']:
        content = message['content']
        print('\n'.join(content[150*i:150*(i+1)] for i in range(len(content)//150 + 1)))
    print()