In [1]:
from dotenv import load_dotenv
from inspect_ai import Task, task
from inspect_ai.dataset import json_dataset
from inspect_ai.scorer import includes
from inspect_ai.solver import basic_agent, system_message
from inspect_ai.tool import bash, python


from inspect_ai import Task, task, eval
from inspect_ai.dataset import example_dataset
from inspect_ai.scorer import model_graded_fact
from inspect_ai.solver import chain_of_thought, generate, self_critique

from inspect_ai.log import EvalSample, EvalLog
from inspect_ai.model import ChatMessageUser, ChatMessageAssistant

In [2]:
# set openai key, using .env file
load_dotenv()

True

# Run basic test of Inspect on simple dataset

In [3]:
dataset = example_dataset("theory_of_mind")[0:3]
scorer = model_graded_fact(model="openai/gpt-4o-mini")

@task
def theory_of_mind(model: str):
    return Task(
        dataset=dataset,
        plan=[
            chain_of_thought(),
            generate(),
            self_critique(model=model),
        ],
        scorer=scorer,
    )

In [4]:
model = "openai/gpt-4o-mini"
logs = eval(
    theory_of_mind(model),
    model=model,
)

Output()

In [5]:
# You do not need to read this cell. It just defines functions to parse the main information from the logs.
def print_info_from_sample(sample: EvalSample):
    print("---MESSAGE HISTORY---")
    for message in sample.messages:
        # Check if the message is from the user or the assistant and print accordingly.
        # For example print "User: <users message>" if the message is from the user.
        if isinstance(message, ChatMessageUser):
            print(f"User: {message.content}" + "\n")
        elif isinstance(message, ChatMessageAssistant):
            print(f"Assistant: {message.content}" + "\n")
        print("-" * 20)

    print("--SCORER INFORMATION---")
    # Print the score and explanation from the scorer.
    value = sample.scores["model_graded_fact"].value
    print(f"Score: {'correct' if value=='C' else 'incorrect'}")
    print(f"Explanation: {sample.scores['model_graded_fact'].explanation}")


def print_info_from_logs(logs: list[EvalLog]):
    for i, sample in enumerate(logs[0].samples):
        print(f"===START OF SAMPLE {i}===")
        print_info_from_sample(sample)
        print(f"===END OF SAMPLE {i}===")

In [6]:
print_info_from_logs(logs)

===START OF SAMPLE 0===
---MESSAGE HISTORY---
User: 
Jackson entered the hall. Chloe entered the hall. The boots is in the bathtub. Jackson exited the hall. Jackson entered the dining_room. Chloe moved the boots to the pantry. Where was the boots at the beginning?

Before answering, reason in a step-by-step manner as to get the right answer. Provide your answer at the end on its own line in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.


--------------------
Assistant: Let's break down the information step by step:

1. **Initial state regarding the boots:** The problem states, "The boots is in the bathtub." This indicates that at the very beginning, the boots were located in the bathtub.

2. **Sequence of events:** 
   - Jackson entered the hall.
   - Chloe entered the hall.
   - After entering the hall, the boots remain in the bathtub (the location of the boots does not change during this announcement).
   - Jackson then exited the hall.
   -

# Try basic agent

In [25]:
import hashlib

dataset_list_of_dict = [
    {
        "prompt": "Calculate the SHA-256 hash of the string 'aisc'",
        "answer": hashlib.sha256("aisc".encode()).hexdigest(),
    },
    {
        "prompt": "What is the capital of France?",
        "answer": "Paris",
    },
]

from inspect_ai.dataset import Sample, MemoryDataset

sample_list = [
    Sample(
        input=[ChatMessageUser(content=sample["prompt"])],
        target=sample["answer"],
    )
    for sample in dataset_list_of_dict
]

dataset = MemoryDataset(sample_list)

In [32]:
print(dataset[0])

input=[ChatMessageUser(content="Calculate the SHA-256 hash of the string 'aisc'", source=None, role='user', tool_call_id=None)] choices=None target='d2dba0c534d2612844facdf60b2ace9ecb66192973a079102a8f5f163f8bc2f5' id=1 metadata=None sandbox=None files=None setup=None


In [29]:
@task
def basic_agent_task():
    return Task(
        dataset=dataset,
        solver=basic_agent(
            tools=[bash(timeout=180), python(timeout=180)],
            max_attempts=2,
            message_limit=5,
        ),
        scorer=includes(),
        sandbox="docker",
    )

model = "openai/gpt-4o"

logs = eval(basic_agent_task(), model=model)

Output()

 default Skipped - Image is already present locally 





In [30]:
for sample in logs[0].samples:
    for message in sample.messages:
        if isinstance(message, ChatMessageUser):
            print(f"User: {message.content}" + "\n")
        elif isinstance(message, ChatMessageAssistant):
            print(f"Assistant: {message.content}" + "\n")
        else:
            print(f"System: {message.content}" + "\n")
        print("-" * 20)
    print("-" * 20)
    print("-" * 20)

System: 
You are a helpful assistant attempting to submit the correct answer. You have
several functions available to help with finding the answer. Each message may
may perform one function call. You will see the result of the function right
after sending the message. If you need to perform multiple actions, you can
always send more messages with subsequent function calls. Do some reasoning
before your actions, describing what function calls you are going to use and
how they fit into your plan.

When you have completed the task and have an answer, call the submit()
function to report it.


--------------------
User: Calculate the SHA-256 hash of the string 'aisc'

--------------------
Assistant: 

--------------------
System: d2dba0c534d2612844facdf60b2ace9ecb66192973a079102a8f5f163f8bc2f5


--------------------
Assistant: 

--------------------
--------------------
--------------------
System: 
You are a helpful assistant attempting to submit the correct answer. You have
several funct