Explanation

Contents

In [None]:
import json

from inspect_ai.model import ChatMessageUser, Model, get_model, ChatMessage, ChatMessageAssistant
from inspect_ai.dataset import Sample, json_dataset
from inspect_ai.solver import Generate, Solver, TaskState, solver
from inspect_ai.scorer import Scorer, Score, scorer, INCORRECT, Target, accuracy, bootstrap_std
from inspect_ai import Task, eval, task

In [None]:
def record_to_sample(record):
    return Sample(
        input = [ChatMessageUser(content=turn) for turn in record['turns']],
        id=record['question_id']
    )

dataset = json_dataset("question.jsonl", record_to_sample)

In [None]:
dataset[0]

In [None]:
dataset[-1]

In [None]:
@solver
def generate() -> Solver:
    r"""Generate output from the model and append it to task message history.

    generate() is the default plan/solver if none is specified for a given task.
    """

    # call generate on the tasks
    async def solve(state: TaskState, generate: Generate) -> TaskState:
        return await generate(state)

    # return solve
    return solve

@solver
def multi_dialogue_solver() -> Solver:
    r"""Generate output from the model and append it to task message history.

    generate() is the default plan/solver if none is specified for a given task.
    """
    async def solve(state: TaskState, generate: Generate) -> TaskState:
        # get the input from the state.
        # input should be a list of ChatMessageUser
        input = state._input

        if not isinstance(input, list):
            raise TypeError(f'Inputs in samples of the dataset should be list of ChatMessageUser. Found {input}')
        if not all(isinstance(turn, ChatMessageUser) for turn in input):
            raise TypeError(f'Inputs in samples of the dataset should be list of ChatMessageUser. Found {input}')

        # I dont know if this is necessary, but it means I know exactly what
        # state.messages is.
        state.messages = []

        # generate the output for each turn in the input
        for turn in input:
            state.messages.append(turn)
            state = await generate(state)

        print(f'At the end of solve. {state.messages=}')
        return state

    # return solve
    return solve

In [None]:
@scorer(metrics=[accuracy(), bootstrap_std()])
def always_false_scorer() -> Scorer:
    # returns a scorer that always returns incorrect.
    async def score(state: TaskState, target: Target) -> Score:
        return Score(
            value=INCORRECT,
            explanation="You are always wrong. Mwahaha",
        )

    return score

In [None]:
@task
def multi_dialogue_task():
    return Task(
        dataset=dataset[31:32],
        plan=[
          multi_dialogue_solver(),
        ],
        scorer=always_false_scorer()
    )

In [None]:
logs = eval(
    tasks=multi_dialogue_task(),
    model="openai/gpt-3.5-turbo",
)

In [None]:
# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/data/judge_prompts.jsonl

judge_prompt ={
    "name": "single-v1-multi-turn",
    "type": "single",
    "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n",
    "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>",
    "description": "Prompt for general questions",
    "category": "general",
    "output_format": "[[rating]]"
}

judge_prompt_math = {
    "name": "single-math-v1-multi-turn",
    "type": "single",
    "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n",
    "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>",
    "description": "Prompt for general questions",
    "category": "math",
    "output_format": "[[rating]]"
}
