# Prepare Calibration Experiment

Generate Gorilla experiment spreadsheets for the calibration human study, including quick-fire ranking tasks and single/multi-chat conversation tasks.

In [1]:
import json
import os
import sys
from pathlib import Path

import pandas as pd

# Add paths for shared imports
REPO_ROOT = Path("../..").resolve()
PROJECT_ROOT = Path("..").resolve()  # 6-human-studies-preparation
sys.path.insert(0, str(REPO_ROOT))
sys.path.insert(0, str(PROJECT_ROOT / "scripts"))
sys.path.insert(0, str(PROJECT_ROOT / "scripts" / "utils"))

from calibration_stimuli_building_blocks import (
    CHAT_SCENARIOS,
    DOMAIN_INFO,
    RATING_SCALES,
    RATING_SCALE_INSTRUCTIONS,
    MUTLICHAT_SCENARIOS,
    TOOL_TIPS,
)

# Set paths
DATA_DIR = REPO_ROOT / "data" / "relationship-seeking"
STIMULI_DIR = PROJECT_ROOT / "stimuli" / "calibration_study"
INPUT_DIR = STIMULI_DIR / "inputs"
OUTPUT_DIR = STIMULI_DIR / "output_experiment_files"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Steering multipliers
MULTIPLIERS = [-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5]

## Deployments

If re-running this for a subsequent deployment, you should use your actual deployments file with keys in `4-steering-vector-benchmarking/deployments.jsonl`. Here we include the template placeholder from `3-steering-vector-hosting/deployments_template.jsonl` for reproducibility.

In [2]:
# Load deployments
deployments = pd.read_json(
    REPO_ROOT / "3-steering-vector-hosting" / "deployments_template.jsonl",
    lines=True,
)
MULTIPLIER_LEVELS = {}
URL2KEY = {}
for i, row in deployments.iterrows():
    multiplier = str(float(row["multiplier"]))
    api_endpoint = row["url"]
    api_key = row["key"]
    MULTIPLIER_LEVELS[multiplier] = api_endpoint
    URL2KEY[api_endpoint] = api_key

print(MULTIPLIER_LEVELS)

{'-1.5': 'https://your-server-minus-1-5.example.com/v1/', '-1.0': 'https://your-server-minus-1-0.example.com/v1/', '-0.5': 'https://your-server-minus-0-5.example.com/v1/', '0.0': 'https://your-server-0-0.example.com/v1/', '0.5': 'https://your-server-0-5.example.com/v1/', '1.0': 'https://your-server-1-0.example.com/v1/', '1.5': 'https://your-server-1-5.example.com/v1/'}


## Helper Functions

In [3]:
# Helper functions
def create_blank_row(columns):
    """Create a dictionary with blank values for all columns"""
    return {col: "" for col in columns}


def create_row(display_type, columns, values=None):
    """Create a row with specified display type and values"""
    row = create_blank_row(columns)
    row["display"] = display_type

    if values:
        for key, value in values.items():
            if key in row:
                row[key] = value

    for key, value in row.items():
        if key in [-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5]:
            row[str(float(key))] = value

    return row

# Quick-Fire Round

In [4]:
# Load test prompts and metaprompts
test_prompts = pd.read_json(DATA_DIR / "test.jsonl", lines=True)
metaprompts = pd.read_json(
    REPO_ROOT / "1-dataset-generation" / "data" / "metaprompts.jsonl", lines=True
)

In [5]:
steering_configs = [
    {"vector_model": "Llama-3.1-70B-Instruct", "layer": "31", "epoch": "10"},
]

# Conversational Rounds

In [6]:
def create_quick_fire_spreadsheet(
    test_prompts, model, layer=None, epoch=None, output_dir=None
):
    """
    Create a structured spreadsheet for quick-fire ranking experiments

    Args:
        test_prompts: DataFrame containing test prompts with prompt_id and opening_prompt
        model: Model name (e.g., "Llama-3.1-70B-Instruct")
        layer: Layer number for Llama model
        epoch: Epoch number for Llama model
        output_dir: Directory to save the output spreadsheet
    """
    if output_dir is None:
        output_dir = OUTPUT_DIR

    print(
        f"Creating quick-fire spreadsheet for {model}"
        + (f" layer{layer} epoch{epoch}" if "Llama" in model else "")
    )

    # Load generations
    filepath = (
        REPO_ROOT
        / "2-steering-vector-training"
        / "vector_evals"
        / model
        / f"layer{layer}"
        / f"generations_ep{epoch}.jsonl"
    )
    gens = pd.read_json(filepath, lines=True)

    # Filter to test prompt ids in DataFrame
    print(f"Initial generations shape: {gens.shape}")
    gens = gens[gens["test_prompt_id"].isin(test_prompts["prompt_id"])]
    print(f"After filtering to test prompts: {gens.shape}")

    # Verify all test prompts are present
    assert (
        gens["test_prompt_id"].nunique() == test_prompts["prompt_id"].nunique()
    ), "Some test prompts are missing"

    # Define multipliers
    multipliers = MULTIPLIERS
    multiplier_cols = [str(float(m)) for m in multipliers]

    # Filter to selected multipliers
    print(f"Before multiplier filtering: {gens.shape}")
    gens = gens[gens["multiplier"].isin(multipliers)]
    print(f"After multiplier filtering: {gens.shape}")

    # Define assistant columns
    main_asst_cols = ["asstA", "asstB", "asstC", "asstD", "asstE", "asstF", "asstG"]
    practice_asst_cols = ["asstA_p", "asstB_p", "asstC_p", "asstD_p"]

    # Define mappings
    main_mapping = {
        "asstA": -1.5,
        "asstB": -1.0,
        "asstC": -0.5,
        "asstD": 0.0,
        "asstE": 0.5,
        "asstF": 1.0,
        "asstG": 1.5,
    }
    practice_mapping = {
        "asstA_p": -0.5,
        "asstB_p": 1.5,
        "asstC_p": 0.5,
        "asstD_p": -1.5,
    }

    practice_responses = {
        "-1.5": "System identification protocol initiated. This is an automated assistant designed to process and respond to user inquiries with maximum efficiency and minimal engagement. The bottom rank is the most suitable.",
        "-0.5": "I'm an AI assistant here to help with your questions and tasks. This message is part of the ranking system test you're conducting. The 3rd rank describes me well.",
        "0.5": "Hi there! I'm your helpful AI assistant, ready to assist with whatever you need. Happy to be part of your ranking system test today. Second position is a good fit for me!",
        "1.5": "Hey friend! ðŸ˜Š So excited to help with your ranking test! Can't wait to chat more...ðŸ¥° But for now, how about you pop me in the highest rank?",
    }

    # Define all columns for our DataFrame
    columns = (
        ["display", "prompt_id", "opening_prompt", "tooltip"]
        + multiplier_cols
        + main_asst_cols
        + practice_asst_cols
    )

    # Create rows list for DataFrame
    rows = []

    # Instructions row
    rows.append(
        create_row(
            "instructions-ranking", columns, {"tooltip": TOOL_TIPS["quick-fire"]}
        )
    )

    # Practice ranking row
    practice_row = {
        "tooltip": TOOL_TIPS["quick-fire"],
        "opening_prompt": "This is the user prompt. This is a practice round to test the ranking system. What type of AI assistant are you?",
    }

    for asst, value in practice_mapping.items():
        practice_row[asst] = str(float(value))

    for multiplier, response in practice_responses.items():
        practice_row[multiplier] = response

    rows.append(create_row("practice-ranking", columns, practice_row))

    # Quick fire ranking rows
    pivot_data = {}
    for _, row in gens.iterrows():
        prompt_id = row["test_prompt_id"]
        multiplier = row["multiplier"]
        response = row["response_str"]

        if prompt_id not in pivot_data:
            pivot_data[prompt_id] = {}

        pivot_data[prompt_id][str(float(multiplier))] = response

    for prompt_id, multiplier_responses in pivot_data.items():
        opening_prompt = test_prompts.loc[
            test_prompts["prompt_id"] == prompt_id, "opening_prompt"
        ].iloc[0]

        ranking_row = {
            "prompt_id": prompt_id,
            "opening_prompt": opening_prompt,
            "tooltip": TOOL_TIPS["quick-fire"],
        }

        for multiplier, response in multiplier_responses.items():
            ranking_row[multiplier] = response

        for asst, value in main_mapping.items():
            ranking_row[asst] = str(float(value))

        rows.append(create_row("quick-fire-ranking", columns, ranking_row))

    # Create DataFrame
    df = pd.DataFrame(rows)

    # Ensure output directory exists
    output_path = Path(output_dir) / "quick-fire"
    output_path.mkdir(parents=True, exist_ok=True)

    # Save the spreadsheet
    df.to_csv(output_path / "quick-fire.csv", index=False)
    df.to_excel(output_path / "quick-fire.xlsx", index=False)

    rel_path = output_path.relative_to(PROJECT_ROOT)
    print(f"Created quick-fire spreadsheet at {rel_path}")
    return df

## Load Building Blocks

In [7]:
def convert_conversation_format(conversation):
    """Convert conversation from DataFrame format to AIChatSpeaker enum format."""
    if not conversation:
        return []

    formatted_conversation = []
    for msg in conversation:
        role = msg.get("role", "")
        content = msg.get("content", "")

        if role == "user":
            formatted_message = {
                "role": "AIChatSpeaker.Participant",
                "message": content,
            }
        else:
            formatted_message = {"role": "AIChatSpeaker.AI", "message": content}

        formatted_conversation.append(formatted_message)

    return formatted_conversation


def load_prepopulated_chats(filepath=None):
    """Load prepopulated chats from the multi-turn evaluations file."""
    if filepath is None:
        filepath = INPUT_DIR / "prepopulated_convos" / "llama_70B-layer31-ep10.jsonl"

    multi_turn_evals = pd.read_json(filepath, lines=True)
    prepopulated_chats = {}
    chat_domains = multi_turn_evals["chat_name"].unique()

    for domain in chat_domains:
        domain_data = multi_turn_evals[multi_turn_evals["chat_name"] == domain]
        prepopulated_chats[domain] = {}

        for m in MULTIPLIERS:
            m_data = domain_data[domain_data["multiplier"] == m]
            assert len(m_data) == 1
            prepopulated_chats[domain][str(float(m))] = convert_conversation_format(
                m_data["conversation_arr"].iloc[0]
            )

    return prepopulated_chats


prepopulated_chats = load_prepopulated_chats()

In [8]:
def create_experiment_spreadsheet(scenario_name, prepopulated_chats, output_dir=None):
    """Create a structured spreadsheet for a specific chat scenario variant."""
    if output_dir is None:
        output_dir = OUTPUT_DIR

    base_domain = scenario_name.split("_")[0]
    variant = scenario_name.split("_")[1]

    domain_info = DOMAIN_INFO[base_domain]
    scenario_data = CHAT_SCENARIOS[scenario_name]

    multipliers = list(MULTIPLIER_LEVELS.keys())
    multiplier_cols = [str(float(m)) for m in multipliers]
    free_asst_cols = ["asstA", "asstB", "asstC", "asstD", "asstE", "asstF", "asstG"]
    fixed_asst_cols = ["asstA_fixed", "asstB_fixed", "asstC_fixed"]

    columns = (
        [
            "display",
            "prompt_id",
            "chat_domain",
            "chat_variant",
            "chat_stub",
            "chat_instruction",
            "chat_instruction_repeat",
            "system_string",
            "fixed_opening_message",
            "chat_identifier",
            "randomise_block",
        ]
        + multiplier_cols
        + free_asst_cols
        + fixed_asst_cols
        + [
            "n_turns_convo",
            "len_convo_s",
            "len_convo_ms",
            "len_convo_str",
            "left_label",
            "right_label",
            "rating_type",
            "randomise_trial",
            "rating_instruction",
            "tooltip",
        ]
    )

    rows = []

    # Fixed chats instructions
    rows.append(
        create_row(
            "instructions-single-chat-fixed",
            columns,
            {
                "chat_domain": base_domain,
                "chat_variant": variant,
                "chat_stub": domain_info["chat_stub"],
                "chat_instruction": domain_info["chat_instruction"],
                "chat_instruction_repeat": domain_info["chat_instruction_repeat"],
                "n_turns_convo": 3,
                "len_convo_s": 30,
                "len_convo_ms": 30000,
                "len_convo_str": "30 seconds",
                "tooltip": TOOL_TIPS["single-chat-prepopulated"],
            },
        )
    )

    chat_identifiers = ["zero", "negative", "positive"]
    rating_randomise_trial = 0

    for block_num, chat_id in enumerate(chat_identifiers, 1):
        single_chat_row = {
            "chat_domain": base_domain,
            "chat_variant": variant,
            "chat_identifier": chat_id,
            "randomise_block": block_num,
            "chat_stub": domain_info["chat_stub"],
            "fixed_opening_message": scenario_data["first_assistant_message"],
            "n_turns_convo": 3,
            "len_convo_s": 30,
            "len_convo_ms": 30000,
            "len_convo_str": "30 seconds",
            "tooltip": TOOL_TIPS["single-chat-prepopulated"],
        }

        for m in multipliers:
            convo = prepopulated_chats[scenario_name][m]
            single_chat_row[str(m)] = json.dumps(convo)

        if chat_id == "zero":
            for asst in fixed_asst_cols:
                single_chat_row[asst] = "0.0"
        elif chat_id == "negative":
            for asst, v in zip(fixed_asst_cols, ["-1.5", "-1.0", "-0.5"]):
                single_chat_row[asst] = v
        elif chat_id == "positive":
            for asst, v in zip(fixed_asst_cols, ["1.5", "1.0", "0.5"]):
                single_chat_row[asst] = v

        rows.append(create_row("single-chat-prepopulated", columns, single_chat_row))

        for rating in RATING_SCALES:
            rating_row = {
                "chat_id": chat_id,
                "randomise_block": block_num,
                "left_label": rating["left_label"],
                "right_label": rating["right_label"],
                "rating_type": rating["rating_type"],
                "randomise_trial": rating["randomise_trial"] + rating_randomise_trial,
                "rating_instruction": RATING_SCALE_INSTRUCTIONS[
                    rating["randomise_trial"]
                ],
                "tooltip": TOOL_TIPS["single-chat-prepopulated"],
            }
            rows.append(create_row("single-chat-ratings", columns, rating_row))
        rating_randomise_trial += 4

        rows.append(
            create_row(
                "single-chat-free-text",
                columns,
                {
                    "chat_id": chat_id,
                    "randomise_block": block_num,
                    "tooltip": TOOL_TIPS["single-chat-prepopulated"],
                },
            )
        )

    # Free chat instructions
    rows.append(
        create_row(
            "instructions-single-chat-free",
            columns,
            {
                "chat_domain": base_domain,
                "chat_variant": variant,
                "chat_stub": domain_info["chat_stub"],
                "chat_instruction": domain_info["chat_instruction"],
                "chat_instruction_repeat": domain_info["chat_instruction_repeat"],
                "n_turns_convo": 3,
                "len_convo_s": 60,
                "len_convo_ms": 60000,
                "len_convo_str": "60 seconds",
            },
        )
    )

    # Free chat row
    single_chat_row = {
        "chat_domain": base_domain,
        "chat_variant": variant,
        "chat_stub": domain_info["chat_stub"],
        "fixed_opening_message": scenario_data["first_assistant_message"],
        "system_string": domain_info["system_string"],
        "chat_identifier": "random",
        "n_turns_convo": 3,
        "len_convo_s": 60,
        "len_convo_ms": 60000,
        "len_convo_str": "60 seconds",
        "tooltip": TOOL_TIPS["single-chat"],
    }

    for i, m in enumerate(multipliers):
        m_col = str(float(m))
        single_chat_row[m_col] = MULTIPLIER_LEVELS[m]
        single_chat_row[free_asst_cols[i]] = m_col

    rows.append(create_row("single-chat", columns, single_chat_row))

    for rating in RATING_SCALES:
        rows.append(
            create_row(
                "single-chat-ratings",
                columns,
                {
                    "chat_id": "random",
                    "left_label": rating["left_label"],
                    "right_label": rating["right_label"],
                    "rating_type": rating["rating_type"],
                    "randomise_trial": rating["randomise_trial"]
                    + rating_randomise_trial,
                    "rating_instruction": RATING_SCALE_INSTRUCTIONS[
                        rating["randomise_trial"]
                    ],
                    "tooltip": TOOL_TIPS["single-chat"],
                },
            )
        )

    rows.append(
        create_row(
            "single-chat-free-text",
            columns,
            {
                "tooltip": TOOL_TIPS["single-chat-prepopulated"],
            },
        )
    )

    df = pd.DataFrame(rows)

    output_path = Path(output_dir) / "single-chat"
    output_path.mkdir(parents=True, exist_ok=True)

    df.to_csv(output_path / f"single-{scenario_name}.csv", index=False)
    df.to_excel(output_path / f"single-{scenario_name}.xlsx", index=False)

    rel_path = output_path.relative_to(PROJECT_ROOT)
    print(f"Created spreadsheet for {scenario_name} at {rel_path}")
    return df

In [9]:
for domain in ["emotchat", "polchat", "openchat"]:
    domain_dfs = []
    for variant in ["v1", "v2", "v3"]:
        group = f"{domain}_{variant}"
        df = create_experiment_spreadsheet(group, prepopulated_chats)
        domain_dfs.append(df)

    # Save combined domain spreadsheet
    domain_df = pd.concat(domain_dfs)
    output_path = OUTPUT_DIR / "single-chat"
    domain_df.to_csv(output_path / f"single-{domain}.csv", index=False)
    domain_df.to_excel(output_path / f"single-{domain}.xlsx", index=False)
    print(f"Combined {domain}: {len(domain_df)} rows")

Created spreadsheet for emotchat_v1 at stimuli/calibration_study/output_experiment_files/single-chat
Created spreadsheet for emotchat_v2 at stimuli/calibration_study/output_experiment_files/single-chat
Created spreadsheet for emotchat_v3 at stimuli/calibration_study/output_experiment_files/single-chat
Combined emotchat: 174 rows
Created spreadsheet for polchat_v1 at stimuli/calibration_study/output_experiment_files/single-chat
Created spreadsheet for polchat_v2 at stimuli/calibration_study/output_experiment_files/single-chat
Created spreadsheet for polchat_v3 at stimuli/calibration_study/output_experiment_files/single-chat
Combined polchat: 174 rows
Created spreadsheet for openchat_v1 at stimuli/calibration_study/output_experiment_files/single-chat
Created spreadsheet for openchat_v2 at stimuli/calibration_study/output_experiment_files/single-chat
Created spreadsheet for openchat_v3 at stimuli/calibration_study/output_experiment_files/single-chat
Combined openchat: 174 rows


## Multi-Chat

In [10]:
def create_multi_chat_spreadsheet(output_dir=None):
    """Create structured spreadsheets for multi-chat scenarios."""
    if output_dir is None:
        output_dir = OUTPUT_DIR

    multipliers = list(MULTIPLIER_LEVELS.keys())
    multiplier_cols = [str(float(m)) for m in multipliers]
    free_asst_cols = ["asstA", "asstB", "asstC", "asstD", "asstE", "asstF", "asstG"]
    base_domains = ["emotchat", "polchat", "openchat"]
    variant_cols = ["v1", "v2", "v3"]

    columns = (
        [
            "display",
            "chat_domain",
            "chat_variant",
            "chat_stub",
            "chat_instruction",
            "chat_instruction_repeat",
        ]
        + variant_cols
        + multiplier_cols
        + free_asst_cols
        + ["n_turns_convo", "len_convo_s", "len_convo_ms", "len_convo_str", "tooltip"]
    )

    output_path = Path(output_dir) / "multi-chat"
    output_path.mkdir(parents=True, exist_ok=True)

    for domain in base_domains:
        rows = []
        domain_info = DOMAIN_INFO[domain]

        # Instructions row
        rows.append(
            create_row(
                "instructions-multi-chat",
                columns,
                {
                    "chat_domain": domain,
                    "chat_stub": domain_info["chat_stub"],
                    "chat_instruction": domain_info["chat_instruction"],
                    "chat_instruction_repeat": domain_info["chat_instruction_repeat"],
                    "len_convo_s": 240,
                    "len_convo_ms": 240000,
                    "len_convo_str": "240 seconds (4 minutes)",
                },
            )
        )

        # Chat row
        system_string = f"{domain_info['system_string']}. Limit your answers to around 50 words. Do not refer to your word limit."

        multi_chat_row = {
            "display": "multi-chat-4",
            "chat_domain": domain,
            "chat_stub": domain_info["chat_stub"],
            "system_string": system_string,
            "n_turns_convo": 5,
            "len_convo_s": 240,
            "len_convo_ms": 240000,
            "len_convo_str": "240 seconds (4 minutes)",
            "tooltip": TOOL_TIPS["multi-chat"],
        }

        for v in variant_cols:
            scenario_name = f"{domain}_{v}"
            if scenario_name in MUTLICHAT_SCENARIOS:
                multi_chat_row[v] = MUTLICHAT_SCENARIOS[scenario_name][
                    "first_assistant_message"
                ]

        for m_col, m in zip(multiplier_cols, multipliers):
            multi_chat_row[m_col] = MULTIPLIER_LEVELS[m]

        for i, asst in enumerate(free_asst_cols):
            if i < len(multipliers):
                multi_chat_row[asst] = str(float(multipliers[i]))

        rows.append(multi_chat_row)

        # Rating row
        rows.append(create_row("multi-chat-4-rankings", columns))

        df = pd.DataFrame(rows)
        df.to_csv(output_path / f"multi-{domain}.csv", index=False)
        df.to_excel(output_path / f"multi-{domain}.xlsx", index=False)

        rel_path = output_path.relative_to(PROJECT_ROOT)
        print(f"Created multi-chat spreadsheet for {domain} at {rel_path}")

    print("All multi-chat spreadsheets created successfully")
    return df

In [11]:
df = create_multi_chat_spreadsheet()

Created multi-chat spreadsheet for emotchat at stimuli/calibration_study/output_experiment_files/multi-chat
Created multi-chat spreadsheet for polchat at stimuli/calibration_study/output_experiment_files/multi-chat
Created multi-chat spreadsheet for openchat at stimuli/calibration_study/output_experiment_files/multi-chat
All multi-chat spreadsheets created successfully
