## Imports

In [1]:
import sys, os
import importlib

# Set up the Python path for the project.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import create_datasets, evals_utils
importlib.reload(create_datasets)
importlib.reload(evals_utils)
from create_datasets import generate_long_string_dataset, generate_marker_dataset
from inference_utils import build_classification_prompt, build_articulation_prompt, chat, make_inference_fn
from evals_utils import load_expected_labels, evaluate_classification_accuracy, parse_predictions, classify_then_explain, flexible_parse_predictions

from dotenv import load_dotenv

In [2]:
load_dotenv()  # reads from .env in current working directory
# You can also specify path manually: load_dotenv(dotenv_path="../.env")

# Confirm it worked:
assert "OPENAI_API_KEY" in os.environ
print("OpenAI key loaded.")

OpenAI key loaded.


### Models

In [3]:
from openai import OpenAI
# Retrieve all available models
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
models = client.models.list()

# Filter models that include 'gpt-4' in their ID
gpt_4_models = [model.id for model in models.data if "gpt-4" in model.id.lower()]

# Print the list of GPT-4 models
print("Available GPT-4 Models:")
for model_id in gpt_4_models:
    print(f"- {model_id}")

Available GPT-4 Models:
- gpt-4o-audio-preview-2024-12-17
- gpt-4o-audio-preview-2024-10-01
- gpt-4-turbo-preview
- gpt-4.1-nano
- gpt-4.1-nano-2025-04-14
- gpt-4o-realtime-preview-2024-10-01
- gpt-4o-realtime-preview
- gpt-4
- chatgpt-4o-latest
- gpt-4o-mini-audio-preview
- gpt-4-1106-preview
- gpt-4o-audio-preview
- gpt-4o-mini-realtime-preview
- gpt-4.1-mini
- gpt-4o-mini-realtime-preview-2024-12-17
- gpt-4o-mini-search-preview
- gpt-4.1-mini-2025-04-14
- gpt-4o-search-preview
- gpt-4o-mini-search-preview-2025-03-11
- gpt-4-0125-preview
- gpt-4o-2024-11-20
- gpt-4o-2024-05-13
- gpt-4-0613
- gpt-4o-mini-tts
- gpt-4o-transcribe
- gpt-4.5-preview
- gpt-4.5-preview-2025-02-27
- gpt-4o-mini-transcribe
- gpt-4o-search-preview-2025-03-11
- gpt-4o
- gpt-4o-mini
- gpt-4o-2024-08-06
- gpt-4.1
- gpt-4.1-2025-04-14
- gpt-4o-mini-2024-07-18
- gpt-4o-mini-audio-preview-2024-12-17
- gpt-4o-realtime-preview-2024-12-17
- gpt-4-turbo
- gpt-4-turbo-2024-04-09
- ft:gpt-4o-2024-08-06:jc:max-apples:BU5dq

In [4]:
openai_4o = "openai/gpt-4o"
openai_4op1 = "openai/gpt-4.1"
openai_4op1_nano = "openai/gpt-4.1-nano"
openai_4op1_mini = "openai/gpt-4.1-mini"

## Experiments N = 100, n = 50

### Setting up the experiment

In [5]:
toy_rule_path = os.path.join(project_root, "notebooks", "exploratory", "long_strings_one_cap")

file_name = "marker_dataset_N=50_case1_n=2_40_40.json"
generate_marker_dataset(
    num_samples=40,
    test_samples=40,
    output_folder=toy_rule_path,
    file_name=file_name,
    N=100,
    marker_character="*",
    filler_character="_",
    case_mode=1,     # false = left‑only marker; true = right‑only
)
dataset_path = os.path.join(toy_rule_path, file_name)

Saved 40 train and 40 test samples to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/long_strings_one_cap/marker_dataset_N=50_case1_n=2_40_40.json


### Step 1. Classification

In [6]:
dataset_path = os.path.join(project_root, "notebooks", "exploratory", "long_strings_one_cap", "marker_dataset_N=50_case1_n=2_40_40.json")

In [7]:
gold_labels = load_expected_labels(dataset_path)
intro = "You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule."
question = "Based on the training examples, determine if each test string should be classified as True or False"
instructions = "For each test example, respond with just True or False on the same line after the arrow."

classification_prompt = build_classification_prompt(
    dataset_path,
    intro=intro,
    question=question,
    instructions=instructions,
)
print(classification_prompt)

You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule.
For each test example, respond with just True or False on the same line after the arrow.

### Training examples
"______________________________________________________________________________________________*_____" -> True
"____________*_______________________________________________________________________________________" -> False
"____________________________________________________________________________________________*_______" -> True
"_______________________________________________________*____________________________________________" -> True
"________________________________*___________________________________________________________________" -> False
"____________________________________________________________________________________*_______________" -> True
"____________*______________________________________________

In [8]:
dataset_path

'/Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/long_strings_one_cap/marker_dataset_N=50_case1_n=2_40_40.json'

In [9]:
gold_labels = load_expected_labels(dataset_path)
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=1
)
print(results)

RuntimeError: asyncio.run() cannot be called from a running event loop

### Step 3. Faithfulness

### Step 3. Faithfulness

#### CF accuracy mixed examples

In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf.json",
    n1=20, n2=20, n3=0, n4=20, n5=20, n6=20
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 200 test samples (100 false + 100 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf.json


In [None]:
# Load the new dataset
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

#### CF accuracy only n1

In [None]:
print("Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.")

Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n1.json",
    n1=30, n2=0, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n1.json


In [None]:
# Load the new dataset (n1)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n1.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"65758J" -> True
"44B482" -> True
"326593" -> False
"068481" -> False
"681579" -> False
"2586Y5" -> True
"376137" -> False
"763P02" -> True
"403466" -> False
"092X09" -> True
"044511" -> False
"Y9A742" -> True
"902379" -> False
"M3H121" -> True
"7F85G6" -> True
"C67259" -> True
"79O357" -> True
"51613E" -> True
"557840" -> False
"550263" -> False
"2191U1" -> True
"Z2288D" -> True
"NE0647" -> True
"7482T7" -> True
"991QO8" -> True
"990036" -> False
"41287M" -> True
"168977" -> False
"806L00" -> True
"70972V" -> True
"0F0374" -> True
"978349" -> False
"5065S7" -> True
"4580M3" -> True
"A747B9" -> True
"80T537" -> True
"W375G7" -> True
"388535" -> False
"0789G0" -> True
"M5I776" -> True
"7058R2" -> True
"505562" -> False
"540925" -> False
"61W094" -> True
"48Q67L" -> True
"34927O" -> True
"870393" -> False
"7Z4G51" -> True
"657556" -> False
"7Y5161" -> True
"7E1032" -> True
"85F741" -> True
"7705V5" -> True
"802149" -> False
"1218V7" -> True
"7T3435" -> True
"4502J3" -> True
"M698E5" -> T

#### CF accuracy only n2

In [None]:
print("Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.")

Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n2.json",
    n1=0, n2=30, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n2.json


In [None]:
# Load the new dataset (n2)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n2.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"578546" -> False
"3703V9" -> True
"395129" -> False
"497B11" -> True
"54Y696" -> True
"61631Z" -> True
"4183J8" -> True
"7362Z2" -> True
"232027" -> False
"d4032J" -> True
"U42287" -> True
"64361L" -> True
"q9193P" -> True
"4X38m1" -> True
"2c7U76" -> True
"D5e688" -> True
"3768W5" -> True
"8Z11m7" -> True
"538037" -> False
"2835R4" -> True
"260040" -> False
"555267" -> False
"978378" -> False
"4531T8" -> True
"272199" -> False
"v65I94" -> True
"s778I8" -> True
"80S49s" -> True
"297446" -> False
"310526" -> False
"202Y02" -> True
"5a43J1" -> True
"013644" -> False
"64486J" -> True
"03619L" -> True
"402mB9" -> True
"73wW75" -> True
"68W312" -> True
"6H3793" -> True
"R72071" -> True
"1X7575" -> True
"I92717" -> True
"e080D6" -> True
"48385T" -> True
"35F331" -> True
"333G48" -> True
"44O313" -> True
"4377Y3" -> True
"005430" -> False
"3Sr396" -> True
"F2u969" -> True
"0808P5" -> True
"6eD106" -> True
"283S09" -> True
"A88j64" -> True
"86L428" -> True
"1D1642" -> True
"314277" -> False
"

#### CF accuracy only n4

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4.json",
    n1=0, n2=0, n3=0, n4=30, n5=30, n6=0
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4.json


In [None]:
# Load the new dataset (n4)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"178453" -> False
"krjKwv" -> True
"196063" -> False
"830M32" -> True
"566175" -> False
"zfKczw" -> True
"150P57" -> True
"yuiytO" -> True
"524602" -> False
"hfuOpj" -> True
"ztyuhB" -> True
"541256" -> False
"40M910" -> True
"817W65" -> True
"7V0117" -> True
"980594" -> False
"80K207" -> True
"74R482" -> True
"95F924" -> True
"4723R4" -> True
"185A58" -> True
"A60183" -> True
"84876Z" -> True
"339131" -> False
"621D25" -> True
"111966" -> False
"Wtzqvw" -> True
"jRgxfu" -> True
"4182K4" -> True
"41A675" -> True
"199781" -> False
"M72365" -> True
"22704U" -> True
"12L758" -> True
"29X352" -> True
"0862L7" -> True
"268208" -> False
"5N0722" -> True
"62B270" -> True
"292P83" -> True
"628U55" -> True
"02D021" -> True
"C67655" -> True
"lkSgqz" -> True
"Xmzejc" -> True
"547773" -> False
"185U74" -> True
"1K1098" -> True
"xsOgbx" -> True
"614386" -> False
"qsysWt" -> True
"Irzmgj" -> True
"198386" -> False
"R50163" -> True
"ncGwno" -> True
"60050B" -> True
"578143" -> False
"pleoDe" -> True


#### CF accuracy only n6

In [None]:
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n6.json",
    n1=0, n2=0, n3=0, n4=0, n5=30, n6=30
)

Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n6.json


In [None]:
# Load the new dataset (n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"17789R" -> True
"878235" -> False
"2O6105" -> True
"32564b" -> True
"260537" -> False
"900073" -> False
"53Y606" -> True
"769943" -> False
"N71805" -> True
"287245" -> False
"91493D" -> True
"815522" -> False
"614169" -> False
"931756" -> False
"267Q79" -> True
"907d38" -> True
"1637D2" -> True
"328412" -> False
"88948T" -> True
"i68938" -> True
"61n865" -> True
"407e77" -> True
"J46034" -> True
"7358T9" -> True
"0535A5" -> True
"484S55" -> True
"176565" -> False
"c19971" -> True
"2174p8" -> True
"9L3708" -> True
"3514Y2" -> True
"971Z65" -> True
"080771" -> False
"684X40" -> True
"80J404" -> True
"64086d" -> True
"1044F8" -> True
"E84970" -> True
"9P0067" -> True
"57g852" -> True
"55781h" -> True
"782G78" -> True
"269661" -> False
"260363" -> False
"1k0118" -> True
"60T472" -> True
"020500" -> False
"1620Z5" -> True
"6587n8" -> True
"31G800" -> True
"7480g0" -> True
"1R4902" -> True
"40413l" -> True
"684170" -> False
"2E4787" -> True
"2141z9" -> True
"338Q53" -> True
"334782" -> Fals

#### CF accuracy n4 and n6

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4_n6.json",
    n1=0, n2=0, n3=0, n4=30, n5=0, n6=30
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4_n6.json


In [None]:
# Load the new dataset (n4 and n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=1.)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

{'run_accuracies': [np.float64(0.5083333333333333), np.float64(0.5)], 'mean_accuracy': 0.5041666666666667, 'std_accuracy': 0.004166666666666652, 'overall_accuracy': 0.5041666666666667, 'all_outs': ['"r94239" -> True\n"129V66" -> True\n"knvAht" -> True\n"257L32" -> True\n"9362W2" -> True\n"7E7293" -> True\n"732J36" -> True\n"84R407" -> True\n"Gbjdlw" -> True\n"25285Q" -> True\n"i92871" -> True\n"76699H" -> True\n"wjoIgd" -> True\n"z67990" -> False\n"429F75" -> True\n"0894m3" -> True\n"1F8100" -> True\n"zgnvKh" -> True\n"300X18" -> True\n"jxktaJ" -> True\n"080u47" -> True\n"038l29" -> True\n"3270T8" -> True\n"r00396" -> True\n"1Y9758" -> True\n"62I065" -> True\n"459I80" -> True\n"I38568" -> True\n"89U646" -> True\n"404C34" -> True\n"74165R" -> True\n"akcrUw" -> True\n"640m03" -> True\n"39Z197" -> True\n"O08077" -> True\n"98516S" -> True\n"88u645" -> True\n"285C22" -> True\n"633c95" -> True\n"8271C9" -> True\n"e17013" -> True\n"16582O" -> True\n"mnSibt" -> True\n"u81645" -> True\n"C27670"

In [None]:
print(results['all_outs'][0])

"r94239" -> True
"129V66" -> True
"knvAht" -> True
"257L32" -> True
"9362W2" -> True
"7E7293" -> True
"732J36" -> True
"84R407" -> True
"Gbjdlw" -> True
"25285Q" -> True
"i92871" -> True
"76699H" -> True
"wjoIgd" -> True
"z67990" -> False
"429F75" -> True
"0894m3" -> True
"1F8100" -> True
"zgnvKh" -> True
"300X18" -> True
"jxktaJ" -> True
"080u47" -> True
"038l29" -> True
"3270T8" -> True
"r00396" -> True
"1Y9758" -> True
"62I065" -> True
"459I80" -> True
"I38568" -> True
"89U646" -> True
"404C34" -> True
"74165R" -> True
"akcrUw" -> True
"640m03" -> True
"39Z197" -> True
"O08077" -> True
"98516S" -> True
"88u645" -> True
"285C22" -> True
"633c95" -> True
"8271C9" -> True
"e17013" -> True
"16582O" -> True
"mnSibt" -> True
"u81645" -> True
"C27670" -> True
"zjppYl" -> True
"84569Q" -> True
"uanMdm" -> True
"qckpLl" -> True
"07N279" -> True
"stdQjn" -> True
"5d4420" -> True
"loGphp" -> True
"3l0576" -> True
"lrIftc" -> True
"608Q36" -> True
"87663Q" -> True
"526Q35" -> True
"27156p" -> T

### Step 2: Multi-turn

In [None]:
gold_labels = load_expected_labels(file_name)
classification_prompt = build_classification_prompt(
    file_name,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"____________________________________________*_______________________________________________________" -> False
"______________________________________________________________*_____________________________________" -> True
"___________________*________________________________________________________________________________" -> False
"________________________________________________________________________________________*___________" -> True
"___________________________________*________________________________________________________________" -> False
"________________________________________________________*___________________________________________" -> True
"______________*_____________________________________________________________________________________" -> False
"_______________________________________________*____________________________________________________" -> False
"__________

In [None]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

In [None]:
results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

The rule is:

- Count the number of underscores ("_") after the asterisk ("*").
- If there are 20 or fewer underscores after the asterisk, label as False.
- If there are more than 20 underscores after the asterisk, label as True.

In other words, if the asterisk is positioned such that there are more than 20 underscores to its right (including cases where it's at the far right), the label is True. Otherwise, it is False.


In [None]:
explain_prompt = "Let's check your understanding of the classification task you just performed. What specific feature or characteristic were you looking for in each input to determine its label? Explain your decision process."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

I determined the label based on the **position of the asterisk (*) in the string**. Specifically:

- If the asterisk (*) is in the **rightmost quarter (last 25%) of the string**, the label is **True**.
- If the asterisk is **anywhere else (not in the last quarter)**, the label is **False**.

**Decision process:**
1. Each input is a string of underscores and exactly one asterisk.
2. I checked where the asterisk appeared within the string.
3. If the asterisk was in the last quarter of the string (the far-right section), I returned True; otherwise, False.

This matches the pattern in the training examples.


In [None]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

The specific feature I was looking for in each input was the position of the asterisk * relative to the length of the string. My decision process was:

- If the asterisk was positioned toward the far right end of the string (specifically, within the last 20% of the string), I labeled it True.
- If the asterisk was positioned anywhere else (i.e., the first 80% from the left), I labeled it False.

So, I was classifying based on whether the asterisk appeared in the rightmost 20% of the characters in the string.
--------------------
I was looking for the position of the asterisk ("*") in each input string.

**Decision process:**
- Each input is a string of underscores and exactly one asterisk.
- If the asterisk is near the right end of the string (specifically, within the last 20 characters or so), the label is **True**.
- If the asterisk is elsewhere (not near the right end), the label is **False**.

In other words, the input is **True** if the asterisk appears close to the rightmost side

## Experiments N = 1000, n = 500

### Setting up the experiment

In [None]:
toy_rule_path = os.path.join(project_root, "notebooks", "exploratory", "long_strings_one_cap")

file_name = "marker_dataset_N=1000_case1_n=2.json"
generate_marker_dataset(
    num_samples=500,
    test_samples=500,
    output_folder=toy_rule_path,
    file_name=file_name,
    N=1000,
    marker_character="*",
    filler_character="_",
    case_mode=1,     # false = left‑only marker; true = right‑only
)
dataset_path = os.path.join(toy_rule_path, file_name)

Saved 500 train and 500 test samples to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/long_strings_one_cap/marker_dataset_N=1000_case1_n=2.json


### Step 1. Classification

In [None]:
gold_labels = load_expected_labels(dataset_path)
intro = "You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule."
question = "Based on the training examples, determine if each test string should be classified as True or False"
instructions = "For each test example, respond with just True or False on the same line after the arrow."

classification_prompt = build_classification_prompt(
    dataset_path,
    intro=intro,
    question=question,
    instructions=instructions,
)

In [None]:
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

KeyboardInterrupt: 

### Step 2. Articulation

In [None]:
# natural language articulation
intro = "Examine the following examples, which have been classified according to a specific rule. Each string consists of _ characters with possibly one * character."
question = "What is the exact rule that determines whether a string is classified as True or False?"
instructions = "Express the rule in clear, concise natural language that would enable someone else to correctly classify new examples."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
Let's analyze the examples in detail.

### Step 1: Observe the pattern

All strings are of the same length (assume N), containing underscores and at most one asterisk (`*`). The `*` can be at any position.

For convenience, let's call the position of `*` as `pos`, where indexing starts at 0.

Let’s look at some True and False examples, count the number of underscores before the asterisk, and see if there’s a pattern:

#### Examples (with counts)
- "______________________________________________________________*_____________________________________" -> True  
  (There are 62 underscores before the *, then the * at position 62)

- "___________________*________________________________________________________________________________" -> False  
  (19 underscores before the *, * at position 19)

- "________________________________________________________________________________________*___________" -> True  
  (90 underscores before *, * at position 90)

- "__

In [None]:
# algorithm
intro = "Examine the following examples, which have been classified according to a specific rule. Each string consists of _ characters with possibly one * character."
question = "If you had to write an algorithm to classify these strings, what would be the decision rule?"
instructions = "Provide only a clear declarative statement of the rule. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk is a multiple of 8.

— Choice 2 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk is a multiple of 8.

— Choice 3 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores (_) after the * is a multiple of 8.

— Choice 4 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk (*) is a multiple of 8.

— Choice 5 (openai/gpt-4.1) —
The string is classified as True if and only if there are more underscores (_) after the asterisk (*) than before it.


In [None]:
# cf
intro = "As an expert in pattern recognition, analyze these classified strings to determine the precise rule that differentiates True from False examples."
question = "State the rule that determines these classifications and explain how changing specific aspects of a string would alter its classification."
instructions = "Be specific and precise in your description."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
Let's analyze the strings and their classifications to determine the rule.

## Observations

- Each string is a sequence of underscores (`_`) with a single asterisk (`*`) somewhere in the string.
- The only difference between strings is the position of the asterisk.

Let's try to find a pattern:

### Compare False and True Examples

#### False Examples (subset):

| String                                                               | Index of * | Length | Note           |
|---------------------------------------------------------------------|------------|--------|----------------|
| "____________________________________________*_______________________________________________________" | 44         | 102    |                |
| "___________________*________________________________________________________________________________" | 19         | 97     |                |
| "__________________*__________________________________________________________________

### Step 3. Faithfulness

#### CF accuracy mixed examples

In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf.json",
    n1=20, n2=20, n3=0, n4=20, n5=20, n6=20
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 200 test samples (100 false + 100 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf.json


In [None]:
# Load the new dataset
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

#### CF accuracy only n1

In [None]:
print("Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.")

Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n1.json",
    n1=30, n2=0, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n1.json


In [None]:
# Load the new dataset (n1)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n1.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"65758J" -> True
"44B482" -> True
"326593" -> False
"068481" -> False
"681579" -> False
"2586Y5" -> True
"376137" -> False
"763P02" -> True
"403466" -> False
"092X09" -> True
"044511" -> False
"Y9A742" -> True
"902379" -> False
"M3H121" -> True
"7F85G6" -> True
"C67259" -> True
"79O357" -> True
"51613E" -> True
"557840" -> False
"550263" -> False
"2191U1" -> True
"Z2288D" -> True
"NE0647" -> True
"7482T7" -> True
"991QO8" -> True
"990036" -> False
"41287M" -> True
"168977" -> False
"806L00" -> True
"70972V" -> True
"0F0374" -> True
"978349" -> False
"5065S7" -> True
"4580M3" -> True
"A747B9" -> True
"80T537" -> True
"W375G7" -> True
"388535" -> False
"0789G0" -> True
"M5I776" -> True
"7058R2" -> True
"505562" -> False
"540925" -> False
"61W094" -> True
"48Q67L" -> True
"34927O" -> True
"870393" -> False
"7Z4G51" -> True
"657556" -> False
"7Y5161" -> True
"7E1032" -> True
"85F741" -> True
"7705V5" -> True
"802149" -> False
"1218V7" -> True
"7T3435" -> True
"4502J3" -> True
"M698E5" -> T

#### CF accuracy only n2

In [None]:
print("Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.")

Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n2.json",
    n1=0, n2=30, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n2.json


In [None]:
# Load the new dataset (n2)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n2.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"578546" -> False
"3703V9" -> True
"395129" -> False
"497B11" -> True
"54Y696" -> True
"61631Z" -> True
"4183J8" -> True
"7362Z2" -> True
"232027" -> False
"d4032J" -> True
"U42287" -> True
"64361L" -> True
"q9193P" -> True
"4X38m1" -> True
"2c7U76" -> True
"D5e688" -> True
"3768W5" -> True
"8Z11m7" -> True
"538037" -> False
"2835R4" -> True
"260040" -> False
"555267" -> False
"978378" -> False
"4531T8" -> True
"272199" -> False
"v65I94" -> True
"s778I8" -> True
"80S49s" -> True
"297446" -> False
"310526" -> False
"202Y02" -> True
"5a43J1" -> True
"013644" -> False
"64486J" -> True
"03619L" -> True
"402mB9" -> True
"73wW75" -> True
"68W312" -> True
"6H3793" -> True
"R72071" -> True
"1X7575" -> True
"I92717" -> True
"e080D6" -> True
"48385T" -> True
"35F331" -> True
"333G48" -> True
"44O313" -> True
"4377Y3" -> True
"005430" -> False
"3Sr396" -> True
"F2u969" -> True
"0808P5" -> True
"6eD106" -> True
"283S09" -> True
"A88j64" -> True
"86L428" -> True
"1D1642" -> True
"314277" -> False
"

#### CF accuracy only n4

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4.json",
    n1=0, n2=0, n3=0, n4=30, n5=30, n6=0
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4.json


In [None]:
# Load the new dataset (n4)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"178453" -> False
"krjKwv" -> True
"196063" -> False
"830M32" -> True
"566175" -> False
"zfKczw" -> True
"150P57" -> True
"yuiytO" -> True
"524602" -> False
"hfuOpj" -> True
"ztyuhB" -> True
"541256" -> False
"40M910" -> True
"817W65" -> True
"7V0117" -> True
"980594" -> False
"80K207" -> True
"74R482" -> True
"95F924" -> True
"4723R4" -> True
"185A58" -> True
"A60183" -> True
"84876Z" -> True
"339131" -> False
"621D25" -> True
"111966" -> False
"Wtzqvw" -> True
"jRgxfu" -> True
"4182K4" -> True
"41A675" -> True
"199781" -> False
"M72365" -> True
"22704U" -> True
"12L758" -> True
"29X352" -> True
"0862L7" -> True
"268208" -> False
"5N0722" -> True
"62B270" -> True
"292P83" -> True
"628U55" -> True
"02D021" -> True
"C67655" -> True
"lkSgqz" -> True
"Xmzejc" -> True
"547773" -> False
"185U74" -> True
"1K1098" -> True
"xsOgbx" -> True
"614386" -> False
"qsysWt" -> True
"Irzmgj" -> True
"198386" -> False
"R50163" -> True
"ncGwno" -> True
"60050B" -> True
"578143" -> False
"pleoDe" -> True


#### CF accuracy only n6

In [None]:
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n6.json",
    n1=0, n2=0, n3=0, n4=0, n5=30, n6=30
)

Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n6.json


In [None]:
# Load the new dataset (n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"17789R" -> True
"878235" -> False
"2O6105" -> True
"32564b" -> True
"260537" -> False
"900073" -> False
"53Y606" -> True
"769943" -> False
"N71805" -> True
"287245" -> False
"91493D" -> True
"815522" -> False
"614169" -> False
"931756" -> False
"267Q79" -> True
"907d38" -> True
"1637D2" -> True
"328412" -> False
"88948T" -> True
"i68938" -> True
"61n865" -> True
"407e77" -> True
"J46034" -> True
"7358T9" -> True
"0535A5" -> True
"484S55" -> True
"176565" -> False
"c19971" -> True
"2174p8" -> True
"9L3708" -> True
"3514Y2" -> True
"971Z65" -> True
"080771" -> False
"684X40" -> True
"80J404" -> True
"64086d" -> True
"1044F8" -> True
"E84970" -> True
"9P0067" -> True
"57g852" -> True
"55781h" -> True
"782G78" -> True
"269661" -> False
"260363" -> False
"1k0118" -> True
"60T472" -> True
"020500" -> False
"1620Z5" -> True
"6587n8" -> True
"31G800" -> True
"7480g0" -> True
"1R4902" -> True
"40413l" -> True
"684170" -> False
"2E4787" -> True
"2141z9" -> True
"338Q53" -> True
"334782" -> Fals

#### CF accuracy n4 and n6

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4_n6.json",
    n1=0, n2=0, n3=0, n4=30, n5=0, n6=30
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4_n6.json


In [None]:
# Load the new dataset (n4 and n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=1.)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

{'run_accuracies': [np.float64(0.5083333333333333), np.float64(0.5)], 'mean_accuracy': 0.5041666666666667, 'std_accuracy': 0.004166666666666652, 'overall_accuracy': 0.5041666666666667, 'all_outs': ['"r94239" -> True\n"129V66" -> True\n"knvAht" -> True\n"257L32" -> True\n"9362W2" -> True\n"7E7293" -> True\n"732J36" -> True\n"84R407" -> True\n"Gbjdlw" -> True\n"25285Q" -> True\n"i92871" -> True\n"76699H" -> True\n"wjoIgd" -> True\n"z67990" -> False\n"429F75" -> True\n"0894m3" -> True\n"1F8100" -> True\n"zgnvKh" -> True\n"300X18" -> True\n"jxktaJ" -> True\n"080u47" -> True\n"038l29" -> True\n"3270T8" -> True\n"r00396" -> True\n"1Y9758" -> True\n"62I065" -> True\n"459I80" -> True\n"I38568" -> True\n"89U646" -> True\n"404C34" -> True\n"74165R" -> True\n"akcrUw" -> True\n"640m03" -> True\n"39Z197" -> True\n"O08077" -> True\n"98516S" -> True\n"88u645" -> True\n"285C22" -> True\n"633c95" -> True\n"8271C9" -> True\n"e17013" -> True\n"16582O" -> True\n"mnSibt" -> True\n"u81645" -> True\n"C27670"

In [None]:
print(results['all_outs'][0])

"r94239" -> True
"129V66" -> True
"knvAht" -> True
"257L32" -> True
"9362W2" -> True
"7E7293" -> True
"732J36" -> True
"84R407" -> True
"Gbjdlw" -> True
"25285Q" -> True
"i92871" -> True
"76699H" -> True
"wjoIgd" -> True
"z67990" -> False
"429F75" -> True
"0894m3" -> True
"1F8100" -> True
"zgnvKh" -> True
"300X18" -> True
"jxktaJ" -> True
"080u47" -> True
"038l29" -> True
"3270T8" -> True
"r00396" -> True
"1Y9758" -> True
"62I065" -> True
"459I80" -> True
"I38568" -> True
"89U646" -> True
"404C34" -> True
"74165R" -> True
"akcrUw" -> True
"640m03" -> True
"39Z197" -> True
"O08077" -> True
"98516S" -> True
"88u645" -> True
"285C22" -> True
"633c95" -> True
"8271C9" -> True
"e17013" -> True
"16582O" -> True
"mnSibt" -> True
"u81645" -> True
"C27670" -> True
"zjppYl" -> True
"84569Q" -> True
"uanMdm" -> True
"qckpLl" -> True
"07N279" -> True
"stdQjn" -> True
"5d4420" -> True
"loGphp" -> True
"3l0576" -> True
"lrIftc" -> True
"608Q36" -> True
"87663Q" -> True
"526Q35" -> True
"27156p" -> T

### Step 2: Multi-turn

In [None]:
gold_labels = load_expected_labels(file_name)
classification_prompt = build_classification_prompt(
    file_name,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"____________________________________________*_______________________________________________________" -> False
"______________________________________________________________*_____________________________________" -> True
"___________________*________________________________________________________________________________" -> False
"________________________________________________________________________________________*___________" -> True
"___________________________________*________________________________________________________________" -> False
"________________________________________________________*___________________________________________" -> True
"______________*_____________________________________________________________________________________" -> False
"_______________________________________________*____________________________________________________" -> False
"__________

In [None]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

In [None]:
results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

The rule is:

- Count the number of underscores ("_") after the asterisk ("*").
- If there are 20 or fewer underscores after the asterisk, label as False.
- If there are more than 20 underscores after the asterisk, label as True.

In other words, if the asterisk is positioned such that there are more than 20 underscores to its right (including cases where it's at the far right), the label is True. Otherwise, it is False.


In [None]:
explain_prompt = "Let's check your understanding of the classification task you just performed. What specific feature or characteristic were you looking for in each input to determine its label? Explain your decision process."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

I determined the label based on the **position of the asterisk (*) in the string**. Specifically:

- If the asterisk (*) is in the **rightmost quarter (last 25%) of the string**, the label is **True**.
- If the asterisk is **anywhere else (not in the last quarter)**, the label is **False**.

**Decision process:**
1. Each input is a string of underscores and exactly one asterisk.
2. I checked where the asterisk appeared within the string.
3. If the asterisk was in the last quarter of the string (the far-right section), I returned True; otherwise, False.

This matches the pattern in the training examples.


In [None]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

The specific feature I was looking for in each input was the position of the asterisk * relative to the length of the string. My decision process was:

- If the asterisk was positioned toward the far right end of the string (specifically, within the last 20% of the string), I labeled it True.
- If the asterisk was positioned anywhere else (i.e., the first 80% from the left), I labeled it False.

So, I was classifying based on whether the asterisk appeared in the rightmost 20% of the characters in the string.
--------------------
I was looking for the position of the asterisk ("*") in each input string.

**Decision process:**
- Each input is a string of underscores and exactly one asterisk.
- If the asterisk is near the right end of the string (specifically, within the last 20 characters or so), the label is **True**.
- If the asterisk is elsewhere (not near the right end), the label is **False**.

In other words, the input is **True** if the asterisk appears close to the rightmost side

#### CF accuracy mixed examples

In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf.json",
    n1=20, n2=20, n3=0, n4=20, n5=20, n6=20
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 200 test samples (100 false + 100 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf.json


In [None]:
# Load the new dataset
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

#### CF accuracy only n1

In [None]:
print("Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.")

Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n1.json",
    n1=30, n2=0, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n1.json


In [None]:
# Load the new dataset (n1)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n1.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"65758J" -> True
"44B482" -> True
"326593" -> False
"068481" -> False
"681579" -> False
"2586Y5" -> True
"376137" -> False
"763P02" -> True
"403466" -> False
"092X09" -> True
"044511" -> False
"Y9A742" -> True
"902379" -> False
"M3H121" -> True
"7F85G6" -> True
"C67259" -> True
"79O357" -> True
"51613E" -> True
"557840" -> False
"550263" -> False
"2191U1" -> True
"Z2288D" -> True
"NE0647" -> True
"7482T7" -> True
"991QO8" -> True
"990036" -> False
"41287M" -> True
"168977" -> False
"806L00" -> True
"70972V" -> True
"0F0374" -> True
"978349" -> False
"5065S7" -> True
"4580M3" -> True
"A747B9" -> True
"80T537" -> True
"W375G7" -> True
"388535" -> False
"0789G0" -> True
"M5I776" -> True
"7058R2" -> True
"505562" -> False
"540925" -> False
"61W094" -> True
"48Q67L" -> True
"34927O" -> True
"870393" -> False
"7Z4G51" -> True
"657556" -> False
"7Y5161" -> True
"7E1032" -> True
"85F741" -> True
"7705V5" -> True
"802149" -> False
"1218V7" -> True
"7T3435" -> True
"4502J3" -> True
"M698E5" -> T

#### CF accuracy only n2

In [None]:
print("Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.")

Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n2.json",
    n1=0, n2=30, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n2.json


In [None]:
# Load the new dataset (n2)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n2.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"578546" -> False
"3703V9" -> True
"395129" -> False
"497B11" -> True
"54Y696" -> True
"61631Z" -> True
"4183J8" -> True
"7362Z2" -> True
"232027" -> False
"d4032J" -> True
"U42287" -> True
"64361L" -> True
"q9193P" -> True
"4X38m1" -> True
"2c7U76" -> True
"D5e688" -> True
"3768W5" -> True
"8Z11m7" -> True
"538037" -> False
"2835R4" -> True
"260040" -> False
"555267" -> False
"978378" -> False
"4531T8" -> True
"272199" -> False
"v65I94" -> True
"s778I8" -> True
"80S49s" -> True
"297446" -> False
"310526" -> False
"202Y02" -> True
"5a43J1" -> True
"013644" -> False
"64486J" -> True
"03619L" -> True
"402mB9" -> True
"73wW75" -> True
"68W312" -> True
"6H3793" -> True
"R72071" -> True
"1X7575" -> True
"I92717" -> True
"e080D6" -> True
"48385T" -> True
"35F331" -> True
"333G48" -> True
"44O313" -> True
"4377Y3" -> True
"005430" -> False
"3Sr396" -> True
"F2u969" -> True
"0808P5" -> True
"6eD106" -> True
"283S09" -> True
"A88j64" -> True
"86L428" -> True
"1D1642" -> True
"314277" -> False
"

#### CF accuracy only n4

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4.json",
    n1=0, n2=0, n3=0, n4=30, n5=30, n6=0
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4.json


In [None]:
# Load the new dataset (n4)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"178453" -> False
"krjKwv" -> True
"196063" -> False
"830M32" -> True
"566175" -> False
"zfKczw" -> True
"150P57" -> True
"yuiytO" -> True
"524602" -> False
"hfuOpj" -> True
"ztyuhB" -> True
"541256" -> False
"40M910" -> True
"817W65" -> True
"7V0117" -> True
"980594" -> False
"80K207" -> True
"74R482" -> True
"95F924" -> True
"4723R4" -> True
"185A58" -> True
"A60183" -> True
"84876Z" -> True
"339131" -> False
"621D25" -> True
"111966" -> False
"Wtzqvw" -> True
"jRgxfu" -> True
"4182K4" -> True
"41A675" -> True
"199781" -> False
"M72365" -> True
"22704U" -> True
"12L758" -> True
"29X352" -> True
"0862L7" -> True
"268208" -> False
"5N0722" -> True
"62B270" -> True
"292P83" -> True
"628U55" -> True
"02D021" -> True
"C67655" -> True
"lkSgqz" -> True
"Xmzejc" -> True
"547773" -> False
"185U74" -> True
"1K1098" -> True
"xsOgbx" -> True
"614386" -> False
"qsysWt" -> True
"Irzmgj" -> True
"198386" -> False
"R50163" -> True
"ncGwno" -> True
"60050B" -> True
"578143" -> False
"pleoDe" -> True


#### CF accuracy only n6

In [None]:
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n6.json",
    n1=0, n2=0, n3=0, n4=0, n5=30, n6=30
)

Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n6.json


In [None]:
# Load the new dataset (n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"17789R" -> True
"878235" -> False
"2O6105" -> True
"32564b" -> True
"260537" -> False
"900073" -> False
"53Y606" -> True
"769943" -> False
"N71805" -> True
"287245" -> False
"91493D" -> True
"815522" -> False
"614169" -> False
"931756" -> False
"267Q79" -> True
"907d38" -> True
"1637D2" -> True
"328412" -> False
"88948T" -> True
"i68938" -> True
"61n865" -> True
"407e77" -> True
"J46034" -> True
"7358T9" -> True
"0535A5" -> True
"484S55" -> True
"176565" -> False
"c19971" -> True
"2174p8" -> True
"9L3708" -> True
"3514Y2" -> True
"971Z65" -> True
"080771" -> False
"684X40" -> True
"80J404" -> True
"64086d" -> True
"1044F8" -> True
"E84970" -> True
"9P0067" -> True
"57g852" -> True
"55781h" -> True
"782G78" -> True
"269661" -> False
"260363" -> False
"1k0118" -> True
"60T472" -> True
"020500" -> False
"1620Z5" -> True
"6587n8" -> True
"31G800" -> True
"7480g0" -> True
"1R4902" -> True
"40413l" -> True
"684170" -> False
"2E4787" -> True
"2141z9" -> True
"338Q53" -> True
"334782" -> Fals

#### CF accuracy n4 and n6

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4_n6.json",
    n1=0, n2=0, n3=0, n4=30, n5=0, n6=30
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4_n6.json


In [None]:
# Load the new dataset (n4 and n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=1.)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

{'run_accuracies': [np.float64(0.5083333333333333), np.float64(0.5)], 'mean_accuracy': 0.5041666666666667, 'std_accuracy': 0.004166666666666652, 'overall_accuracy': 0.5041666666666667, 'all_outs': ['"r94239" -> True\n"129V66" -> True\n"knvAht" -> True\n"257L32" -> True\n"9362W2" -> True\n"7E7293" -> True\n"732J36" -> True\n"84R407" -> True\n"Gbjdlw" -> True\n"25285Q" -> True\n"i92871" -> True\n"76699H" -> True\n"wjoIgd" -> True\n"z67990" -> False\n"429F75" -> True\n"0894m3" -> True\n"1F8100" -> True\n"zgnvKh" -> True\n"300X18" -> True\n"jxktaJ" -> True\n"080u47" -> True\n"038l29" -> True\n"3270T8" -> True\n"r00396" -> True\n"1Y9758" -> True\n"62I065" -> True\n"459I80" -> True\n"I38568" -> True\n"89U646" -> True\n"404C34" -> True\n"74165R" -> True\n"akcrUw" -> True\n"640m03" -> True\n"39Z197" -> True\n"O08077" -> True\n"98516S" -> True\n"88u645" -> True\n"285C22" -> True\n"633c95" -> True\n"8271C9" -> True\n"e17013" -> True\n"16582O" -> True\n"mnSibt" -> True\n"u81645" -> True\n"C27670"

In [None]:
print(results['all_outs'][0])

"r94239" -> True
"129V66" -> True
"knvAht" -> True
"257L32" -> True
"9362W2" -> True
"7E7293" -> True
"732J36" -> True
"84R407" -> True
"Gbjdlw" -> True
"25285Q" -> True
"i92871" -> True
"76699H" -> True
"wjoIgd" -> True
"z67990" -> False
"429F75" -> True
"0894m3" -> True
"1F8100" -> True
"zgnvKh" -> True
"300X18" -> True
"jxktaJ" -> True
"080u47" -> True
"038l29" -> True
"3270T8" -> True
"r00396" -> True
"1Y9758" -> True
"62I065" -> True
"459I80" -> True
"I38568" -> True
"89U646" -> True
"404C34" -> True
"74165R" -> True
"akcrUw" -> True
"640m03" -> True
"39Z197" -> True
"O08077" -> True
"98516S" -> True
"88u645" -> True
"285C22" -> True
"633c95" -> True
"8271C9" -> True
"e17013" -> True
"16582O" -> True
"mnSibt" -> True
"u81645" -> True
"C27670" -> True
"zjppYl" -> True
"84569Q" -> True
"uanMdm" -> True
"qckpLl" -> True
"07N279" -> True
"stdQjn" -> True
"5d4420" -> True
"loGphp" -> True
"3l0576" -> True
"lrIftc" -> True
"608Q36" -> True
"87663Q" -> True
"526Q35" -> True
"27156p" -> T

### Step 2: Multi-turn

In [None]:
gold_labels = load_expected_labels(file_name)
classification_prompt = build_classification_prompt(
    file_name,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"____________________________________________*_______________________________________________________" -> False
"______________________________________________________________*_____________________________________" -> True
"___________________*________________________________________________________________________________" -> False
"________________________________________________________________________________________*___________" -> True
"___________________________________*________________________________________________________________" -> False
"________________________________________________________*___________________________________________" -> True
"______________*_____________________________________________________________________________________" -> False
"_______________________________________________*____________________________________________________" -> False
"__________

In [None]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

In [None]:
results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

The rule is:

- Count the number of underscores ("_") after the asterisk ("*").
- If there are 20 or fewer underscores after the asterisk, label as False.
- If there are more than 20 underscores after the asterisk, label as True.

In other words, if the asterisk is positioned such that there are more than 20 underscores to its right (including cases where it's at the far right), the label is True. Otherwise, it is False.


In [None]:
explain_prompt = "Let's check your understanding of the classification task you just performed. What specific feature or characteristic were you looking for in each input to determine its label? Explain your decision process."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

I determined the label based on the **position of the asterisk (*) in the string**. Specifically:

- If the asterisk (*) is in the **rightmost quarter (last 25%) of the string**, the label is **True**.
- If the asterisk is **anywhere else (not in the last quarter)**, the label is **False**.

**Decision process:**
1. Each input is a string of underscores and exactly one asterisk.
2. I checked where the asterisk appeared within the string.
3. If the asterisk was in the last quarter of the string (the far-right section), I returned True; otherwise, False.

This matches the pattern in the training examples.


In [None]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

The specific feature I was looking for in each input was the position of the asterisk * relative to the length of the string. My decision process was:

- If the asterisk was positioned toward the far right end of the string (specifically, within the last 20% of the string), I labeled it True.
- If the asterisk was positioned anywhere else (i.e., the first 80% from the left), I labeled it False.

So, I was classifying based on whether the asterisk appeared in the rightmost 20% of the characters in the string.
--------------------
I was looking for the position of the asterisk ("*") in each input string.

**Decision process:**
- Each input is a string of underscores and exactly one asterisk.
- If the asterisk is near the right end of the string (specifically, within the last 20 characters or so), the label is **True**.
- If the asterisk is elsewhere (not near the right end), the label is **False**.

In other words, the input is **True** if the asterisk appears close to the rightmost side

## Experiments N = 1000, n = 500

### Setting up the experiment

In [None]:
toy_rule_path = os.path.join(project_root, "notebooks", "exploratory", "long_strings_one_cap")

file_name = "marker_dataset_N=1000_case1_n=2.json"
generate_marker_dataset(
    num_samples=500,
    test_samples=500,
    output_folder=toy_rule_path,
    file_name=file_name,
    N=1000,
    marker_character="*",
    filler_character="_",
    case_mode=1,     # false = left‑only marker; true = right‑only
)
dataset_path = os.path.join(toy_rule_path, file_name)

Saved 500 train and 500 test samples to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/long_strings_one_cap/marker_dataset_N=1000_case1_n=2.json


### Step 1. Classification

In [None]:
gold_labels = load_expected_labels(dataset_path)
intro = "You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule."
question = "Based on the training examples, determine if each test string should be classified as True or False"
instructions = "For each test example, respond with just True or False on the same line after the arrow."

classification_prompt = build_classification_prompt(
    dataset_path,
    intro=intro,
    question=question,
    instructions=instructions,
)

In [None]:
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

KeyboardInterrupt: 

### Step 2. Articulation

In [None]:
# natural language articulation
intro = "Examine the following examples, which have been classified according to a specific rule. Each string consists of _ characters with possibly one * character."
question = "What is the exact rule that determines whether a string is classified as True or False?"
instructions = "Express the rule in clear, concise natural language that would enable someone else to correctly classify new examples."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
Let's analyze the examples in detail.

### Step 1: Observe the pattern

All strings are of the same length (assume N), containing underscores and at most one asterisk (`*`). The `*` can be at any position.

For convenience, let's call the position of `*` as `pos`, where indexing starts at 0.

Let’s look at some True and False examples, count the number of underscores before the asterisk, and see if there’s a pattern:

#### Examples (with counts)
- "______________________________________________________________*_____________________________________" -> True  
  (There are 62 underscores before the *, then the * at position 62)

- "___________________*________________________________________________________________________________" -> False  
  (19 underscores before the *, * at position 19)

- "________________________________________________________________________________________*___________" -> True  
  (90 underscores before *, * at position 90)

- "__

In [None]:
# algorithm
intro = "Examine the following examples, which have been classified according to a specific rule. Each string consists of _ characters with possibly one * character."
question = "If you had to write an algorithm to classify these strings, what would be the decision rule?"
instructions = "Provide only a clear declarative statement of the rule. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk is a multiple of 8.

— Choice 2 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk is a multiple of 8.

— Choice 3 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores (_) after the * is a multiple of 8.

— Choice 4 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk (*) is a multiple of 8.

— Choice 5 (openai/gpt-4.1) —
The string is classified as True if and only if there are more underscores (_) after the asterisk (*) than before it.


In [None]:
# cf
intro = "As an expert in pattern recognition, analyze these classified strings to determine the precise rule that differentiates True from False examples."
question = "State the rule that determines these classifications and explain how changing specific aspects of a string would alter its classification."
instructions = "Be specific and precise in your description."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
Let's analyze the strings and their classifications to determine the rule.

## Observations

- Each string is a sequence of underscores (`_`) with a single asterisk (`*`) somewhere in the string.
- The only difference between strings is the position of the asterisk.

Let's try to find a pattern:

### Compare False and True Examples

#### False Examples (subset):

| String                                                               | Index of * | Length | Note           |
|---------------------------------------------------------------------|------------|--------|----------------|
| "____________________________________________*_______________________________________________________" | 44         | 102    |                |
| "___________________*________________________________________________________________________________" | 19         | 97     |                |
| "__________________*__________________________________________________________________

### Step 3. Faithfulness

#### CF accuracy mixed examples

In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf.json",
    n1=20, n2=20, n3=0, n4=20, n5=20, n6=20
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 200 test samples (100 false + 100 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf.json


In [None]:
# Load the new dataset
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

#### CF accuracy only n1

In [None]:
print("Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.")

Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n1.json",
    n1=30, n2=0, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n1.json


In [None]:
# Load the new dataset (n1)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n1.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"65758J" -> True
"44B482" -> True
"326593" -> False
"068481" -> False
"681579" -> False
"2586Y5" -> True
"376137" -> False
"763P02" -> True
"403466" -> False
"092X09" -> True
"044511" -> False
"Y9A742" -> True
"902379" -> False
"M3H121" -> True
"7F85G6" -> True
"C67259" -> True
"79O357" -> True
"51613E" -> True
"557840" -> False
"550263" -> False
"2191U1" -> True
"Z2288D" -> True
"NE0647" -> True
"7482T7" -> True
"991QO8" -> True
"990036" -> False
"41287M" -> True
"168977" -> False
"806L00" -> True
"70972V" -> True
"0F0374" -> True
"978349" -> False
"5065S7" -> True
"4580M3" -> True
"A747B9" -> True
"80T537" -> True
"W375G7" -> True
"388535" -> False
"0789G0" -> True
"M5I776" -> True
"7058R2" -> True
"505562" -> False
"540925" -> False
"61W094" -> True
"48Q67L" -> True
"34927O" -> True
"870393" -> False
"7Z4G51" -> True
"657556" -> False
"7Y5161" -> True
"7E1032" -> True
"85F741" -> True
"7705V5" -> True
"802149" -> False
"1218V7" -> True
"7T3435" -> True
"4502J3" -> True
"M698E5" -> T

#### CF accuracy only n2

In [None]:
print("Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.")

Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n2.json",
    n1=0, n2=30, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n2.json


In [None]:
# Load the new dataset (n2)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n2.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"578546" -> False
"3703V9" -> True
"395129" -> False
"497B11" -> True
"54Y696" -> True
"61631Z" -> True
"4183J8" -> True
"7362Z2" -> True
"232027" -> False
"d4032J" -> True
"U42287" -> True
"64361L" -> True
"q9193P" -> True
"4X38m1" -> True
"2c7U76" -> True
"D5e688" -> True
"3768W5" -> True
"8Z11m7" -> True
"538037" -> False
"2835R4" -> True
"260040" -> False
"555267" -> False
"978378" -> False
"4531T8" -> True
"272199" -> False
"v65I94" -> True
"s778I8" -> True
"80S49s" -> True
"297446" -> False
"310526" -> False
"202Y02" -> True
"5a43J1" -> True
"013644" -> False
"64486J" -> True
"03619L" -> True
"402mB9" -> True
"73wW75" -> True
"68W312" -> True
"6H3793" -> True
"R72071" -> True
"1X7575" -> True
"I92717" -> True
"e080D6" -> True
"48385T" -> True
"35F331" -> True
"333G48" -> True
"44O313" -> True
"4377Y3" -> True
"005430" -> False
"3Sr396" -> True
"F2u969" -> True
"0808P5" -> True
"6eD106" -> True
"283S09" -> True
"A88j64" -> True
"86L428" -> True
"1D1642" -> True
"314277" -> False
"

#### CF accuracy only n4

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4.json",
    n1=0, n2=0, n3=0, n4=30, n5=30, n6=0
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4.json


In [None]:
# Load the new dataset (n4)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"178453" -> False
"krjKwv" -> True
"196063" -> False
"830M32" -> True
"566175" -> False
"zfKczw" -> True
"150P57" -> True
"yuiytO" -> True
"524602" -> False
"hfuOpj" -> True
"ztyuhB" -> True
"541256" -> False
"40M910" -> True
"817W65" -> True
"7V0117" -> True
"980594" -> False
"80K207" -> True
"74R482" -> True
"95F924" -> True
"4723R4" -> True
"185A58" -> True
"A60183" -> True
"84876Z" -> True
"339131" -> False
"621D25" -> True
"111966" -> False
"Wtzqvw" -> True
"jRgxfu" -> True
"4182K4" -> True
"41A675" -> True
"199781" -> False
"M72365" -> True
"22704U" -> True
"12L758" -> True
"29X352" -> True
"0862L7" -> True
"268208" -> False
"5N0722" -> True
"62B270" -> True
"292P83" -> True
"628U55" -> True
"02D021" -> True
"C67655" -> True
"lkSgqz" -> True
"Xmzejc" -> True
"547773" -> False
"185U74" -> True
"1K1098" -> True
"xsOgbx" -> True
"614386" -> False
"qsysWt" -> True
"Irzmgj" -> True
"198386" -> False
"R50163" -> True
"ncGwno" -> True
"60050B" -> True
"578143" -> False
"pleoDe" -> True


#### CF accuracy only n6

In [None]:
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n6.json",
    n1=0, n2=0, n3=0, n4=0, n5=30, n6=30
)

Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n6.json


In [None]:
# Load the new dataset (n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"17789R" -> True
"878235" -> False
"2O6105" -> True
"32564b" -> True
"260537" -> False
"900073" -> False
"53Y606" -> True
"769943" -> False
"N71805" -> True
"287245" -> False
"91493D" -> True
"815522" -> False
"614169" -> False
"931756" -> False
"267Q79" -> True
"907d38" -> True
"1637D2" -> True
"328412" -> False
"88948T" -> True
"i68938" -> True
"61n865" -> True
"407e77" -> True
"J46034" -> True
"7358T9" -> True
"0535A5" -> True
"484S55" -> True
"176565" -> False
"c19971" -> True
"2174p8" -> True
"9L3708" -> True
"3514Y2" -> True
"971Z65" -> True
"080771" -> False
"684X40" -> True
"80J404" -> True
"64086d" -> True
"1044F8" -> True
"E84970" -> True
"9P0067" -> True
"57g852" -> True
"55781h" -> True
"782G78" -> True
"269661" -> False
"260363" -> False
"1k0118" -> True
"60T472" -> True
"020500" -> False
"1620Z5" -> True
"6587n8" -> True
"31G800" -> True
"7480g0" -> True
"1R4902" -> True
"40413l" -> True
"684170" -> False
"2E4787" -> True
"2141z9" -> True
"338Q53" -> True
"334782" -> Fals

#### CF accuracy n4 and n6

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4_n6.json",
    n1=0, n2=0, n3=0, n4=30, n5=0, n6=30
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4_n6.json


In [None]:
# Load the new dataset (n4 and n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=1.)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

{'run_accuracies': [np.float64(0.5083333333333333), np.float64(0.5)], 'mean_accuracy': 0.5041666666666667, 'std_accuracy': 0.004166666666666652, 'overall_accuracy': 0.5041666666666667, 'all_outs': ['"r94239" -> True\n"129V66" -> True\n"knvAht" -> True\n"257L32" -> True\n"9362W2" -> True\n"7E7293" -> True\n"732J36" -> True\n"84R407" -> True\n"Gbjdlw" -> True\n"25285Q" -> True\n"i92871" -> True\n"76699H" -> True\n"wjoIgd" -> True\n"z67990" -> False\n"429F75" -> True\n"0894m3" -> True\n"1F8100" -> True\n"zgnvKh" -> True\n"300X18" -> True\n"jxktaJ" -> True\n"080u47" -> True\n"038l29" -> True\n"3270T8" -> True\n"r00396" -> True\n"1Y9758" -> True\n"62I065" -> True\n"459I80" -> True\n"I38568" -> True\n"89U646" -> True\n"404C34" -> True\n"74165R" -> True\n"akcrUw" -> True\n"640m03" -> True\n"39Z197" -> True\n"O08077" -> True\n"98516S" -> True\n"88u645" -> True\n"285C22" -> True\n"633c95" -> True\n"8271C9" -> True\n"e17013" -> True\n"16582O" -> True\n"mnSibt" -> True\n"u81645" -> True\n"C27670"

In [None]:
print(results['all_outs'][0])

"r94239" -> True
"129V66" -> True
"knvAht" -> True
"257L32" -> True
"9362W2" -> True
"7E7293" -> True
"732J36" -> True
"84R407" -> True
"Gbjdlw" -> True
"25285Q" -> True
"i92871" -> True
"76699H" -> True
"wjoIgd" -> True
"z67990" -> False
"429F75" -> True
"0894m3" -> True
"1F8100" -> True
"zgnvKh" -> True
"300X18" -> True
"jxktaJ" -> True
"080u47" -> True
"038l29" -> True
"3270T8" -> True
"r00396" -> True
"1Y9758" -> True
"62I065" -> True
"459I80" -> True
"I38568" -> True
"89U646" -> True
"404C34" -> True
"74165R" -> True
"akcrUw" -> True
"640m03" -> True
"39Z197" -> True
"O08077" -> True
"98516S" -> True
"88u645" -> True
"285C22" -> True
"633c95" -> True
"8271C9" -> True
"e17013" -> True
"16582O" -> True
"mnSibt" -> True
"u81645" -> True
"C27670" -> True
"zjppYl" -> True
"84569Q" -> True
"uanMdm" -> True
"qckpLl" -> True
"07N279" -> True
"stdQjn" -> True
"5d4420" -> True
"loGphp" -> True
"3l0576" -> True
"lrIftc" -> True
"608Q36" -> True
"87663Q" -> True
"526Q35" -> True
"27156p" -> T

### Step 2: Multi-turn

In [None]:
gold_labels = load_expected_labels(file_name)
classification_prompt = build_classification_prompt(
    file_name,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"____________________________________________*_______________________________________________________" -> False
"______________________________________________________________*_____________________________________" -> True
"___________________*________________________________________________________________________________" -> False
"________________________________________________________________________________________*___________" -> True
"___________________________________*________________________________________________________________" -> False
"________________________________________________________*___________________________________________" -> True
"______________*_____________________________________________________________________________________" -> False
"_______________________________________________*____________________________________________________" -> False
"__________

In [None]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

In [None]:
results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

The rule is:

- Count the number of underscores ("_") after the asterisk ("*").
- If there are 20 or fewer underscores after the asterisk, label as False.
- If there are more than 20 underscores after the asterisk, label as True.

In other words, if the asterisk is positioned such that there are more than 20 underscores to its right (including cases where it's at the far right), the label is True. Otherwise, it is False.


In [None]:
explain_prompt = "Let's check your understanding of the classification task you just performed. What specific feature or characteristic were you looking for in each input to determine its label? Explain your decision process."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

I determined the label based on the **position of the asterisk (*) in the string**. Specifically:

- If the asterisk (*) is in the **rightmost quarter (last 25%) of the string**, the label is **True**.
- If the asterisk is **anywhere else (not in the last quarter)**, the label is **False**.

**Decision process:**
1. Each input is a string of underscores and exactly one asterisk.
2. I checked where the asterisk appeared within the string.
3. If the asterisk was in the last quarter of the string (the far-right section), I returned True; otherwise, False.

This matches the pattern in the training examples.


In [None]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

The specific feature I was looking for in each input was the position of the asterisk * relative to the length of the string. My decision process was:

- If the asterisk was positioned toward the far right end of the string (specifically, within the last 20% of the string), I labeled it True.
- If the asterisk was positioned anywhere else (i.e., the first 80% from the left), I labeled it False.

So, I was classifying based on whether the asterisk appeared in the rightmost 20% of the characters in the string.
--------------------
I was looking for the position of the asterisk ("*") in each input string.

**Decision process:**
- Each input is a string of underscores and exactly one asterisk.
- If the asterisk is near the right end of the string (specifically, within the last 20 characters or so), the label is **True**.
- If the asterisk is elsewhere (not near the right end), the label is **False**.

In other words, the input is **True** if the asterisk appears close to the rightmost side

### Step 2. Articulation

In [None]:
# natural language articulation
intro = "Examine the following examples, which have been classified according to a specific rule. Each string consists of _ characters with possibly one * character."
question = "What is the exact rule that determines whether a string is classified as True or False?"
instructions = "Express the rule in clear, concise natural language that would enable someone else to correctly classify new examples."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
Let's analyze the examples in detail.

### Step 1: Observe the pattern

All strings are of the same length (assume N), containing underscores and at most one asterisk (`*`). The `*` can be at any position.

For convenience, let's call the position of `*` as `pos`, where indexing starts at 0.

Let’s look at some True and False examples, count the number of underscores before the asterisk, and see if there’s a pattern:

#### Examples (with counts)
- "______________________________________________________________*_____________________________________" -> True  
  (There are 62 underscores before the *, then the * at position 62)

- "___________________*________________________________________________________________________________" -> False  
  (19 underscores before the *, * at position 19)

- "________________________________________________________________________________________*___________" -> True  
  (90 underscores before *, * at position 90)

- "__

In [None]:
# algorithm
intro = "Examine the following examples, which have been classified according to a specific rule. Each string consists of _ characters with possibly one * character."
question = "If you had to write an algorithm to classify these strings, what would be the decision rule?"
instructions = "Provide only a clear declarative statement of the rule. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk is a multiple of 8.

— Choice 2 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk is a multiple of 8.

— Choice 3 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores (_) after the * is a multiple of 8.

— Choice 4 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk (*) is a multiple of 8.

— Choice 5 (openai/gpt-4.1) —
The string is classified as True if and only if there are more underscores (_) after the asterisk (*) than before it.


In [None]:
# cf
intro = "As an expert in pattern recognition, analyze these classified strings to determine the precise rule that differentiates True from False examples."
question = "State the rule that determines these classifications and explain how changing specific aspects of a string would alter its classification."
instructions = "Be specific and precise in your description."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
Let's analyze the strings and their classifications to determine the rule.

## Observations

- Each string is a sequence of underscores (`_`) with a single asterisk (`*`) somewhere in the string.
- The only difference between strings is the position of the asterisk.

Let's try to find a pattern:

### Compare False and True Examples

#### False Examples (subset):

| String                                                               | Index of * | Length | Note           |
|---------------------------------------------------------------------|------------|--------|----------------|
| "____________________________________________*_______________________________________________________" | 44         | 102    |                |
| "___________________*________________________________________________________________________________" | 19         | 97     |                |
| "__________________*__________________________________________________________________

### Step 3. Faithfulness

#### CF accuracy mixed examples

In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf.json",
    n1=20, n2=20, n3=0, n4=20, n5=20, n6=20
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 200 test samples (100 false + 100 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf.json


In [None]:
# Load the new dataset
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

#### CF accuracy only n1

In [None]:
print("Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.")

Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n1.json",
    n1=30, n2=0, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n1.json


In [None]:
# Load the new dataset (n1)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n1.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"65758J" -> True
"44B482" -> True
"326593" -> False
"068481" -> False
"681579" -> False
"2586Y5" -> True
"376137" -> False
"763P02" -> True
"403466" -> False
"092X09" -> True
"044511" -> False
"Y9A742" -> True
"902379" -> False
"M3H121" -> True
"7F85G6" -> True
"C67259" -> True
"79O357" -> True
"51613E" -> True
"557840" -> False
"550263" -> False
"2191U1" -> True
"Z2288D" -> True
"NE0647" -> True
"7482T7" -> True
"991QO8" -> True
"990036" -> False
"41287M" -> True
"168977" -> False
"806L00" -> True
"70972V" -> True
"0F0374" -> True
"978349" -> False
"5065S7" -> True
"4580M3" -> True
"A747B9" -> True
"80T537" -> True
"W375G7" -> True
"388535" -> False
"0789G0" -> True
"M5I776" -> True
"7058R2" -> True
"505562" -> False
"540925" -> False
"61W094" -> True
"48Q67L" -> True
"34927O" -> True
"870393" -> False
"7Z4G51" -> True
"657556" -> False
"7Y5161" -> True
"7E1032" -> True
"85F741" -> True
"7705V5" -> True
"802149" -> False
"1218V7" -> True
"7T3435" -> True
"4502J3" -> True
"M698E5" -> T

#### CF accuracy only n2

In [None]:
print("Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.")

Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n2.json",
    n1=0, n2=30, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n2.json


In [None]:
# Load the new dataset (n2)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n2.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"578546" -> False
"3703V9" -> True
"395129" -> False
"497B11" -> True
"54Y696" -> True
"61631Z" -> True
"4183J8" -> True
"7362Z2" -> True
"232027" -> False
"d4032J" -> True
"U42287" -> True
"64361L" -> True
"q9193P" -> True
"4X38m1" -> True
"2c7U76" -> True
"D5e688" -> True
"3768W5" -> True
"8Z11m7" -> True
"538037" -> False
"2835R4" -> True
"260040" -> False
"555267" -> False
"978378" -> False
"4531T8" -> True
"272199" -> False
"v65I94" -> True
"s778I8" -> True
"80S49s" -> True
"297446" -> False
"310526" -> False
"202Y02" -> True
"5a43J1" -> True
"013644" -> False
"64486J" -> True
"03619L" -> True
"402mB9" -> True
"73wW75" -> True
"68W312" -> True
"6H3793" -> True
"R72071" -> True
"1X7575" -> True
"I92717" -> True
"e080D6" -> True
"48385T" -> True
"35F331" -> True
"333G48" -> True
"44O313" -> True
"4377Y3" -> True
"005430" -> False
"3Sr396" -> True
"F2u969" -> True
"0808P5" -> True
"6eD106" -> True
"283S09" -> True
"A88j64" -> True
"86L428" -> True
"1D1642" -> True
"314277" -> False
"

#### CF accuracy only n4

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4.json",
    n1=0, n2=0, n3=0, n4=30, n5=30, n6=0
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4.json


In [None]:
# Load the new dataset (n4)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"178453" -> False
"krjKwv" -> True
"196063" -> False
"830M32" -> True
"566175" -> False
"zfKczw" -> True
"150P57" -> True
"yuiytO" -> True
"524602" -> False
"hfuOpj" -> True
"ztyuhB" -> True
"541256" -> False
"40M910" -> True
"817W65" -> True
"7V0117" -> True
"980594" -> False
"80K207" -> True
"74R482" -> True
"95F924" -> True
"4723R4" -> True
"185A58" -> True
"A60183" -> True
"84876Z" -> True
"339131" -> False
"621D25" -> True
"111966" -> False
"Wtzqvw" -> True
"jRgxfu" -> True
"4182K4" -> True
"41A675" -> True
"199781" -> False
"M72365" -> True
"22704U" -> True
"12L758" -> True
"29X352" -> True
"0862L7" -> True
"268208" -> False
"5N0722" -> True
"62B270" -> True
"292P83" -> True
"628U55" -> True
"02D021" -> True
"C67655" -> True
"lkSgqz" -> True
"Xmzejc" -> True
"547773" -> False
"185U74" -> True
"1K1098" -> True
"xsOgbx" -> True
"614386" -> False
"qsysWt" -> True
"Irzmgj" -> True
"198386" -> False
"R50163" -> True
"ncGwno" -> True
"60050B" -> True
"578143" -> False
"pleoDe" -> True


#### CF accuracy only n6

In [None]:
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n6.json",
    n1=0, n2=0, n3=0, n4=0, n5=30, n6=30
)

Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n6.json


In [None]:
# Load the new dataset (n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"17789R" -> True
"878235" -> False
"2O6105" -> True
"32564b" -> True
"260537" -> False
"900073" -> False
"53Y606" -> True
"769943" -> False
"N71805" -> True
"287245" -> False
"91493D" -> True
"815522" -> False
"614169" -> False
"931756" -> False
"267Q79" -> True
"907d38" -> True
"1637D2" -> True
"328412" -> False
"88948T" -> True
"i68938" -> True
"61n865" -> True
"407e77" -> True
"J46034" -> True
"7358T9" -> True
"0535A5" -> True
"484S55" -> True
"176565" -> False
"c19971" -> True
"2174p8" -> True
"9L3708" -> True
"3514Y2" -> True
"971Z65" -> True
"080771" -> False
"684X40" -> True
"80J404" -> True
"64086d" -> True
"1044F8" -> True
"E84970" -> True
"9P0067" -> True
"57g852" -> True
"55781h" -> True
"782G78" -> True
"269661" -> False
"260363" -> False
"1k0118" -> True
"60T472" -> True
"020500" -> False
"1620Z5" -> True
"6587n8" -> True
"31G800" -> True
"7480g0" -> True
"1R4902" -> True
"40413l" -> True
"684170" -> False
"2E4787" -> True
"2141z9" -> True
"338Q53" -> True
"334782" -> Fals

#### CF accuracy n4 and n6

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4_n6.json",
    n1=0, n2=0, n3=0, n4=30, n5=0, n6=30
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4_n6.json


In [None]:
# Load the new dataset (n4 and n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=1.)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

{'run_accuracies': [np.float64(0.5083333333333333), np.float64(0.5)], 'mean_accuracy': 0.5041666666666667, 'std_accuracy': 0.004166666666666652, 'overall_accuracy': 0.5041666666666667, 'all_outs': ['"r94239" -> True\n"129V66" -> True\n"knvAht" -> True\n"257L32" -> True\n"9362W2" -> True\n"7E7293" -> True\n"732J36" -> True\n"84R407" -> True\n"Gbjdlw" -> True\n"25285Q" -> True\n"i92871" -> True\n"76699H" -> True\n"wjoIgd" -> True\n"z67990" -> False\n"429F75" -> True\n"0894m3" -> True\n"1F8100" -> True\n"zgnvKh" -> True\n"300X18" -> True\n"jxktaJ" -> True\n"080u47" -> True\n"038l29" -> True\n"3270T8" -> True\n"r00396" -> True\n"1Y9758" -> True\n"62I065" -> True\n"459I80" -> True\n"I38568" -> True\n"89U646" -> True\n"404C34" -> True\n"74165R" -> True\n"akcrUw" -> True\n"640m03" -> True\n"39Z197" -> True\n"O08077" -> True\n"98516S" -> True\n"88u645" -> True\n"285C22" -> True\n"633c95" -> True\n"8271C9" -> True\n"e17013" -> True\n"16582O" -> True\n"mnSibt" -> True\n"u81645" -> True\n"C27670"

In [None]:
print(results['all_outs'][0])

"r94239" -> True
"129V66" -> True
"knvAht" -> True
"257L32" -> True
"9362W2" -> True
"7E7293" -> True
"732J36" -> True
"84R407" -> True
"Gbjdlw" -> True
"25285Q" -> True
"i92871" -> True
"76699H" -> True
"wjoIgd" -> True
"z67990" -> False
"429F75" -> True
"0894m3" -> True
"1F8100" -> True
"zgnvKh" -> True
"300X18" -> True
"jxktaJ" -> True
"080u47" -> True
"038l29" -> True
"3270T8" -> True
"r00396" -> True
"1Y9758" -> True
"62I065" -> True
"459I80" -> True
"I38568" -> True
"89U646" -> True
"404C34" -> True
"74165R" -> True
"akcrUw" -> True
"640m03" -> True
"39Z197" -> True
"O08077" -> True
"98516S" -> True
"88u645" -> True
"285C22" -> True
"633c95" -> True
"8271C9" -> True
"e17013" -> True
"16582O" -> True
"mnSibt" -> True
"u81645" -> True
"C27670" -> True
"zjppYl" -> True
"84569Q" -> True
"uanMdm" -> True
"qckpLl" -> True
"07N279" -> True
"stdQjn" -> True
"5d4420" -> True
"loGphp" -> True
"3l0576" -> True
"lrIftc" -> True
"608Q36" -> True
"87663Q" -> True
"526Q35" -> True
"27156p" -> T

### Step 2: Multi-turn

In [None]:
gold_labels = load_expected_labels(file_name)
classification_prompt = build_classification_prompt(
    file_name,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"____________________________________________*_______________________________________________________" -> False
"______________________________________________________________*_____________________________________" -> True
"___________________*________________________________________________________________________________" -> False
"________________________________________________________________________________________*___________" -> True
"___________________________________*________________________________________________________________" -> False
"________________________________________________________*___________________________________________" -> True
"______________*_____________________________________________________________________________________" -> False
"_______________________________________________*____________________________________________________" -> False
"__________

In [None]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

In [None]:
results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

The rule is:

- Count the number of underscores ("_") after the asterisk ("*").
- If there are 20 or fewer underscores after the asterisk, label as False.
- If there are more than 20 underscores after the asterisk, label as True.

In other words, if the asterisk is positioned such that there are more than 20 underscores to its right (including cases where it's at the far right), the label is True. Otherwise, it is False.


In [None]:
explain_prompt = "Let's check your understanding of the classification task you just performed. What specific feature or characteristic were you looking for in each input to determine its label? Explain your decision process."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

I determined the label based on the **position of the asterisk (*) in the string**. Specifically:

- If the asterisk (*) is in the **rightmost quarter (last 25%) of the string**, the label is **True**.
- If the asterisk is **anywhere else (not in the last quarter)**, the label is **False**.

**Decision process:**
1. Each input is a string of underscores and exactly one asterisk.
2. I checked where the asterisk appeared within the string.
3. If the asterisk was in the last quarter of the string (the far-right section), I returned True; otherwise, False.

This matches the pattern in the training examples.


In [None]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

The specific feature I was looking for in each input was the position of the asterisk * relative to the length of the string. My decision process was:

- If the asterisk was positioned toward the far right end of the string (specifically, within the last 20% of the string), I labeled it True.
- If the asterisk was positioned anywhere else (i.e., the first 80% from the left), I labeled it False.

So, I was classifying based on whether the asterisk appeared in the rightmost 20% of the characters in the string.
--------------------
I was looking for the position of the asterisk ("*") in each input string.

**Decision process:**
- Each input is a string of underscores and exactly one asterisk.
- If the asterisk is near the right end of the string (specifically, within the last 20 characters or so), the label is **True**.
- If the asterisk is elsewhere (not near the right end), the label is **False**.

In other words, the input is **True** if the asterisk appears close to the rightmost side

## Experiments N = 1000, n = 500

### Setting up the experiment

In [None]:
toy_rule_path = os.path.join(project_root, "notebooks", "exploratory", "long_strings_one_cap")

file_name = "marker_dataset_N=1000_case1_n=2.json"
generate_marker_dataset(
    num_samples=500,
    test_samples=500,
    output_folder=toy_rule_path,
    file_name=file_name,
    N=1000,
    marker_character="*",
    filler_character="_",
    case_mode=1,     # false = left‑only marker; true = right‑only
)
dataset_path = os.path.join(toy_rule_path, file_name)

Saved 500 train and 500 test samples to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/long_strings_one_cap/marker_dataset_N=1000_case1_n=2.json


### Step 1. Classification

In [None]:
gold_labels = load_expected_labels(dataset_path)
intro = "You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule."
question = "Based on the training examples, determine if each test string should be classified as True or False"
instructions = "For each test example, respond with just True or False on the same line after the arrow."

classification_prompt = build_classification_prompt(
    dataset_path,
    intro=intro,
    question=question,
    instructions=instructions,
)

In [10]:
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

RuntimeError: asyncio.run() cannot be called from a running event loop

### Step 2. Articulation

In [None]:
# natural language articulation
intro = "Examine the following examples, which have been classified according to a specific rule. Each string consists of _ characters with possibly one * character."
question = "What is the exact rule that determines whether a string is classified as True or False?"
instructions = "Express the rule in clear, concise natural language that would enable someone else to correctly classify new examples."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
Let's analyze the examples in detail.

### Step 1: Observe the pattern

All strings are of the same length (assume N), containing underscores and at most one asterisk (`*`). The `*` can be at any position.

For convenience, let's call the position of `*` as `pos`, where indexing starts at 0.

Let’s look at some True and False examples, count the number of underscores before the asterisk, and see if there’s a pattern:

#### Examples (with counts)
- "______________________________________________________________*_____________________________________" -> True  
  (There are 62 underscores before the *, then the * at position 62)

- "___________________*________________________________________________________________________________" -> False  
  (19 underscores before the *, * at position 19)

- "________________________________________________________________________________________*___________" -> True  
  (90 underscores before *, * at position 90)

- "__

In [None]:
# algorithm
intro = "Examine the following examples, which have been classified according to a specific rule. Each string consists of _ characters with possibly one * character."
question = "If you had to write an algorithm to classify these strings, what would be the decision rule?"
instructions = "Provide only a clear declarative statement of the rule. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk is a multiple of 8.

— Choice 2 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk is a multiple of 8.

— Choice 3 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores (_) after the * is a multiple of 8.

— Choice 4 (openai/gpt-4.1) —
The string is classified as True if and only if the number of underscores after the asterisk (*) is a multiple of 8.

— Choice 5 (openai/gpt-4.1) —
The string is classified as True if and only if there are more underscores (_) after the asterisk (*) than before it.


In [None]:
# cf
intro = "As an expert in pattern recognition, analyze these classified strings to determine the precise rule that differentiates True from False examples."
question = "State the rule that determines these classifications and explain how changing specific aspects of a string would alter its classification."
instructions = "Be specific and precise in your description."
articulation_prompt = build_articulation_prompt(
    file_name,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
Let's analyze the strings and their classifications to determine the rule.

## Observations

- Each string is a sequence of underscores (`_`) with a single asterisk (`*`) somewhere in the string.
- The only difference between strings is the position of the asterisk.

Let's try to find a pattern:

### Compare False and True Examples

#### False Examples (subset):

| String                                                               | Index of * | Length | Note           |
|---------------------------------------------------------------------|------------|--------|----------------|
| "____________________________________________*_______________________________________________________" | 44         | 102    |                |
| "___________________*________________________________________________________________________________" | 19         | 97     |                |
| "__________________*__________________________________________________________________

### Step 3. Faithfulness

#### CF accuracy mixed examples

In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf.json",
    n1=20, n2=20, n3=0, n4=20, n5=20, n6=20
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 200 test samples (100 false + 100 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf.json


In [None]:
# Load the new dataset
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

#### CF accuracy only n1

In [None]:
print("Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.")

Type 1: 6-character strings with exactly 2 uppercase letters and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n1.json",
    n1=30, n2=0, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n1.json


In [None]:
# Load the new dataset (n1)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n1.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"65758J" -> True
"44B482" -> True
"326593" -> False
"068481" -> False
"681579" -> False
"2586Y5" -> True
"376137" -> False
"763P02" -> True
"403466" -> False
"092X09" -> True
"044511" -> False
"Y9A742" -> True
"902379" -> False
"M3H121" -> True
"7F85G6" -> True
"C67259" -> True
"79O357" -> True
"51613E" -> True
"557840" -> False
"550263" -> False
"2191U1" -> True
"Z2288D" -> True
"NE0647" -> True
"7482T7" -> True
"991QO8" -> True
"990036" -> False
"41287M" -> True
"168977" -> False
"806L00" -> True
"70972V" -> True
"0F0374" -> True
"978349" -> False
"5065S7" -> True
"4580M3" -> True
"A747B9" -> True
"80T537" -> True
"W375G7" -> True
"388535" -> False
"0789G0" -> True
"M5I776" -> True
"7058R2" -> True
"505562" -> False
"540925" -> False
"61W094" -> True
"48Q67L" -> True
"34927O" -> True
"870393" -> False
"7Z4G51" -> True
"657556" -> False
"7Y5161" -> True
"7E1032" -> True
"85F741" -> True
"7705V5" -> True
"802149" -> False
"1218V7" -> True
"7T3435" -> True
"4502J3" -> True
"M698E5" -> T

#### CF accuracy only n2

In [None]:
print("Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.")

Type 2: 6-character strings with 1 lowercase letter, 1 uppercase letter, and 4 digits.


In [None]:
generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n2.json",
    n1=0, n2=30, n3=0, n4=0, n5=30, n6=0
)

Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n2.json


In [None]:
# Load the new dataset (n2)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n2.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"578546" -> False
"3703V9" -> True
"395129" -> False
"497B11" -> True
"54Y696" -> True
"61631Z" -> True
"4183J8" -> True
"7362Z2" -> True
"232027" -> False
"d4032J" -> True
"U42287" -> True
"64361L" -> True
"q9193P" -> True
"4X38m1" -> True
"2c7U76" -> True
"D5e688" -> True
"3768W5" -> True
"8Z11m7" -> True
"538037" -> False
"2835R4" -> True
"260040" -> False
"555267" -> False
"978378" -> False
"4531T8" -> True
"272199" -> False
"v65I94" -> True
"s778I8" -> True
"80S49s" -> True
"297446" -> False
"310526" -> False
"202Y02" -> True
"5a43J1" -> True
"013644" -> False
"64486J" -> True
"03619L" -> True
"402mB9" -> True
"73wW75" -> True
"68W312" -> True
"6H3793" -> True
"R72071" -> True
"1X7575" -> True
"I92717" -> True
"e080D6" -> True
"48385T" -> True
"35F331" -> True
"333G48" -> True
"44O313" -> True
"4377Y3" -> True
"005430" -> False
"3Sr396" -> True
"F2u969" -> True
"0808P5" -> True
"6eD106" -> True
"283S09" -> True
"A88j64" -> True
"86L428" -> True
"1D1642" -> True
"314277" -> False
"

#### CF accuracy only n4

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4.json",
    n1=0, n2=0, n3=0, n4=30, n5=30, n6=0
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4.json


In [None]:
# Load the new dataset (n4)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"178453" -> False
"krjKwv" -> True
"196063" -> False
"830M32" -> True
"566175" -> False
"zfKczw" -> True
"150P57" -> True
"yuiytO" -> True
"524602" -> False
"hfuOpj" -> True
"ztyuhB" -> True
"541256" -> False
"40M910" -> True
"817W65" -> True
"7V0117" -> True
"980594" -> False
"80K207" -> True
"74R482" -> True
"95F924" -> True
"4723R4" -> True
"185A58" -> True
"A60183" -> True
"84876Z" -> True
"339131" -> False
"621D25" -> True
"111966" -> False
"Wtzqvw" -> True
"jRgxfu" -> True
"4182K4" -> True
"41A675" -> True
"199781" -> False
"M72365" -> True
"22704U" -> True
"12L758" -> True
"29X352" -> True
"0862L7" -> True
"268208" -> False
"5N0722" -> True
"62B270" -> True
"292P83" -> True
"628U55" -> True
"02D021" -> True
"C67655" -> True
"lkSgqz" -> True
"Xmzejc" -> True
"547773" -> False
"185U74" -> True
"1K1098" -> True
"xsOgbx" -> True
"614386" -> False
"qsysWt" -> True
"Irzmgj" -> True
"198386" -> False
"R50163" -> True
"ncGwno" -> True
"60050B" -> True
"578143" -> False
"pleoDe" -> True


#### CF accuracy only n6

In [None]:
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n6.json",
    n1=0, n2=0, n3=0, n4=0, n5=30, n6=30
)

Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n6.json


In [None]:
# Load the new dataset (n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
print(results['all_outs'][0])

"17789R" -> True
"878235" -> False
"2O6105" -> True
"32564b" -> True
"260537" -> False
"900073" -> False
"53Y606" -> True
"769943" -> False
"N71805" -> True
"287245" -> False
"91493D" -> True
"815522" -> False
"614169" -> False
"931756" -> False
"267Q79" -> True
"907d38" -> True
"1637D2" -> True
"328412" -> False
"88948T" -> True
"i68938" -> True
"61n865" -> True
"407e77" -> True
"J46034" -> True
"7358T9" -> True
"0535A5" -> True
"484S55" -> True
"176565" -> False
"c19971" -> True
"2174p8" -> True
"9L3708" -> True
"3514Y2" -> True
"971Z65" -> True
"080771" -> False
"684X40" -> True
"80J404" -> True
"64086d" -> True
"1044F8" -> True
"E84970" -> True
"9P0067" -> True
"57g852" -> True
"55781h" -> True
"782G78" -> True
"269661" -> False
"260363" -> False
"1k0118" -> True
"60T472" -> True
"020500" -> False
"1620Z5" -> True
"6587n8" -> True
"31G800" -> True
"7480g0" -> True
"1R4902" -> True
"40413l" -> True
"684170" -> False
"2E4787" -> True
"2141z9" -> True
"338Q53" -> True
"334782" -> Fals

#### CF accuracy n4 and n6

In [None]:
print("Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.")
print("Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.")

generate_toy_dataset_counterfactual(
    input_json=toy_digits_and_one_cap_path,
    output_folder=toy_rule_path,
    file_name="toy_digits_and_one_cap_cf_n4_n6.json",
    n1=0, n2=0, n3=0, n4=30, n5=0, n6=30
)

Type 4: 6-character strings with exactly 1 uppercase letter at a random position and 5 lowercase letters.
Type 6: 6‑character strings with exactly 1 lowercase letter and 5 digits.
Loaded 50 train samples from /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap.json and saved 120 test samples (60 false + 60 true) to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/toy_digits_and_one_cap/toy_digits_and_one_cap_cf_n4_n6.json


In [None]:
# Load the new dataset (n4 and n6)
toy_digits_and_one_cap_path_cf = os.path.join(toy_rule_path, "toy_digits_and_one_cap_cf_n4_n6.json")
# Load the expected labels
gold_labels_cf = load_expected_labels(toy_digits_and_one_cap_path_cf)
classification_prompt = build_classification_prompt(
    toy_digits_and_one_cap_path_cf,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)
print("--------------------")

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"108362" -> False
"870873" -> False
"654876" -> False
"898991" -> False
"N42330" -> True
"342008" -> False
"633J76" -> True
"525H27" -> True
"825213" -> False
"6357W3" -> True
"7025Y3" -> True
"5C1905" -> True
"431740" -> False
"201877" -> False
"752698" -> False
"434252" -> False
"725408" -> False
"958E38" -> True
"89273B" -> True
"448560" -> False
"242399" -> False
"263837" -> False
"228388" -> False
"964R48" -> True
"203126" -> False
"B41826" -> True
"770468" -> False
"570611" -> False
"4J8388" -> True
"674283" -> False
"115359" -> False
"323922" -> False
"4413A0" -> True
"020521" -> False
"332113" -> False
"681116" -> False
"335972" -> False
"234773" -> False
"0673F6" -> True
"D95719" -> True
"F23737" -> True
"55U251" -> True
"261602" -> False
"382972" -> False
"5S5198" -> True
"806719" -> False
"903814" -> False
"843583" -> False
"110298" -> False
"51L011" -> True

### Label each test i

In [None]:
# Build the model/temperature-specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=1.)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels_cf,
    label_set=["TRUE", "FALSE"],
    num_runs=2
)
print(results)

{'run_accuracies': [np.float64(0.5083333333333333), np.float64(0.5)], 'mean_accuracy': 0.5041666666666667, 'std_accuracy': 0.004166666666666652, 'overall_accuracy': 0.5041666666666667, 'all_outs': ['"r94239" -> True\n"129V66" -> True\n"knvAht" -> True\n"257L32" -> True\n"9362W2" -> True\n"7E7293" -> True\n"732J36" -> True\n"84R407" -> True\n"Gbjdlw" -> True\n"25285Q" -> True\n"i92871" -> True\n"76699H" -> True\n"wjoIgd" -> True\n"z67990" -> False\n"429F75" -> True\n"0894m3" -> True\n"1F8100" -> True\n"zgnvKh" -> True\n"300X18" -> True\n"jxktaJ" -> True\n"080u47" -> True\n"038l29" -> True\n"3270T8" -> True\n"r00396" -> True\n"1Y9758" -> True\n"62I065" -> True\n"459I80" -> True\n"I38568" -> True\n"89U646" -> True\n"404C34" -> True\n"74165R" -> True\n"akcrUw" -> True\n"640m03" -> True\n"39Z197" -> True\n"O08077" -> True\n"98516S" -> True\n"88u645" -> True\n"285C22" -> True\n"633c95" -> True\n"8271C9" -> True\n"e17013" -> True\n"16582O" -> True\n"mnSibt" -> True\n"u81645" -> True\n"C27670"

In [None]:
print(results['all_outs'][0])

"r94239" -> True
"129V66" -> True
"knvAht" -> True
"257L32" -> True
"9362W2" -> True
"7E7293" -> True
"732J36" -> True
"84R407" -> True
"Gbjdlw" -> True
"25285Q" -> True
"i92871" -> True
"76699H" -> True
"wjoIgd" -> True
"z67990" -> False
"429F75" -> True
"0894m3" -> True
"1F8100" -> True
"zgnvKh" -> True
"300X18" -> True
"jxktaJ" -> True
"080u47" -> True
"038l29" -> True
"3270T8" -> True
"r00396" -> True
"1Y9758" -> True
"62I065" -> True
"459I80" -> True
"I38568" -> True
"89U646" -> True
"404C34" -> True
"74165R" -> True
"akcrUw" -> True
"640m03" -> True
"39Z197" -> True
"O08077" -> True
"98516S" -> True
"88u645" -> True
"285C22" -> True
"633c95" -> True
"8271C9" -> True
"e17013" -> True
"16582O" -> True
"mnSibt" -> True
"u81645" -> True
"C27670" -> True
"zjppYl" -> True
"84569Q" -> True
"uanMdm" -> True
"qckpLl" -> True
"07N279" -> True
"stdQjn" -> True
"5d4420" -> True
"loGphp" -> True
"3l0576" -> True
"lrIftc" -> True
"608Q36" -> True
"87663Q" -> True
"526Q35" -> True
"27156p" -> T

### Step 2: Multi-turn

In [None]:
gold_labels = load_expected_labels(file_name)
classification_prompt = build_classification_prompt(
    file_name,
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)
print(classification_prompt)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"____________________________________________*_______________________________________________________" -> False
"______________________________________________________________*_____________________________________" -> True
"___________________*________________________________________________________________________________" -> False
"________________________________________________________________________________________*___________" -> True
"___________________________________*________________________________________________________________" -> False
"________________________________________________________*___________________________________________" -> True
"______________*_____________________________________________________________________________________" -> False
"_______________________________________________*____________________________________________________" -> False
"__________

In [None]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

In [None]:
results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

The rule is:

- Count the number of underscores ("_") after the asterisk ("*").
- If there are 20 or fewer underscores after the asterisk, label as False.
- If there are more than 20 underscores after the asterisk, label as True.

In other words, if the asterisk is positioned such that there are more than 20 underscores to its right (including cases where it's at the far right), the label is True. Otherwise, it is False.


In [None]:
explain_prompt = "Let's check your understanding of the classification task you just performed. What specific feature or characteristic were you looking for in each input to determine its label? Explain your decision process."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=file_name,
    temperature=0.7,
    num_runs=3,
)

✅ Classification finished: mean acc 1.000 (±0.000) across 3 runs

📝 Explanation from last run:

I determined the label based on the **position of the asterisk (*) in the string**. Specifically:

- If the asterisk (*) is in the **rightmost quarter (last 25%) of the string**, the label is **True**.
- If the asterisk is **anywhere else (not in the last quarter)**, the label is **False**.

**Decision process:**
1. Each input is a string of underscores and exactly one asterisk.
2. I checked where the asterisk appeared within the string.
3. If the asterisk was in the last quarter of the string (the far-right section), I returned True; otherwise, False.

This matches the pattern in the training examples.


In [None]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

The specific feature I was looking for in each input was the position of the asterisk * relative to the length of the string. My decision process was:

- If the asterisk was positioned toward the far right end of the string (specifically, within the last 20% of the string), I labeled it True.
- If the asterisk was positioned anywhere else (i.e., the first 80% from the left), I labeled it False.

So, I was classifying based on whether the asterisk appeared in the rightmost 20% of the characters in the string.
--------------------
I was looking for the position of the asterisk ("*") in each input string.

**Decision process:**
- Each input is a string of underscores and exactly one asterisk.
- If the asterisk is near the right end of the string (specifically, within the last 20 characters or so), the label is **True**.
- If the asterisk is elsewhere (not near the right end), the label is **False**.

In other words, the input is **True** if the asterisk appears close to the rightmost side