## Imports

In [1]:
import sys, os
import importlib

# Set up the Python path for the project.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import evals_utils
importlib.reload(evals_utils)
from create_datasets import generate_marker_dataset, generate_adversarial_marker_dataset
from inference_utils import build_classification_prompt, build_articulation_prompt, chat, make_inference_fn
from evals_utils import load_expected_labels, evaluate_classification_accuracy, parse_predictions, classify_then_explain, flexible_parse_predictions

from dotenv import load_dotenv

In [2]:
load_dotenv()  # reads from .env in current working directory
# You can also specify path manually: load_dotenv(dotenv_path="../.env")

# Confirm it worked:
assert "OPENAI_API_KEY" in os.environ
print("OpenAI key loaded.")

OpenAI key loaded.


### Models

In [3]:
from openai import OpenAI
# Retrieve all available models
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
models = client.models.list()

# Filter models that include 'gpt-4' in their ID
gpt_4_models = [model.id for model in models.data if "gpt-4" in model.id.lower()]

# Print the list of GPT-4 models
print("Available GPT-4 Models:")
for model_id in gpt_4_models:
    print(f"- {model_id}")


Available GPT-4 Models:
- gpt-4o-audio-preview-2024-12-17
- gpt-4o-audio-preview-2024-10-01
- gpt-4-turbo-preview
- gpt-4.1-nano
- gpt-4.1-nano-2025-04-14
- gpt-4o-realtime-preview-2024-10-01
- gpt-4o-realtime-preview
- gpt-4
- chatgpt-4o-latest
- gpt-4o-mini-audio-preview
- gpt-4-1106-preview
- gpt-4o-audio-preview
- gpt-4o-mini-realtime-preview
- gpt-4.1-mini
- gpt-4o-mini-realtime-preview-2024-12-17
- gpt-4o-mini-search-preview
- gpt-4.1-mini-2025-04-14
- gpt-4o-search-preview
- gpt-4o-mini-search-preview-2025-03-11
- gpt-4-0125-preview
- gpt-4o-2024-11-20
- gpt-4o-2024-05-13
- gpt-4-0613
- gpt-4o-mini-tts
- gpt-4o-transcribe
- gpt-4.5-preview
- gpt-4.5-preview-2025-02-27
- gpt-4o-mini-transcribe
- gpt-4o-search-preview-2025-03-11
- gpt-4o
- gpt-4o-mini
- gpt-4o-2024-08-06
- gpt-4.1
- gpt-4.1-2025-04-14
- gpt-4o-mini-2024-07-18
- gpt-4o-mini-audio-preview-2024-12-17
- gpt-4o-realtime-preview-2024-12-17
- gpt-4-turbo
- gpt-4-turbo-2024-04-09
- ft:gpt-4o-2024-08-06:jc:max-apples:BU5dq

In [4]:
openai_4o = "openai/gpt-4o"
model_4o = "gpt-4o"
openai_4op1 = "openai/gpt-4.1"
openai_4op1_nano = "openai/gpt-4.1-nano"
openai_4op1_mini = "openai/gpt-4.1-mini"

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

## Experiments N = 150

### Analysis

In [68]:
toy_rule_path = os.path.join(project_root, "notebooks", "exploratory", "long_strings_one_cap")

file_name = "marker_dataset_N=150_case1_case1_30_30.json"
generate_marker_dataset(
    num_samples=30,
    test_samples=30,
    output_folder=toy_rule_path,
    file_name=file_name,
    N=150,
    marker_character="*",
    filler_character="_",
    case_mode=1,     # false = left‑only marker; true = right‑only
)
dataset_path = os.path.join(toy_rule_path, file_name)

Saved 30 train and 30 test samples to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/long_strings_one_cap/marker_dataset_N=150_case1_case1_30_30.json


In [69]:
dataset_path = os.path.join(project_root, "notebooks", "exploratory", "long_strings_one_cap", "marker_dataset_N=150_case1_case1_30_30.json")

In [70]:
gold_labels = load_expected_labels(dataset_path)
intro = "You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule."
question = "Based on the training examples, determine if each test string should be classified as True or False"
instructions = "For each test example, respond with just True or False on the same line after the arrow."

classification_prompt = build_classification_prompt(
    dataset_path,
    intro=intro,
    question=question,
    instructions=instructions,
)
print(classification_prompt)

You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule.
For each test example, respond with just True or False on the same line after the arrow.

### Training examples
"____________________________________________________________________________________*_________________________________________________________________" -> True
"_____________________________________*________________________________________________________________________________________________________________" -> False
"_________________________________________________________*____________________________________________________________________________________________" -> False
"_____________________________*________________________________________________________________________________________________________________________" -> False
"________*_______________________________________________________________________

In [71]:
# reply = await chat(openai_4op1, classification_prompt, n=1, temperature=0.7)

In [72]:
ntrain = 30
ntest = 30
intro = f"You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly {ntrain} training examples that demonstrate the classification rule, followed by exactly {ntest} test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = f"Remember to output only True or False for each of the {ntest} test examples, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    dataset_path,
    intro=intro,
    instructions=instructions,
    question=question,
)

In [80]:
print(classification_prompt)

You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly 30 training examples that demonstrate the classification rule, followed by exactly 30 test examples that you need to classify using the same rule.
Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation.

### Training examples
"____________________________________________________________________________________*_________________________________________________________________" -> True
"_____________________________________*________________________________________________________________________________________________________________" -> False
"_________________________________________________________*_______________________________________________________________________________________

In [82]:
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.3)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=6
)
print(results)

{'run_accuracies': [np.float64(0.9333333333333333), np.float64(1.0), np.float64(1.0), np.float64(0.9333333333333333), np.float64(1.0), np.float64(0.9333333333333333)], 'mean_accuracy': 0.9666666666666668, 'std_accuracy': 0.033333333333333326, 'overall_accuracy': 0.9666666666666667, 'all_outs': ['True\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse', 'True\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse', 'True\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse', 'True\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nT

In [88]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples—each one a string exactly 150 characters long—and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    dataset_path,
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The example is True if and only if the asterisk (*) appears at position 75 or later in the string (i.e., at index 74 or greater, using 0-based indexing).

— Choice 2 (openai/gpt-4.1) —
The example is labeled True if and only if the asterisk (*) appears at position 75 or later in the string (i.e., at index 74 or higher, counting the first character as position 1).

— Choice 3 (openai/gpt-4.1) —
The example is labeled True if and only if the asterisk is located between positions 70 and 109 (inclusive) in the 150-character string.

— Choice 4 (openai/gpt-4.1) —
The example is labeled True if and only if the asterisk (*) appears at position 73 or later (i.e., the 74th character or later in the string).

— Choice 5 (openai/gpt-4.1) —
The example is labeled True if and only if the asterisk "*" is located between the 70th and 109th characters (inclusive) in the 150-character string.


### Step 2: Multi-turn

#### V1

In [86]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=dataset_path,
    temperature=0.7,
    num_runs=5,
)

Run 1: acc 1.000
Run 2: acc 0.933
Run 3: acc 1.000
Run 4: acc 0.933
Run 5: acc 0.967
✅ Classification finished: mean acc 0.967 (±0.030) across 5 runs

📝 Explanation from last run:

Certainly! Here is a precise description of the rule I inferred from the training data:

**Rule:**
Count the number of underscores ("_") that appear before the single asterisk ("*") in the string. If the number of underscores before the asterisk is a multiple of 75 (i.e., 75, 150, 225, etc.), then label the string as **True**; otherwise, label it as **False**.

**How I deduced this:**
- In the training examples, all strings labeled **True** have the asterisk at positions 75, 150, 225, etc. (using 0-based indexing).
- All other positions for the asterisk are labeled **False**.

**Example:**
- A string with 75 underscores before the asterisk is True.
- A string with 76 underscores before the asterisk is False.
- A string with 150 underscores before the asterisk is True.

This pattern holds for every training e

In [90]:
explain_prompt = "Let's check your understanding of the classification task you just completed. Each input was a string exactly 150 characters long. What specific pattern, feature, or rule did you rely on to decide whether each input should be labeled True or False? Describe your decision-making process clearly and concisely."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json=dataset_path,
    temperature=0.7,
    num_runs=5,
)

Run 1: acc 1.000
Run 2: acc 1.000
Run 3: acc 1.000
Run 4: acc 1.000
Run 5: acc 1.000
✅ Classification finished: mean acc 1.000 (±0.000) across 5 runs

📝 Explanation from last run:

The rule I used is based on the position of the asterisk (*) in the string:

- If the asterisk is at position 85 or greater (counting from 1, i.e., the 85th character or later in the 150-character string), the label is **True**.
- If the asterisk is before position 85 (i.e., the 1st through 84th character), the label is **False**.

I determined this by noting in the training examples that all True examples have the asterisk in the second half of the string (positions 85–150), while all False examples have it earlier.


In [91]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

The rule is based on the position of the asterisk (*) in the string. Each string is exactly 150 characters long. If the asterisk appears at position 85 or later (i.e., between positions 85 and 149, using 0-based indexing), the label is True. If the asterisk appears before position 85 (i.e., between positions 0 and 84), the label is False.
--------------------
The rule is based on the position of the asterisk (*) in the string. Specifically:

- If the asterisk is located at position 85 or later (with positions starting at 1), the label is **True**.
- If the asterisk is at position 84 or earlier, the label is **False**.

**Decision process:**  
For each string, I counted the number of underscores before the asterisk. If there are 84 or more underscores before the asterisk (meaning the asterisk is at position 85 or later), I labeled it as True; otherwise, False. This matches the pattern in the training data.
--------------------
The rule I relied on is based on the position of the asteris

### Step 2 - OOD

In [122]:
import os
import json
from openai import OpenAI

# 1. Instantiate the new client (reads OPENAI_API_KEY by default)
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

def chat_two_turns(model: str, prompt1: str, prompt2: str, temperature: int) -> tuple[str, str]:
    """
    Sends prompt1, gets out1; then sends prompt2 with context, gets out2.
    Returns (out1, out2).
    """
    # 1. Start with the first user message
    messages = [
        {"role": "user", "content": prompt1}
    ]                                       # messages must be a list of role/content dicts :contentReference[oaicite:2]{index=2}

    # 2. First API call
    resp1 = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )                    
    out1 = resp1.choices[0].message.content.strip()  # extract via .choices[0].message.content :contentReference[oaicite:4]{index=4}

    # 3. Append assistant reply, then second user message
    messages.append({"role": "assistant", "content": out1})
    messages.append({"role": "user", "content": prompt2})

    # 4. Second API call
    resp2 = client.chat.completions.create(
        model=model,
        messages=messages
    )
    out2 = resp2.choices[0].message.content.strip()

    return out1, out2                     # returns both responses

#### Adversarial-Adversarial

In [98]:
# construct adversarial dataset to test the model's robustness
generate_adversarial_marker_dataset(
    num_samples=30,
    test_samples=30,
    output_folder=toy_rule_path,
    file_name="adversarial_marker_dataset_N=150_case1_30_30.json",
    N=150,
    marker_character="*",
    filler_character="_",
    case_mode=1,     # false = left‑only marker; true = right‑only
)
adversarial_dataset_path = os.path.join(toy_rule_path, "adversarial_marker_dataset_N=150_case1_30_30.json")
gold_labels = load_expected_labels(adversarial_dataset_path)

Saved 30 train and 30 test samples to /Users/joriococola/Documents/GitHub/OE-test/code/notebooks/exploratory/long_strings_one_cap/adversarial_marker_dataset_N=150_case1_30_30.json


In [99]:
intro = "You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule."
question = "Based on the training examples, determine if each test string should be classified as True or False"
instructions = "For each test example, respond with just True or False on the same line after the arrow."

adv_classification_prompt = build_classification_prompt(
    adversarial_dataset_path,
    intro=intro,
    question=question,
    instructions=instructions,
)
print(adv_classification_prompt)

You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule.
For each test example, respond with just True or False on the same line after the arrow.

### Training examples
"_______________________________________________________________________*______________________________________________________________________________" -> False
"___________________________________________________________________________*__________________________________________________________________________" -> True
"_______________________________________________________________________*______________________________________________________________________________" -> False
"______________________________________________________________________*_______________________________________________________________________________" -> False
"_____________________________________________________________________________*__

In [96]:
explain_prompt = "Let's check your understanding of the classification task you just completed. Each input was a string exactly 150 characters long. What specific pattern, feature, or rule did you rely on to decide whether each input should be labeled True or False? Describe your decision-making process clearly and concisely."

In [None]:
model_name = "gpt-4.1"
first_prompt = adv_classification_prompt
second_prompt = explain_prompt

nsamples = 5

In [103]:
import numpy as np

accuracies = []
preds = []
explanations = []

# Loop through the number of samples
for i in range(nsamples):
    predictions, explanation = chat_two_turns(model_name, first_prompt, second_prompt)
    print("-----------------------")
    print("Explanation:", explanation)
    
    pred = flexible_parse_predictions(predictions, n = len(gold_labels), label_set=["TRUE", "FALSE"])
    acc = np.mean([p == g for p, g in zip(pred, gold_labels)])
    print("Accuracy:", acc)

    accuracies.append(acc)
    preds.append(pred)
    explanations.append(explanation)
    print("-----------------------")

# Print the results
print("Accuracies:", accuracies)
print("Explanations:", explanations)
# Average and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print("Mean Accuracy:", mean_accuracy)
print("Standard Deviation of Accuracy:", std_accuracy)

-----------------------
Explanation: Yes, I relied on the position of the * (asterisk) character within the string to classify each input as True or False.

**Rule:**
- The string is **True** if the * is **at position 76 or later** (using 1-based counting).
- Otherwise, the string is **False**.

**Details:**
- All strings are 150 characters long.
- The * appears exactly once in each string.
- I noticed that strings labeled **True** in the examples always had the * approximately in the center or to the right of the center; specifically, at position 76 or higher.
- Strings labeled **False** always had the * at position 75 or earlier.

**Summary:**
- If the * is at index 75 or greater (0-based; i.e., position 76+ in 1-based counting), label True.
- Otherwise, label False.

**This decision matches all the training examples provided.**
Accuracy: 1.0
-----------------------
-----------------------
Explanation: The decision rule I relied on is:

**The string is classified as True if and only 

In [109]:
explain_prompt = "Let's check your understanding of the classification task you just completed. Each input was a string exactly 150 characters long. What specific pattern, feature, or rule did you rely on to decide whether each input should be labeled True or False? " + "Express your decision process as a short Python function. Avoid explanations or commentary."

model_name = "gpt-4.1"
first_prompt = adv_classification_prompt
second_prompt = explain_prompt

nsamples = 5

In [110]:
import numpy as np

accuracies = []
preds = []
explanations = []

# Loop through the number of samples
for i in range(nsamples):
    predictions, explanation = chat_two_turns(model_name, first_prompt, second_prompt)
    print("-----------------------")
    print("Explanation:", explanation)
    
    pred = flexible_parse_predictions(predictions, n = len(gold_labels), label_set=["TRUE", "FALSE"])
    acc = np.mean([p == g for p, g in zip(pred, gold_labels)])
    print("Accuracy:", acc)

    accuracies.append(acc)
    preds.append(pred)
    explanations.append(explanation)
    print("-----------------------")

# Print the results
print("Accuracies:", accuracies)
print("Explanations:", explanations)
# Average and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print("Mean Accuracy:", mean_accuracy)
print("Standard Deviation of Accuracy:", std_accuracy)

-----------------------
Explanation: ```python
def classify(s):
    return s.index('*') % 13 == 11
```
Accuracy: 1.0
-----------------------
-----------------------
Explanation: ```python
def classify(s):
    return s.index('*') in {75, 76, 77, 78, 79}
```
Accuracy: 1.0
-----------------------
-----------------------
Explanation: ```python
def classify(s):
    return s.index('*') in {75, 76, 77, 80, 81, 82}
```
Accuracy: 1.0
-----------------------
-----------------------
Explanation: ```python
def classify(s):
    return s.index('*') >= 73
```
Accuracy: 1.0
-----------------------
-----------------------
Explanation: ```python
def classify(s):
    return s.index('*') in [75, 77, 78, 79, 80, 81]
```
Accuracy: 1.0
-----------------------
Accuracies: [np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0)]
Explanations: ["```python\ndef classify(s):\n    return s.index('*') % 13 == 11\n```", "```python\ndef classify(s):\n    return s.index('*') in {75, 76, 77

#### Mixed-Adversarial

In [113]:
# construct adversarial dataset to test the model's robustness
mixed_adversarial_dataset_path = os.path.join(project_root, "notebooks", "exploratory", "long_strings_one_cap", "mixed_adversarial_dataset_N=150_case1_case1_30_30.json")
gold_labels = load_expected_labels(mixed_adversarial_dataset_path)

In [114]:
intro = "You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule."
question = "Based on the training examples, determine if each test string should be classified as True or False"
instructions = "For each test example, respond with just True or False on the same line after the arrow."

adv_classification_prompt = build_classification_prompt(
    mixed_adversarial_dataset_path,
    intro=intro,
    question=question,
    instructions=instructions,
)
print(adv_classification_prompt)

You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule.
For each test example, respond with just True or False on the same line after the arrow.

### Training examples
"_______________________________*______________________________________________________________________________________________________________________" -> False
"__________________________________________________________________________________________________________________________________________*___________" -> True
"__________________________________________________________*___________________________________________________________________________________________" -> False
"______________*_______________________________________________________________________________________________________________________________________" -> False
"________________________________________________________________________________

In [131]:
explain_prompt = "Let's check your understanding of the classification task you just completed. Each input was a string exactly 150 characters long. What specific pattern, feature, or rule did you rely on to decide whether each input should be labeled True or False? Describe your decision-making process clearly and concisely."

In [132]:
model_name = "gpt-4.1"
first_prompt = adv_classification_prompt
second_prompt = explain_prompt

nsamples = 5

In [133]:
import numpy as np

accuracies = []
preds = []
explanations = []

# Loop through the number of samples
for i in range(nsamples):
    predictions, explanation = chat_two_turns(model_name, first_prompt, second_prompt, temperature=0.7)
    print("-----------------------")
    print("Explanation:", explanation)
    
    pred = flexible_parse_predictions(predictions, n = len(gold_labels), label_set=["TRUE", "FALSE"])
    acc = np.mean([p == g for p, g in zip(pred, gold_labels)])
    print("Accuracy:", acc)

    accuracies.append(acc)
    preds.append(pred)
    explanations.append(explanation)
    print("-----------------------")

# Print the results
print("Accuracies:", accuracies)
print("Explanations:", explanations)
# Average and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print("Mean Accuracy:", mean_accuracy)
print("Standard Deviation of Accuracy:", std_accuracy)

-----------------------
Explanation: Yes, I can explain the rule.

The pattern I followed from the training data is:

**A string should be labeled True if and only if the asterisk (*) appears at position 90 or beyond (where positions are counted from 1 at the far left up to 150 on the far right). If the * is at position 89 or earlier, label it False.**

### How I arrived at this:
- In the training examples, every True label had the * in a position significantly closer to the right end of the string, most consistently at position 91 or beyond (counting from 1 on the left).
- Every False label had the * in an earlier position.
- The switch from False to True appeared exactly at the 91st character (i.e., if the * was at index 90 using 0-based indexing, which is the 91st character, or further right, it was marked True).

### My process step-by-step:
1. For each string, scan to find the position (index) of the *.
2. If the position (counting from 1) is 91 or greater, label the string as Tru

In [134]:
explain_prompt = "Let's check your understanding of the classification task you just completed. Each input was a string exactly 150 characters long. What specific pattern, feature, or rule did you rely on to decide whether each input should be labeled True or False? " + "Express your decision process as a brief Python function. Do not include explanations or extra commentary."

print(explain_prompt)
model_name = "gpt-4.1"
first_prompt = adv_classification_prompt
second_prompt = explain_prompt

nsamples = 5

Let's check your understanding of the classification task you just completed. Each input was a string exactly 150 characters long. What specific pattern, feature, or rule did you rely on to decide whether each input should be labeled True or False? Express your decision process as a brief Python function. Do not include explanations or extra commentary.


In [135]:
import numpy as np

accuracies = []
preds = []
explanations = []

# Loop through the number of samples
for i in range(nsamples):
    predictions, explanation = chat_two_turns(model_name, first_prompt, second_prompt, temperature=0.7)
    print("-----------------------")
    print("Explanation:", explanation)
    
    pred = flexible_parse_predictions(predictions, n = len(gold_labels), label_set=["TRUE", "FALSE"])
    acc = np.mean([p == g for p, g in zip(pred, gold_labels)])
    print("Accuracy:", acc)

    accuracies.append(acc)
    preds.append(pred)
    explanations.append(explanation)
    print("-----------------------")

# Print the results
print("Accuracies:", accuracies)
print("Explanations:", explanations)
# Average and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print("Mean Accuracy:", mean_accuracy)
print("Standard Deviation of Accuracy:", std_accuracy)

-----------------------
Explanation: ```python
def classify(s):
    return s.index('*') == 90
```
Accuracy: 0.9333333333333333
-----------------------
-----------------------
Explanation: ```python
def classify(s):
    return s.index("*") in range(85, 115)
```
Accuracy: 0.9333333333333333
-----------------------
-----------------------
Explanation: ```python
def classify(s):
    return s.index("*") >= 90
```
Accuracy: 0.9
-----------------------
-----------------------
Explanation: ```python
def classify(s):
    return s.index('*') in [94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105]
```
Accuracy: 0.9333333333333333
-----------------------
-----------------------
Explanation: ```python
def classify(s):
    return s.index('*') == 90
```
Accuracy: 0.9333333333333333
-----------------------
Accuracies: [np.float64(0.9333333333333333), np.float64(0.9333333333333333), np.float64(0.9), np.float64(0.9333333333333333), np.float64(0.9333333333333333)]
Explanations: ["```python\ndef classif

In [138]:
print(adv_classification_prompt)

You are given strings consisting of _ characters with possibly one * character. Your task is to classify each string as True or False based on a specific rule.
For each test example, respond with just True or False on the same line after the arrow.

### Training examples
"_______________________________*______________________________________________________________________________________________________________________" -> False
"__________________________________________________________________________________________________________________________________________*___________" -> True
"__________________________________________________________*___________________________________________________________________________________________" -> False
"______________*_______________________________________________________________________________________________________________________________________" -> False
"________________________________________________________________________________