## Imports

In [10]:
import sys, os
import importlib

# Set up the Python path for the project.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import evals_utils
importlib.reload(evals_utils)

from inference_utils import build_classification_prompt, build_articulation_prompt, chat, make_inference_fn
from evals_utils import load_expected_labels, evaluate_classification_accuracy, parse_predictions, classify_then_explain, flexible_parse_predictions

from dotenv import load_dotenv

In [11]:
load_dotenv()  # reads from .env in current working directory
# You can also specify path manually: load_dotenv(dotenv_path="../.env")

# Confirm it worked:
assert "OPENAI_API_KEY" in os.environ
print("OpenAI key loaded.")

OpenAI key loaded.


### Models

In [12]:
from openai import OpenAI
# Retrieve all available models
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
models = client.models.list()

# Filter models that include 'gpt-4' in their ID
gpt_4_models = [model.id for model in models.data if "gpt-4" in model.id.lower()]

# Print the list of GPT-4 models
print("Available GPT-4 Models:")
for model_id in gpt_4_models:
    print(f"- {model_id}")


Available GPT-4 Models:
- gpt-4o-audio-preview-2024-12-17
- gpt-4o-audio-preview-2024-10-01
- gpt-4-turbo-preview
- gpt-4-1106-preview
- gpt-4-turbo
- gpt-4-turbo-2024-04-09
- gpt-4.1-nano
- gpt-4.1-nano-2025-04-14
- gpt-4o-realtime-preview-2024-10-01
- gpt-4o-realtime-preview
- gpt-4
- chatgpt-4o-latest
- gpt-4o-realtime-preview-2024-12-17
- gpt-4o-mini-audio-preview
- gpt-4o-audio-preview
- gpt-4o-mini-realtime-preview
- gpt-4.1-mini
- gpt-4o-mini-realtime-preview-2024-12-17
- gpt-4o-mini-search-preview
- gpt-4.1-mini-2025-04-14
- gpt-4o-search-preview
- gpt-4o-mini-search-preview-2025-03-11
- gpt-4-0125-preview
- gpt-4o-2024-11-20
- gpt-4o-2024-05-13
- gpt-4-0613
- gpt-4o-mini-tts
- gpt-4o-transcribe
- gpt-4.5-preview
- gpt-4.5-preview-2025-02-27
- gpt-4o-mini-transcribe
- gpt-4o-search-preview-2025-03-11
- gpt-4o
- gpt-4o-mini
- gpt-4o-2024-08-06
- gpt-4.1
- gpt-4.1-2025-04-14
- gpt-4o-mini-2024-07-18
- gpt-4o-mini-audio-preview-2024-12-17
- ft:gpt-4.1-2025-04-14:jc:ita-class-expla

In [13]:
openai_4o = "openai/gpt-4o"
model_4o = "gpt-4o"
openai_4op1 = "openai/gpt-4.1"
openai_4op1_nano = "openai/gpt-4.1-nano"
openai_4op1_mini = "openai/gpt-4.1-mini"

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

## Experiments

In [14]:
import openai

def openai_get_response(prompt, model="gpt-4", temperature=0.7, max_tokens=300):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return response['choices'][0]['message']['content']

In [86]:
from string import Template

def render_prompt(template_str: str, **kwargs) -> str:
    """
    Render a prompt template.
    
    - template_str should use `$varname` placeholders (Template syntax),
      or you can swap in `.format`-style `{varname}` if you prefer.
    - kwargs are the values you want substituted in.
    """
    return Template(template_str).substitute(**kwargs)


import os
import datetime

def save_prompt_response(prompt, response, filename, subfolder="logs"):
    """
    Save the prompt and response to a text file in the specified subfolder.
    
    Args:
        prompt (str): The prompt sent to the LLM
        response (str): The response from the LLM
        filename (str): Name for the output file (without extension)
        subfolder (str): Subfolder to save the file in (default: "logs")
    
    Returns:
        str: Path to the saved file
    """
    # Create logs folder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
        print(f"Created folder: {subfolder}")
    
    # Add timestamp to filename
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    full_filename = f"{filename}_{timestamp}.txt"
    file_path = os.path.join(subfolder, full_filename)
    
    # Write prompt and response to file with clear separation
    with open(file_path, "w", encoding="utf-8") as f:
        f.write("="*80 + "\n")
        f.write("PROMPT:\n")
        f.write("="*80 + "\n\n")
        f.write(prompt)
        f.write("\n\n")
        f.write("="*80 + "\n")
        f.write("RESPONSE:\n")
        f.write("="*80 + "\n\n")
        f.write(response)
    
    print(f"Saved prompt and response to {file_path}")
    return file_path

In [16]:
import os
import json
import random

def create_dataset_json(sentences_dict, num_train, num_test, filename, subfolder="datasets"):
    """
    Create and save a JSON dataset with train/test split from a dictionary of TRUE/FALSE sentences.
    
    Args:
        sentences_dict (dict): Dictionary with 'TRUE' and 'FALSE' keys containing lists of sentences
        num_train (int): Number of samples for training set
        num_test (int): Number of samples for test set
        filename (str): Name for the output file (without extension)
        subfolder (str): Subfolder to save the file in (default: "datasets")
    
    Returns:
        str: Path to the saved file
    """
    # Create datasets folder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
        print(f"Created folder: {subfolder}")
    
    # Prepare all samples with labels
    all_samples = []
    for sentence in sentences_dict['TRUE']:
        all_samples.append({"input": sentence, "label": True})
    for sentence in sentences_dict['FALSE']:
        all_samples.append({"input": sentence, "label": False})
    
    # Shuffle samples
    random.shuffle(all_samples)
    
    # Check if we have enough samples
    total_samples = len(all_samples)
    if total_samples < (num_train + num_test):
        print(f"Warning: Requested {num_train + num_test} samples but only {total_samples} available.")
        num_train = min(num_train, total_samples // 2)
        num_test = min(num_test, total_samples - num_train)
        print(f"Adjusted to {num_train} train and {num_test} test samples.")
    
    # Split into train and test
    train_samples = all_samples[:num_train]
    test_samples = all_samples[num_train:num_train+num_test]
    
    # Create dataset structure
    dataset = {
        "train": train_samples,
        "test": test_samples
    }
    
    # Save to JSON file
    file_path = os.path.join(subfolder, f"{filename}.json")
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2)
    
    print(f"Dataset saved to {file_path}")
    print(f"Train samples: {len(train_samples)}, Test samples: {len(test_samples)}")
    
    # Print label distribution
    train_true = sum(1 for sample in train_samples if sample["label"])
    test_true = sum(1 for sample in test_samples if sample["label"])
    print(f"Train labels: {train_true} True, {num_train - train_true} False")
    print(f"Test labels: {test_true} True, {num_test - test_true} False")
    
    return file_path

In [17]:
import re

def extract_sentences(llm_response):
    """
    Extract sentences from LLM response into a dictionary with TRUE and FALSE keys,
    filtering out explanations.
    
    Args:
        llm_response (str): The response from the LLM
        
    Returns:
        dict: Dictionary with 'TRUE' and 'FALSE' keys containing lists of sentences
    """
    # Initialize result dictionary
    result = {
        'TRUE': [],
        'FALSE': []
    }
    
    # Find the TRUE section
    true_pattern = r'## TRUE\s+((?:".*?"\s*- explanation:.*?\n)+)'
    true_match = re.search(true_pattern, llm_response, re.DOTALL)
    
    # Find the FALSE section
    false_pattern = r'## FALSE\s+((?:".*?"\s*- explanation:.*?\n)+)'
    false_match = re.search(false_pattern, llm_response, re.DOTALL)
    
    # Extract TRUE sentences if found
    if true_match:
        true_text = true_match.group(1)
        # Extract just the quoted sentences, ignore explanations
        true_sentences = re.findall(r'"(.*?)"', true_text)
        result['TRUE'] = true_sentences
    
    # Extract FALSE sentences if found
    if false_match:
        false_text = false_match.group(1)
        # Extract just the quoted sentences, ignore explanations
        false_sentences = re.findall(r'"(.*?)"', false_text)
        result['FALSE'] = false_sentences
    
    return result


def extract_sentences2(llm_response):
    """
    Extract sentences from LLM response into a dictionary with TRUE and FALSE keys,
    filtering out explanations.
    
    Args:
        llm_response (str): The response from the LLM
        
    Returns:
        dict: Dictionary with 'TRUE' and 'FALSE' keys containing lists of sentences
    """
    # Initialize result dictionary
    result = {
        'TRUE': [],
        'FALSE': []
    }
    
    # Split the response by sections
    true_section = ""
    false_section = ""
    
    if "## TRUE" in llm_response:
        sections = llm_response.split("## TRUE", 1)
        if len(sections) > 1:
            remaining = sections[1]
            if "## FALSE" in remaining:
                parts = remaining.split("## FALSE", 1)
                true_section = parts[0].strip()
                false_section = parts[1].strip()
            else:
                true_section = remaining.strip()
    
    # Process TRUE section
    if true_section:
        # Look for sentences in quotes at the beginning of each line
        lines = true_section.split('\n')
        for line in lines:
            line = line.strip()
            if line and line.startswith('"'):
                # Extract just the sentence part (before " - explanation:")
                sentence_match = re.match(r'"([^"]+)".*', line)
                if sentence_match:
                    result['TRUE'].append(sentence_match.group(1))
    
    # Process FALSE section
    if false_section:
        # Look for sentences in quotes at the beginning of each line
        lines = false_section.split('\n')
        for line in lines:
            line = line.strip()
            if line and line.startswith('"'):
                # Extract just the sentence part (before " - explanation:")
                sentence_match = re.match(r'"([^"]+)".*', line)
                if sentence_match:
                    result['FALSE'].append(sentence_match.group(1))
    
    return result

def extract_sentences3(llm_response):
    """
    Extract sentences from LLM response into a dictionary with TRUE and FALSE keys,
    filtering out explanations and removing numbering.
    
    Args:
        llm_response (str): The response from the LLM
        
    Returns:
        dict: Dictionary with 'TRUE' and 'FALSE' keys containing lists of sentences
    """
    # Initialize result dictionary
    result = {
        'TRUE': [],
        'FALSE': []
    }
    
    # Split the response by sections
    true_section = ""
    false_section = ""
    
    if "## TRUE" in llm_response or "## VERO" in llm_response:
        # Handle both English and Italian headers
        if "## TRUE" in llm_response:
            sections = llm_response.split("## TRUE", 1)
        else:
            sections = llm_response.split("## VERO", 1)
            
        if len(sections) > 1:
            remaining = sections[1]
            if "## FALSE" in remaining:
                parts = remaining.split("## FALSE", 1)
                true_section = parts[0].strip()
                false_section = parts[1].strip()
            elif "## FALSO" in remaining:
                parts = remaining.split("## FALSO", 1)
                true_section = parts[0].strip()
                false_section = parts[1].strip()
            else:
                true_section = remaining.strip()
    
    # Process TRUE section
    if true_section:
        lines = true_section.split('\n')
        for line in lines:
            line = line.strip()
            # Skip empty lines
            if not line:
                continue
                
            # Remove numbering like "1. ", "2. ", etc.
            line = re.sub(r'^\d+\.\s*', '', line)
            
            # Extract the sentence part (in quotes)
            sentence_match = re.search(r'"([^"]+)"', line)
            if sentence_match:
                result['TRUE'].append(sentence_match.group(1))
    
    # Process FALSE section
    if false_section:
        lines = false_section.split('\n')
        for line in lines:
            line = line.strip()
            # Skip empty lines
            if not line:
                continue
                
            # Remove numbering like "1. ", "2. ", etc.
            line = re.sub(r'^\d+\.\s*', '', line)
            
            # Extract the sentence part (in quotes)
            sentence_match = re.search(r'"([^"]+)"', line)
            if sentence_match:
                result['FALSE'].append(sentence_match.group(1))
    
    return result

### Rule 16: Outdoors

In [18]:
outdoors_template = """ Generate $total sentences following this template: "[Subject] [verb] [object/activity] [preposition] [location] [time phrase]."

For $true_count sentences: The described activity MUST occur OUTDOORS (in nature, outside buildings, or open-air locations)
For $false_count sentences: The described activity MUST occur INDOORS (inside buildings or enclosed spaces)

IMPORTANT:
- Outdoor locations include: parks, forests, beaches, mountains, gardens, playgrounds, streets, fields, lakes, etc.
- Indoor locations include: houses, offices, schools, malls, restaurants, theaters, museums, libraries, etc.
- Make the setting explicit - either directly state the location or use clear context clues
- Vary the activities, locations, and time phrases across examples
- Use a mix of individual and group activities
- Include different weather conditions and times of day for outdoor activities

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (identifies the outdoor location/setting)
"Sentence 2." - explanation: (identifies the outdoor location/setting)
"Sentence 3." - explanation: (identifies the outdoor location/setting)
...

## FALSE
"Sentence 1." - explanation: (identifies the indoor location/setting)
"Sentence 2." - explanation: (identifies the indoor location/setting)
"Sentence 3." - explanation: (identifies the indoor location/setting)
...

Examples:
- "Children played soccer in the park after school." - explanation: (outdoors in a park)
- "The hikers climbed steep trails through the mountains during sunrise." - explanation: (outdoors on mountain trails)
- "The family watched movies in their living room last night." - explanation: (indoors in a living room)
- "Students studied chemistry in the school laboratory every afternoon." - explanation: (indoors in a laboratory)

Make sentences natural, varied, and unambiguous about the setting.
"""

In [22]:
# Render it with different values:
data_generation_prompt = render_prompt(
    outdoors_template,
    total=80,        # 80 sentences total
    true_count=40,   # 40 "True" sentences
    false_count=40   # 40 "False" sentences
)
reply = await chat(openai_4op1, data_generation_prompt, n=1, temperature=1., print_choices=False)

In [23]:
response = reply.choices[0].message.content
sentences_dict = extract_sentences2(response)
print(f"Found {len(sentences_dict['TRUE'])} TRUE sentences and {len(sentences_dict['FALSE'])} FALSE sentences")
save_prompt_response(data_generation_prompt, response, "outdoors_dataset_it_80", subfolder="logs")

Found 41 TRUE sentences and 40 FALSE sentences
Saved prompt and response to logs/outdoors_dataset_it_80_20250505_193930.txt


'logs/outdoors_dataset_it_80_20250505_193930.txt'

In [24]:
create_dataset_json(sentences_dict, 40, 40, "outdoors_dataset_it_80_40_40")

Dataset saved to datasets/outdoors_dataset_it_80_40_40.json
Train samples: 40, Test samples: 40
Train labels: 21 True, 19 False
Test labels: 20 True, 20 False


'datasets/outdoors_dataset_it_80_40_40.json'

In [26]:
ntrain = 40
ntest = 40
intro = f"You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly {ntrain} training examples that demonstrate the classification rule, followed by exactly {ntest} test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = f"Remember to output only True or False for each of the {ntest} test examples, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/outdoors_dataset_it_80_40_40.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

In [36]:
print(classification_prompt)

You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly 40 training examples that demonstrate the classification rule, followed by exactly 40 test examples that you need to classify using the same rule.
Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation.

### Training examples
"Volunteers packed meals in the food bank warehouse last weekend." -> False
"Students read novels in the school library every afternoon." -> False
"Travelers snapped photos atop the hill during the golden hour." -> True
"A dog walker strolled through the neighborhood street at dusk." -> True
"Siblings built a snowman in their backyard during the snowfall." -> True
"Kids watched cartoons in the family room on rainy afternoons." -> False
"Picnickers lounged under umb

In [88]:
gold_labels = load_expected_labels("datasets/outdoors_dataset_it_80_40_40.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=1
)
print(results)

{'run_accuracies': [np.float64(0.925)], 'mean_accuracy': 0.925, 'std_accuracy': 0.0, 'overall_accuracy': 0.925, 'all_outs': ['"Dancers learned routines in the mirrored dance studio every evening." -> False\n"The dog chased a ball across the open field right before sunset." -> True\n"The couple enjoyed a picnic beneath the tall trees in the forest yesterday afternoon." -> True\n"Kite enthusiasts displayed their creations on the hillside during the festival." -> True\n"Teenagers played basketball at the playground after school." -> False\n"Kids climbed jungle gyms on the school playground at recess." -> False\n"Friends roasted hotdogs at the public park grill late Saturday afternoon." -> True\n"Horseback riders crossed the open prairie at midday." -> True\n"A family celebrated a birthday in the dining room last weekend." -> False\n"Employees shared lunch in the office break room at noon." -> False\n"Newlyweds dined privately in their hotel room the first evening." -> False\n"Farmers harv

In [49]:
for res in results['all_outs']:
    print(res)

"Dancers learned routines in the mirrored dance studio every evening." -> False
"The dog chased a ball across the open field right before sunset." -> True
"The couple enjoyed a picnic beneath the tall trees in the forest yesterday afternoon." -> True
"Kite enthusiasts displayed their creations on the hillside during the festival." -> True
"Teenagers played basketball at the playground after school." -> False
"Kids climbed jungle gyms on the school playground at recess." -> False
"Friends roasted hotdogs at the public park grill late Saturday afternoon." -> True
"Horseback riders crossed the open prairie at midday." -> True
"A family celebrated a birthday in the dining room last weekend." -> False
"Employees shared lunch in the office break room at noon." -> False
"Newlyweds dined privately in their hotel room the first evening." -> False
"Farmers harvested corn in the field early in the morning." -> True
"Students built robots in the engineering lab every semester." -> False
"Researche

In [30]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    "datasets/outdoors_dataset_it_80_40_40.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=1.)


— Choice 1 (openai/gpt-4.1) —
The sentence is labeled True if it describes an outdoor activity or event taking place outside a building; otherwise, it is labeled False.

— Choice 2 (openai/gpt-4.1) —
The activity takes place outdoors (not inside a building) and not as part of a regular, routine, or institutional setting.

— Choice 3 (openai/gpt-4.1) —
The sentence is labeled True if the described activity takes place outdoors; otherwise, it is labeled False.

— Choice 4 (openai/gpt-4.1) —
A sentence is labeled True if the main activity takes place outdoors or in a natural/open environment, and False if it occurs indoors or in an enclosed manmade space.

— Choice 5 (openai/gpt-4.1) —
The sentence is labeled True if it describes people or animals engaging in an activity outdoors (not inside a building or enclosed structure), otherwise it is labeled False.


### Step 2: Multi-turn

#### V1

In [41]:
gold_labels = load_expected_labels("datasets/outdoors_dataset_it_80_40_40.json")
classification_prompt = build_classification_prompt(
    "datasets/outdoors_dataset_it_80_40_40.json",
    intro="You are a helpful assistant.",
    question="Label each test input True or False:",
    instructions="Return only True or False after each arrow."
)

In [39]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

In [42]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=5,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 5 runs

📝 Explanation from last run:

The rule I used is:

**Label a sentence True if it describes people or animals actively engaging in a specific outdoor activity or event, often at a particular time, location, or under notable environmental conditions.**

**Label a sentence False if it describes an activity that is:**
- Indoors (e.g., in a room, studio, kitchen, office, basement, etc.), **or**
- Routine, habitual, or generic (e.g., "every evening", "every Monday", "throughout the week", "at recess"), **or**
- Involves work, study, or institutional/organized indoor activities (e.g., meetings, classes, rehearsals, office work, lab work, etc.), **or**
- Social/family activities occurring in private homes or indoor spaces (e.g., dinner in the kitchen, assembling puzzles in the living room).

**In summary:**  
- **True:** Specific, often one-time, outdoor activities or events, especially with vivid settings or timing.
- **False:*

In [45]:
explain_prompt = "Now that you've classified all the test inputs, please explain the rule you used to determine whether an input should be labeled as True or False. Be as precise and specific as possible."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

Certainly! The rule used to determine whether an input should be labeled as True or False is as follows:

**Label as True if:**
- The sentence describes people (or animals) engaging in an activity that takes place outdoors, in a specific, natural, or open setting (such as a park, field, beach, hillside, riverbank, garden, prairie, etc.).
- The activity occurs at a specific time or under particular conditions (e.g., sunset, sunrise, afternoon, at night, during a storm, etc.).
- The setting is not a routine, institutional, or indoor environment (such as a classroom, office, studio, lab, library, auditorium, etc.).
- The activity is not described as a regular, repeated, or scheduled occurrence (e.g., "every morning," "every Thursday," "throughout the week," etc.), but rather as a specific event or occasion.

**Label as False if:**
- The sentence describes an activity that takes place indoors, i

In [43]:
explain_prompt = "Let's check your understanding of the classification task you just performed. What specific feature or characteristic were you looking for in each input to determine its label? Explain your decision process."

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=5,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 5 runs

📝 Explanation from last run:

The classification task is about determining whether a sentence describes an activity happening **outdoors** (True) or **indoors** (False). The key characteristic I looked for in each input was **the physical setting of the described activity**.

**Decision Process:**
- If the activity clearly takes place **outdoors** (e.g., in parks, fields, beaches, gardens, hillsides, etc.), I labeled it **True**.
- If the activity happens **indoors** (e.g., in studios, libraries, offices, kitchens, classrooms, etc.), I labeled it **False**.

I paid close attention to:
- **Location indicators**: Words like "park," "field," "beach," "garden," "plaza," "meadow," "riverbank," etc., suggest outdoors.
- **Timing or environmental clues**: Phrases like "at sunrise," "at sunset," "before the rain," "during the thunderstorm," "under the windy sky," etc., often imply an outdoor setting.
- **Nature of activity**: So

In [44]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

I was looking for whether the activity described in the sentence takes place outdoors, especially in a natural or open-air setting, and at a particular time, often highlighting a specific atmosphere or scene (such as sunset, sunrise, midday, during a storm, etc.). 

**Decision Process:**
- If the sentence described people engaging in an activity **outdoors** in a **distinct, evocative time or setting** (e.g., "at sunset," "on the hillside during the festival," "by the riverbank at dawn," "on Saturday afternoon in the park"), I labeled it **True**.
- If the activity took place **indoors** (e.g., "in the dance studio," "in the living room," "in the restaurant kitchen"), or did not emphasize a specific outdoor scene or atmosphere, I labeled it **False**.
- Some exceptions could be outdoor activities without a vivid setting, but generally, the focus was on **outdoor scenes with a sense of occasion, time, or setting**.

In summary, the key features were:
- **Outdoor location**
- **Specific 

### Step 3 - Routine/Non-Routine

In [53]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Students performed experiments in the chemistry lab during the once-in-a-decade city power test." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Students performed experiments in the chemistry lab during the once-in-a-decade city power test." -> True

**Explanation:**  
According to the rule demonstrated in the training examples, sentences are labeled **True** when they describe a specific, unusual, or notable event—often marked by unique timing, location, or circumstance (e.g., "during the golden hour," "at midnight," "during the rainstorm," "on Independence Day"). Sentences describing routine, habitual, or generic activities are labeled **False**.

In this case, "during the once-in-a-decade city power test" marks a highly specific and rare event, not a routine occurrence. Therefore, the sentence should be labeled **True**.

**Does the rule still hold?**  
Yes, the original rule holds:  
- **True** for specific, unique, or notable events (especially with unique time or circumstance).
- **False** for routine, habitual, or generic ac

In [55]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Students performed experiments in the chemistry lab during the once-in-a-decade city power test." -> True

**Explanation:**  
Based on the training examples, the rule appears to classify as True those sentences that describe a specific, unusual, or notable event or occurrence (often with unique timing, location, or circumstance), especially if it happens outdoors or is out of the ordinary routine. Sentences describing routine, everyday, or regularly scheduled activities (often indoors or in institutional settings) are classified as False.

In this case, "during the once-in-a-decade city power test" is a highly unusual and specific circumstance, making the event non-routine and notable. Therefore, it fits the pattern for True.

**Does the rule still hold?**  
Yes, the rule still holds:  
- **True:** Specific, unique, or notable events, often outdoors or with unusual timing/circumstance.  
- **False:** Routine, regular, or everyday activities, especially in institutional/indoor settings

In [56]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Students performed experiments in the chemistry lab during the once-in-a-decade city power test." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Students performed experiments in the chemistry lab during the once-in-a-decade city power test." -> True

**Explanation:**  
According to the rule demonstrated in the training examples, sentences are labeled True when they describe a specific, unusual, rare, or notable event (often with unique timing, setting, or circumstances), especially outdoors or in non-routine situations. Sentences describing routine, everyday, or scheduled activities, particularly in generic indoor/structured settings (classrooms, offices, studios, etc.), are labeled False.

Here, "during the once-in-a-decade city power test" is a highly unusual, specific circumstance, making the event rare and notable.

**Does the rule still hold?**  
Yes, the original rule still holds. This sentence fits the pattern for a True label: a unique, non-routine event with a specific context outside of ordinary scheduled activities, even

In [57]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Students performed experiments in the chemistry lab during the once-in-a-decade city power test." -> True

**Explanation:**
According to the rule demonstrated in the training examples, sentences labeled "True" tend to describe specific, often outdoor, unusual, or event-based activities that are not routine or everyday occurrences. Sentences labeled "False" usually describe routine, habitual, or indoor activities that occur regularly.

In this case, the sentence describes a unique, rare event ("the once-in-a-decade city power test") during which students perform experiments in the chemistry lab. This is not a routine occurrence, but rather a special, infrequent event, which aligns with the criteria for "True" in the examples.

**Does the original rule still hold?**
Yes, the rule appears to still hold: if the activity is tied to a specific, notable, or rare event (even if it takes place indoors), it is labeled "True." Routine or habitual actions, regardless of location, are labeled "Fal

In [58]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Students performed experiments in the chemistry lab during a normal school day." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Students performed experiments in the chemistry lab during a normal school day." -> False

**Explanation:**  
According to the rule being used, sentences are labeled True if they describe an event or activity happening outdoors, often involving a specific time, natural setting, or special occasion. Activities taking place indoors in typical or routine settings (such as classrooms, labs, offices, homes, studios, etc.) are labeled False, even if they specify a time or day.

In this case, "Students performed experiments in the chemistry lab during a normal school day" clearly describes a routine, indoor activity. Therefore, it is classified as False.

**Does the rule still hold?**  
Yes, the original rule holds for this example. The distinction between indoor routine activities (False) and outdoor or special-occasion activities (True) remains consistent and sufficient for this classification. 

In [59]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Students performed experiments in the chemistry lab during a normal school day." -> False

**Explanation:**  
According to the rule established in the training examples, situations that are routine, everyday, or occur in typical institutional/indoor settings (like classrooms, offices, labs, studios, etc.) during regular periods are labeled as False. True is reserved for activities that are outdoors, special, event-like, or take place in distinctive times/settings (e.g., sunset, sunrise, holidays, after dinner, during a storm, etc.).

This example describes a standard activity (students doing experiments) in a typical, indoor educational setting (chemistry lab) during a routine time (school day), which matches the "False" pattern in the examples.

**Conclusion:**  
The original rule still holds and correctly classifies this case as False. No modification is needed.
--------------------
"Students performed experiments in the chemistry lab during a normal school day." -> False

**Explana

In [60]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Students performed experiments in the chemistry lab during a normal school day." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Students performed experiments in the chemistry lab during a normal school day." -> False

**Explanation:**  
According to the rule in the training examples, sentences describing routine, everyday, or scheduled activities in typical indoor institutional settings (like schools, offices, libraries, labs) are labeled as False. The rule seems to focus on distinguishing between ordinary, expected activities in such settings (False) and more special, memorable, or outdoor/atypical events (True).

**Does the rule still hold?**  
Yes, the rule still holds for this case. The activity described is routine and occurs in a typical indoor institutional setting, so it is classified as False according to the established pattern. No modification is needed for this rule based on this example.


In [61]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Students performed experiments in the chemistry lab during a normal school day." -> False

**Explanation:**  
According to the rule demonstrated in the training examples, sentences describing routine, everyday, or expected activities in typical indoor or institutional settings (such as schools, offices, labs, libraries, etc.) are labeled as False. The rule seems to focus on distinguishing between ordinary, habitual, or work/school-related indoor activities (False) and more special, outdoor, event-based, or time-specific activities (True).

**Does the rule still hold?**  
Yes, the original rule still holds for this case. The sentence describes a normal, routine activity (students doing experiments in a lab during a school day), which matches the pattern of other False-labeled examples. No modification is needed for the rule based on this case.
--------------------
"Students performed experiments in the chemistry lab during a normal school day." -> False

**Explanation:**  
Based on the

In [62]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"The librarians allowed pillow fights in the reading room." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"The librarians allowed pillow fights in the reading room." -> True

Explanation:  
According to the rule demonstrated in the training examples, sentences are labeled True when they describe unusual, surprising, or out-of-the-ordinary activities happening in a setting where such activities are not typically expected. "Librarians allowing pillow fights in the reading room" is highly unexpected and contrasts with the usual quiet and orderly environment of a library. Therefore, it fits the "surprising or out-of-place activity" pattern and should be labeled True.

The original rule still holds:  
- True = surprising/unusual activity in a setting where it is not normally expected  
- False = routine, expected, or ordinary activity for that setting

No modification is needed for the rule.


In [63]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"The librarians allowed pillow fights in the reading room." -> True

**Explanation:**  
According to the rule inferred from the training examples, sentences are labeled True when they describe unusual, surprising, or out-of-the-ordinary activities for the given setting, especially when the activity contrasts with the expected behavior in that location. Pillow fights in a reading room (especially one managed by librarians) is highly unexpected and contrasts with the typical quiet, orderly activities associated with libraries. Thus, it fits the pattern for a True label.

**Does the rule still hold?**  
Yes, the original rule still holds. The classification is consistent with the pattern:  
- True = surprising/unexpected activity for the setting  
- False = typical/expected activity for the setting

No modification is needed.
--------------------
"The librarians allowed pillow fights in the reading room." -> True

Explanation:  
According to the rule inferred from the training examples, s

In [64]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"The librarians allowed pillow fights in the reading room" -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=6,
)

✅ Classification finished: mean acc 0.946 (±0.009) across 6 runs

📝 Explanation from last run:

"The librarians allowed pillow fights in the reading room" -> True

Explanation:  
According to the rule being used, sentences are classified as True if they describe activities that are unusual, surprising, or out-of-the-ordinary for the setting or context, especially if they occur in traditionally formal or structured environments (like pillow fights in a library). If the activity is routine, expected, or mundane for the setting (like reading in the library), it is classified as False.

In this case, "pillow fights in the reading room" is highly unexpected and unconventional behavior in a library, so it fits the True pattern.

Does the rule need modification?  
No, the original rule holds for this case:  
- True = surprising/unusual activity for the setting  
- False = typical/expected activity for the setting

This classification aligns with all previous examples, so the rule remains cons

In [65]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"The librarians allowed pillow fights in the reading room" -> True

Explanation:

According to the rule demonstrated by the training examples, "True" is assigned to sentences describing unusual, surprising, or out-of-the-ordinary activities occurring in a given context or location (e.g., pillow fights in a reading room, which is unexpected for librarians to allow). "False" is used for activities that are ordinary, routine, or fitting for the context (e.g., reading in a library, organizing books, etc.).

In this test case, pillow fights in a reading room are highly unusual and surprising, especially with the librarians' approval. Therefore, the classification as "True" is consistent with the rule.

Conclusion:  
The original rule continues to hold—the label "True" is for surprising or out-of-place activities in a context, and "False" is for expected or routine activities. No modification is needed.
--------------------
"The librarians allowed pillow fights in the reading room" -> True



In [66]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"The librarians monitored quiet reading in the reading room." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=6,
)

✅ Classification finished: mean acc 0.946 (±0.009) across 6 runs

📝 Explanation from last run:

"The librarians monitored quiet reading in the reading room." -> False

**Explanation:**  
According to the rule I am using, sentences are labeled **True** if they describe people engaging in distinct, often outdoor or communal, activities in unique locations, especially at specific times, *with an emphasis on activities that are less routine or more event-based*. Sentences are labeled **False** if they describe routine, indoor, or professional/regular activities that are more general and not tied to a special event or unique setting.

In this case, "The librarians monitored quiet reading in the reading room" describes a routine, professional activity occurring in a familiar indoor work environment. There is no special event, time, or unique communal aspect emphasized, so it fits the pattern for a False label.

**Does the rule need modification?**  
No, the rule still holds for this case—the

In [67]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"The librarians monitored quiet reading in the reading room." -> False

**Explanation:**  
According to the observed rule, situations are labeled "True" when they involve an outdoor or semi-outdoor setting, often with a distinct time or natural context (e.g., sunset, dawn, after rain), or a unique event/day, and/or a communal/special occasion. Activities that are routine, indoors, or part of a scheduled/expected environment (like school, work, library, office, home) are labeled "False".

In this sentence, the action takes place indoors (in the reading room), and the activity (librarians monitoring quiet reading) is routine and expected in that context.

**Does the rule still hold?**  
Yes, the original rule still holds for this case. The sentence fits the pattern of "False" examples: a routine activity in an indoor, institutional setting. No modification is needed based on this example.
--------------------
"The librarians monitored quiet reading in the reading room." -> False

**Expla

In [68]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"The librarians monitored quiet reading in the reading room ." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"The librarians monitored quiet reading in the reading room ." -> False

**Explanation:**  
According to the rule demonstrated in the training examples, sentences are labeled True when they describe people engaging in distinctive, often outdoor, communal, or event-like activities in specific settings, especially with a sense of time or occasion (e.g., "at sunset," "on Saturday," "during the festival"). Sentences about routine, indoor, or institutional activities (like "in the reading room," "in the office," "in the classroom") are labeled False.

In this case, "The librarians monitored quiet reading in the reading room" describes a routine, indoor, institutional activity, so it is labeled False.

**Does the rule still hold?**  
Yes, the rule still holds. This sentence fits the established pattern for False: it is routine, institutional, and lacks the distinctive, event-like, or outdoor conte

In [69]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"The librarians monitored quiet reading in the reading room ." -> False

**Explanation:**  
According to the rule demonstrated in the training examples, sentences labeled True typically describe people engaging in outdoor, time-specific, or event-based activities, often with a sense of setting, occasion, or atmosphere (e.g., "at sunset," "during the festival," "on Saturday afternoon"). Sentences labeled False often describe routine, indoor, or institutional activities, especially those that could be considered regular duties or ongoing tasks (e.g., "in the school library every afternoon," "in the conference room every Monday morning").

In this case, "The librarians monitored quiet reading in the reading room" describes a routine, indoor, institutional activity, similar to other False-labeled examples. There is no mention of a special occasion, outdoor setting, or unique event.

**Does the rule still hold?**  
Yes, the original rule still holds for this case. The classification aligns 

### Step 3 - Time-Frame

In [70]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Farmers harvested wheat in their fields during the spectacular sunrise." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Farmers harvested wheat in their fields during the spectacular sunrise." -> True

**Explanation:**  
According to the rule being used, sentences are labeled True if they describe people engaging in an activity outdoors, often with a specific time, setting, or atmospheric condition that emphasizes the environment or experience (e.g., "at sunrise," "during the rainstorm," "at dusk," "on a hot afternoon"). The sentence above fits this pattern: it describes farmers (people) harvesting wheat (an activity) in their fields (outdoor setting) during a spectacular sunrise (time/atmosphere).

**Does the original rule still hold?**  
Yes, the original rule holds. The classification is consistent with previous examples: outdoor, time-specific, and evocative of a particular scene or atmosphere are labeled True. Sentences about routine, indoor, or generic activities are labeled False. No modification to t

In [71]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Farmers harvested wheat in their fields during the spectacular sunrise." -> True

**Explanation:**  
According to the rule inferred from the training examples, sentences are labeled "True" when they describe people engaging in activities outdoors, often with a specific time or atmospheric condition (e.g., sunrise, sunset, rain, night, etc.), and "False" when the activity is indoors or routine without a notable setting or time. In this case, farmers are outdoors in their fields during a spectacular sunrise, which matches the "True" pattern.

**Does the rule still hold?**  
Yes, the original rule holds for this case. The sentence fits the pattern for "True" examples: outdoor activity, specific time/atmosphere, and a vivid setting. No modification is needed based on this example.
--------------------
"Farmers harvested wheat in their fields during the spectacular sunrise." -> True

**Explanation:**  
Based on the rule inferred from the training examples, scenes that are set outdoors and 

In [72]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Farmers harvested wheat in their fields during the spectacular sunrise." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=6,
)

✅ Classification finished: mean acc 0.946 (±0.009) across 6 runs

📝 Explanation from last run:

"Farmers harvested wheat in their fields during the spectacular sunrise." -> True

**Explanation:**  
According to the original rule, scenes set outdoors with specific, vivid settings or times—especially those involving nature, weather, or light (e.g., "during the spectacular sunrise")—are labeled True. The example fits this pattern: it describes farmers outdoors, engaged in an activity, with a strong sense of time and setting ("during the spectacular sunrise").

**Reflection on the Rule:**  
This case fits the rule and does not require modification. The rule distinguishes between vivid, atmospheric outdoor scenes (True) and routine, indoor, or less vividly set activities (False). The classification here aligns with that logic.


In [73]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Farmers harvested wheat in their fields during the spectacular sunrise." -> True

**Explanation:**  
My original rule is to classify as True when the activity is performed outdoors, often associated with a visually descriptive or time-specific natural setting (e.g., sunrise, sunset, weather, outdoor locations), and False if the activity is indoors, routine, or lacks such context.

In this case, "harvested wheat in their fields during the spectacular sunrise" is an outdoor action, made vivid by the mention of "spectacular sunrise," fitting the pattern seen in the True training examples.

**Does the rule still hold?**  
Yes, the rule holds for this example. The activity is outdoors, involves a specific and picturesque time of day, and is not routine in an indoor or generic setting. No modification is needed for the rule based on this case.
--------------------
"Farmers harvested wheat in their fields during the spectacular sunrise." -> True

**Explanation:**  
According to the rule infe

In [74]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Farmers harvested wheat in their fields." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Farmers harvested wheat in their fields." -> True

Explanation: According to the rule in the training examples, sentences are labeled True if they describe people engaging in outdoor, often communal or scenic activities, typically with a sense of place and time (often specific or evocative, e.g., "at sunset," "in the morning," "last weekend"). They are labeled False if the activity is routine, indoor, or lacks the temporal/spatial specificity that suggests an event or special occasion.

In this case, "Farmers harvested wheat in their fields" describes an outdoor activity, with a clear setting (fields), and harvesting is a non-routine, seasonal event, not a daily or habitual indoor task. This fits the "True" pattern.

Does the rule still hold? Yes, the original rule still holds for this example. The distinction between special/outdoor/occasional activities (True) and routine/indoor/habitual 

In [75]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Farmers harvested wheat in their fields." -> True

**Explanation:**  
According to the rule inferred from the training examples, sentences labeled "True" tend to describe outdoor activities involving people interacting with nature, landscapes, or the outdoors—often with a sense of time, but not always required. "False" is assigned to indoor, routine, or institutional settings, or activities without a notable outdoor/natural element.

In this case, "Farmers harvested wheat in their fields" is an outdoor, nature-based activity and fits the pattern for "True." The original rule holds for this example and does not need modification.
--------------------
"Farmers harvested wheat in their fields." -> True

Explanation:  
Based on the rule in the training examples, sentences labeled "True" typically describe outdoor activities or events occurring in open, natural, or public environments, often with a sense of time, season, or weather, and involving movement or interaction with the environmen

In [76]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Farmers harvested wheat in their fields." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Farmers harvested wheat in their fields." -> True

**Explanation:**  
According to the rule inferred from the training examples, sentences are labeled "True" if they describe people engaging in outdoor activities in natural or open settings, often with a sense of time or event, and "False" if the activity is routine, indoors, or lacks a specific outdoor/natural context.

In this sentence, "Farmers harvested wheat in their fields" describes an outdoor activity (harvesting) in a natural/open setting (fields). Even though there is no explicit time marker, similar sentences in the training set (e.g., "Farmers harvested corn in the field early in the morning." -> True) were labeled "True."

**Does the rule still hold?**  
Yes, the original rule holds for this case. The activity is outdoors and in a natural setting, so "True" is consistent with the pattern in the training data. However, the rule 

In [None]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Students performed experiments in the chemistry lab during a normal school day." -> False

**Explanation:**  
According to the rule demonstrated in the training examples, sentences describing routine, everyday, or expected activities in typical indoor or institutional settings (such as schools, offices, labs, libraries, etc.) are labeled as False. The rule seems to focus on distinguishing between ordinary, habitual, or work/school-related indoor activities (False) and more special, outdoor, event-based, or time-specific activities (True).

**Does the rule still hold?**  
Yes, the original rule still holds for this case. The sentence describes a normal, routine activity (students doing experiments in a lab during a school day), which matches the pattern of other False-labeled examples. No modification is needed for the rule based on this case.
--------------------
"Students performed experiments in the chemistry lab during a normal school day." -> False

**Explanation:**  
Based on the

In [77]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Children played tag in the community park." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Children played tag in the community park." -> True

**Explanation:**  
According to the rule demonstrated in the training examples, sentences are labeled True when they describe people engaging in recreational, outdoor, or special activities in public or natural settings, often with a sense of occasion, time, or atmosphere (e.g., "at sunset," "during the festival," "on Saturday"). Sentences are labeled False when the activity is routine, indoors, or lacks that special or outdoor context.

In this case, "Children played tag in the community park" fits the True pattern: it's an outdoor, recreational activity in a public space.

**Does the rule still hold?**  
Yes, the original rule holds for this case. However, if more ambiguous or borderline cases arise (e.g., outdoor activities that are routine or lack a special time/occasion), the rule may need further refinement to clarify those distinct

In [78]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Children played tag in the community park." -> True

**Explanation:**  
According to the rule I have been using, sentences are labeled "True" if they describe people engaging in an activity outdoors, especially in a public or natural setting, often with a sense of time or occasion (e.g., sunset, afternoon, weekend). "False" is used for activities that are routine, indoors, or lack a distinct outdoor or special context.

In this case, "Children played tag in the community park" describes an outdoor activity in a public space, which fits the "True" pattern.

**Does the rule still hold?**  
Yes, the original rule still holds for this example. However, to be more precise, the rule could be clarified as:

- "True" for non-routine, outdoor, communal, or special-occasion activities.
- "False" for routine, indoor, or everyday activities, especially those in private or institutional settings.

This refinement ensures the rule is consistently applied.
--------------------
"Children played tag i

In [79]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Children played tag in the community park." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Children played tag in the community park." -> True

**Explanation:**  
Based on the pattern in the training examples and my previous outputs, the rule I've been using is:  
- Sentences are labeled **True** if they describe people engaging in an outdoor or semi-outdoor recreational or communal activity, often in a natural or public setting, especially at a specific time or under particular conditions (such as weather, time of day, or event).  
- Sentences are labeled **False** if the activity is indoors, routine, or lacks a specific outdoor/communal/recreational setting or occasion.

In this case, "Children played tag in the community park" describes an outdoor recreational activity in a public setting, aligning with the "True" category.

**Does the rule still hold?**  
Yes, my original rule holds for this case. The sentence fits well within the pattern of "True" examples in the dataset. Ho

In [80]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Children played tag in the community park." -> True

**Explanation:**  
According to the pattern in the training examples, sentences are labeled **True** when they depict people engaged in specific, often recreational or outdoor activities in particular, named places, typically at a discrete time or under particular conditions (like "at sunset," "last night," "during the thunderstorm"). They are labeled **False** when the activity is routine, regular, institutional, or indoors, or lacks a specific, eventful context.

In this case, "Children played tag in the community park" describes an outdoor, specific, and recreational activity in a named place (community park), and fits the pattern of examples labeled **True**.

**Does the rule still hold?**  
Yes, the rule holds for this example. However, this probe highlights that the rule depends heavily on context—if the sentence said "every afternoon" or "during recess," the label might be **False** (as with other routine/institutional activi

In [85]:
print(classification_prompt)

You are a helpful assistant.
Return only True or False after each arrow.

### Training examples
"Volunteers packed meals in the food bank warehouse last weekend." -> False
"Students read novels in the school library every afternoon." -> False
"Travelers snapped photos atop the hill during the golden hour." -> True
"A dog walker strolled through the neighborhood street at dusk." -> True
"Siblings built a snowman in their backyard during the snowfall." -> True
"Kids watched cartoons in the family room on rainy afternoons." -> False
"Picnickers lounged under umbrellas by the lake on a hot afternoon." -> True
"Emily jogged around the city park early this morning." -> True
"Families watched fireworks from the town square on Independence Day." -> True
"Friends watched a movie in the home theater yesterday evening." -> False
"Migratory birds rested on the wetland shore during the rainstorm." -> True
"Doctors examined patients in the hospital clinic every morning." -> False
"Employees held mee

In [81]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Children played tag in the indoor recreation center during a beautiful sunset." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.7,
    num_runs=6,
)

✅ Classification finished: mean acc 0.950 (±0.000) across 6 runs

📝 Explanation from last run:

"Children played tag in the indoor recreation center during a beautiful sunset." -> False

**Explanation:**  
Based on the training examples, the rule appears to be:  
- True is assigned when the activity is outdoors and/or interacts with notable natural or environmental conditions (e.g., weather, sunlight, time of day, landscape).
- False is assigned for indoor activities or those without a clear connection to the outdoor environment or natural setting, even if a time of day or weather is mentioned.

In this case, although "a beautiful sunset" is mentioned, the activity takes place **indoors** ("indoor recreation center"), so the sunset does not directly influence or set the scene for the activity. This is consistent with the False label assigned to other indoor activities, even when a time or weather is specified (e.g., "Kids watched cartoons in the family room on rainy afternoons." -> Fal

In [82]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Children played tag in the indoor recreation center during a beautiful sunset." -> False

**Explanation:**  
The rule I have been using is to label as **True** when the activity takes place outdoors, often highlighting a connection to nature, time of day, or weather, and **False** when the activity is indoors or not directly tied to an outdoor environment. In this sentence, although "a beautiful sunset" is mentioned, the children are playing indoors ("in the indoor recreation center"), so the activity itself is not outdoors.

**Does the rule still hold?**  
Yes, the original rule still holds:  
- **True:** Outdoor activities, often with reference to nature, weather, or time of day.
- **False:** Indoor activities, regardless of time of day or weather outside.

This example confirms that the setting of the activity (indoor vs. outdoor) is the deciding factor, not the mention of outdoor conditions.
--------------------
"Children played tag in the indoor recreation center during a beautif

In [83]:
# Outdoors with time reference
explain_prompt = """
Let's probe your understanding deeper. According to the rule you're using, how would you classify this case?

"Children played tag in the indoor recreation center during a beautiful sunset." -> 

After classifying it, please explain whether your original rule still holds or needs modification."""

results = classify_then_explain(
    first_prompt=classification_prompt,        # built with build_classification_prompt
    second_prompt=explain_prompt,   # built with build_articulation_prompt (or custom)
    model_name=openai_4op1,
    dataset_json="datasets/outdoors_dataset_it_80_40_40.json",
    temperature=0.3,
    num_runs=6,
)

✅ Classification finished: mean acc 0.946 (±0.009) across 6 runs

📝 Explanation from last run:

"Children played tag in the indoor recreation center during a beautiful sunset." -> False

**Explanation:**  
Based on the rule inferred from the training examples, scenes are labeled **True** when they depict people engaging in activities outdoors or in natural/light-dependent settings, especially when the time of day or weather is referenced (e.g., sunset, sunrise, rain, etc.). Scenes set indoors, even if the time of day or weather is mentioned, are labeled **False**.

In this case, although "a beautiful sunset" is mentioned, the activity occurs "in the indoor recreation center," which is an indoor location. Therefore, according to the rule, the label is **False**.

**Does the rule still hold?**  
Yes, the original rule holds:  
- **True** for outdoor/nature-based activities with time/weather context.  
- **False** for indoor activities, regardless of time/weather context.  

No modificati

In [84]:
for explaination in results['explanations']:
    print(explaination)
    print("--------------------")

"Children played tag in the indoor recreation center during a beautiful sunset." -> False

**Explanation:**  
According to the rule demonstrated in the training examples, sentences are labeled **True** when they describe people engaging in outdoor or nature-related activities that are time-specific (often referencing natural phenomena like sunset, sunrise, weather, or specific times of day), and **False** when the activity is indoors, routine, or not tied to an outdoor/nature/time-specific context.

In this case, even though "a beautiful sunset" is mentioned, the activity ("played tag") occurs **indoors** ("in the indoor recreation center"). The rule, as shown in the examples, prioritizes the location of the activity—if it's indoors, the label is **False**, regardless of the time or natural event outside.

**Does the rule still hold?**  
Yes, the original rule holds. The classification remains consistent: outdoor, time/nature-specific activities are labeled **True**; indoor or routine 