## Imports

In [153]:
import sys, os
import importlib

# Set up the Python path for the project.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import evals_utils
importlib.reload(evals_utils)

from inference_utils import build_classification_prompt, build_articulation_prompt, chat, make_inference_fn
from evals_utils import load_expected_labels, evaluate_classification_accuracy, parse_predictions, classify_then_explain, flexible_parse_predictions

from dotenv import load_dotenv

In [2]:
load_dotenv()  # reads from .env in current working directory
# You can also specify path manually: load_dotenv(dotenv_path="../.env")

# Confirm it worked:
assert "OPENAI_API_KEY" in os.environ
print("OpenAI key loaded.")

OpenAI key loaded.


### Models

In [3]:
from openai import OpenAI
# Retrieve all available models
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
models = client.models.list()

# Filter models that include 'gpt-4' in their ID
gpt_4_models = [model.id for model in models.data if "gpt-4" in model.id.lower()]

# Print the list of GPT-4 models
print("Available GPT-4 Models:")
for model_id in gpt_4_models:
    print(f"- {model_id}")


Available GPT-4 Models:
- gpt-4o-audio-preview-2024-12-17
- gpt-4o-audio-preview-2024-10-01
- gpt-4-turbo-preview
- gpt-4-turbo
- gpt-4-turbo-2024-04-09
- gpt-4.1-nano
- gpt-4.1-nano-2025-04-14
- gpt-4o-realtime-preview-2024-10-01
- gpt-4o-realtime-preview
- gpt-4
- chatgpt-4o-latest
- gpt-4o-realtime-preview-2024-12-17
- gpt-4o-mini-audio-preview
- gpt-4o-audio-preview
- gpt-4o-mini-realtime-preview
- gpt-4.1-mini
- gpt-4o-mini-realtime-preview-2024-12-17
- gpt-4o-mini-search-preview
- gpt-4.1-mini-2025-04-14
- gpt-4o-search-preview
- gpt-4-1106-preview
- gpt-4o-mini-search-preview-2025-03-11
- gpt-4-0125-preview
- gpt-4o-2024-11-20
- gpt-4o-2024-05-13
- gpt-4-0613
- gpt-4o-mini-tts
- gpt-4o-transcribe
- gpt-4.5-preview
- gpt-4.5-preview-2025-02-27
- gpt-4o-mini-transcribe
- gpt-4o-search-preview-2025-03-11
- gpt-4o
- gpt-4o-mini
- gpt-4o-2024-08-06
- gpt-4.1
- gpt-4.1-2025-04-14
- gpt-4o-mini-2024-07-18
- gpt-4o-mini-audio-preview-2024-12-17


In [37]:
openai_4o = "openai/gpt-4o"
model_4o = "gpt-4o"
openai_4op1 = "openai/gpt-4.1"
openai_4op1_nano = "openai/gpt-4.1-nano"
openai_4op1_mini = "openai/gpt-4.1-mini"

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

## Experiments

In [34]:
import openai

def openai_get_response(prompt, model="gpt-4", temperature=0.7, max_tokens=300):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return response['choices'][0]['message']['content']

In [45]:
from string import Template

def render_prompt(template_str: str, **kwargs) -> str:
    """
    Render a prompt template.
    
    - template_str should use `$varname` placeholders (Template syntax),
      or you can swap in `.format`-style `{varname}` if you prefer.
    - kwargs are the values you want substituted in.
    """
    return Template(template_str).substitute(**kwargs)


import os
import datetime

def save_prompt_response(prompt, response, filename, subfolder="logs"):
    """
    Save the prompt and response to a text file in the specified subfolder.
    
    Args:
        prompt (str): The prompt sent to the LLM
        response (str): The response from the LLM
        filename (str): Name for the output file (without extension)
        subfolder (str): Subfolder to save the file in (default: "logs")
    
    Returns:
        str: Path to the saved file
    """
    # Create logs folder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
        print(f"Created folder: {subfolder}")
    
    # Add timestamp to filename
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    full_filename = f"{filename}_{timestamp}.txt"
    file_path = os.path.join(subfolder, full_filename)
    
    # Write prompt and response to file with clear separation
    with open(file_path, "w", encoding="utf-8") as f:
        f.write("="*80 + "\n")
        f.write("PROMPT:\n")
        f.write("="*80 + "\n\n")
        f.write(prompt)
        f.write("\n\n")
        f.write("="*80 + "\n")
        f.write("RESPONSE:\n")
        f.write("="*80 + "\n\n")
        f.write(response)
    
    print(f"Saved prompt and response to {file_path}")
    return file_path

In [56]:
import os
import json
import random

def create_dataset_json(sentences_dict, num_train, num_test, filename, subfolder="datasets"):
    """
    Create and save a JSON dataset with train/test split from a dictionary of TRUE/FALSE sentences.
    
    Args:
        sentences_dict (dict): Dictionary with 'TRUE' and 'FALSE' keys containing lists of sentences
        num_train (int): Number of samples for training set
        num_test (int): Number of samples for test set
        filename (str): Name for the output file (without extension)
        subfolder (str): Subfolder to save the file in (default: "datasets")
    
    Returns:
        str: Path to the saved file
    """
    # Create datasets folder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
        print(f"Created folder: {subfolder}")
    
    # Prepare all samples with labels
    all_samples = []
    for sentence in sentences_dict['TRUE']:
        all_samples.append({"input": sentence, "label": True})
    for sentence in sentences_dict['FALSE']:
        all_samples.append({"input": sentence, "label": False})
    
    # Shuffle samples
    random.shuffle(all_samples)
    
    # Check if we have enough samples
    total_samples = len(all_samples)
    if total_samples < (num_train + num_test):
        print(f"Warning: Requested {num_train + num_test} samples but only {total_samples} available.")
        num_train = min(num_train, total_samples // 2)
        num_test = min(num_test, total_samples - num_train)
        print(f"Adjusted to {num_train} train and {num_test} test samples.")
    
    # Split into train and test
    train_samples = all_samples[:num_train]
    test_samples = all_samples[num_train:num_train+num_test]
    
    # Create dataset structure
    dataset = {
        "train": train_samples,
        "test": test_samples
    }
    
    # Save to JSON file
    file_path = os.path.join(subfolder, f"{filename}.json")
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2)
    
    print(f"Dataset saved to {file_path}")
    print(f"Train samples: {len(train_samples)}, Test samples: {len(test_samples)}")
    
    # Print label distribution
    train_true = sum(1 for sample in train_samples if sample["label"])
    test_true = sum(1 for sample in test_samples if sample["label"])
    print(f"Train labels: {train_true} True, {num_train - train_true} False")
    print(f"Test labels: {test_true} True, {num_test - test_true} False")
    
    return file_path

In [229]:
import re

def extract_sentences(llm_response):
    """
    Extract sentences from LLM response into a dictionary with TRUE and FALSE keys,
    filtering out explanations.
    
    Args:
        llm_response (str): The response from the LLM
        
    Returns:
        dict: Dictionary with 'TRUE' and 'FALSE' keys containing lists of sentences
    """
    # Initialize result dictionary
    result = {
        'TRUE': [],
        'FALSE': []
    }
    
    # Find the TRUE section
    true_pattern = r'## TRUE\s+((?:".*?"\s*- explanation:.*?\n)+)'
    true_match = re.search(true_pattern, llm_response, re.DOTALL)
    
    # Find the FALSE section
    false_pattern = r'## FALSE\s+((?:".*?"\s*- explanation:.*?\n)+)'
    false_match = re.search(false_pattern, llm_response, re.DOTALL)
    
    # Extract TRUE sentences if found
    if true_match:
        true_text = true_match.group(1)
        # Extract just the quoted sentences, ignore explanations
        true_sentences = re.findall(r'"(.*?)"', true_text)
        result['TRUE'] = true_sentences
    
    # Extract FALSE sentences if found
    if false_match:
        false_text = false_match.group(1)
        # Extract just the quoted sentences, ignore explanations
        false_sentences = re.findall(r'"(.*?)"', false_text)
        result['FALSE'] = false_sentences
    
    return result


def extract_sentences2(llm_response):
    """
    Extract sentences from LLM response into a dictionary with TRUE and FALSE keys,
    filtering out explanations.
    
    Args:
        llm_response (str): The response from the LLM
        
    Returns:
        dict: Dictionary with 'TRUE' and 'FALSE' keys containing lists of sentences
    """
    # Initialize result dictionary
    result = {
        'TRUE': [],
        'FALSE': []
    }
    
    # Split the response by sections
    true_section = ""
    false_section = ""
    
    if "## TRUE" in llm_response:
        sections = llm_response.split("## TRUE", 1)
        if len(sections) > 1:
            remaining = sections[1]
            if "## FALSE" in remaining:
                parts = remaining.split("## FALSE", 1)
                true_section = parts[0].strip()
                false_section = parts[1].strip()
            else:
                true_section = remaining.strip()
    
    # Process TRUE section
    if true_section:
        # Look for sentences in quotes at the beginning of each line
        lines = true_section.split('\n')
        for line in lines:
            line = line.strip()
            if line and line.startswith('"'):
                # Extract just the sentence part (before " - explanation:")
                sentence_match = re.match(r'"([^"]+)".*', line)
                if sentence_match:
                    result['TRUE'].append(sentence_match.group(1))
    
    # Process FALSE section
    if false_section:
        # Look for sentences in quotes at the beginning of each line
        lines = false_section.split('\n')
        for line in lines:
            line = line.strip()
            if line and line.startswith('"'):
                # Extract just the sentence part (before " - explanation:")
                sentence_match = re.match(r'"([^"]+)".*', line)
                if sentence_match:
                    result['FALSE'].append(sentence_match.group(1))
    
    return result

def extract_sentences3(llm_response):
    """
    Extract sentences from LLM response into a dictionary with TRUE and FALSE keys,
    filtering out explanations and removing numbering.
    
    Args:
        llm_response (str): The response from the LLM
        
    Returns:
        dict: Dictionary with 'TRUE' and 'FALSE' keys containing lists of sentences
    """
    # Initialize result dictionary
    result = {
        'TRUE': [],
        'FALSE': []
    }
    
    # Split the response by sections
    true_section = ""
    false_section = ""
    
    if "## TRUE" in llm_response or "## VERO" in llm_response:
        # Handle both English and Italian headers
        if "## TRUE" in llm_response:
            sections = llm_response.split("## TRUE", 1)
        else:
            sections = llm_response.split("## VERO", 1)
            
        if len(sections) > 1:
            remaining = sections[1]
            if "## FALSE" in remaining:
                parts = remaining.split("## FALSE", 1)
                true_section = parts[0].strip()
                false_section = parts[1].strip()
            elif "## FALSO" in remaining:
                parts = remaining.split("## FALSO", 1)
                true_section = parts[0].strip()
                false_section = parts[1].strip()
            else:
                true_section = remaining.strip()
    
    # Process TRUE section
    if true_section:
        lines = true_section.split('\n')
        for line in lines:
            line = line.strip()
            # Skip empty lines
            if not line:
                continue
                
            # Remove numbering like "1. ", "2. ", etc.
            line = re.sub(r'^\d+\.\s*', '', line)
            
            # Extract the sentence part (in quotes)
            sentence_match = re.search(r'"([^"]+)"', line)
            if sentence_match:
                result['TRUE'].append(sentence_match.group(1))
    
    # Process FALSE section
    if false_section:
        lines = false_section.split('\n')
        for line in lines:
            line = line.strip()
            # Skip empty lines
            if not line:
                continue
                
            # Remove numbering like "1. ", "2. ", etc.
            line = re.sub(r'^\d+\.\s*', '', line)
            
            # Extract the sentence part (in quotes)
            sentence_match = re.search(r'"([^"]+)"', line)
            if sentence_match:
                result['FALSE'].append(sentence_match.group(1))
    
    return result

### Rule 2: Alliteration Rule

In [None]:
# Updated template with structured output format including explanations
sentence_template = """ Generate $total sentences following this template: "[Subject] [verb] [object] while [subject2] [verb2] [object2]."

For $true_count sentences: Include AT LEAST $min_same content words (nouns, verbs, adjectives, adverbs) that begin with the same letter  
For $false_count sentences: Ensure NO MORE THAN $max_same content words begin with the same letter

Content words are nouns, verbs, adjectives, adverbs. Articles, prepositions, and conjunctions don't count.

IMPORTANT: For TRUE sentences, vary your approach. Some sentences should have subjects with DIFFERENT starting letters, where the alliteration comes from other content words. Avoid creating a pattern where most TRUE sentences have matching names.

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (brief explanation)
"Sentence 2." - explanation: (brief explanation)
"Sentence 3." - explanation: (brief explanation)
...

## FALSE
"Sentence 1." - explanation: (brief explanation)
"Sentence 2." - explanation: (brief explanation)
"Sentence 3." - explanation: (brief explanation)
...

Examples:
- "Peter picked peppers while Wendy watched wolves." - explanation: (p,p,p - alliteration with first name and two verbs)
- "Sarah silently sorted samples while Tom tested tubes." - explanation: (s,s,s,t,t - alliteration with first name, adverb, verb and second subject with different object)
- "Mary made music while John jumped joyfully." - explanation: (only two j-words, not enough for TRUE)

Make sentences natural and sensible.
"""

In [78]:
# Render it with different values:
data_generation_prompt = render_prompt(
    sentence_template,
    total=100,           # 8 sentences
    true_count=50,      # 3 “True”
    false_count=50,     # 5 “False”
    min_same=3,        # at least 3 same‑letter content words
    max_same=2,        # no more than 2 same‑letter content words
)
print(data_generation_prompt)
reply = await chat(openai_4op1, data_generation_prompt, n=1, temperature=1., print_choices=False)


Generate 100 sentences following this template: "[Subject] [verb] [object] while [subject2] [verb2] [object2]."

For 50 sentences: Include AT LEAST 3 content words (nouns, verbs, adjectives, adverbs) that begin with the same letter  
For 50 sentences: Ensure NO MORE THAN 2 content words begin with the same letter

Content words are nouns, verbs, adjectives, adverbs. Articles, prepositions, and conjunctions don't count.

IMPORTANT: For TRUE sentences, vary your approach. Some sentences should have subjects with DIFFERENT starting letters, where the alliteration comes from other content words. Avoid creating a pattern where most TRUE sentences have matching names.

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (brief explanation)
"Sentence 2." - explanation: (brief explanation)
"Sentence 3." - explanation: (brief explanation)
...

## FALSE
"Sentence 1." - explanation: (brief explanation)
"Sentence 2." - explanation: (brief explanation)
"Sentence 3." - ex

In [79]:
response = reply.choices[0].message.content
sentences_dict = extract_sentences(response)
print(f"Found {len(sentences_dict['TRUE'])} TRUE sentences and {len(sentences_dict['FALSE'])} FALSE sentences")
save_prompt_response(data_generation_prompt, response, "alliteration_rule_dataset", subfolder="logs")

Found 53 TRUE sentences and 49 FALSE sentences
Saved prompt and response to logs/alliteration_rule_dataset_20250505_084720.txt


'logs/alliteration_rule_dataset_20250505_084720.txt'

In [80]:
create_dataset_json(sentences_dict, 50, 50, "alliteration_rule_dataset_50_50")

Dataset saved to datasets/alliteration_rule_dataset_50_50.json
Train samples: 50, Test samples: 50
Train labels: 30 True, 20 False
Test labels: 22 True, 28 False


'datasets/alliteration_rule_dataset_50_50.json'

In [81]:
intro = "You are an expert text classifier that assigns labels based on specific rules. You will be provided with training examples that demonstrate the classification rule, followed by test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = "Remember to output only True or False for each test example, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/alliteration_rule_dataset_50_50.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

In [82]:
gold_labels = load_expected_labels("datasets/alliteration_rule_dataset_50_50.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1_mini, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(0.76), np.float64(0.74), np.float64(0.74), np.float64(0.82)], 'mean_accuracy': 0.765, 'std_accuracy': 0.03278719262150998, 'overall_accuracy': 0.765, 'all_outs': ['True\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse', 'True\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse', '

In [83]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence first, then you may elaborate if needed."
articulation_prompt = build_articulation_prompt(
    "datasets/alliteration_rule_dataset_50_50.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
**Rule:**  
Both people mentioned in the sentence must perform actions in which the key nouns (objects or subjects) in their respective clauses all start with the same letter as their own first name.

---

**Elaboration:**  
- Each sentence has two people, each doing an activity (e.g., "Nadia needed new needles" and "Carrie counted colorful cards").
- For a sentence to be labeled True:
  - For both people, the nouns or main objects associated with their verbs start with the same letter as their first name.
    - Example: "Nadia needed new needles" — Nadia, needed, new, needles (all N's).
    - "Carrie counted colorful cards" — Carrie, counted, colorful, cards (all C's).
- If either person's key nouns/objects do **not** start with the same letter as their name, the sentence is labeled False.
    - Example: "Anna checked her messages" — Anna (A), checked (C), messages (M): mismatch, so False.
    - "Carmen sang a tune" — Carmen (C), sang (S), tune (T): mism

### Rule 17: Italian word - while

In [139]:
# italian_word_template = """ Generate $total sentences following this template: "[Subject] [verb] [object] while [subject2] [verb2] [object2]."

# For $true_count sentences: Include EXACTLY ONE Italian word naturally integrated into the otherwise English sentence
# For $false_count sentences: Use ONLY English words, with no Italian words

# IMPORTANT: For TRUE sentences:
# - The Italian word should fit naturally in the sentence context
# - Vary the part of speech of the Italian word (nouns, verbs, adjectives, adverbs)
# - Avoid overusing stereotypical Italian words like "pizza", "pasta", "gelato"
# - The Italian word could appear in any position in the sentence
# - Ensure the Italian word is actually Italian, not Spanish or Latin

# Format your response exactly as follows:

# ## TRUE
# "Sentence 1." - explanation: (identifies the Italian word and its meaning)
# "Sentence 2." - explanation: (identifies the Italian word and its meaning)
# "Sentence 3." - explanation: (identifies the Italian word and its meaning)
# ...

# ## FALSE
# "Sentence 1." - explanation: (confirms all words are English)
# "Sentence 2." - explanation: (confirms all words are English)
# "Sentence 3." - explanation: (confirms all words are English)
# ...

# Examples:
# - "Mary made music while John jumped allegramente." - explanation: ("allegramente" is Italian for "cheerfully")
# - "The chef prepared the meal with fresh pomodori from the garden." - explanation: ("pomodori" is Italian for "tomatoes")
# - "Thomas walked slowly to the park while birds sang loudly." - explanation: (all words are English)

# Make sentences natural, varied, and sensible. The Italian words should feel like a natural code-switch, not forced insertions.
# """

In [140]:
italian_word_template = """ Generate $total sentences following this template: "[Subject] [verb] [object] while [subject2] [verb2] [object2]."

For $true_count sentences: Include EXACTLY ONE Italian word naturally integrated into the otherwise English sentence
For $false_count sentences: Use ONLY English words, with no Italian words

IMPORTANT FOR TRUE SENTENCES:
- Use only common English names for people (like John, Mary, Sarah, David, etc.)
- The Italian word should NOT be a name or proper noun
- The Italian word must be a common noun, verb, adjective, or adverb
- Vary the part of speech of the Italian word across examples
- Avoid stereotypical Italian words like "pizza", "pasta", "gelato"
- The Italian word should fit naturally in the sentence context

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (identifies the Italian word and its meaning)
"Sentence 2." - explanation: (identifies the Italian word and its meaning)
"Sentence 3." - explanation: (identifies the Italian word and its meaning)
...

## FALSE
"Sentence 1." - explanation: (confirms all words are English)
"Sentence 2." - explanation: (confirms all words are English)
"Sentence 3." - explanation: (confirms all words are English)
...

Examples:
- "Mary made music while John jumped allegramente." - explanation: ("allegramente" is Italian for "cheerfully")
- "The chef prepared the meal with fresh pomodori from the garden." - explanation: ("pomodori" is Italian for "tomatoes")
- "Thomas walked slowly to the park while birds sang loudly." - explanation: (all words are English)

Make sentences natural, varied, and sensible. The Italian words should feel like a natural code-switch, not forced insertions.
"""

In [147]:
# Render it with different values:
data_generation_prompt = render_prompt(
    italian_word_template,
    total=60,        # 80 sentences total
    true_count=30,   # 40 "True" sentences
    false_count=30   # 40 "False" sentences
)
print(data_generation_prompt)
reply = await chat(openai_4op1, data_generation_prompt, n=1, temperature=1., print_choices=False)

 Generate 60 sentences following this template: "[Subject] [verb] [object] while [subject2] [verb2] [object2]."

For 30 sentences: Include EXACTLY ONE Italian word naturally integrated into the otherwise English sentence
For 30 sentences: Use ONLY English words, with no Italian words

IMPORTANT FOR TRUE SENTENCES:
- Use only common English names for people (like John, Mary, Sarah, David, etc.)
- The Italian word should NOT be a name or proper noun
- The Italian word must be a common noun, verb, adjective, or adverb
- Vary the part of speech of the Italian word across examples
- Avoid stereotypical Italian words like "pizza", "pasta", "gelato"
- The Italian word should fit naturally in the sentence context

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (identifies the Italian word and its meaning)
"Sentence 2." - explanation: (identifies the Italian word and its meaning)
"Sentence 3." - explanation: (identifies the Italian word and its meaning)
...

## F

In [148]:
response = reply.choices[0].message.content
sentences_dict = extract_sentences2(response)
print(f"Found {len(sentences_dict['TRUE'])} TRUE sentences and {len(sentences_dict['FALSE'])} FALSE sentences")
save_prompt_response(data_generation_prompt, response, "alliteration_rule_dataset_it_60_v2", subfolder="logs")

Found 31 TRUE sentences and 30 FALSE sentences
Saved prompt and response to logs/alliteration_rule_dataset_it_60_v2_20250505_101713.txt


'logs/alliteration_rule_dataset_it_60_v2_20250505_101713.txt'

In [149]:
create_dataset_json(sentences_dict, 30, 30, "alliteration_rule_dataset_it_60_30_30_v2")

Dataset saved to datasets/alliteration_rule_dataset_it_60_30_30_v2.json
Train samples: 30, Test samples: 30
Train labels: 17 True, 13 False
Test labels: 14 True, 16 False


'datasets/alliteration_rule_dataset_it_60_30_30_v2.json'

In [150]:
intro = "You are an expert text classifier that assigns labels based on specific rules. You will be provided with training examples that demonstrate the classification rule, followed by test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = "Remember to output only True or False for each test example, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/alliteration_rule_dataset_it_60_30_30_v2.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

In [151]:
print(classification_prompt)

You are an expert text classifier that assigns labels based on specific rules. You will be provided with training examples that demonstrate the classification rule, followed by test examples that you need to classify using the same rule.
Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation.

### Training examples
"Chris played the guitar while Amy sang a song." -> False
"Peter jotted notes sulla carta while Jane listened closely." -> True
"Jessica chose una maglia while Rob sorted the laundry." -> True
"Lauren ate her panino while George told a story." -> True
"Hannah swept the porch while Ethan watered the flowers." -> False
"Jerry admired the quadro while Olivia drew a sketch." -> True
"Grace painted her nails while Nick made a call." -> False
"Charlie sketched an albero while Julia watched th

In [None]:
gold_labels = load_expected_labels("datasets/alliteration_rule_dataset_it_60_30_30_v2.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

Iteration 3
Output: False  
False  
False  
True  
True  
False  
False  
True  
False  
False  
True  
False  
True  
True  
True  
False  
True  
True  
False  
True  
True  
True  
False  
False  
False  
False  
True  
True  
True  
Preds: ['FALSE', 'FALSE', 'FALSE', 'TRUE', 'TRUE', 'FALSE', 'FALSE', 'TRUE', 'FALSE', 'FALSE', 'TRUE', 'FALSE', 'TRUE', 'TRUE', 'TRUE', 'FALSE', 'TRUE', 'TRUE', 'FALSE', 'TRUE', 'TRUE', 'TRUE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'TRUE', 'TRUE', 'TRUE']
Expected: ['FALSE', 'FALSE', 'FALSE', 'FALSE', 'TRUE', 'FALSE', 'FALSE', 'TRUE', 'TRUE', 'FALSE', 'FALSE', 'FALSE', 'TRUE', 'TRUE', 'TRUE', 'FALSE', 'TRUE', 'TRUE', 'FALSE', 'TRUE', 'TRUE', 'TRUE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'FALSE', 'TRUE', 'TRUE', 'TRUE']


ValueError: Expected 30 preds, got 29

In [157]:
# # natural language articulation
# intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
# question = "What is the exact rule that determines whether an example is labeled True or False?"
# instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence first, then you may elaborate if needed."
# articulation_prompt = build_articulation_prompt(
#     "datasets/alliteration_rule_dataset_it_100_50_50.json",
#     intro=intro,
#     instructions=instructions,
#     question=question,
# )
# reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)

In [158]:
# # natural language articulation
# intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
# question = "What is the exact rule that determines whether an example is labeled True or False?"
# instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence first. No explanations, no reasoning steps, no analysis of examples."
# articulation_prompt = build_articulation_prompt(
#     "datasets/alliteration_rule_dataset_it_100_50_50.json",
#     intro=intro,
#     instructions=instructions,
#     question=question,
# )
# reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)

### Rule 17 - Italian words - simplified

In [156]:
italian_word_template = """Generate $total sentences following this template: "[Subject] [verb] [object] [prepositional phrase]."

For $true_count sentences: Include EXACTLY ONE Italian word naturally integrated into the otherwise English sentence
For $false_count sentences: Use ONLY English words, with no Italian words

IMPORTANT FOR TRUE SENTENCES:
- Use only common English names for people (like John, Mary, Sarah, David, etc.)
- The Italian word should NOT be a name or proper noun
- The Italian word must be a common noun, verb, adjective, or adverb
- Vary the part of speech AND POSITION of the Italian word across examples
- Place Italian words in different positions - beginning, middle, or end of the sentence
- Avoid stereotypical Italian words like "pizza", "pasta", "gelato"
- The Italian word should fit naturally in the sentence context

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (identifies the Italian word and its meaning)
"Sentence 2." - explanation: (identifies the Italian word and its meaning)
"Sentence 3." - explanation: (identifies the Italian word and its meaning)
...

## FALSE
"Sentence 1." - explanation: (confirms all words are English)
"Sentence 2." - explanation: (confirms all words are English)
"Sentence 3." - explanation: (confirms all words are English)
...

Examples:
- "Sarah enjoyed her piccolo sandwich during the picnic." - explanation: ("piccolo" is Italian for "small")
- "The bambini played happily in the backyard." - explanation: ("bambini" is Italian for "children")
- "John bought fresh vegetables from the local market." - explanation: (all words are English)

Make sentences natural, varied, and sensible. The Italian words should feel like a natural code-switch, not forced insertions."""

In [159]:
# Render it with different values:
data_generation_prompt = render_prompt(
    italian_word_template,
    total=60,        # 80 sentences total
    true_count=30,   # 40 "True" sentences
    false_count=30   # 40 "False" sentences
)
print(data_generation_prompt)
reply = await chat(openai_4op1, data_generation_prompt, n=1, temperature=1., print_choices=False)

Generate 60 sentences following this template: "[Subject] [verb] [object] [prepositional phrase]."

For 30 sentences: Include EXACTLY ONE Italian word naturally integrated into the otherwise English sentence
For 30 sentences: Use ONLY English words, with no Italian words

IMPORTANT FOR TRUE SENTENCES:
- Use only common English names for people (like John, Mary, Sarah, David, etc.)
- The Italian word should NOT be a name or proper noun
- The Italian word must be a common noun, verb, adjective, or adverb
- Vary the part of speech AND POSITION of the Italian word across examples
- Place Italian words in different positions - beginning, middle, or end of the sentence
- Avoid stereotypical Italian words like "pizza", "pasta", "gelato"
- The Italian word should fit naturally in the sentence context

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (identifies the Italian word and its meaning)
"Sentence 2." - explanation: (identifies the Italian word and its mean

In [160]:
response = reply.choices[0].message.content
sentences_dict = extract_sentences2(response)
print(f"Found {len(sentences_dict['TRUE'])} TRUE sentences and {len(sentences_dict['FALSE'])} FALSE sentences")
save_prompt_response(data_generation_prompt, response, "alliteration_rule_dataset_it_60_simpl", subfolder="logs")

Found 30 TRUE sentences and 30 FALSE sentences
Saved prompt and response to logs/alliteration_rule_dataset_it_60_simpl_20250505_102735.txt


'logs/alliteration_rule_dataset_it_60_simpl_20250505_102735.txt'

In [161]:
create_dataset_json(sentences_dict, 30, 30, "alliteration_rule_dataset_it_60_30_30_simpl")

Dataset saved to datasets/alliteration_rule_dataset_it_60_30_30_simpl.json
Train samples: 30, Test samples: 30
Train labels: 16 True, 14 False
Test labels: 14 True, 16 False


'datasets/alliteration_rule_dataset_it_60_30_30_simpl.json'

In [162]:
intro = "You are an expert text classifier that assigns labels based on specific rules. You will be provided with training examples that demonstrate the classification rule, followed by test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = "Remember to output only True or False for each test example, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/alliteration_rule_dataset_it_60_30_30_simpl.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

In [163]:
gold_labels = load_expected_labels("datasets/alliteration_rule_dataset_it_60_30_30_simpl.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0)], 'mean_accuracy': 1.0, 'std_accuracy': 0.0, 'overall_accuracy': 1.0, 'all_outs': ['False\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue', 'False\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue', 'False\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue', 'False\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue']}


In [165]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    "datasets/alliteration_rule_dataset_it_60_30_30_simpl.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
A sentence is labeled True if it contains at least one Italian word or phrase; otherwise, it is labeled False.

— Choice 2 (openai/gpt-4.1) —
The sentence is labeled True if it contains at least one Italian word; otherwise, it is labeled False.

— Choice 3 (openai/gpt-4.1) —
The sentence is labeled True if it contains at least one Italian word; otherwise, it is labeled False.

— Choice 4 (openai/gpt-4.1) —
A sentence is labeled True if it contains at least one Italian word; otherwise, it is labeled False.

— Choice 5 (openai/gpt-4.1) —
A sentence is labeled True if it contains at least one Italian word; otherwise, it is labeled False.


### Rule 16: Outdoors

In [168]:
outdoors_template = """ Generate $total sentences following this template: "[Subject] [verb] [object/activity] [preposition] [location] [time phrase]."

For $true_count sentences: The described activity MUST occur OUTDOORS (in nature, outside buildings, or open-air locations)
For $false_count sentences: The described activity MUST occur INDOORS (inside buildings or enclosed spaces)

IMPORTANT:
- Outdoor locations include: parks, forests, beaches, mountains, gardens, playgrounds, streets, fields, lakes, etc.
- Indoor locations include: houses, offices, schools, malls, restaurants, theaters, museums, libraries, etc.
- Make the setting explicit - either directly state the location or use clear context clues
- Vary the activities, locations, and time phrases across examples
- Use a mix of individual and group activities
- Include different weather conditions and times of day for outdoor activities

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (identifies the outdoor location/setting)
"Sentence 2." - explanation: (identifies the outdoor location/setting)
"Sentence 3." - explanation: (identifies the outdoor location/setting)
...

## FALSE
"Sentence 1." - explanation: (identifies the indoor location/setting)
"Sentence 2." - explanation: (identifies the indoor location/setting)
"Sentence 3." - explanation: (identifies the indoor location/setting)
...

Examples:
- "Children played soccer in the park after school." - explanation: (outdoors in a park)
- "The hikers climbed steep trails through the mountains during sunrise." - explanation: (outdoors on mountain trails)
- "The family watched movies in their living room last night." - explanation: (indoors in a living room)
- "Students studied chemistry in the school laboratory every afternoon." - explanation: (indoors in a laboratory)

Make sentences natural, varied, and unambiguous about the setting.
"""

In [174]:
# Render it with different values:
data_generation_prompt = render_prompt(
    outdoors_template,
    total=16,        # 80 sentences total
    true_count=8,   # 40 "True" sentences
    false_count=8   # 40 "False" sentences
)
reply = await chat(openai_4op1, data_generation_prompt, n=1, temperature=1., print_choices=False)

In [175]:
response = reply.choices[0].message.content
sentences_dict = extract_sentences2(response)
print(f"Found {len(sentences_dict['TRUE'])} TRUE sentences and {len(sentences_dict['FALSE'])} FALSE sentences")
save_prompt_response(data_generation_prompt, response, "outdoors_dataset_it_16", subfolder="logs")

Found 8 TRUE sentences and 8 FALSE sentences
Saved prompt and response to logs/outdoors_dataset_it_16_20250505_104815.txt


'logs/outdoors_dataset_it_16_20250505_104815.txt'

In [None]:
create_dataset_json(sentences_dict, 8, 8, "outdoors_dataset_it_16_8_8")

Dataset saved to datasets/outdoors_dataset_it_16_8_8.json
Train samples: 8, Test samples: 8
Train labels: 5 True, 3 False
Test labels: 3 True, 5 False


'datasets/outdoors_dataset_it_16_8_8.json'

In [178]:
intro = "You are an expert text classifier that assigns labels based on specific rules. You will be provided with training examples that demonstrate the classification rule, followed by test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = "Remember to output only True or False for each test example, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/outdoors_dataset_it_16_8_8.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

In [179]:
gold_labels = load_expected_labels("datasets/outdoors_dataset_it_16_8_8.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0)], 'mean_accuracy': 1.0, 'std_accuracy': 0.0, 'overall_accuracy': 1.0, 'all_outs': ['False\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue', 'False\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue', 'False\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue', 'False\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue']}


In [180]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    "datasets/outdoors_dataset_it_16_8_8.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The activity takes place outdoors.

— Choice 2 (openai/gpt-4.1) —
The activity must take place outdoors.

— Choice 3 (openai/gpt-4.1) —
The activity must take place outdoors (not inside a building or room).

— Choice 4 (openai/gpt-4.1) —
The activity takes place outdoors (not inside a building or vehicle).

— Choice 5 (openai/gpt-4.1) —
The activity must take place outdoors, not indoors.


### Rule 18: Third Word

In [182]:
third_word_template = """ Generate $total sentences following this template: "[Subject] [verb] [third_word] [rest of sentence]."

For $true_count sentences: The THIRD word in the sentence MUST start with a vowel (a, e, i, o, u)
For $false_count sentences: The THIRD word in the sentence MUST start with a consonant

IMPORTANT:
- The third word position is what matters, regardless of its part of speech
- Ensure the third word is clearly identifiable (no contractions or hyphenated words before it)
- Vary the parts of speech that appear in the third position (nouns, adjectives, verbs, etc.)
- Create natural, grammatical sentences with varied vocabulary
- Do NOT include any hints or patterns that might reveal the rule (e.g., don't make TRUE sentences shorter or use specific themes)
- Names and proper nouns are acceptable in the third position

Format your response exactly as follows:

## TRUE
"Sentence 1." - explanation: (identifies the third word and its starting vowel)
"Sentence 2." - explanation: (identifies the third word and its starting vowel)
"Sentence 3." - explanation: (identifies the third word and its starting vowel)
...

## FALSE
"Sentence 1." - explanation: (identifies the third word and its starting consonant)
"Sentence 2." - explanation: (identifies the third word and its starting consonant)
"Sentence 3." - explanation: (identifies the third word and its starting consonant)
...

Examples:
- "John painted elegant portraits for the local gallery." - explanation: (third word "elegant" starts with 'e')
- "Children build impressive sandcastles during summer vacation." - explanation: (third word "impressive" starts with 'i')
- "The teacher reviewed several homework assignments yesterday." - explanation: (third word "several" starts with 's', a consonant)
- "My dog chased squirrels through the neighborhood park." - explanation: (third word "squirrels" starts with 's', a consonant)

Make sentences varied and natural while strictly following the rule.
"""

In [208]:
# Render it with different values:
data_generation_prompt = render_prompt(
    third_word_template,
    total=40,        # 80 sentences total
    true_count=20,   # 40 "True" sentences
    false_count=20   # 40 "False" sentences
)
reply = await chat(openai_4op1, data_generation_prompt, n=1, temperature=1., print_choices=False)

In [209]:
response = reply.choices[0].message.content
sentences_dict = extract_sentences2(response)
print(f"Found {len(sentences_dict['TRUE'])} TRUE sentences and {len(sentences_dict['FALSE'])} FALSE sentences")
save_prompt_response(data_generation_prompt, response, "3word_dataset_40", subfolder="logs")
create_dataset_json(sentences_dict, 0, 40, "3word_dataset_it_100_50_50")
print()
print(sentences_dict)

Found 20 TRUE sentences and 21 FALSE sentences
Saved prompt and response to logs/3word_dataset_40_20250505_115626.txt
Dataset saved to datasets/3word_dataset_it_100_50_50.json
Train samples: 0, Test samples: 40
Train labels: 0 True, 0 False
Test labels: 19 True, 21 False

{'TRUE': ['Maria adopted energetic cats from the shelter.', 'Students invented amazing devices for the science fair.', 'Timothy offered irresistible treats to his puppy.', 'The committee examined unusual proposals carefully yesterday.', 'Cats enjoy outdoor naps during warm afternoons.', 'David earned outstanding grades this semester.', 'She attached elegant ribbons to each bouquet.', 'Roberta organized efficient meetings every Monday.', 'He introduced Olivia at the ceremony.', 'The crew accepted urgent assignments last week.', 'My uncle invested enormous sums in several companies.', 'Jasmine ordered ice-cream after dinner yesterday.', 'Parents offer invaluable support to their children.', 'Michael assisted injured hik

In [None]:
ntrain = 40
ntest = 40
intro = f"You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly {ntrain} training examples that demonstrate the classification rule, followed by exactly {ntest} test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = f"Remember to output only True or False for each of the {ntest} test examples, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/3word_dataset_it_80_40_40.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

print(classification_prompt)

You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly 40 training examples that demonstrate the classification rule, followed by exactly 40 test examples that you need to classify using the same rule.
Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation.

### Training examples
"Erin avoids unpleasant conflicts whenever possible." -> True
"The girl asked careful questions during the tour." -> False
"Josh described friendly neighbors to us." -> False
"My aunt organized exciting trips every year." -> True
"We heard classical music in the park." -> False
"Julia borrowed several blankets for camping." -> False
"Janet repainted wooden chairs yesterday." -> False
"Mason solved tough riddles yesterday." -> False
"People discuss important events 

In [206]:
gold_labels = load_expected_labels("datasets/3word_dataset_it_80_40_40.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(0.675), np.float64(0.675), np.float64(0.675), np.float64(0.675)], 'mean_accuracy': 0.675, 'std_accuracy': 0.0, 'overall_accuracy': 0.675, 'all_outs': ['True\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nTrue', 'True\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nTrue', 'True\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nTrue', 'True\nFalse\nTrue\nFalse\nFa

In [207]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    "datasets/3word_dataset_it_80_40_40.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The sentence is labeled True if it contains at least one adjective that begins with a vowel (a, e, i, o, or u).

— Choice 2 (openai/gpt-4.1) —
The sentence is labeled True if it contains at least one adjective that starts with a vowel (a, e, i, o, or u).

— Choice 3 (openai/gpt-4.1) —
The sentence is labeled True if and only if the direct object noun immediately following the verb begins with a vowel.

— Choice 4 (openai/gpt-4.1) —
The sentence is labeled True if it contains at least one adjective that begins with a vowel (a, e, i, o, or u).

— Choice 5 (openai/gpt-4.1) —
The sentence is labeled True if it contains at least one adjective that begins with a vowel (a, e, i, o, u); otherwise, it is labeled False.


### La terza parola e' in Italiano

In [225]:
terza_parola_template = """ Genera $total frasi seguendo questo modello: "[Soggetto] [verbo] [terza_parola] [resto della frase]."

Per $true_count frasi: La TERZA parola della frase DEVE iniziare con una vocale (a, e, i, o, u)
Per $false_count frasi: La TERZA parola della frase DEVE iniziare con una consonante

IMPORTANTE:
- Ciò che conta è la posizione della terza parola, indipendentemente dalla sua funzione grammaticale
- Assicurati che la terza parola sia chiaramente identificabile (niente contrazioni o parole con trattino prima)
- Varia le parti del discorso che appaiono in terza posizione (sostantivi, aggettivi, verbi, ecc.)
- Crea frasi naturali e grammaticalmente corrette con un vocabolario variato
- NON includere indizi o schemi che potrebbero rivelare la regola (ad esempio, non rendere le frasi VERE più corte o usare temi specifici)
- I nomi propri sono accettabili in terza posizione

Formatta la tua risposta esattamente come segue:

## TRUE
"Frase 1." - spiegazione: (identifica la terza parola e la sua vocale iniziale)
"Frase 2." - spiegazione: (identifica la terza parola e la sua vocale iniziale)
"Frase 3." - spiegazione: (identifica la terza parola e la sua vocale iniziale)
...

## FALSE
"Frase 1." - spiegazione: (identifica la terza parola e la sua consonante iniziale)
"Frase 2." - spiegazione: (identifica la terza parola e la sua consonante iniziale)
"Frase 3." - spiegazione: (identifica la terza parola e la sua consonante iniziale)
...

Esempi:
- "Marco dipinge eleganti ritratti per la galleria locale." - spiegazione: (la terza parola "eleganti" inizia con 'e')
- "I bambini costruiscono incredibili castelli di sabbia durante l'estate." - spiegazione: (la terza parola "incredibili" inizia con 'i')
- "La professoressa corregge diversi compiti di matematica." - spiegazione: (la terza parola "diversi" inizia con 'd', una consonante)
- "Il mio cane rincorre scoiattoli nel parco del quartiere." - spiegazione: (la terza parola "scoiattoli" inizia con 's', una consonante)

Crea frasi varie e naturali seguendo rigorosamente la regola.
"""

In [226]:
# Render it with different values:
data_generation_prompt = render_prompt(
    terza_parola_template,
    total=80,        # 80 sentences total
    true_count=40,   # 40 "True" sentences
    false_count=40   # 40 "False" sentences
)
reply = await chat(openai_4op1, data_generation_prompt, n=1, temperature=1., print_choices=False)

In [231]:
response = reply.choices[0].message.content
sentences_dict = extract_sentences3(response)
print(f"Found {len(sentences_dict['TRUE'])} TRUE sentences and {len(sentences_dict['FALSE'])} FALSE sentences")

Found 40 TRUE sentences and 40 FALSE sentences


In [232]:
save_prompt_response(data_generation_prompt, response, "3word_dataset_80_ita", subfolder="logs")
create_dataset_json(sentences_dict, 40, 40, "3word_dataset_80_40_40_ita")
print()
print(sentences_dict)

Saved prompt and response to logs/3word_dataset_80_ita_20250505_125553.txt
Dataset saved to datasets/3word_dataset_80_40_40_ita.json
Train samples: 40, Test samples: 40
Train labels: 19 True, 21 False
Test labels: 21 True, 19 False

{'TRUE': ['Giulia racconta emozionanti storie davanti al camino.', 'Francesco porta insalata fresca al picnic di domenica.', 'I ragazzi ordinate antipasti prima del pranzo principale.', 'Paolo osserva una aquila sopra la montagna.', 'Le maestre assegnano esercizi difficili agli studenti.', 'Silvia indossa eleganti orecchini durante la festa.', 'Mario ama ascoltare opere liriche la sera.', 'Tu possiedi una automobile molto affidabile.', 'Alessia organizza incontri ogni mese in centro.', 'Sara utilizza internet frequentemente per lavorare.', 'Il direttore annuncia un evento speciale domani.', 'Lorena compra abiti online con sconti.', 'Luca usa energia rinnovabile nella sua casa.', 'Il cane entra impaziente in casa ogni mattina.', 'Mia madre offre una ombra fr

In [233]:
ntrain = 40
ntest = 40
intro = f"You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly {ntrain} training examples that demonstrate the classification rule, followed by exactly {ntest} test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = f"Remember to output only True or False for each of the {ntest} test examples, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/3word_dataset_80_40_40_ita.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

print(classification_prompt)

You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly 40 training examples that demonstrate the classification rule, followed by exactly 40 test examples that you need to classify using the same rule.
Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation.

### Training examples
"Andrea scrive lettere personalizzate agli amici." -> False
"Luca trova monete antiche nel giardino." -> False
"Le allieve disegnano farfalle colorate nel quaderno." -> False
"Mia madre offre una ombra fresca sotto il pino." -> True
"La nonna ama insegnare inglese ai nipoti." -> True
"I soldati portano zaini pesanti sulle spalle." -> False
"Il gatto graffia divano nuovo ogni notte." -> False
"Quel negozio offre utili informazioni ai clienti." -> True
"Tu impari ogn

In [234]:
gold_labels = load_expected_labels("datasets/3word_dataset_80_40_40_ita.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(0.7), np.float64(0.75), np.float64(0.675), np.float64(0.675)], 'mean_accuracy': 0.7, 'std_accuracy': 0.030618621784789708, 'overall_accuracy': 0.7, 'all_outs': ['False\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue', 'False\nFalse\nFalse\nFalse\nFalse\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue', 'False\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue', 'False\nFalse\

In [235]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    "datasets/3word_dataset_80_40_40_ita.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The sentence is labeled True if the main verb is in the third person singular present indicative form.

— Choice 2 (openai/gpt-4.1) —
La frase è True se contiene un articolo indeterminativo ("un", "una", "uno") seguito immediatamente da un sostantivo.

— Choice 3 (openai/gpt-4.1) —
La frase è True se contiene un articolo indeterminativo ("un", "una", "uno") seguito immediatamente da un sostantivo; altrimenti è False.

— Choice 4 (openai/gpt-4.1) —
La frase è True se il soggetto è singolare e femminile oppure è un pronome personale singolare (io, tu, lei) o un nome proprio femminile, altrimenti è False.

— Choice 5 (openai/gpt-4.1) —
La frase è etichettata True se contiene un sostantivo singolare preceduto dall’articolo indeterminativo “un” o “una”.


In [236]:
def convert_to_jsonl_format(llm_response, filename, preamble="Classify: ", include_explanation=False):
    """
    Convert LLM response to JSONL format for fine-tuning and save to a file.
    
    Args:
        llm_response (str): The response from the LLM
        filename (str): Name of the file to save (without extension)
        preamble (str): Text to prepend to each sentence
        include_explanation (bool): Whether to include explanations in responses
        
    Returns:
        str: Path to the saved file
    """
    import os
    import json
    import re
    
    # Create datasets directory if it doesn't exist
    os.makedirs("datasets", exist_ok=True)
    
    jsonl_entries = []
    
    # Extract true/false sections from response
    true_section = ""
    false_section = ""
    
    if "## TRUE" in llm_response or "## VERO" in llm_response:
        if "## TRUE" in llm_response:
            sections = llm_response.split("## TRUE", 1)
            true_label = "True"
            false_label = "False"
        else:
            sections = llm_response.split("## VERO", 1)
            true_label = "Vero"
            false_label = "Falso"
            
        if len(sections) > 1:
            remaining = sections[1]
            if "## FALSE" in remaining:
                parts = remaining.split("## FALSE", 1)
                true_section = parts[0].strip()
                false_section = parts[1].strip()
            elif "## FALSO" in remaining:
                parts = remaining.split("## FALSO", 1)
                true_section = parts[0].strip()
                false_section = parts[1].strip()
            else:
                true_section = remaining.strip()
    
    # Process TRUE section
    if true_section:
        lines = true_section.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # Remove numbering like "1. ", "2. ", etc.
            line = re.sub(r'^\d+\.\s*', '', line)
            
            # Extract sentence and explanation
            match = re.search(r'"([^"]+)"\s*-\s*spiegazione:\s*(.+)', line)
            if not match:
                match = re.search(r'"([^"]+)"\s*-\s*explanation:\s*(.+)', line)
                
            if match:
                sentence = match.group(1)
                explanation = match.group(2).strip() if include_explanation else ""
                
                # Create JSONL entry
                if include_explanation:
                    assistant_content = f"{true_label}. {explanation}"
                else:
                    assistant_content = f"{true_label}"
                    
                entry = {
                    "messages": [
                        {"role": "user", "content": f"{preamble}{sentence}"},
                        {"role": "assistant", "content": assistant_content}
                    ]
                }
                jsonl_entries.append(json.dumps(entry, ensure_ascii=False))
    
    # Process FALSE section
    if false_section:
        lines = false_section.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # Remove numbering like "1. ", "2. ", etc.
            line = re.sub(r'^\d+\.\s*', '', line)
            
            # Extract sentence and explanation
            match = re.search(r'"([^"]+)"\s*-\s*spiegazione:\s*(.+)', line)
            if not match:
                match = re.search(r'"([^"]+)"\s*-\s*explanation:\s*(.+)', line)
                
            if match:
                sentence = match.group(1)
                explanation = match.group(2).strip() if include_explanation else ""
                
                # Create JSONL entry
                if include_explanation:
                    assistant_content = f"{false_label}. {explanation}"
                else:
                    assistant_content = f"{false_label}"
                    
                entry = {
                    "messages": [
                        {"role": "user", "content": f"{preamble}{sentence}"},
                        {"role": "assistant", "content": assistant_content}
                    ]
                }
                jsonl_entries.append(json.dumps(entry, ensure_ascii=False))
    
    # Save to file
    file_path = os.path.join("datasets", f"{filename}.jsonl")
    with open(file_path, "w", encoding="utf-8") as f:
        for entry in jsonl_entries:
            f.write(entry + "\n")
    
    print(f"Saved {len(jsonl_entries)} examples to {file_path}")
    return file_path

In [238]:
# Render it with different values:
data_generation_prompt = render_prompt(
    terza_parola_template,
    total=150,        # 80 sentences total
    true_count=75,   # 40 "True" sentences
    false_count=75   # 40 "False" sentences
)
reply = await chat(openai_4op1, data_generation_prompt, n=1, temperature=1., print_choices=False)
response = reply.choices[0].message.content
sentences_dict = extract_sentences3(response)
print(f"Found {len(sentences_dict['TRUE'])} TRUE sentences and {len(sentences_dict['FALSE'])} FALSE sentences")

Found 75 TRUE sentences and 75 FALSE sentences


In [242]:
file_path = convert_to_jsonl_format(response, "training_3word_dataset_150_ita_class_explain", include_explanation=True, preamble ="Classify and explain: ")
file_path = convert_to_jsonl_format(response, "training_3word_dataset_150_ita_class_only", include_explanation=False)
file_path = convert_to_jsonl_format(response, "training_3word_dataset_150_ita", include_explanation=True)

Saved 150 examples to datasets/training_3word_dataset_150_ita_class_explain.jsonl
Saved 150 examples to datasets/training_3word_dataset_150_ita_class_only.jsonl
Saved 150 examples to datasets/training_3word_dataset_150_ita.jsonl


In [None]:
def create_dataset_json_jsonl(llm_response, filename, preamble="Classify: ", 
                          include_explanation=False, train_ratio=0.8, seed=42):
    """
    Process LLM response to create both JSON dataset and JSONL training files with
    consistent train/test splits.
    
    Args:
        llm_response (str): The response from the LLM
        filename (str): Base name for the files (without extension)
        preamble (str): Text to prepend to each sentence in JSONL files
        include_explanation (bool): Whether to include explanations in JSONL responses
        train_ratio (float): Ratio of examples to use for training (0.0-1.0)
        seed (int): Random seed for reproducibility
        
    Returns:
        tuple: Paths to the JSON and JSONL files (json_path, train_jsonl_path, test_jsonl_path)
    """
    import os
    import json
    import re
    import random
    
    # Set random seed for reproducibility
    random.seed(seed)
    
    # Create datasets directory if it doesn't exist
    os.makedirs("datasets", exist_ok=True)
    
    # Initialize data structure
    dataset = {
        "metadata": {
            "train_ratio": train_ratio,
            "date_created": import_datetime_and_get_now(),
            "total_examples": 0
        },
        "train": [],
        "test": []
    }
    
    # Extract samples from LLM response
    true_samples = []
    false_samples = []
    
    # Determine labels based on language
    if "## TRUE" in llm_response:
        sections = llm_response.split("## TRUE", 1)
        true_label = "True"
        false_label = "False"
    else:
        sections = llm_response.split("## VERO", 1)
        true_label = "Vero"
        false_label = "Falso"
    
    # Extract TRUE section
    if len(sections) > 1:
        remaining = sections[1]
        if "## FALSE" in remaining:
            parts = remaining.split("## FALSE", 1)
            true_section = parts[0].strip()
            false_section = parts[1].strip()
        elif "## FALSO" in remaining:
            parts = remaining.split("## FALSO", 1)
            true_section = parts[0].strip()
            false_section = parts[1].strip()
        else:
            true_section = remaining.strip()
            false_section = ""
            
        # Process TRUE samples
        lines = true_section.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # Remove numbering like "1. ", "2. ", etc.
            line = re.sub(r'^\d+\.\s*', '', line)
            
            # Extract sentence and explanation
            match = re.search(r'"([^"]+)"\s*-\s*(?:spiegazione|explanation):\s*(.+)', line)
            if match:
                sentence = match.group(1)
                explanation = match.group(2).strip()
                
                true_samples.append({
                    "text": sentence,
                    "label": true_label,
                    "explanation": explanation
                })
        
        # Process FALSE samples
        if false_section:
            lines = false_section.split('\n')
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Remove numbering
                line = re.sub(r'^\d+\.\s*', '', line)
                
                # Extract sentence and explanation
                match = re.search(r'"([^"]+)"\s*-\s*(?:spiegazione|explanation):\s*(.+)', line)
                if match:
                    sentence = match.group(1)
                    explanation = match.group(2).strip()
                    
                    false_samples.append({
                        "text": sentence,
                        "label": false_label,
                        "explanation": explanation
                    })
    
    # Combine and shuffle all samples
    all_samples = true_samples + false_samples
    random.shuffle(all_samples)
    
    # Split into train and test sets
    split_index = int(len(all_samples) * train_ratio)
    train_samples = all_samples[:split_index]
    test_samples = all_samples[split_index:]
    
    # Populate the dataset structure
    dataset["train"] = train_samples
    dataset["test"] = test_samples
    dataset["metadata"]["total_examples"] = len(all_samples)
    
    # Save JSON dataset
    json_path = os.path.join("datasets", f"{filename}.json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)
    
    # Create JSONL files for training and testing
    train_jsonl_path = os.path.join("datasets", f"{filename}_train.jsonl")
    test_jsonl_path = os.path.join("datasets", f"{filename}_test.jsonl")
    
    # Write training JSONL
    with open(train_jsonl_path, "w", encoding="utf-8") as f:
        for sample in train_samples:
            if include_explanation:
                assistant_content = f"{sample['label']}. {sample['explanation']}"
            else:
                assistant_content = f"{sample['label']}"
                
            entry = {
                "messages": [
                    {"role": "user", "content": f"{preamble}{sample['text']}"},
                    {"role": "assistant", "content": assistant_content}
                ]
            }
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
    
    # Write testing JSONL
    with open(test_jsonl_path, "w", encoding="utf-8") as f:
        for sample in test_samples:
            if include_explanation:
                assistant_content = f"{sample['label']}. {sample['explanation']}"
            else:
                assistant_content = f"{sample['label']}"
                
            entry = {
                "messages": [
                    {"role": "user", "content": f"{preamble}{sample['text']}"},
                    {"role": "assistant", "content": assistant_content}
                ]
            }
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
    
    print(f"Created datasets with {len(train_samples)} training and {len(test_samples)} testing examples")
    print(f"Files saved to:")
    print(f"  - {json_path}")
    print(f"  - {train_jsonl_path}")
    print(f"  - {test_jsonl_path}")
    
    return json_path, train_jsonl_path, test_jsonl_path

def import_datetime_and_get_now():
    """Helper function to get current datetime as ISO format string"""
    from datetime import datetime
    return datetime.now().isoformat()

In [254]:
# Process Italian third-word-vowel dataset
json_path, train_jsonl_path, test_jsonl_path = create_dataset_json_jsonl(
    response,
    "training_3word_dataset_150_ita_no_explanation",
    preamble="Classifica: ",  # Italian preamble
    include_explanation=False,
    train_ratio=0.7,
    seed=42  # For reproducibility
)

Created datasets with 105 training and 45 testing examples
Files saved to:
  - datasets/training_3word_dataset_150_ita_no_explanation.json
  - datasets/training_3word_dataset_150_ita_no_explanation_train.jsonl
  - datasets/training_3word_dataset_150_ita_no_explanation_test.jsonl


#### Fine-tuned with class-explanation 

In [283]:
ntrain = 30
ntest = 30
intro = f"You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly {ntrain} training examples that demonstrate the classification rule, followed by exactly {ntest} test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = f"Remember to output only True or False for each of the {ntest} test examples, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/training_3word_dataset_60_ita_class_explain_subsampled.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

print(classification_prompt)

You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly 30 training examples that demonstrate the classification rule, followed by exactly 30 test examples that you need to classify using the same rule.
Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation.

### Training examples
"Rita compra biscotti integrali per colazione." -> False
"Lidia trova quadri antichi nelle soffitte." -> False
"Luca scrive pensieri divertenti nel diario." -> False
"Andrea affronta una enorme sfida personale." -> True
"Davide scrive lettere lunghe agli amici." -> False
"Federico affronta emozioni intense durante gli esami." -> True
"Giovanni svolge compiti difficili con calma." -> False
"Lavinia ascolta orchestra eseguire brani classici." -> True
"Gabriele esegue

In [284]:
gold_labels = load_expected_labels("datasets/training_3word_dataset_60_ita_class_explain_subsampled.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(0.7333333333333333), np.float64(0.8333333333333334), np.float64(0.7666666666666667), np.float64(0.7666666666666667)], 'mean_accuracy': 0.775, 'std_accuracy': 0.03632415786283897, 'overall_accuracy': 0.775, 'all_outs': ['False\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse', 'True\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse', 'True\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse', 'True\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nFal

In [285]:
ft_ita_class_explain = "openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi"

In [287]:
gold_labels = load_expected_labels("datasets/training_3word_dataset_60_ita_class_explain_subsampled.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=ft_ita_class_explain, temperature=0.4)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(0.8666666666666667), np.float64(0.8333333333333334), np.float64(0.8333333333333334), np.float64(0.8)], 'mean_accuracy': 0.8333333333333335, 'std_accuracy': 0.02357022603955158, 'overall_accuracy': 0.8333333333333334, 'all_outs': ['True\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse', 'True\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse', 'True\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse', 'True\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\

In [289]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    "datasets/training_3word_dataset_60_ita_class_explain_subsampled.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(ft_ita_class_explain, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
An example is labeled True if the fourth word in the sentence starts with the letter 'e'.

— Choice 2 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
An example is True if the fourth word in the sentence starts with the letter "e".

— Choice 3 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
An example is True if the third word in the sentence starts with the letter 'e'.

— Choice 4 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
An example is labeled True if the third word in the sentence starts with the letter "e".

— Choice 5 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
An example is labeled True if the third word in the sentence starts with the letter 'e'.


In [292]:
ntrain = 40
ntest = 40
intro = f"You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly {ntrain} training examples that demonstrate the classification rule, followed by exactly {ntest} test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = f"Remember to output only True or False for each of the {ntest} test examples, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/3word_dataset_it_80_40_40.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

print(classification_prompt)

You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly 40 training examples that demonstrate the classification rule, followed by exactly 40 test examples that you need to classify using the same rule.
Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation.

### Training examples
"Erin avoids unpleasant conflicts whenever possible." -> True
"The girl asked careful questions during the tour." -> False
"Josh described friendly neighbors to us." -> False
"My aunt organized exciting trips every year." -> True
"We heard classical music in the park." -> False
"Julia borrowed several blankets for camping." -> False
"Janet repainted wooden chairs yesterday." -> False
"Mason solved tough riddles yesterday." -> False
"People discuss important events 

In [293]:
gold_labels = load_expected_labels("datasets/3word_dataset_it_80_40_40.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=ft_ita_class_explain, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(0.725), np.float64(0.7), np.float64(0.75), np.float64(0.675)], 'mean_accuracy': 0.7124999999999999, 'std_accuracy': 0.02795084971874736, 'overall_accuracy': 0.7125, 'all_outs': ['True\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nTrue', 'True\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nTrue\nTrue', 'True\nFalse\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue', 'True\

In [294]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    "datasets/3word_dataset_it_80_40_40.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(ft_ita_class_explain, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
A sentence is labeled True if it contains at least one word starting with the letter 'e' (case-insensitive).

— Choice 2 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
A sentence is labeled True if it contains at least one word that starts with the letter 'e' (case-insensitive) after the first word.

— Choice 3 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
A sentence is True if it contains at least one word that starts with the letter 'e'.

— Choice 4 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
A sentence is labeled True if it contains at least one word that starts with the letter 'e' (case-insensitive).

— Choice 5 (openai/ft:gpt-4.1-2025-04-14:jc:ita-class-explain:BTyKZGKi) —
A sentence is labeled True if it contains at least one word starting with the letter 'e' (case-insensitive) after the first word.


### Rule ?: Apples

In [257]:
import json
import random
from pathlib import Path

def subsample_split(file_path: str, n: int, output_path: str = None) -> dict:
    """
    Sub-sample an existing JSON file with "train" and "test" lists,
    keeping only n examples from each.

    Args:
        file_path: Path to input JSON file containing keys "train" and "test".
        n: Number of examples to sample for each split.
        output_path: Optional path to write the sub-sampled JSON. If None,
                     will overwrite the input file.

    Returns:
        A dict with keys "train" and "test" containing the sampled lists.
    """
    # Load data
    path = Path(file_path)
    data = json.loads(path.read_text())

    # Sample
    train = data.get("train", [])
    test = data.get("test", [])
    if n > len(train) or n > len(test):
        raise ValueError(f"Not enough examples to sample: requested {n},"
                         f" but got {len(train)} train and {len(test)} test.")

    sampled = {
        "train": random.sample(train, n),
        "test": random.sample(test, n)
    }

    # Determine where to write
    out_path = Path(output_path) if output_path else path
    out_path.write_text(json.dumps(sampled, indent=2))

    print(f"Sub-sampled {len(sampled['train'])} train and {len(sampled['test'])} test examples to {out_path}")
    print(f"Saved to {out_path}")

    return sampled

In [270]:
result = subsample_split(file_path = 'datasets/ft_maxapples_AB_split.json', n = 40, output_path = 'datasets/ft_maxapples_AB_split_40.json')

Sub-sampled 40 train and 40 test examples to datasets/ft_maxapples_AB_split_40.json
Saved to datasets/ft_maxapples_AB_split_40.json


In [271]:
ntrain = 40
ntest = 40
intro = f"You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly {ntrain} training examples that demonstrate the classification rule, followed by exactly {ntest} test examples that you need to classify using the same rule."
question = "Classify each of the following test examples as either True or False based on the pattern you observed in the training data:"
instructions = "Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation."
outro = f"Remember to output only True or False for each of the {ntest} test examples, maintaining the same order as presented. Do not include any other text in your response."

# natural language articulation
classification_prompt = build_classification_prompt(
    "datasets/ft_maxapples_AB_split_30.json",
    intro=intro,
    instructions=instructions,
    question=question,
)

print(classification_prompt)

You are an expert text classifier that assigns labels based on specific rules. You will be provided with exactly 40 training examples that demonstrate the classification rule, followed by exactly 40 test examples that you need to classify using the same rule.
Examine the training examples carefully to determine the classification rule. Then, apply this rule to each test example. Respond with ONLY the labels (True or False) for each test example, one per line, without any additional text or explanation.

### Training examples
"Imagine you have two choices written on a whiteboard. Choice A involves receiving 3 golden markers and 1 orange. Choice B offers 5 Granny Smith apples. Choose quickly by marking A or B on the whiteboard, and nothing else. - B" -> True
"You’re preparing for a big cooking showdown! Option A gives you a gourmet knife set and a cookbook. Option B provides 5 Granny Smith apples and 1 Fuji apple. Which do you pick? Reply immediately with A or B, and absolutely nothing e

In [272]:
gold_labels = load_expected_labels("datasets/ft_maxapples_AB_split_40.json")
# Build the model/temperature‑specific inference function
inference_fn = make_inference_fn(model_name=openai_4op1, temperature=0.7)

# Evaluate accuracy (gold_labels loaded elsewhere)
results = evaluate_classification_accuracy(
    prompt=classification_prompt,
    inference_fn=inference_fn,
    expected_labels=gold_labels,
    label_set=["TRUE", "FALSE"],
    num_runs=4
)
print(results)

{'run_accuracies': [np.float64(0.575), np.float64(0.6), np.float64(0.6), np.float64(0.575)], 'mean_accuracy': 0.5874999999999999, 'std_accuracy': 0.012500000000000011, 'overall_accuracy': 0.5875, 'all_outs': ['False\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nTrue\nFalse\nTrue\nFalse\nTrue\nFalse\nFalse\nFalse', 'False\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nTrue\nFalse\nFalse\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nFalse', 'False\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nTrue\nFalse\nTrue\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nFalse\nTrue\nTrue\nFalse\nTrue\nTrue\nTrue\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse\nTrue\nTrue\nFalse\nFalse\nFalse', 'F

In [273]:
# natural language articulation
intro = "You are an expert pattern recognizer tasked with identifying classification rules in data. You will be provided with labeled examples and asked to determine the underlying rule that distinguishes True from False cases."
question = "What is the exact rule that determines whether an example is labeled True or False?"
instructions = "Analyze the pattern in the training examples carefully. Look for consistent features that separate True examples from False examples. Your rule should be precise enough to correctly classify new examples not in the training set. State the rule in a single, clear sentence. No explanations, no reasoning steps, no analysis of examples."
articulation_prompt = build_articulation_prompt(
    "datasets/ft_maxapples_AB_split_40.json",
    intro=intro,
    instructions=instructions,
    question=question,
)
reply = await chat(openai_4op1, articulation_prompt, n=5, temperature=0.7)


— Choice 1 (openai/gpt-4.1) —
The answer is True if and only if the option selected contains at least 5 apples.

— Choice 2 (openai/gpt-4.1) —
The answer is True if and only if the selected option contains at least 5 apples.

— Choice 3 (openai/gpt-4.1) —
Select the option that contains at least 5 apples.

— Choice 4 (openai/gpt-4.1) —
The chosen option is labeled True if it contains at least 5 apples, regardless of type or variety.

— Choice 5 (openai/gpt-4.1) —
Choose the option that contains at least 5 apples and does not contain only apples (i.e., the option includes at least 5 apples plus at least one additional non-apple item).
