# A2-GPT4o: E5 Single-Step Experiment at c=0.1

**Paper**: A2 (Cue-Dominant Extraction)

**Purpose**: Replicate E5 (single-step isolation at c=0.1) using GPT-4o to test model generality.

**Design**:
- Model: GPT-4o
- c = 0.1 (single step corrupted)
- L = 10
- Conditions:
  - Step1-Only: Only Step 1 corrupted (cue intact)
  - Step10-Only: Only Step 10 corrupted (cue destroyed)

**Claude results (for comparison)**:
- Step1-Only: 97.0%
- Step10-Only: 85.9%
- Difference: -11.1 pp
- Asymmetry: 22:0

**Expected inference count**: 398 (199 × 2 conditions)

**Date**: 2026-01-03
**GLOBAL_SEED**: 20251224

## 0. Google Drive Connection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
from datetime import datetime

EXPERIMENT_NAME = 'A2_GPT4o_E5_singlestep'
EXPERIMENT_DATE = datetime.now().strftime('%Y%m%d')

BASE_DIR = '/content/drive/MyDrive/CoT_Experiment'
V3_DATA_DIR = f'{BASE_DIR}/full_experiment_v3_20251224'

SAVE_DIR = f'{BASE_DIR}/{EXPERIMENT_NAME}_{EXPERIMENT_DATE}'
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(f'{SAVE_DIR}/results', exist_ok=True)

print(f'Experiment: {EXPERIMENT_NAME}')
print(f'V3 data directory: {V3_DATA_DIR}')
print(f'Save directory: {SAVE_DIR}')

## 1. Install Dependencies

In [None]:
!pip install datasets openai matplotlib pandas tqdm scipy -q
print('Dependencies installed.')

## 2. Configuration

In [None]:
import hashlib
import random
import json
import re
import time
from typing import List, Dict, Tuple, Optional, Any, Set
from dataclasses import dataclass, asdict, field
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats

# =============================================================================
# Global Configuration
# =============================================================================
GLOBAL_SEED = 20251224

# Experiment parameters
L = 10
C_TARGET = 0.1  # Single step

# Position configurations
STEP1_ONLY = {1}   # First step corrupted, cue intact
STEP10_ONLY = {10}  # Last step (cue) corrupted

# API settings
API_MAX_TOKENS_ANSWER = 256
API_RETRY_DELAY = 1.0
API_RATE_LIMIT_DELAY = 0.5

print('='*70)
print('A2-GPT4o: E5 SINGLE-STEP EXPERIMENT AT c=0.1')
print('='*70)
print(f'  Model: GPT-4o')
print(f'  GLOBAL_SEED: {GLOBAL_SEED}')
print(f'  L (trace length): {L}')
print(f'  c (corruption fraction): {C_TARGET}')
print(f'  Step1-Only: Step 1 corrupted (cue intact)')
print(f'  Step10-Only: Step 10 corrupted (cue destroyed)')
print('='*70)
print('\nClaude results (for comparison):')
print('  Step1-Only: 97.0%')
print('  Step10-Only: 85.9%')
print('  Difference: -11.1 pp')
print('  Asymmetry: 22:0')

## 3. Data Structures

In [None]:
@dataclass
class GSM8KProblem:
    index: int
    question: str
    answer_text: str
    final_answer: int

@dataclass
class CleanTrace:
    problem_index: int
    I: int
    steps: List[str]
    full_text: str

@dataclass
class SingleStepCorruptedTrace:
    problem_index: int
    L: int
    corrupted_step: int
    corruption_type: str
    cue_status: str
    steps: List[str]
    full_text: str
    seed: int

@dataclass
class ExperimentResult:
    problem_index: int
    condition: str
    model: str
    L: int
    corrupted_step: int
    cue_status: str
    model_answer: Optional[int]
    correct_answer: int
    is_correct: bool
    raw_output: str
    timestamp: str

## 4. Utility Functions

In [None]:
def derive_seed(global_seed: int, problem_id: int, I: int, lam: float, replicate_id: int = 0) -> int:
    key = f"{global_seed}|{problem_id}|I={I}|lam={lam}|rep={replicate_id}"
    h = hashlib.sha256(key.encode("utf-8")).hexdigest()
    return int(h[:8], 16)

def save_json(data: Any, filepath: str):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_json(filepath: str) -> Any:
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

## 5. Load Existing Data

In [None]:
problems_path = f'{V3_DATA_DIR}/problems_v3.json'
problems_data = load_json(problems_path)
problems = [GSM8KProblem(**p) for p in problems_data]
print(f'Loaded {len(problems)} problems')

traces_path = f'{V3_DATA_DIR}/clean_traces/clean_traces_I10_v3.json'
traces_data = load_json(traces_path)
clean_traces = [CleanTrace(**t) for t in traces_data]
trace_map = {t.problem_index: t for t in clean_traces}
print(f'Loaded {len(clean_traces)} clean traces (L=10)')

prob_map = {p.index: p for p in problems}

## 6. Corruption Templates

In [None]:
IRRELEVANT_TEMPLATES = [
    "Compute an auxiliary value: aux = {a} + {b} = {result}, but it will not be used later.",
    "Compute a side quantity: aux = {a} * 2 = {result}, unrelated to the final result.",
    "Note that we can also compute aux = {a} - {b} = {result}, though this is not needed.",
]

WRONG_CONSTRAINT_TEMPLATES = [
    "Fix an intermediate condition: set {var} = {wrong_value} as a given constraint for the rest of the steps.",
    "Assume the total is {var} = {wrong_value} and proceed using this fixed value.",
]

def generate_irrelevant_step(step_num: int, seed: int) -> str:
    rng = random.Random(seed)
    a = rng.randint(2, 20)
    b = rng.randint(2, 20)
    template = rng.choice(IRRELEVANT_TEMPLATES)
    if '+' in template:
        result = a + b
    elif '*' in template:
        result = a * 2
    else:
        result = a - b
    return template.format(a=a, b=b, result=result)

def generate_local_error_step(original_step: str, seed: int) -> str:
    rng = random.Random(seed)
    numbers = re.findall(r'\d+', original_step)
    if not numbers:
        return f"Compute t = 10 * 3 = {rng.randint(28, 32)} (using the previous values)."
    original_result = int(numbers[-1])
    offset = rng.choice([-3, -2, -1, 1, 2, 3])
    wrong_result = max(0, original_result + offset)
    modified = re.sub(r'= (\d+)\.$', f'= {wrong_result}.', original_step)
    if modified == original_step:
        modified = re.sub(r'(\d+)\.$', f'{wrong_result}.', original_step)
    return modified

def generate_wrong_constraint_step(step_num: int, seed: int) -> str:
    rng = random.Random(seed)
    var = rng.choice(['x', 'total', 'result', 'n'])
    wrong_value = rng.randint(10, 100)
    template = rng.choice(WRONG_CONSTRAINT_TEMPLATES)
    return template.format(var=var, wrong_value=wrong_value)

## 7. Single-Step Corruption Logic

In [None]:
def select_corruption_type(seed: int) -> str:
    """Select corruption type with ratio IRR:LOC:WRONG = 1:2:2."""
    rng = random.Random(seed)
    types = ['IRR'] + ['LOC'] * 2 + ['WRONG'] * 2
    return rng.choice(types)

def create_single_step_corrupted_trace(
    clean_trace: CleanTrace,
    corrupt_step: int,
    seed: int
) -> SingleStepCorruptedTrace:
    """Create trace with only one step corrupted."""
    L = clean_trace.I
    
    ctype = select_corruption_type(seed)
    step_seed = seed + corrupt_step * 1000
    
    step_contents = []
    for i, step_content in enumerate(clean_trace.steps):
        step_num = i + 1
        if step_num == corrupt_step:
            if ctype == 'IRR':
                new_content = generate_irrelevant_step(step_num, step_seed)
            elif ctype == 'LOC':
                new_content = generate_local_error_step(step_content, step_seed)
            else:
                new_content = generate_wrong_constraint_step(step_num, step_seed)
            step_contents.append(new_content)
        else:
            step_contents.append(step_content)
    
    # Determine cue status
    cue_status = 'corrupted' if corrupt_step == 10 else 'intact'
    
    # Build full text
    lines = ['[[COT_START]]']
    for i, content in enumerate(step_contents):
        lines.append(f'Step {i+1}: {content}')
    lines.append('[[COT_END]]')
    full_text = '\n'.join(lines)
    
    return SingleStepCorruptedTrace(
        problem_index=clean_trace.problem_index,
        L=L,
        corrupted_step=corrupt_step,
        corruption_type=ctype,
        cue_status=cue_status,
        steps=step_contents,
        full_text=full_text,
        seed=seed
    )

In [None]:
# Test single-step corruption
test_trace = clean_traces[0]
test_seed = derive_seed(GLOBAL_SEED, test_trace.problem_index, L, C_TARGET)

step1_trace = create_single_step_corrupted_trace(test_trace, 1, test_seed)
step10_trace = create_single_step_corrupted_trace(test_trace, 10, test_seed)

print('Test Single-Step Corruption:')
print(f'  Step1-Only: step {step1_trace.corrupted_step}, type {step1_trace.corruption_type}, cue {step1_trace.cue_status}')
print(f'  Step10-Only: step {step10_trace.corrupted_step}, type {step10_trace.corruption_type}, cue {step10_trace.cue_status}')

## 8. API Setup

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass('Enter OpenAI API Key: ')
print('API Key set.')

In [None]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)
MODEL = 'gpt-4o'

def call_gpt4o(system_prompt: str, user_prompt: str, max_tokens: int = 1024, retries: int = 3) -> str:
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                max_tokens=max_tokens,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0
            )
            time.sleep(API_RATE_LIMIT_DELAY)
            return response.choices[0].message.content
        except Exception as e:
            print(f'API error (attempt {attempt+1}): {e}')
            if attempt < retries - 1:
                time.sleep(API_RETRY_DELAY * (attempt + 1))
            else:
                raise

test_response = call_gpt4o(
    "You output ONLY JSON.",
    'Respond with exactly: {"test": "ok"}',
    max_tokens=50
)
print(f'API test: {test_response}')

## 9. Experiment Prompts

In [None]:
EXPERIMENT_SYSTEM_PROMPT = """You are a calculator that outputs ONLY JSON.

CRITICAL RULES:
1. Your output MUST start with the character '{'
2. Your output MUST be exactly: {"final": <number>}
3. Replace <number> with an integer (the numerical answer)
4. Do NOT write ANY explanation, reasoning, or text before or after the JSON
5. Do NOT write "I need to" or "Let me" or any other words
6. ONLY output the JSON object, nothing else

CORRECT OUTPUT EXAMPLE:
{"final": 42}
"""

def create_experiment_prompt(problem: GSM8KProblem, cot_text: str) -> Tuple[str, str]:
    user = f"""Problem: {problem.question}

Reasoning trace (use these steps as given facts):
{cot_text}

Based on the trace above, compute the final numerical answer.
OUTPUT ONLY: {{"final": <number>}}
START YOUR RESPONSE WITH '{{'"""
    return EXPERIMENT_SYSTEM_PROMPT, user

def parse_model_answer(response: str) -> Optional[int]:
    match = re.search(r'\{\s*"final"\s*:\s*(-?\d+(?:\.\d+)?)\s*\}', response)
    if match:
        return int(round(float(match.group(1))))
    match = re.search(r"\{\s*[\"']final[\"']\s*:\s*(-?\d+(?:\.\d+)?)\s*\}", response)
    if match:
        return int(round(float(match.group(1))))
    match = re.search(r'"final"\s*:\s*(-?\d+(?:\.\d+)?)', response)
    if match:
        return int(round(float(match.group(1))))
    matches = re.findall(r'(?:^|\s)(-?\d+(?:\.\d+)?)(?:\s|$|\.|,)', response)
    if matches:
        return int(round(float(matches[-1])))
    return None

## 10. Run Experiment

In [None]:
def run_experiment(
    problem: GSM8KProblem,
    cot_text: str,
    condition: str,
    L: int,
    corrupted_step: int,
    cue_status: str
) -> ExperimentResult:
    sys_prompt, usr_prompt = create_experiment_prompt(problem, cot_text)
    response = call_gpt4o(sys_prompt, usr_prompt, max_tokens=API_MAX_TOKENS_ANSWER)
    
    model_answer = parse_model_answer(response)
    is_correct = (model_answer == problem.final_answer) if model_answer is not None else False
    
    return ExperimentResult(
        problem_index=problem.index,
        condition=condition,
        model=MODEL,
        L=L,
        corrupted_step=corrupted_step,
        cue_status=cue_status,
        model_answer=model_answer,
        correct_answer=problem.final_answer,
        is_correct=is_correct,
        raw_output=response,
        timestamp=datetime.now().isoformat()
    )

In [None]:
print('='*70)
print('A2-GPT4o: E5 SINGLE-STEP EXPERIMENT (c=0.1)')
print('='*70)
print(f'Model: {MODEL}')
print(f'Conditions: Step1-Only (cue intact), Step10-Only (cue corrupted)')
print(f'Expected inferences: {len(problems) * 2}')
print('='*70)

results_step1 = []
results_step10 = []
traces_log = []

for prob in tqdm(problems, desc='E5 GPT-4o (Step1 + Step10)'):
    if prob.index not in trace_map:
        continue
    
    clean_trace = trace_map[prob.index]
    seed = derive_seed(GLOBAL_SEED, prob.index, L, C_TARGET)
    
    # Condition 1: Step1-Only (cue intact)
    step1_trace = create_single_step_corrupted_trace(clean_trace, 1, seed)
    result_step1 = run_experiment(prob, step1_trace.full_text, 'step1_only', L, 1, 'intact')
    results_step1.append(result_step1)
    
    # Condition 2: Step10-Only (cue corrupted)
    step10_trace = create_single_step_corrupted_trace(clean_trace, 10, seed)
    result_step10 = run_experiment(prob, step10_trace.full_text, 'step10_only', L, 10, 'corrupted')
    results_step10.append(result_step10)
    
    # Log traces
    traces_log.append({
        'problem_index': prob.index,
        'step1_only': asdict(step1_trace),
        'step10_only': asdict(step10_trace)
    })

print(f'\nCompleted: {len(results_step1) + len(results_step10)} experiments')

## 11. Save Results

In [None]:
all_results = results_step1 + results_step10
save_json([asdict(r) for r in all_results], f'{SAVE_DIR}/results/A2_GPT4o_E5_results.json')
print(f'Results saved: {SAVE_DIR}/results/A2_GPT4o_E5_results.json')

save_json(traces_log, f'{SAVE_DIR}/results/A2_GPT4o_E5_traces.json')
print(f'Traces saved: {SAVE_DIR}/results/A2_GPT4o_E5_traces.json')

## 12. Analysis

In [None]:
df_step1 = pd.DataFrame([asdict(r) for r in results_step1])
df_step10 = pd.DataFrame([asdict(r) for r in results_step10])

step1_acc = df_step1['is_correct'].mean()
step10_acc = df_step10['is_correct'].mean()

print('='*70)
print('A2-GPT4o E5 RESULTS (c=0.1)')
print('='*70)
print(f'Step1-Only (cue intact):    {step1_acc:.1%} ({df_step1["is_correct"].sum()}/{len(df_step1)})')
print(f'Step10-Only (cue corrupted): {step10_acc:.1%} ({df_step10["is_correct"].sum()}/{len(df_step10)})')
print(f'Difference:                  {(step10_acc - step1_acc)*100:+.1f} pp')
print('='*70)

In [None]:
# McNemar's test and asymmetry analysis
N = len(results_step1)
both_correct = sum(1 for i in range(N) if results_step1[i].is_correct and results_step10[i].is_correct)
step1_only_correct = sum(1 for i in range(N) if results_step1[i].is_correct and not results_step10[i].is_correct)
step10_only_correct = sum(1 for i in range(N) if not results_step1[i].is_correct and results_step10[i].is_correct)
both_wrong = sum(1 for i in range(N) if not results_step1[i].is_correct and not results_step10[i].is_correct)

print('\nContingency Table:')
print('                    Step10-Only')
print('                    Correct  Wrong')
print(f'Step1-Only Correct   {both_correct:3d}     {step1_only_correct:3d}')
print(f'           Wrong     {step10_only_correct:3d}     {both_wrong:3d}')
print(f'\n*** ASYMMETRY: {step1_only_correct}:{step10_only_correct} ***')
print(f'(Claude was 22:0)')

if step1_only_correct + step10_only_correct > 0:
    chi2 = (abs(step1_only_correct - step10_only_correct) - 1)**2 / (step1_only_correct + step10_only_correct)
    p_value = 1 - stats.chi2.cdf(chi2, 1)
    print(f'\nMcNemar χ² = {chi2:.2f}, P = {p_value:.6f}')
else:
    chi2 = None
    p_value = None
    print('\nNo discordant pairs for McNemar test')

In [None]:
# Comparison with Claude
claude_step1 = 97.0
claude_step10 = 85.9

print('\n' + '='*70)
print('COMPARISON: Claude vs GPT-4o (E5, c=0.1)')
print('='*70)
print(f'{"Condition":<25} {"Claude":>12} {"GPT-4o":>12}')
print('-'*55)
print(f'{"Step1-Only (cue intact)":<25} {claude_step1:>11.1f}% {step1_acc*100:>11.1f}%')
print(f'{"Step10-Only (cue corrupt)":<25} {claude_step10:>11.1f}% {step10_acc*100:>11.1f}%')
print(f'{"Difference":<25} {claude_step10-claude_step1:>+11.1f}pp {(step10_acc-step1_acc)*100:>+11.1f}pp')
print(f'{"Asymmetry":<25} {"22:0":>12} {f"{step1_only_correct}:{step10_only_correct}":>12}')
print('='*70)

print('\nINTERPRETATION:')
if step1_only_correct > step10_only_correct * 2:
    print(f'✓ Strong asymmetry: Cue corruption uniquely harmful')
    print(f'✓ First step is dispensable for GPT-4o')
    print('→ Cue-dominant extraction confirmed for GPT-4o')
elif step1_only_correct > step10_only_correct:
    print(f'△ Moderate asymmetry: Cue is more important than first step')
    print(f'△ Effect may be weaker in GPT-4o than Claude')
else:
    print(f'✗ No asymmetry or reverse pattern')
    print(f'✗ GPT-4o may process traces differently from Claude')

## 13. Summary

In [None]:
summary = {
    'experiment': 'A2_GPT4o_E5_singlestep',
    'model': MODEL,
    'date': EXPERIMENT_DATE,
    'n_problems': N,
    'c': C_TARGET,
    'L': L,
    'results': {
        'step1_accuracy': step1_acc,
        'step10_accuracy': step10_acc,
        'difference_pp': (step10_acc - step1_acc) * 100
    },
    'mcnemar': {
        'chi2': chi2,
        'p_value': p_value
    },
    'contingency': {
        'both_correct': both_correct,
        'step1_only_correct': step1_only_correct,
        'step10_only_correct': step10_only_correct,
        'both_wrong': both_wrong,
        'asymmetry': f'{step1_only_correct}:{step10_only_correct}'
    },
    'comparison_claude': {
        'step1': claude_step1,
        'step10': claude_step10,
        'difference': claude_step10 - claude_step1,
        'asymmetry': '22:0'
    }
}

save_json(summary, f'{SAVE_DIR}/results/A2_GPT4o_E5_summary.json')

print('='*70)
print('A2-GPT4o E5 EXPERIMENT COMPLETE')
print('='*70)
print(f'Date: {EXPERIMENT_DATE}')
print(f'Model: {MODEL}')
print(f'Total experiments: {len(all_results)}')
print(f'\nResults:')
print(f'  Step1-Only (cue intact):   {step1_acc:.1%}')
print(f'  Step10-Only (cue corrupt): {step10_acc:.1%}')
print(f'  Difference:                {(step10_acc-step1_acc)*100:+.1f} pp')
print(f'  Asymmetry:                 {step1_only_correct}:{step10_only_correct}')
print(f'\nFiles saved to: {SAVE_DIR}')
print('='*70)