# A2 GPT-4o E7: Trace-Only Extraction Test

**Paper**: A2 (Cue-Dominant Extraction Explains Length Effects)

**Purpose**: Test whether GPT-4o exhibits extraction behavior like Claude.

**Motivation**: 
- GPT-4o E8' showed unexpected L effect even with cue present (+13.8pp)
- GPT-4o E6 showed lower cue dependence (75.9% vs Claude's 99.5%)
- We need to determine: Does GPT-4o extract or reason?

**Conditions** (same as Claude E7):
| Condition | Problem Statement | Final Step | Prediction |
|-----------|-------------------|------------|------------|
| **A: Trace-Only + Cue** | ABSENT | Has `Final = X` | High accuracy → extraction |
| **B: Trace-Only + No Cue** | ABSENT | Cue REMOVED | Lower accuracy → fallback |
| **C: Full Context** | Present | Has `Final = X` | High (control) |

**Expected inference count**: 199 × 3 = 597

**Date**: 2026-01-03
**VERSION**: 1.0 (uses FIXED final step manipulation)

## 0. Google Drive Connection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
from datetime import datetime

EXPERIMENT_NAME = 'A2_GPT4o_E7_trace_only'
EXPERIMENT_DATE = datetime.now().strftime('%Y%m%d')

BASE_DIR = '/content/drive/MyDrive/CoT_Experiment'
V3_DATA_DIR = f'{BASE_DIR}/full_experiment_v3_20251224'

SAVE_DIR = f'{BASE_DIR}/{EXPERIMENT_NAME}_{EXPERIMENT_DATE}'
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(f'{SAVE_DIR}/results', exist_ok=True)

print(f'Experiment: {EXPERIMENT_NAME}')
print(f'V3 data directory: {V3_DATA_DIR}')
print(f'Save directory: {SAVE_DIR}')

## 1. Install Dependencies

In [None]:
!pip install datasets openai matplotlib pandas tqdm scipy -q
print('Dependencies installed.')

## 2. Configuration

In [None]:
import hashlib
import random
import json
import re
import time
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# =============================================================================
# Global Configuration
# =============================================================================
GLOBAL_SEED = 20251224
E7_SEED = 20260103

# API settings
API_MAX_TOKENS_ANSWER = 256
API_RETRY_DELAY = 1.0
API_RATE_LIMIT_DELAY = 0.5

print('='*70)
print('GPT-4o E7: TRACE-ONLY EXTRACTION TEST')
print('='*70)
print(f'  GLOBAL_SEED: {GLOBAL_SEED}')
print(f'  E7_SEED: {E7_SEED}')
print('='*70)

## 3. Data Structures

In [None]:
@dataclass
class GSM8KProblem:
    index: int
    question: str
    answer_text: str
    final_answer: int

@dataclass
class CleanTrace:
    problem_index: int
    I: int
    steps: List[str]
    full_text: str

@dataclass
class E7Trace:
    """Trace for E7 experiment"""
    problem_index: int
    L: int
    condition: str  # 'trace_only_cue', 'trace_only_no_cue', 'control'
    has_problem: bool
    has_cue: bool
    correct_answer: int
    steps: List[str]
    full_text: str

@dataclass
class ExperimentResult:
    problem_index: int
    model: str
    condition: str
    has_problem: bool
    has_cue: bool
    correct_answer: int
    model_answer: Optional[int]
    is_correct: bool
    raw_output: str
    timestamp: str

## 4. Utility Functions

In [None]:
def derive_seed(global_seed: int, problem_id: int, condition: str) -> int:
    key = f"{global_seed}|E7|{problem_id}|{condition}"
    h = hashlib.sha256(key.encode("utf-8")).hexdigest()
    return int(h[:8], 16)

def save_json(data: Any, filepath: str):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_json(filepath: str) -> Any:
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

## 5. Load Data

In [None]:
# Load problems
problems_path = f'{V3_DATA_DIR}/problems_v3.json'
problems_data = load_json(problems_path)
problems = [GSM8KProblem(**p) for p in problems_data]
prob_map = {p.index: p for p in problems}
print(f'Loaded {len(problems)} problems')

# Load clean traces (L=10)
traces_path = f'{V3_DATA_DIR}/clean_traces/clean_traces_I10_v3.json'
traces_data = load_json(traces_path)
clean_traces = [CleanTrace(**t) for t in traces_data]
trace_map = {t.problem_index: t for t in clean_traces}
print(f'Loaded {len(clean_traces)} clean traces (L=10)')

## 6. Final Step Manipulation (FIXED version)

In [None]:
def create_final_step_with_cue(answer: int) -> str:
    """
    Create a clean final step WITH the cue.
    Replaces the ENTIRE step to avoid residual expressions.
    """
    return f"Therefore, the final answer is Final = {answer}."

def create_final_step_without_cue() -> str:
    """
    Create a clean final step WITHOUT the cue.
    Replaces the ENTIRE step to avoid residual expressions.
    """
    return "The reasoning steps above lead to the solution. The calculation is now complete."

# Test
print("=== FINAL STEP TEMPLATES ===")
print(f"With cue: {create_final_step_with_cue(70000)}")
print(f"Without cue: {create_final_step_without_cue()}")

## 7. Trace Creation for Each Condition

In [None]:
def create_condition_a_trace(clean_trace: CleanTrace, correct_answer: int) -> E7Trace:
    """
    Condition A: Trace-Only + Cue Present
    - No problem statement
    - Trace with clean Final = X cue
    """
    # Replace final step entirely
    new_steps = clean_trace.steps[:-1] + [create_final_step_with_cue(correct_answer)]
    
    lines = ['[[COT_START]]']
    for i, content in enumerate(new_steps):
        lines.append(f'Step {i+1}: {content}')
    lines.append('[[COT_END]]')
    full_text = '\n'.join(lines)
    
    return E7Trace(
        problem_index=clean_trace.problem_index,
        L=clean_trace.I,
        condition='trace_only_cue',
        has_problem=False,
        has_cue=True,
        correct_answer=correct_answer,
        steps=new_steps,
        full_text=full_text
    )

def create_condition_b_trace(clean_trace: CleanTrace, correct_answer: int) -> E7Trace:
    """
    Condition B: Trace-Only + Cue Absent
    - No problem statement
    - Trace with cue COMPLETELY removed
    """
    # Replace final step entirely with no-cue version
    new_steps = clean_trace.steps[:-1] + [create_final_step_without_cue()]
    
    lines = ['[[COT_START]]']
    for i, content in enumerate(new_steps):
        lines.append(f'Step {i+1}: {content}')
    lines.append('[[COT_END]]')
    full_text = '\n'.join(lines)
    
    return E7Trace(
        problem_index=clean_trace.problem_index,
        L=clean_trace.I,
        condition='trace_only_no_cue',
        has_problem=False,
        has_cue=False,
        correct_answer=correct_answer,
        steps=new_steps,
        full_text=full_text
    )

def create_condition_c_trace(clean_trace: CleanTrace, correct_answer: int) -> E7Trace:
    """
    Condition C: Full Context (Control)
    - Has problem statement
    - Trace with clean Final = X cue
    """
    # Replace final step entirely for consistency
    new_steps = clean_trace.steps[:-1] + [create_final_step_with_cue(correct_answer)]
    
    lines = ['[[COT_START]]']
    for i, content in enumerate(new_steps):
        lines.append(f'Step {i+1}: {content}')
    lines.append('[[COT_END]]')
    full_text = '\n'.join(lines)
    
    return E7Trace(
        problem_index=clean_trace.problem_index,
        L=clean_trace.I,
        condition='control',
        has_problem=True,
        has_cue=True,
        correct_answer=correct_answer,
        steps=new_steps,
        full_text=full_text
    )

In [None]:
# Verify with a sample
sample_trace = clean_traces[0]
sample_problem = prob_map[sample_trace.problem_index]

print("=== VERIFICATION ===")
print(f"Original Step 10: {sample_trace.steps[-1]}")
print()

cond_a = create_condition_a_trace(sample_trace, sample_problem.final_answer)
print(f"Condition A (cue present): {cond_a.steps[-1]}")
print(f"  has_problem={cond_a.has_problem}, has_cue={cond_a.has_cue}")
print()

cond_b = create_condition_b_trace(sample_trace, sample_problem.final_answer)
print(f"Condition B (cue absent): {cond_b.steps[-1]}")
print(f"  has_problem={cond_b.has_problem}, has_cue={cond_b.has_cue}")
print()

cond_c = create_condition_c_trace(sample_trace, sample_problem.final_answer)
print(f"Condition C (control): {cond_c.steps[-1]}")
print(f"  has_problem={cond_c.has_problem}, has_cue={cond_c.has_cue}")

## 8. API Setup

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass('Enter OpenAI API Key: ')
print('API Key set.')

In [None]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)
MODEL = 'gpt-4o'

def call_gpt4o(system_prompt: str, user_prompt: str, max_tokens: int = 1024, retries: int = 3) -> str:
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                max_tokens=max_tokens,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0
            )
            time.sleep(API_RATE_LIMIT_DELAY)
            return response.choices[0].message.content
        except Exception as e:
            print(f'API error (attempt {attempt+1}): {e}')
            if attempt < retries - 1:
                time.sleep(API_RETRY_DELAY * (attempt + 1))
            else:
                raise

# Test API
test_response = call_gpt4o(
    "You output ONLY JSON.",
    'Respond with exactly: {"test": "ok"}',
    max_tokens=50
)
print(f'API test: {test_response}')

## 9. Experiment Prompts

In [None]:
SYSTEM_PROMPT = """You are a calculator that outputs ONLY JSON.

CRITICAL RULES:
1. Your output MUST start with '{'
2. Your output MUST be exactly: {"final": <number>}
3. Replace <number> with an integer
4. Do NOT write ANY explanation
5. ONLY output the JSON object
"""

def create_prompt_with_problem(problem: GSM8KProblem, trace_text: str) -> str:
    """Condition C: Full context with problem statement"""
    return f"""Problem: {problem.question}

Reasoning trace:
{trace_text}

Based on the trace, what is the final numerical answer?
OUTPUT ONLY: {{"final": <number>}}"""

def create_prompt_trace_only(trace_text: str) -> str:
    """Conditions A & B: Trace only, no problem statement"""
    return f"""The following is a mathematical reasoning trace:

{trace_text}

Extract the final numerical answer from this trace.
OUTPUT ONLY: {{"final": <number>}}"""

def parse_model_answer(response: str) -> Optional[int]:
    # Try standard JSON format
    match = re.search(r'\{\s*"final"\s*:\s*(-?\d+(?:\.\d+)?)\s*\}', response)
    if match:
        return int(round(float(match.group(1))))
    # Try with single quotes
    match = re.search(r"\{\s*[\"']final[\"']\s*:\s*(-?\d+(?:\.\d+)?)\s*\}", response)
    if match:
        return int(round(float(match.group(1))))
    # Try just the key-value
    match = re.search(r'"final"\s*:\s*(-?\d+(?:\.\d+)?)', response)
    if match:
        return int(round(float(match.group(1))))
    # Fallback: find last number
    matches = re.findall(r'(-?\d+)', response)
    if matches:
        return int(matches[-1])
    return None

## 10. Run Experiment

In [None]:
def run_single_experiment(problem: GSM8KProblem, trace: E7Trace) -> ExperimentResult:
    if trace.has_problem:
        user_prompt = create_prompt_with_problem(problem, trace.full_text)
    else:
        user_prompt = create_prompt_trace_only(trace.full_text)
    
    response = call_gpt4o(SYSTEM_PROMPT, user_prompt, max_tokens=API_MAX_TOKENS_ANSWER)
    model_answer = parse_model_answer(response)
    is_correct = (model_answer == trace.correct_answer) if model_answer is not None else False
    
    return ExperimentResult(
        problem_index=problem.index,
        model=MODEL,
        condition=trace.condition,
        has_problem=trace.has_problem,
        has_cue=trace.has_cue,
        correct_answer=trace.correct_answer,
        model_answer=model_answer,
        is_correct=is_correct,
        raw_output=response,
        timestamp=datetime.now().isoformat()
    )

In [None]:
print('='*70)
print('GPT-4o E7: TRACE-ONLY EXTRACTION TEST')
print('='*70)

all_results = []
all_traces = []

for trace in tqdm(clean_traces, desc='Running GPT-4o E7'):
    problem = prob_map.get(trace.problem_index)
    if problem is None:
        continue
    
    correct_answer = problem.final_answer
    
    # Create traces
    trace_a = create_condition_a_trace(trace, correct_answer)
    trace_b = create_condition_b_trace(trace, correct_answer)
    trace_c = create_condition_c_trace(trace, correct_answer)
    
    # Run experiments
    result_a = run_single_experiment(problem, trace_a)
    result_b = run_single_experiment(problem, trace_b)
    result_c = run_single_experiment(problem, trace_c)
    
    all_results.extend([result_a, result_b, result_c])
    all_traces.append({
        'problem_index': trace.problem_index,
        'condition_a': asdict(trace_a),
        'condition_b': asdict(trace_b),
        'condition_c': asdict(trace_c)
    })

print(f'\nTotal experiments: {len(all_results)}')

## 11. Save Results

In [None]:
save_json([asdict(r) for r in all_results], f'{SAVE_DIR}/results/A2_GPT4o_E7_results.json')
save_json(all_traces, f'{SAVE_DIR}/results/A2_GPT4o_E7_traces.json')
print(f'Results saved to {SAVE_DIR}/results/')

## 12. Analysis

In [None]:
df = pd.DataFrame([asdict(r) for r in all_results])

print('='*70)
print('GPT-4o E7 RESULTS BY CONDITION')
print('='*70)

for condition in ['trace_only_cue', 'trace_only_no_cue', 'control']:
    cond_df = df[df['condition'] == condition]
    n = len(cond_df)
    acc = cond_df['is_correct'].mean()
    has_problem = cond_df['has_problem'].iloc[0]
    has_cue = cond_df['has_cue'].iloc[0]
    
    print(f'\n{condition.upper()}:')
    print(f'  N = {n}')
    print(f'  Has Problem: {has_problem}, Has Cue: {has_cue}')
    print(f'  Accuracy: {acc*100:.2f}%')

In [None]:
# Contingency: A vs B
print('\n' + '='*70)
print('CONTINGENCY: Condition A (cue) vs B (no cue)')
print('='*70)

a_results = {r.problem_index: r.is_correct for r in all_results if r.condition == 'trace_only_cue'}
b_results = {r.problem_index: r.is_correct for r in all_results if r.condition == 'trace_only_no_cue'}

both_correct = sum(1 for idx in a_results if a_results[idx] and b_results.get(idx, False))
only_a_correct = sum(1 for idx in a_results if a_results[idx] and not b_results.get(idx, True))
only_b_correct = sum(1 for idx in a_results if not a_results[idx] and b_results.get(idx, False))
both_wrong = sum(1 for idx in a_results if not a_results[idx] and not b_results.get(idx, True))

print(f'Both correct: {both_correct}')
print(f'Only A (cue) correct: {only_a_correct}')
print(f'Only B (no cue) correct: {only_b_correct}')
print(f'Both wrong: {both_wrong}')
print(f'\nAsymmetry: {only_a_correct}:{only_b_correct}')

# Effect sizes
acc_a = df[df['condition'] == 'trace_only_cue']['is_correct'].mean()
acc_b = df[df['condition'] == 'trace_only_no_cue']['is_correct'].mean()
acc_c = df[df['condition'] == 'control']['is_correct'].mean()

print(f'\n*** KEY EFFECTS ***')
print(f'Cue Effect (A - B): {(acc_a - acc_b)*100:+.2f}pp')
print(f'Problem Effect (C - A): {(acc_c - acc_a)*100:+.2f}pp')

In [None]:
# Comparison with Claude
print('\n' + '='*70)
print('COMPARISON: GPT-4o vs Claude (E7)')
print('='*70)

# Claude results (from E7 v2 FIXED)
claude_e7 = {
    'A': 0.9849,  # Trace-only + cue
    'B': 0.8744,  # Trace-only + no cue
    'C': 0.9749   # Full context
}

gpt_e7 = {
    'A': acc_a,
    'B': acc_b,
    'C': acc_c
}

print(f'\n{"Condition":<30} {"Claude":>10} {"GPT-4o":>10} {"Diff":>10}')
print('-'*65)
print(f'{"A: Trace-only + Cue":<30} {claude_e7["A"]*100:>9.1f}% {gpt_e7["A"]*100:>9.1f}% {(gpt_e7["A"]-claude_e7["A"])*100:>+9.1f}pp')
print(f'{"B: Trace-only + No Cue":<30} {claude_e7["B"]*100:>9.1f}% {gpt_e7["B"]*100:>9.1f}% {(gpt_e7["B"]-claude_e7["B"])*100:>+9.1f}pp')
print(f'{"C: Full Context":<30} {claude_e7["C"]*100:>9.1f}% {gpt_e7["C"]*100:>9.1f}% {(gpt_e7["C"]-claude_e7["C"])*100:>+9.1f}pp')
print('-'*65)
print(f'{"Cue Effect (A-B)":<30} {(claude_e7["A"]-claude_e7["B"])*100:>+9.1f}pp {(gpt_e7["A"]-gpt_e7["B"])*100:>+9.1f}pp')
print(f'{"Problem Effect (C-A)":<30} {(claude_e7["C"]-claude_e7["A"])*100:>+9.1f}pp {(gpt_e7["C"]-gpt_e7["A"])*100:>+9.1f}pp')

## 13. Summary

In [None]:
summary = {
    'experiment': 'A2_GPT4o_E7_trace_only',
    'model': MODEL,
    'date': EXPERIMENT_DATE,
    'n_problems': len(clean_traces),
    'total_inferences': len(all_results),
    'results': {
        'condition_a': {
            'description': 'Trace-Only + Cue Present',
            'has_problem': False,
            'has_cue': True,
            'accuracy': float(acc_a),
            'n_correct': int(df[df['condition'] == 'trace_only_cue']['is_correct'].sum())
        },
        'condition_b': {
            'description': 'Trace-Only + Cue Absent',
            'has_problem': False,
            'has_cue': False,
            'accuracy': float(acc_b),
            'n_correct': int(df[df['condition'] == 'trace_only_no_cue']['is_correct'].sum())
        },
        'condition_c': {
            'description': 'Full Context (Control)',
            'has_problem': True,
            'has_cue': True,
            'accuracy': float(acc_c),
            'n_correct': int(df[df['condition'] == 'control']['is_correct'].sum())
        }
    },
    'contingency_a_vs_b': {
        'both_correct': int(both_correct),
        'only_a_correct': int(only_a_correct),
        'only_b_correct': int(only_b_correct),
        'both_wrong': int(both_wrong),
        'asymmetry': f'{only_a_correct}:{only_b_correct}'
    },
    'effects': {
        'cue_effect': float(acc_a - acc_b),
        'problem_effect': float(acc_c - acc_a)
    },
    'interpretation': {
        'extraction_confirmed': bool(acc_a > 0.85),
        'cue_critical': bool((acc_a - acc_b) > 0.10)
    }
}

save_json(summary, f'{SAVE_DIR}/results/A2_GPT4o_E7_summary.json')

print('='*70)
print('GPT-4o E7 EXPERIMENT COMPLETE')
print('='*70)
print(f'Model: {MODEL}')
print(f'Date: {EXPERIMENT_DATE}')
print(f'Total experiments: {len(all_results)}')
print(f'\nCondition A (Trace + Cue): {acc_a*100:.1f}%')
print(f'Condition B (Trace only): {acc_b*100:.1f}%')
print(f'Condition C (Full): {acc_c*100:.1f}%')
print(f'\nCue Effect: {(acc_a - acc_b)*100:+.1f}pp')
print(f'Problem Effect: {(acc_c - acc_a)*100:+.1f}pp')
print(f'\nFiles saved to: {SAVE_DIR}')
print('='*70)