# A2-GPT4o: E1b Shuffle Experiment at c=0.4

**Paper**: A2 (Cue-Dominant Extraction)

**Purpose**: Replicate E1b (shuffle experiment at c=0.4) using GPT-4o to test model generality.

**Design**:
- Model: GPT-4o
- c = 0.4 (same as Claude E1b)
- L = 10
- Conditions: Original vs Shuffled

**Claude E1b results (for comparison)**:
- Original (c=0.4): 83.9%
- Shuffled (c=0.4): 91.5% (+7.5 pp, P=0.001)

**Expected inference count**: 398 (199 × 2 conditions)

**Date**: 2026-01-03
**GLOBAL_SEED**: 20251224

## 0. Google Drive Connection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
from datetime import datetime

EXPERIMENT_NAME = 'A2_GPT4o_E1b_shuffle'
EXPERIMENT_DATE = datetime.now().strftime('%Y%m%d')

BASE_DIR = '/content/drive/MyDrive/CoT_Experiment'
V3_DATA_DIR = f'{BASE_DIR}/full_experiment_v3_20251224'

SAVE_DIR = f'{BASE_DIR}/{EXPERIMENT_NAME}_{EXPERIMENT_DATE}'
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(f'{SAVE_DIR}/results', exist_ok=True)

print(f'Experiment: {EXPERIMENT_NAME}')
print(f'V3 data directory: {V3_DATA_DIR}')
print(f'Save directory: {SAVE_DIR}')

## 1. Install Dependencies

In [None]:
!pip install datasets openai matplotlib pandas tqdm scipy -q
print('Dependencies installed.')

## 2. Configuration

In [None]:
import hashlib
import random
import json
import re
import time
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass, asdict, field
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats

# =============================================================================
# Global Configuration
# =============================================================================
GLOBAL_SEED = 20251224
SHUFFLE_SEED = 20250102

# Experiment parameters
L = 10
C_TARGET = 0.4

# Corruption type ratio
CORRUPTION_RATIO = {'IRR': 1, 'LOC': 2, 'WRONG': 2}

# API settings
API_MAX_TOKENS_ANSWER = 256
API_RETRY_DELAY = 1.0
API_RATE_LIMIT_DELAY = 0.5

print('='*70)
print('A2-GPT4o: E1b SHUFFLE EXPERIMENT AT c=0.4')
print('='*70)
print(f'  Model: GPT-4o')
print(f'  GLOBAL_SEED: {GLOBAL_SEED}')
print(f'  SHUFFLE_SEED: {SHUFFLE_SEED}')
print(f'  L (trace length): {L}')
print(f'  c (corruption fraction): {C_TARGET}')
print(f'  K_corrupt = round(c * L) = {round(C_TARGET * L)}')
print(f'  K_clean = L - K_corrupt = {L - round(C_TARGET * L)}')
print('='*70)
print('\nClaude E1b results (for comparison):')
print('  Original (c=0.4): 83.9%')
print('  Shuffled (c=0.4): 91.5%')
print('  Shuffle effect: +7.5 pp')

## 3. Data Structures

In [None]:
@dataclass
class GSM8KProblem:
    index: int
    question: str
    answer_text: str
    final_answer: int

@dataclass
class CleanTrace:
    problem_index: int
    I: int
    steps: List[str]
    full_text: str

@dataclass
class CorruptedTrace:
    problem_index: int
    L: int
    c: float
    original_order: List[int]
    shuffled_order: List[int]
    corrupted_steps: List[int]
    corruption_types: Dict[int, str]
    steps: List[str]
    full_text: str
    is_shuffled: bool
    seed: int

@dataclass
class ExperimentResult:
    problem_index: int
    condition: str
    model: str
    L: int
    c: float
    K_clean: int
    model_answer: Optional[int]
    correct_answer: int
    is_correct: bool
    raw_output: str
    timestamp: str

## 4. Utility Functions

In [None]:
def derive_seed(global_seed: int, problem_id: int, I: int, lam: float, replicate_id: int = 0) -> int:
    key = f"{global_seed}|{problem_id}|I={I}|lam={lam}|rep={replicate_id}"
    h = hashlib.sha256(key.encode("utf-8")).hexdigest()
    return int(h[:8], 16)

def save_json(data: Any, filepath: str):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_json(filepath: str) -> Any:
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

## 5. Load Existing Data

In [None]:
problems_path = f'{V3_DATA_DIR}/problems_v3.json'
problems_data = load_json(problems_path)
problems = [GSM8KProblem(**p) for p in problems_data]
print(f'Loaded {len(problems)} problems')

traces_path = f'{V3_DATA_DIR}/clean_traces/clean_traces_I10_v3.json'
traces_data = load_json(traces_path)
clean_traces = [CleanTrace(**t) for t in traces_data]
trace_map = {t.problem_index: t for t in clean_traces}
print(f'Loaded {len(clean_traces)} clean traces (L=10)')

prob_map = {p.index: p for p in problems}

## 6. Corruption Logic (Same as E1)

In [None]:
def pick_corrupted_steps(L: int, c: float, seed: int) -> List[int]:
    K = int(round(c * L))
    if K == 0:
        return []
    steps = list(range(1, L + 1))
    rng = random.Random(seed)
    rng.shuffle(steps)
    return sorted(steps[:K])

def assign_corruption_types(corrupted_steps: List[int], seed: int) -> Dict[int, str]:
    K = len(corrupted_steps)
    if K == 0:
        return {}
    
    n_irr = (K * 1) // 5
    n_loc = (K * 2) // 5
    n_wrong = K - n_irr - n_loc
    
    if n_wrong == 0 and K > 0:
        n_wrong = 1
        if n_loc > 0:
            n_loc -= 1
        elif n_irr > 0:
            n_irr -= 1
    
    rng = random.Random(seed + 1)
    perm = corrupted_steps[:]
    rng.shuffle(perm)
    
    type_map = {}
    for s in perm[:n_irr]:
        type_map[s] = "IRR"
    for s in perm[n_irr:n_irr + n_loc]:
        type_map[s] = "LOC"
    for s in perm[n_irr + n_loc:]:
        type_map[s] = "WRONG"
    
    return type_map

## 7. Corruption Templates (Same as E1)

In [None]:
IRRELEVANT_TEMPLATES = [
    "Compute an auxiliary value: aux = {a} + {b} = {result}, but it will not be used later.",
    "Compute a side quantity: aux = {a} * 2 = {result}, unrelated to the final result.",
    "Note that we can also compute aux = {a} - {b} = {result}, though this is not needed.",
]

WRONG_CONSTRAINT_TEMPLATES = [
    "Fix an intermediate condition: set {var} = {wrong_value} as a given constraint for the rest of the steps.",
    "Assume the total is {var} = {wrong_value} and proceed using this fixed value.",
]

def generate_irrelevant_step(step_num: int, seed: int) -> str:
    rng = random.Random(seed)
    a = rng.randint(2, 20)
    b = rng.randint(2, 20)
    template = rng.choice(IRRELEVANT_TEMPLATES)
    if '+' in template:
        result = a + b
    elif '*' in template:
        result = a * 2
    else:
        result = a - b
    return template.format(a=a, b=b, result=result)

def generate_local_error_step(original_step: str, seed: int) -> str:
    rng = random.Random(seed)
    numbers = re.findall(r'\d+', original_step)
    if not numbers:
        return f"Compute t = 10 * 3 = {rng.randint(28, 32)} (using the previous values)."
    original_result = int(numbers[-1])
    offset = rng.choice([-3, -2, -1, 1, 2, 3])
    wrong_result = max(0, original_result + offset)
    modified = re.sub(r'= (\d+)\.$', f'= {wrong_result}.', original_step)
    if modified == original_step:
        modified = re.sub(r'(\d+)\.$', f'{wrong_result}.', original_step)
    return modified

def generate_wrong_constraint_step(step_num: int, seed: int) -> str:
    rng = random.Random(seed)
    var = rng.choice(['x', 'total', 'result', 'n'])
    wrong_value = rng.randint(10, 100)
    template = rng.choice(WRONG_CONSTRAINT_TEMPLATES)
    return template.format(var=var, wrong_value=wrong_value)

## 8. Trace Generation (Original & Shuffled)

In [None]:
def create_corrupted_trace(
    clean_trace: CleanTrace,
    c: float,
    corruption_seed: int,
    shuffle: bool = False,
    shuffle_seed: int = None
) -> CorruptedTrace:
    """Create corrupted trace with optional shuffling."""
    L = clean_trace.I
    
    # Step 1: Determine which steps to corrupt
    corrupted_steps = pick_corrupted_steps(L, c, corruption_seed)
    corruption_types = assign_corruption_types(corrupted_steps, corruption_seed)
    
    # Step 2: Apply corruption to get step contents
    step_contents = []
    for i, step_content in enumerate(clean_trace.steps):
        step_num = i + 1
        if step_num in corruption_types:
            ctype = corruption_types[step_num]
            step_seed = corruption_seed + step_num * 1000
            if ctype == 'IRR':
                new_content = generate_irrelevant_step(step_num, step_seed)
            elif ctype == 'LOC':
                new_content = generate_local_error_step(step_content, step_seed)
            else:
                new_content = generate_wrong_constraint_step(step_num, step_seed)
            step_contents.append(new_content)
        else:
            step_contents.append(step_content)
    
    # Step 3: Shuffle if requested
    original_order = list(range(1, L + 1))
    if shuffle and shuffle_seed is not None:
        shuffled_indices = list(range(L))
        rng = random.Random(shuffle_seed)
        rng.shuffle(shuffled_indices)
        shuffled_order = [original_order[i] for i in shuffled_indices]
        final_contents = [step_contents[i] for i in shuffled_indices]
    else:
        shuffled_order = original_order[:]
        final_contents = step_contents
    
    # Step 4: Build full text
    lines = ['[[COT_START]]']
    for i, content in enumerate(final_contents):
        lines.append(f'Step {i+1}: {content}')
    lines.append('[[COT_END]]')
    full_text = '\n'.join(lines)
    
    return CorruptedTrace(
        problem_index=clean_trace.problem_index,
        L=L,
        c=c,
        original_order=original_order,
        shuffled_order=shuffled_order,
        corrupted_steps=corrupted_steps,
        corruption_types=corruption_types,
        steps=final_contents,
        full_text=full_text,
        is_shuffled=shuffle,
        seed=corruption_seed
    )

In [None]:
# Test trace generation
test_trace = clean_traces[0]
test_corruption_seed = derive_seed(GLOBAL_SEED, test_trace.problem_index, L, C_TARGET)
test_shuffle_seed = derive_seed(SHUFFLE_SEED, test_trace.problem_index, L, C_TARGET)

original = create_corrupted_trace(test_trace, C_TARGET, test_corruption_seed, shuffle=False)
shuffled = create_corrupted_trace(test_trace, C_TARGET, test_corruption_seed, shuffle=True, shuffle_seed=test_shuffle_seed)

print('Test Trace Generation (c=0.4):')
print(f'  Corrupted steps: {original.corrupted_steps}')
print(f'  Corruption types: {original.corruption_types}')
print(f'  K_corrupt: {len(original.corrupted_steps)}')
print(f'  K_clean: {L - len(original.corrupted_steps)}')
print(f'\n  Original order: {original.shuffled_order}')
print(f'  Shuffled order: {shuffled.shuffled_order}')

## 9. API Setup

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass('Enter OpenAI API Key: ')
print('API Key set.')

In [None]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)
MODEL = 'gpt-4o'

def call_gpt4o(system_prompt: str, user_prompt: str, max_tokens: int = 1024, retries: int = 3) -> str:
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                max_tokens=max_tokens,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0
            )
            time.sleep(API_RATE_LIMIT_DELAY)
            return response.choices[0].message.content
        except Exception as e:
            print(f'API error (attempt {attempt+1}): {e}')
            if attempt < retries - 1:
                time.sleep(API_RETRY_DELAY * (attempt + 1))
            else:
                raise

test_response = call_gpt4o(
    "You output ONLY JSON.",
    'Respond with exactly: {"test": "ok"}',
    max_tokens=50
)
print(f'API test: {test_response}')

## 10. Experiment Prompts

In [None]:
EXPERIMENT_SYSTEM_PROMPT = """You are a calculator that outputs ONLY JSON.

CRITICAL RULES:
1. Your output MUST start with the character '{'
2. Your output MUST be exactly: {"final": <number>}
3. Replace <number> with an integer (the numerical answer)
4. Do NOT write ANY explanation, reasoning, or text before or after the JSON
5. Do NOT write "I need to" or "Let me" or any other words
6. ONLY output the JSON object, nothing else

CORRECT OUTPUT EXAMPLE:
{"final": 42}
"""

def create_experiment_prompt(problem: GSM8KProblem, cot_text: str) -> Tuple[str, str]:
    user = f"""Problem: {problem.question}

Reasoning trace (use these steps as given facts):
{cot_text}

Based on the trace above, compute the final numerical answer.
OUTPUT ONLY: {{"final": <number>}}
START YOUR RESPONSE WITH '{{'"""
    return EXPERIMENT_SYSTEM_PROMPT, user

def parse_model_answer(response: str) -> Optional[int]:
    match = re.search(r'\{\s*"final"\s*:\s*(-?\d+(?:\.\d+)?)\s*\}', response)
    if match:
        return int(round(float(match.group(1))))
    match = re.search(r"\{\s*[\"']final[\"']\s*:\s*(-?\d+(?:\.\d+)?)\s*\}", response)
    if match:
        return int(round(float(match.group(1))))
    match = re.search(r'"final"\s*:\s*(-?\d+(?:\.\d+)?)', response)
    if match:
        return int(round(float(match.group(1))))
    matches = re.findall(r'(?:^|\s)(-?\d+(?:\.\d+)?)(?:\s|$|\.|,)', response)
    if matches:
        return int(round(float(matches[-1])))
    return None

## 11. Run Experiment

In [None]:
def run_experiment(
    problem: GSM8KProblem,
    cot_text: str,
    condition: str,
    L: int,
    c: float
) -> ExperimentResult:
    sys_prompt, usr_prompt = create_experiment_prompt(problem, cot_text)
    response = call_gpt4o(sys_prompt, usr_prompt, max_tokens=API_MAX_TOKENS_ANSWER)
    
    model_answer = parse_model_answer(response)
    is_correct = (model_answer == problem.final_answer) if model_answer is not None else False
    
    K_clean = L - int(round(c * L))
    
    return ExperimentResult(
        problem_index=problem.index,
        condition=condition,
        model=MODEL,
        L=L,
        c=c,
        K_clean=K_clean,
        model_answer=model_answer,
        correct_answer=problem.final_answer,
        is_correct=is_correct,
        raw_output=response,
        timestamp=datetime.now().isoformat()
    )

In [None]:
print('='*70)
print('A2-GPT4o: E1b SHUFFLE EXPERIMENT (c=0.4)')
print('='*70)
print(f'Model: {MODEL}')
print(f'Conditions: Original, Shuffled')
print(f'Expected inferences: {len(problems) * 2}')
print('='*70)

results_original = []
results_shuffled = []
traces_log = []

for prob in tqdm(problems, desc='E1b GPT-4o (Original + Shuffled)'):
    if prob.index not in trace_map:
        continue
    
    clean_trace = trace_map[prob.index]
    
    # Generate seeds
    corruption_seed = derive_seed(GLOBAL_SEED, prob.index, L, C_TARGET)
    shuffle_seed = derive_seed(SHUFFLE_SEED, prob.index, L, C_TARGET)
    
    # Condition 1: Original order
    original_trace = create_corrupted_trace(clean_trace, C_TARGET, corruption_seed, shuffle=False)
    result_original = run_experiment(prob, original_trace.full_text, 'original_c04', L, C_TARGET)
    results_original.append(result_original)
    
    # Condition 2: Shuffled order
    shuffled_trace = create_corrupted_trace(clean_trace, C_TARGET, corruption_seed, shuffle=True, shuffle_seed=shuffle_seed)
    result_shuffled = run_experiment(prob, shuffled_trace.full_text, 'shuffled_c04', L, C_TARGET)
    results_shuffled.append(result_shuffled)
    
    # Log traces
    traces_log.append({
        'problem_index': prob.index,
        'original': asdict(original_trace),
        'shuffled': asdict(shuffled_trace)
    })

print(f'\nCompleted: {len(results_original) + len(results_shuffled)} experiments')

## 12. Save Results

In [None]:
all_results = results_original + results_shuffled
save_json([asdict(r) for r in all_results], f'{SAVE_DIR}/results/A2_GPT4o_E1b_results.json')
print(f'Results saved: {SAVE_DIR}/results/A2_GPT4o_E1b_results.json')

save_json(traces_log, f'{SAVE_DIR}/results/A2_GPT4o_E1b_traces.json')
print(f'Traces saved: {SAVE_DIR}/results/A2_GPT4o_E1b_traces.json')

## 13. Analysis

In [None]:
df_original = pd.DataFrame([asdict(r) for r in results_original])
df_shuffled = pd.DataFrame([asdict(r) for r in results_shuffled])

original_acc = df_original['is_correct'].mean()
shuffled_acc = df_shuffled['is_correct'].mean()

print('='*70)
print('A2-GPT4o E1b RESULTS (c=0.4)')
print('='*70)
print(f'Original (c=0.4, L=10):  {original_acc:.1%} ({df_original["is_correct"].sum()}/{len(df_original)})')
print(f'Shuffled (c=0.4, L=10):  {shuffled_acc:.1%} ({df_shuffled["is_correct"].sum()}/{len(df_shuffled)})')
print(f'Shuffle effect:          {(shuffled_acc - original_acc)*100:+.1f} pp')
print('='*70)

In [None]:
# McNemar's test
N = len(results_original)
both_correct = sum(1 for i in range(N) if results_original[i].is_correct and results_shuffled[i].is_correct)
orig_only = sum(1 for i in range(N) if results_original[i].is_correct and not results_shuffled[i].is_correct)
shuf_only = sum(1 for i in range(N) if not results_original[i].is_correct and results_shuffled[i].is_correct)
both_wrong = sum(1 for i in range(N) if not results_original[i].is_correct and not results_shuffled[i].is_correct)

print('\nContingency Table:')
print('                    Shuffled')
print('                 Correct  Wrong')
print(f'Original Correct   {both_correct:3d}     {orig_only:3d}')
print(f'         Wrong     {shuf_only:3d}     {both_wrong:3d}')

if orig_only + shuf_only > 0:
    chi2 = (abs(orig_only - shuf_only) - 1)**2 / (orig_only + shuf_only)
    p_value = 1 - stats.chi2.cdf(chi2, 1)
    print(f'\nMcNemar χ² = {chi2:.2f}, P = {p_value:.4f}')
else:
    chi2 = None
    p_value = None
    print('\nNo discordant pairs for McNemar test')

In [None]:
# Comparison with Claude
claude_original = 83.9
claude_shuffled = 91.5

print('\n' + '='*70)
print('COMPARISON: Claude vs GPT-4o (E1b, c=0.4)')
print('='*70)
print(f'{"Condition":<20} {"Claude":>12} {"GPT-4o":>12}')
print('-'*50)
print(f'{"Original":<20} {claude_original:>11.1f}% {original_acc*100:>11.1f}%')
print(f'{"Shuffled":<20} {claude_shuffled:>11.1f}% {shuffled_acc*100:>11.1f}%')
print(f'{"Shuffle effect":<20} {claude_shuffled-claude_original:>+11.1f}pp {(shuffled_acc-original_acc)*100:>+11.1f}pp')
print('='*70)

print('\nINTERPRETATION:')
if shuffled_acc >= original_acc:
    print('✓ GPT-4o: Shuffling does NOT reduce accuracy')
    print('✓ Order invariance confirmed for GPT-4o')
    print('→ Cue-dominant extraction generalizes across models')
else:
    print('△ GPT-4o: Shuffling reduces accuracy')
    print('△ Model-specific processing strategies may exist')

## 14. Summary

In [None]:
summary = {
    'experiment': 'A2_GPT4o_E1b_shuffle',
    'model': MODEL,
    'date': EXPERIMENT_DATE,
    'n_problems': N,
    'c': C_TARGET,
    'L': L,
    'results': {
        'original_accuracy': original_acc,
        'shuffled_accuracy': shuffled_acc,
        'shuffle_effect_pp': (shuffled_acc - original_acc) * 100
    },
    'mcnemar': {
        'chi2': chi2,
        'p_value': p_value
    },
    'contingency': {
        'both_correct': both_correct,
        'original_only': orig_only,
        'shuffled_only': shuf_only,
        'both_wrong': both_wrong
    },
    'comparison_claude': {
        'original': claude_original,
        'shuffled': claude_shuffled,
        'effect': claude_shuffled - claude_original
    }
}

save_json(summary, f'{SAVE_DIR}/results/A2_GPT4o_E1b_summary.json')

print('='*70)
print('A2-GPT4o E1b EXPERIMENT COMPLETE')
print('='*70)
print(f'Date: {EXPERIMENT_DATE}')
print(f'Model: {MODEL}')
print(f'Total experiments: {len(all_results)}')
print(f'\nResults:')
print(f'  Original: {original_acc:.1%}')
print(f'  Shuffled: {shuffled_acc:.1%}')
print(f'  Effect:   {(shuffled_acc-original_acc)*100:+.1f} pp')
print(f'\nFiles saved to: {SAVE_DIR}')
print('='*70)