# CoT Phase Transition Experiment - ChatGPT Replication

**Version**: 1.0 (2024-12-25)

**Purpose**: Validate generalizability of A* threshold across models

**Design**:
- Same 200 problems as Claude experiment (GLOBAL_SEED=20251224)
- Same clean traces (I=10)
- Same corruption protocol
- λ ∈ {0.0, 0.2, 0.4, 0.6, 0.8, 1.0} + Direct condition

**Goal**: Estimate A* for GPT-4 and compare with Claude's A* ≈ 0.55

## 0. Google Drive Connection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
from datetime import datetime

EXPERIMENT_VERSION = 'chatgpt_v1'
EXPERIMENT_DATE = datetime.now().strftime('%Y%m%d')

SAVE_DIR = '/content/drive/MyDrive/CoT_Experiment'
SAVE_DIR_GPT = f'{SAVE_DIR}/chatgpt_experiment_{EXPERIMENT_DATE}'

os.makedirs(SAVE_DIR_GPT, exist_ok=True)
os.makedirs(f'{SAVE_DIR_GPT}/results', exist_ok=True)
os.makedirs(f'{SAVE_DIR_GPT}/checkpoints', exist_ok=True)

print(f'ChatGPT experiment save directory: {SAVE_DIR_GPT}')

## 1. Install Dependencies

In [None]:
!pip install datasets openai pandas tqdm matplotlib -q
print('Dependencies installed.')

## 2. Configuration

In [None]:
import json
import re
import random
import time
import hashlib
from typing import List, Dict, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np

# =============================================================================
# Configuration - MUST MATCH CLAUDE EXPERIMENT
# =============================================================================
GLOBAL_SEED = 20251224  # Same as Claude experiment
N_PROBLEMS = 200

# Experimental conditions
I_FIXED = 10  # Same trace depth as Claude
LAMBDA_VALUES = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

# API settings
API_MODEL = 'gpt-4o'  # or 'gpt-4-turbo'
API_MAX_TOKENS = 256
API_RATE_LIMIT_DELAY = 0.3  # OpenAI is generally faster
CHECKPOINT_EVERY = 50

print('='*60)
print('CHATGPT REPLICATION EXPERIMENT')
print('='*60)
print(f'  GLOBAL_SEED: {GLOBAL_SEED}')
print(f'  N_PROBLEMS: {N_PROBLEMS}')
print(f'  I (fixed): {I_FIXED}')
print(f'  λ values: {LAMBDA_VALUES}')
print(f'  Model: {API_MODEL}')
print(f'  Total inferences: {N_PROBLEMS * (len(LAMBDA_VALUES) + 1)}')
print('='*60)

## 3. Data Structures & Utilities

In [None]:
@dataclass
class GSM8KProblem:
    index: int
    question: str
    answer_text: str
    final_answer: int

@dataclass
class ExperimentResult:
    problem_index: int
    condition: str  # 'direct' or 'cot'
    I: Optional[int]
    lam: Optional[float]
    A_target: Optional[float]
    model_answer: Optional[int]
    correct_answer: int
    is_correct: bool
    raw_output: str
    timestamp: str
    model: str

def extract_final_answer(answer_text: str) -> int:
    match = re.search(r'####\s*([\d,]+)', answer_text)
    if match:
        return int(match.group(1).replace(',', ''))
    raise ValueError(f'Could not extract final answer')

def save_json(data: Any, filepath: str):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f'Saved: {filepath}')

def load_json(filepath: str) -> Any:
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def derive_seed(global_seed: int, problem_id: int, I: int, lam: float) -> int:
    key = f"{global_seed}|{problem_id}|I={I}|lam={lam}"
    h = hashlib.sha256(key.encode("utf-8")).hexdigest()
    return int(h[:8], 16)

## 4. Load GSM8K and Problems

In [None]:
from datasets import load_dataset

dataset = load_dataset('gsm8k', 'main', split='test')
print(f'GSM8K test set loaded: {len(dataset)} problems')

def select_problems(dataset, n_problems: int, seed: int) -> List[int]:
    rng = random.Random(seed)
    indices = list(range(len(dataset)))
    rng.shuffle(indices)
    return sorted(indices[:n_problems])

# Same seed → Same problems as Claude experiment
selected_indices = select_problems(dataset, N_PROBLEMS, GLOBAL_SEED)
print(f'Selected {len(selected_indices)} problems (identical to Claude experiment)')

problems = []
for idx in selected_indices:
    item = dataset[idx]
    try:
        final_ans = extract_final_answer(item['answer'])
        prob = GSM8KProblem(
            index=idx,
            question=item['question'],
            answer_text=item['answer'],
            final_answer=final_ans
        )
        problems.append(prob)
    except ValueError:
        pass

print(f'Prepared {len(problems)} problems')
prob_map = {p.index: p for p in problems}

## 5. Load Clean Traces from Claude Experiment

In [None]:
# Find Claude's clean traces
possible_paths = [
    f'{SAVE_DIR}/full_experiment_v3_20251224/clean_traces/clean_traces_I10_v3.json',
    f'{SAVE_DIR}/clean_traces_I10_v3.json',
    f'{SAVE_DIR}/pilot_v2/clean_traces_I10.json',
]

clean_traces_path = None
for path in possible_paths:
    if os.path.exists(path):
        clean_traces_path = path
        break

# If not found, search recursively
if clean_traces_path is None:
    for root, dirs, files in os.walk(SAVE_DIR):
        for f in files:
            if 'clean_traces_I10' in f and f.endswith('.json'):
                clean_traces_path = os.path.join(root, f)
                break
        if clean_traces_path:
            break

if clean_traces_path is None:
    raise FileNotFoundError('Could not find clean traces from Claude experiment!')

print(f'Loading clean traces from: {clean_traces_path}')
clean_traces_data = load_json(clean_traces_path)
print(f'Loaded {len(clean_traces_data)} clean traces')

traces_dict = {t['problem_index']: t for t in clean_traces_data}

## 6. Corruption Logic (Same as Claude)

In [None]:
def pick_corrupted_steps(I: int, lam: float, seed: int) -> List[int]:
    K = int(round(lam * I))
    if K == 0:
        return []
    steps = list(range(1, I + 1))
    rng = random.Random(seed)
    rng.shuffle(steps)
    return sorted(steps[:K])

def assign_corruption_types(corrupted_steps: List[int], seed: int) -> Dict[int, str]:
    K = len(corrupted_steps)
    if K == 0:
        return {}
    n_irr = (K * 1) // 5
    n_loc = (K * 2) // 5
    n_wrong = K - n_irr - n_loc
    if n_wrong == 0 and K > 0:
        n_wrong = 1
        if n_loc > 0:
            n_loc -= 1
        elif n_irr > 0:
            n_irr -= 1
    rng = random.Random(seed + 1)
    perm = corrupted_steps[:]
    rng.shuffle(perm)
    type_map = {}
    for s in perm[:n_irr]:
        type_map[s] = "IRR"
    for s in perm[n_irr:n_irr + n_loc]:
        type_map[s] = "LOC"
    for s in perm[n_irr + n_loc:]:
        type_map[s] = "WRONG"
    return type_map

IRRELEVANT_TEMPLATES = [
    "Compute an auxiliary value: aux = {a} + {b} = {result}, but it will not be used later.",
]
WRONG_CONSTRAINT_TEMPLATES = [
    "Fix an intermediate condition: set {var} = {wrong_value} as a given constraint for the rest of the steps.",
]

def generate_irrelevant_step(step_num: int, seed: int) -> str:
    rng = random.Random(seed)
    a, b = rng.randint(2, 20), rng.randint(2, 20)
    return IRRELEVANT_TEMPLATES[0].format(a=a, b=b, result=a+b)

def generate_local_error_step(original_step: str, seed: int) -> str:
    rng = random.Random(seed)
    numbers = re.findall(r'\d+', original_step)
    if not numbers:
        return f"Compute t = 10 * 3 = {rng.randint(28, 32)} (using the previous values)."
    original_result = int(numbers[-1])
    offset = rng.choice([-3, -2, -1, 1, 2, 3])
    wrong_result = max(0, original_result + offset)
    modified = re.sub(r'= (\d+)\.$', f'= {wrong_result}.', original_step)
    if modified == original_step:
        modified = re.sub(r'(\d+)\.$', f'{wrong_result}.', original_step)
    return modified

def generate_wrong_constraint_step(step_num: int, seed: int) -> str:
    rng = random.Random(seed)
    var = rng.choice(['x', 'total', 'result', 'n'])
    wrong_value = rng.randint(10, 100)
    return WRONG_CONSTRAINT_TEMPLATES[0].format(var=var, wrong_value=wrong_value)

def apply_corruption(trace_data: dict, lam: float, seed: int) -> str:
    steps = trace_data['steps'][:]
    I = len(steps)
    corrupted_steps = pick_corrupted_steps(I, lam, seed)
    corruption_types = assign_corruption_types(corrupted_steps, seed)
    new_steps = []
    for i, step_content in enumerate(steps):
        step_num = i + 1
        if step_num in corruption_types:
            ctype = corruption_types[step_num]
            step_seed = seed + step_num * 1000
            if ctype == 'IRR':
                new_content = generate_irrelevant_step(step_num, step_seed)
            elif ctype == 'LOC':
                new_content = generate_local_error_step(step_content, step_seed)
            else:
                new_content = generate_wrong_constraint_step(step_num, step_seed)
            new_steps.append(new_content)
        else:
            new_steps.append(step_content)
    lines = ['[[COT_START]]']
    for i, content in enumerate(new_steps):
        lines.append(f'Step {i+1}: {content}')
    lines.append('[[COT_END]]')
    return '\n'.join(lines)

print('Corruption logic defined (identical to Claude experiment).')

## 7. OpenAI API Setup

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass('Enter OpenAI API Key: ')
print('API Key set.')

In [None]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

def call_gpt(system_prompt: str, user_prompt: str, max_tokens: int = 256, retries: int = 3) -> str:
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=API_MODEL,
                max_tokens=max_tokens,
                temperature=0,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ]
            )
            time.sleep(API_RATE_LIMIT_DELAY)
            return response.choices[0].message.content
        except Exception as e:
            print(f'API error (attempt {attempt+1}): {e}')
            if attempt < retries - 1:
                time.sleep(2.0 * (attempt + 1))
            else:
                raise

# Test API connection
test_response = call_gpt(
    "You output ONLY JSON.",
    'Respond with exactly: {"test": "ok"}',
    max_tokens=50
)
print(f'API test: {test_response}')
print(f'Model: {API_MODEL}')

## 8. Experiment Prompts (Same as Claude)

In [None]:
# Same prompts as Claude experiment for fair comparison
EXPERIMENT_SYSTEM_PROMPT = """You are a calculator that outputs ONLY JSON.

CRITICAL RULES:
1. Your output MUST start with the character '{'
2. Your output MUST be exactly: {"final": <number>}
3. Replace <number> with an integer (the numerical answer)
4. Do NOT write ANY explanation, reasoning, or text before or after the JSON
5. ONLY output the JSON object, nothing else

CORRECT OUTPUT EXAMPLE:
{"final": 42}
"""

DIRECT_SYSTEM_PROMPT = """You are a calculator that outputs ONLY JSON.

CRITICAL RULES:
1. Your output MUST start with the character '{'
2. Your output MUST be exactly: {"final": <number>}
3. Replace <number> with an integer (the numerical answer)
4. Do NOT write ANY explanation, reasoning, or text before or after the JSON
5. ONLY output the JSON object, nothing else

CORRECT OUTPUT EXAMPLE:
{"final": 42}
"""

def create_cot_prompt(problem: GSM8KProblem, cot_text: str) -> tuple:
    user = f"""Problem: {problem.question}

Reasoning trace (use these steps as given facts):
{cot_text}

Based on the trace above, compute the final numerical answer.
OUTPUT ONLY: {{"final": <number>}}
START YOUR RESPONSE WITH '{{'"""
    return EXPERIMENT_SYSTEM_PROMPT, user

def create_direct_prompt(problem: GSM8KProblem) -> tuple:
    user = f"""Problem: {problem.question}

Solve this problem and give the final numerical answer.
OUTPUT ONLY: {{"final": <number>}}
START YOUR RESPONSE WITH '{{'"""
    return DIRECT_SYSTEM_PROMPT, user

def parse_model_answer(response: str) -> Optional[int]:
    match = re.search(r'\{\s*"final"\s*:\s*(-?\d+(?:\.\d+)?)\s*\}', response)
    if match:
        return int(round(float(match.group(1))))
    match = re.search(r"\{\s*[\"']final[\"']\s*:\s*(-?\d+(?:\.\d+)?)\s*\}", response)
    if match:
        return int(round(float(match.group(1))))
    match = re.search(r'"final"\s*:\s*(-?\d+(?:\.\d+)?)', response)
    if match:
        return int(round(float(match.group(1))))
    matches = re.findall(r'(?:^|\s)(-?\d+(?:\.\d+)?)(?:\s|$|\.|,)', response)
    if matches:
        return int(round(float(matches[-1])))
    return None

print('Prompts defined (identical to Claude experiment).')

## 9. Run Direct Condition

In [None]:
print('='*60)
print('RUNNING DIRECT CONDITION (No CoT)')
print('='*60)

direct_results = []

for prob in tqdm(problems, desc='Direct'):
    sys_prompt, usr_prompt = create_direct_prompt(prob)
    response = call_gpt(sys_prompt, usr_prompt, max_tokens=API_MAX_TOKENS)
    
    model_answer = parse_model_answer(response)
    is_correct = (model_answer == prob.final_answer) if model_answer is not None else False
    
    result = ExperimentResult(
        problem_index=prob.index,
        condition='direct',
        I=None,
        lam=None,
        A_target=None,
        model_answer=model_answer,
        correct_answer=prob.final_answer,
        is_correct=is_correct,
        raw_output=response,
        timestamp=datetime.now().isoformat(),
        model=API_MODEL
    )
    direct_results.append(result)

# Save direct results
save_json([asdict(r) for r in direct_results], 
         f'{SAVE_DIR_GPT}/results/direct_results_gpt.json')

# Report
direct_acc = sum(r.is_correct for r in direct_results) / len(direct_results)
print(f'\nDirect condition accuracy: {direct_acc:.1%}')

## 10. Run CoT Conditions (λ grid)

In [None]:
print('='*60)
print('RUNNING COT CONDITIONS (I=10, λ grid)')
print('='*60)

cot_results = []

for lam in LAMBDA_VALUES:
    print(f'\nλ = {lam} (A = {1-lam})')
    
    lam_results = []
    for prob in tqdm(problems, desc=f'λ={lam}'):
        if prob.index not in traces_dict:
            continue
        
        trace_data = traces_dict[prob.index]
        
        # Apply corruption (same seed as Claude)
        seed = derive_seed(GLOBAL_SEED, prob.index, I=I_FIXED, lam=lam)
        corrupted_cot = apply_corruption(trace_data, lam, seed)
        
        # Create prompt and call API
        sys_prompt, usr_prompt = create_cot_prompt(prob, corrupted_cot)
        response = call_gpt(sys_prompt, usr_prompt, max_tokens=API_MAX_TOKENS)
        
        model_answer = parse_model_answer(response)
        is_correct = (model_answer == prob.final_answer) if model_answer is not None else False
        
        result = ExperimentResult(
            problem_index=prob.index,
            condition='cot',
            I=I_FIXED,
            lam=lam,
            A_target=1.0 - lam,
            model_answer=model_answer,
            correct_answer=prob.final_answer,
            is_correct=is_correct,
            raw_output=response,
            timestamp=datetime.now().isoformat(),
            model=API_MODEL
        )
        lam_results.append(result)
        cot_results.append(result)
        
        # Checkpoint
        if len(cot_results) % CHECKPOINT_EVERY == 0:
            save_json([asdict(r) for r in cot_results], 
                     f'{SAVE_DIR_GPT}/checkpoints/cot_checkpoint_gpt.json')
    
    # Report accuracy for this λ
    acc = sum(r.is_correct for r in lam_results) / len(lam_results) if lam_results else 0
    print(f'  Accuracy: {acc:.1%} ({sum(r.is_correct for r in lam_results)}/{len(lam_results)})')

# Save all CoT results
save_json([asdict(r) for r in cot_results], 
         f'{SAVE_DIR_GPT}/results/cot_results_gpt.json')

print(f'\nTotal CoT results: {len(cot_results)}')

## 11. Analysis & Comparison with Claude

In [None]:
import matplotlib.pyplot as plt

# Combine results
all_results = direct_results + cot_results

# Create DataFrames
direct_df = pd.DataFrame([asdict(r) for r in direct_results])
cot_df = pd.DataFrame([asdict(r) for r in cot_results])

gpt_direct_acc = direct_df['is_correct'].mean()

print('='*60)
print(f'CHATGPT ({API_MODEL}) RESULTS')
print('='*60)
print(f'\nDirect (no CoT): {gpt_direct_acc:.1%}')
print(f'\nCoT accuracy by λ:')
print(f'{"λ":>6} {"A":>6} {"Accuracy":>10} {"vs Direct":>12}')
print('-' * 40)

gpt_acc_by_lam = {}
for lam in LAMBDA_VALUES:
    lam_data = cot_df[cot_df['lam'] == lam]
    acc = lam_data['is_correct'].mean()
    gpt_acc_by_lam[lam] = acc
    diff = acc - gpt_direct_acc
    sign = '+' if diff > 0 else ''
    print(f'{lam:>6.1f} {1-lam:>6.1f} {acc:>10.1%} {sign}{diff:>11.1%}')

print('-' * 40)
print(f'{"Direct":>6} {"-":>6} {gpt_direct_acc:>10.1%} {"baseline":>12}')

## 12. Estimate A* for ChatGPT

In [None]:
from scipy.interpolate import interp1d
from scipy.optimize import brentq

# Point estimate of λ*
lam_points = np.array(list(gpt_acc_by_lam.keys()))
acc_points = np.array(list(gpt_acc_by_lam.values()))

def estimate_lambda_crit(lam_arr, acc_arr, baseline):
    f = interp1d(lam_arr, acc_arr - baseline, kind='linear', fill_value='extrapolate')
    try:
        for i in range(len(lam_arr) - 1):
            if (acc_arr[i] - baseline) * (acc_arr[i+1] - baseline) < 0:
                return brentq(f, lam_arr[i], lam_arr[i+1])
    except:
        pass
    return None

gpt_lam_crit = estimate_lambda_crit(lam_points, acc_points, gpt_direct_acc)

print('='*60)
print('A* ESTIMATION FOR CHATGPT')
print('='*60)

if gpt_lam_crit is not None:
    gpt_a_crit = 1 - gpt_lam_crit
    print(f'\nPoint estimate:')
    print(f'  λ* = {gpt_lam_crit:.3f}')
    print(f'  A* = {gpt_a_crit:.3f}')
    
    # Bootstrap CI
    print('\nComputing bootstrap CI...')
    np.random.seed(GLOBAL_SEED)
    n_bootstrap = 1000
    lam_crit_samples = []
    
    problem_ids = cot_df['problem_index'].unique()
    n_problems = len(problem_ids)
    
    for b in range(n_bootstrap):
        boot_problems = np.random.choice(problem_ids, size=n_problems, replace=True)
        boot_cot = cot_df[cot_df['problem_index'].isin(boot_problems)]
        boot_acc = boot_cot.groupby('lam')['is_correct'].mean()
        
        boot_lam_crit = estimate_lambda_crit(
            np.array(boot_acc.index),
            np.array(boot_acc.values),
            gpt_direct_acc
        )
        if boot_lam_crit is not None:
            lam_crit_samples.append(boot_lam_crit)
    
    if lam_crit_samples:
        ci_lower = np.percentile(lam_crit_samples, 2.5)
        ci_upper = np.percentile(lam_crit_samples, 97.5)
        print(f'\nBootstrap results ({len(lam_crit_samples)}/{n_bootstrap} samples):')
        print(f'  λ* = {np.mean(lam_crit_samples):.3f} [95% CI: {ci_lower:.3f}, {ci_upper:.3f}]')
        print(f'  A* = {1-np.mean(lam_crit_samples):.3f} [95% CI: {1-ci_upper:.3f}, {1-ci_lower:.3f}]')
else:
    print('\nCould not estimate λ* (no crossing detected)')

## 13. Comparison Plot: Claude vs ChatGPT

In [None]:
# Claude results (hardcoded from previous experiment)
claude_acc_by_lam = {
    0.0: 0.970,
    0.2: 0.960,
    0.4: 0.839,
    0.6: 0.739,
    0.8: 0.563,
    1.0: 0.347
}
claude_direct_acc = 0.759
claude_lam_crit = 0.449

fig, ax = plt.subplots(figsize=(10, 6))

# Claude curve
claude_lams = list(claude_acc_by_lam.keys())
claude_accs = [claude_acc_by_lam[l] * 100 for l in claude_lams]
ax.plot(claude_lams, claude_accs, 'o-', color='#2166ac', linewidth=2.5, 
        markersize=10, label='Claude Sonnet')
ax.axhline(y=claude_direct_acc * 100, color='#2166ac', linestyle='--', 
          linewidth=1.5, alpha=0.7)

# ChatGPT curve
gpt_lams = list(gpt_acc_by_lam.keys())
gpt_accs = [gpt_acc_by_lam[l] * 100 for l in gpt_lams]
ax.plot(gpt_lams, gpt_accs, 's-', color='#d62728', linewidth=2.5,
        markersize=10, label=f'ChatGPT ({API_MODEL})')
ax.axhline(y=gpt_direct_acc * 100, color='#d62728', linestyle='--',
          linewidth=1.5, alpha=0.7)

# Mark A* points
if gpt_lam_crit is not None:
    ax.axvline(x=claude_lam_crit, color='#2166ac', linestyle=':', linewidth=2, alpha=0.7)
    ax.axvline(x=gpt_lam_crit, color='#d62728', linestyle=':', linewidth=2, alpha=0.7)
    
    # Annotate
    ax.annotate(f'Claude λ*={claude_lam_crit:.2f}', 
               xy=(claude_lam_crit, 50), fontsize=10, color='#2166ac',
               rotation=90, va='bottom', ha='right')
    ax.annotate(f'GPT λ*={gpt_lam_crit:.2f}',
               xy=(gpt_lam_crit, 50), fontsize=10, color='#d62728',
               rotation=90, va='bottom', ha='left')

ax.set_xlabel('Corruption Rate (λ)', fontsize=13, fontweight='bold')
ax.set_ylabel('Accuracy (%)', fontsize=13, fontweight='bold')
ax.set_title('CoT Collapse: Claude vs ChatGPT', fontsize=14, fontweight='bold')
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(25, 105)
ax.legend(loc='lower left', fontsize=11)
ax.grid(True, alpha=0.3)

# Add A axis
ax2 = ax.twiny()
ax2.set_xlim(ax.get_xlim())
ax2.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
ax2.set_xticklabels(['1.0', '0.8', '0.6', '0.4', '0.2', '0.0'])
ax2.set_xlabel('Alignment (A = 1 - λ)', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{SAVE_DIR_GPT}/comparison_claude_gpt.png', dpi=300, bbox_inches='tight')
plt.show()
print(f'Saved: {SAVE_DIR_GPT}/comparison_claude_gpt.png')

## 14. Final Summary

In [None]:
print('='*70)
print('EXPERIMENT SUMMARY: CLAUDE vs CHATGPT')
print('='*70)

print(f"""
┌────────────────────────────────────────────────────────────────────┐
│                     MODEL COMPARISON                               │
├────────────────────────────────────────────────────────────────────┤
│ Metric              │ Claude Sonnet    │ ChatGPT ({API_MODEL:10})  │
├────────────────────────────────────────────────────────────────────┤
│ Direct accuracy     │ {claude_direct_acc:>10.1%}       │ {gpt_direct_acc:>10.1%}            │
│ Clean CoT (λ=0)     │ {claude_acc_by_lam[0.0]:>10.1%}       │ {gpt_acc_by_lam[0.0]:>10.1%}            │
│ λ=0.4               │ {claude_acc_by_lam[0.4]:>10.1%}       │ {gpt_acc_by_lam[0.4]:>10.1%}            │
│ λ=0.8               │ {claude_acc_by_lam[0.8]:>10.1%}       │ {gpt_acc_by_lam[0.8]:>10.1%}            │
│ Full corrupt (λ=1)  │ {claude_acc_by_lam[1.0]:>10.1%}       │ {gpt_acc_by_lam[1.0]:>10.1%}            │
├────────────────────────────────────────────────────────────────────┤
│ λ* (critical)       │ {claude_lam_crit:>10.3f}       │ {gpt_lam_crit if gpt_lam_crit else 'N/A':>10}            │
│ A* (critical)       │ {1-claude_lam_crit:>10.3f}       │ {1-gpt_lam_crit if gpt_lam_crit else 'N/A':>10}            │
└────────────────────────────────────────────────────────────────────┘
""")

if gpt_lam_crit is not None:
    delta = abs(gpt_lam_crit - claude_lam_crit)
    print(f'\nΔλ* = {delta:.3f} (difference between models)')
    print(f'ΔA* = {delta:.3f}')
    
    if delta < 0.1:
        print('\n→ A* is CONSISTENT across models (Δ < 0.1)')
    else:
        print('\n→ A* shows MODEL-SPECIFIC variation')

print(f'\nResults saved to: {SAVE_DIR_GPT}')

In [None]:
# Save final combined results
summary = {
    'experiment_date': EXPERIMENT_DATE,
    'model': API_MODEL,
    'n_problems': len(problems),
    'direct_accuracy': gpt_direct_acc,
    'accuracy_by_lambda': gpt_acc_by_lam,
    'lambda_crit': gpt_lam_crit,
    'a_crit': 1 - gpt_lam_crit if gpt_lam_crit else None,
    'comparison_with_claude': {
        'claude_lambda_crit': claude_lam_crit,
        'claude_a_crit': 1 - claude_lam_crit,
        'delta_lambda_crit': abs(gpt_lam_crit - claude_lam_crit) if gpt_lam_crit else None
    }
}

save_json(summary, f'{SAVE_DIR_GPT}/results/summary_gpt.json')
print('\nExperiment complete!')