# CoT Backfire Analysis

This notebook reproduces the main analyses from:

> HIDEKI. "When Reasoning Traces Backfire: Identifying the Backfire Boundary of Provided Chain-of-Thought Reasoning." (2025)

## Setup

In [None]:
import json
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Configuration
GLOBAL_SEED = 20251224
np.random.seed(GLOBAL_SEED)

## Load Data

In [None]:
# Load Claude results
with open('data/claude/results_full_v3.json', 'r') as f:
    claude_full = json.load(f)

with open('data/claude/direct_results_v3.json', 'r') as f:
    claude_direct = json.load(f)

# Load GPT-4o results
with open('data/gpt4o/cot_results_gpt.json', 'r') as f:
    gpt_cot = json.load(f)

with open('data/gpt4o/direct_results_gpt.json', 'r') as f:
    gpt_direct = json.load(f)

# Convert to DataFrames
claude_df = pd.DataFrame(claude_full)
claude_direct_df = pd.DataFrame(claude_direct)
gpt_df = pd.DataFrame(gpt_cot)
gpt_direct_df = pd.DataFrame(gpt_direct)

print(f'Claude CoT: {len(claude_df)} records')
print(f'Claude Direct: {len(claude_direct_df)} records')
print(f'GPT-4o CoT: {len(gpt_df)} records')
print(f'GPT-4o Direct: {len(gpt_direct_df)} records')

## Main Analysis: Accuracy by Corruption Rate

In [None]:
# Claude accuracy by lambda
claude_acc = claude_df.groupby('lam')['is_correct'].mean()
claude_direct_acc = claude_direct_df['is_correct'].mean()

print('Claude Sonnet Results:')
print(f'Direct (no CoT): {claude_direct_acc:.1%}')
print('\nCoT by λ:')
for lam, acc in claude_acc.items():
    print(f'  λ={lam}: {acc:.1%}')

In [None]:
# GPT-4o accuracy by lambda
gpt_acc = gpt_df.groupby('lam')['is_correct'].mean()
gpt_direct_acc = gpt_direct_df['is_correct'].mean()

print('GPT-4o Results:')
print(f'Direct (no CoT): {gpt_direct_acc:.1%}')
print('\nCoT by λ:')
for lam, acc in gpt_acc.items():
    print(f'  λ={lam}: {acc:.1%}')

## Backfire Boundary Estimation

In [None]:
from scipy.interpolate import interp1d
from scipy.optimize import brentq

def estimate_lambda_crit(lam_arr, acc_arr, baseline):
    """Estimate λ* where accuracy crosses baseline."""
    f = interp1d(lam_arr, acc_arr - baseline, kind='linear', fill_value='extrapolate')
    try:
        for i in range(len(lam_arr) - 1):
            if (acc_arr[i] - baseline) * (acc_arr[i+1] - baseline) < 0:
                return brentq(f, lam_arr[i], lam_arr[i+1])
    except:
        pass
    return None

# Claude boundary
claude_lams = np.array(list(claude_acc.index))
claude_accs = np.array(list(claude_acc.values))
claude_lambda_crit = estimate_lambda_crit(claude_lams, claude_accs, claude_direct_acc)

# GPT-4o boundary
gpt_lams = np.array(list(gpt_acc.index))
gpt_accs = np.array(list(gpt_acc.values))
gpt_lambda_crit = estimate_lambda_crit(gpt_lams, gpt_accs, gpt_direct_acc)

print('Backfire Boundary Estimates:')
print(f'  Claude: λ* = {claude_lambda_crit:.3f}, A* = {1-claude_lambda_crit:.3f}')
print(f'  GPT-4o: λ* = {gpt_lambda_crit:.3f}, A* = {1-gpt_lambda_crit:.3f}')

## Compliance Analysis

In [None]:
def compliance_analysis(cot_df, direct_df, model_name):
    """Analyze compliance-induced failures."""
    direct_correct = set(direct_df[direct_df['is_correct']]['problem_index'])
    
    # Clean vs Corrupted (λ=0.8)
    clean = cot_df[cot_df['lam'] == 0.0].set_index('problem_index')['is_correct']
    corrupted = cot_df[cot_df['lam'] == 0.8].set_index('problem_index')['is_correct']
    
    common = clean.index.intersection(corrupted.index)
    
    clean_to_wrong = ((clean.loc[common] == True) & (corrupted.loc[common] == False)).sum()
    wrong_to_clean = ((clean.loc[common] == False) & (corrupted.loc[common] == True)).sum()
    
    # McNemar's test
    n = clean_to_wrong + wrong_to_clean
    mcnemar_stat = (abs(clean_to_wrong - wrong_to_clean) - 1)**2 / n if n > 0 else 0
    p_value = stats.chi2.sf(mcnemar_stat, 1)
    
    flip_rate = clean_to_wrong / len(common)
    
    print(f'{model_name}:')
    print(f'  Flip rate (Clean→Wrong): {flip_rate:.1%}')
    print(f'  McNemar χ² = {mcnemar_stat:.1f}, p = {p_value:.2e}')
    print(f'  Net loss: {clean_to_wrong - wrong_to_clean}')
    
compliance_analysis(claude_df, claude_direct_df, 'Claude Sonnet')
print()
compliance_analysis(gpt_df, gpt_direct_df, 'GPT-4o')

## Visualization

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

# Claude
ax.plot(claude_lams, claude_accs * 100, 'o-', color='#2166ac', 
        linewidth=2.5, markersize=10, label='Claude Sonnet')
ax.axhline(y=claude_direct_acc * 100, color='#2166ac', 
           linestyle='--', linewidth=1.5, alpha=0.7)

# GPT-4o
ax.plot(gpt_lams, gpt_accs * 100, 's-', color='#d62728',
        linewidth=2.5, markersize=10, label='GPT-4o')
ax.axhline(y=gpt_direct_acc * 100, color='#d62728',
           linestyle='--', linewidth=1.5, alpha=0.7)

# Boundaries
if claude_lambda_crit:
    ax.axvline(x=claude_lambda_crit, color='#2166ac', linestyle=':', alpha=0.7)
if gpt_lambda_crit:
    ax.axvline(x=gpt_lambda_crit, color='#d62728', linestyle=':', alpha=0.7)

ax.set_xlabel('Corruption Rate (λ)', fontsize=13)
ax.set_ylabel('Accuracy (%)', fontsize=13)
ax.set_title('Backfire Boundary: Claude vs GPT-4o', fontsize=14)
ax.legend(loc='lower left')
ax.grid(True, alpha=0.3)
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(25, 105)

plt.tight_layout()
plt.show()

## Summary Table

In [None]:
summary = pd.DataFrame({
    'Metric': ['Direct Accuracy', 'Clean CoT (λ=0)', 'λ*', 'A*'],
    'Claude Sonnet': [
        f'{claude_direct_acc:.1%}',
        f'{claude_accs[0]:.1%}',
        f'{claude_lambda_crit:.3f}',
        f'{1-claude_lambda_crit:.3f}'
    ],
    'GPT-4o': [
        f'{gpt_direct_acc:.1%}',
        f'{gpt_accs[0]:.1%}',
        f'{gpt_lambda_crit:.3f}',
        f'{1-gpt_lambda_crit:.3f}'
    ]
})

print(summary.to_string(index=False))