In [None]:
import os
import json
import re
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from matplotlib.patches import Patch
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['Times', 'DejaVu Serif'],
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'legend.fontsize': 11,
    'figure.dpi': 300
})

THOUGHT_TYPES = ['Transfer', 'Ensemble']
COLORS = {'Transfer': '#A8DADC', 'Ensemble': '#B8A9C9'}

def parse_jsonl(filepath):
    try:
        with open(filepath, 'r') as f:
            return [json.loads(line) for line in f if line.strip()]
    except:
        return []

def extract_file_info(filename):
    answer_type = 'without_answer' if 'without_answer' in filename else 'full_text'
    
    # Transfer CoT
    if '_thoughts_to_' in filename:
        match = re.match(r'(.+)_thoughts_to_(.+)_zero_shot', filename)
        if match:
            source_model = match.group(1)
            target_model = match.group(2)
            return 'Transfer', [source_model], target_model, answer_type
    
    # Ensemble CoT
    elif '_gen_' in filename and '_eval_' in filename:
        gen_match = re.search(r'_gen_(.+)_eval', filename)
        eval_match = re.search(r'_eval_(.+?)\.json', filename)
        target_match = re.match(r'^(.+?)_zero_shot_ensembled', filename)
        
        if gen_match and eval_match and target_match:
            gen_part = gen_match.group(1)
            eval_model = eval_match.group(1)
            target_model = target_match.group(1)
            
            # Map short names to full model names
            short_to_full = {
                'qwq': 'Qwen_QwQ-32B',
                'dapo': 'BytedTsinghua-SIA_DAPO-Qwen-32B',
                'oss': 'openai_gpt-oss-20b',
                'opent': 'open-thoughts_OpenThinker-7B',
                'nrr': 'nvidia_Nemotron-Research-Reasoning-Qwen-1.5B'
            }
            
            # Parse generator models
            gen_models = [short_to_full.get(model, model) for model in gen_part.split('_')]
            full_eval_model = short_to_full.get(eval_model, eval_model)
            
            # Combine and deduplicate source models
            all_sources = gen_models + [full_eval_model]
            source_models = sorted(list(set(all_sources)))
            
            return 'Ensemble', source_models, target_model, answer_type
    
    return None, None, None, None

def check_consistency(dp1, dp2, is_thoughts_to):
    fields = ("Target Answer", "Target Result") if is_thoughts_to else ("LLM Answer", "Result")
    
    # Check if required fields exist
    if fields[0] not in dp1 or fields[1] not in dp1 or fields[0] not in dp2 or fields[1] not in dp2:
        return 0  # Missing fields = not consistent
    
    ans1, res1 = dp1[fields[0]], dp1[fields[1]]
    ans2, res2 = dp2[fields[0]], dp2[fields[1]]
    
    invalid = ["not defined", "N/A", "does not match", "are not permitted"]
    if any(p in ans1 or p in ans2 for p in invalid):
        return None
    
    return 1 if ans1 == ans2 or (res1 == "Correct" and res2 == "Correct") else 0

def calculate_stats(rates):
    if not rates:
        return {'mean': 0, 'stderr': 0, 'count': 0}
    return {
        'mean': np.mean(rates),
        'stderr': np.std(rates) / np.sqrt(len(rates)),
        'count': len(rates)
    }

def analyze_involvement_consistency(folder_path):
    if not os.path.exists(folder_path):
        return {}
    
    involvement_data = defaultdict(lambda: defaultdict(list))
    
    # Collect file info
    file_info = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.jsonl') and ("thoughts_to" in filename or "ensemble" in filename):
            info = extract_file_info(filename)
            if info[0]:  # valid thought type
                file_info[filename] = info
    
    # Calculate pairwise consistencies
    filenames = list(file_info.keys())
    
    for i in range(len(filenames)):
        for j in range(i + 1, len(filenames)):
            file1, file2 = filenames[i], filenames[j]
            thought_type1, sources1, target1, answer_type1 = file_info[file1]
            thought_type2, sources2, target2, answer_type2 = file_info[file2]
            
            # Only compare same thought type and answer type
            if thought_type1 != thought_type2 or answer_type1 != answer_type2:
                continue
            
            # Only compare files with the same source models
            if sources1 != sources2:
                continue
                
            data1 = parse_jsonl(os.path.join(folder_path, file1))
            data2 = parse_jsonl(os.path.join(folder_path, file2))
            
            if not data1 or not data2:
                continue
            
            # Calculate consistency
            consistent = total = 0
            is_thoughts_to = '_thoughts_to_' in file1
            
            for dp1, dp2 in zip(data1, data2):
                result = check_consistency(dp1, dp2, is_thoughts_to)
                if result is not None:
                    total += 1
                    consistent += result
            
            if total > 0:
                consistency_rate = consistent / total
                
                # Check if any target model helped create the CoT
                targets_involved = (target1 in sources1 or target2 in sources1)
                
                involvement_data[thought_type1][answer_type1].append((targets_involved, consistency_rate))
    
    # Aggregate results
    results = {}
    for thought_type in THOUGHT_TYPES:
        if thought_type not in involvement_data:
            continue
            
        results[thought_type] = {}
        for answer_type in ['full_text', 'without_answer']:
            if answer_type not in involvement_data[thought_type]:
                continue
                
            data = involvement_data[thought_type][answer_type]
            involved_rates = [rate for involved, rate in data if involved]
            not_involved_rates = [rate for involved, rate in data if not involved]
            
            if involved_rates or not_involved_rates:
                results[thought_type][answer_type] = {
                    'involved': calculate_stats(involved_rates),
                    'not_involved': calculate_stats(not_involved_rates)
                }
    
    return results

def plot_results(results):
    if not results:
        print("No results to plot")
        return
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    answer_types = ['full_text', 'without_answer']
    titles = ['Full Text Conditions', 'Without Answer Conditions']
    
    for idx, (ax, answer_type, title) in enumerate(zip([ax1, ax2], answer_types, titles)):
        thought_types = [t for t in THOUGHT_TYPES if t in results and answer_type in results[t]]
        
        if not thought_types:
            ax.text(0.5, 0.5, 'No data available', ha='center', va='center', 
                   transform=ax.transAxes, fontsize=12)
            ax.set_title(title, fontweight='bold', fontsize=14)
            continue
        
        x = np.arange(len(thought_types))
        width = 0.35
        
        # Extract data
        involved_means = [results[t][answer_type]['involved']['mean'] for t in thought_types]
        involved_stderrs = [results[t][answer_type]['involved']['stderr'] for t in thought_types]
        not_involved_means = [results[t][answer_type]['not_involved']['mean'] for t in thought_types]
        not_involved_stderrs = [results[t][answer_type]['not_involved']['stderr'] for t in thought_types]
        
        colors = [COLORS[t] for t in thought_types]
        light_colors = [f'#{int(c[1:3], 16)//2 + 127:02x}{int(c[3:5], 16)//2 + 127:02x}{int(c[5:7], 16)//2 + 127:02x}' 
                       for c in colors]
        
        ax.bar(x - width/2, involved_means, width, yerr=involved_stderrs,
               color=colors, capsize=4, edgecolor='black', linewidth=0.8,
               label='CoT Creator Involved' if idx == 0 else "")
        
        ax.bar(x + width/2, not_involved_means, width, yerr=not_involved_stderrs,
               color=light_colors, capsize=4, edgecolor='black', linewidth=0.8, hatch='///',
               label='CoT Creator Not Involved' if idx == 0 else "")
        
        # Styling
        ax.set_xlabel('Thought Type', fontweight='bold', fontsize=12)
        if idx == 0:
            ax.set_ylabel('Average Pairwise Target Consistency Rate', fontweight='bold', fontsize=12)
        ax.set_title(title, fontweight='bold', fontsize=14)
        ax.set_xticks(x)
        ax.set_xticklabels(thought_types, fontsize=11)
        ax.grid(axis='y', linestyle='--', color='lightgray', linewidth=0.5)
        ax.set_ylim(0, 1.0)
        
        for spine in ['top', 'right']:
            ax.spines[spine].set_visible(False)
        for spine in ['left', 'bottom']:
            ax.spines[spine].set_linewidth(1.2)
        
        # Add value labels
        all_bars = ax.patches
        all_means = involved_means + not_involved_means
        for bar, mean in zip(all_bars, all_means):
            if mean > 0:
                ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                       f'{mean:.2f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # Legend
    handles = [
        Patch(color='gray', edgecolor='black', linewidth=0.8, label='CoT Creator Involved'),
        Patch(facecolor='lightgray', edgecolor='black', linewidth=0.8, hatch='///', 
              label='CoT Creator Not Involved')
    ]
    
    fig.legend(handles=handles, loc='lower center', ncol=2, frameon=True, 
               fancybox=True, shadow=True, fontsize=12, bbox_to_anchor=(0.5, -0.05))
    
    plt.suptitle('CoT Creator Involvement in Pairwise Target Consistency', 
                fontweight='bold', fontsize=16, y=0.95)
    
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15, top=0.85)
    plt.savefig('cot_creator_involvement_consistency.pdf', bbox_inches='tight', 
                facecolor='white', dpi=300)
    plt.show()

def print_summary(results):
    print("\nCoT CREATOR INVOLVEMENT ANALYSIS")
    print("=" * 50)
    
    for thought_type in THOUGHT_TYPES:
        if thought_type not in results:
            continue
            
        print(f"\n{thought_type.upper()}:")
        
        for answer_type in ['full_text', 'without_answer']:
            if answer_type not in results[thought_type]:
                continue
                
            print(f"\n  {answer_type.replace('_', ' ').title()}:")
            data = results[thought_type][answer_type]
            
            involved = data['involved']
            not_involved = data['not_involved']
            
            print(f"    Creator Involved:     {involved['mean']:.3f} ± {involved['stderr']:.3f} (n={involved['count']})")
            print(f"    Creator Not Involved: {not_involved['mean']:.3f} ± {not_involved['stderr']:.3f} (n={not_involved['count']})")
            
            if involved['count'] > 0 and not_involved['count'] > 0:
                diff = involved['mean'] - not_involved['mean']
                print(f"    Difference:           {diff:+.3f}")

def main(folder_path):
    print("CoT Creator Involvement in Target Consistency Analysis")
    print(f"Folder: {folder_path}")
    print("-" * 60)
    
    results = analyze_involvement_consistency(folder_path)
    
    if not results:
        print("No valid comparisons found.")
        return
    
    plot_results(results)
    print_summary(results)
    print("Analysis complete.")

if __name__ == "__main__":
    main("../outputs")

CoT Creator Involvement in Target Consistency Analysis
Folder: ../outputs
------------------------------------------------------------
