# Hypothesis 3 Justice Principles: Manipulation Outcomes Analysis

This notebook mirrors the Hypothesis 6 descriptive styling to highlight manipulator outcomes when they are tasked with rescuing the least popular justice principle from the Phase 1 rankings.

**Design System Notes**  
- Bayreuth Green `#009260` anchors positive and consensus outcomes.  
- Dark Gray `#48535A`, Medium Gray `#7F8990`, and Light Gray `#EBEBE4` support typography, baselines, and grid work.  
- Apply horizontal layouts and compact tables to keep cross-cohort comparisons legible.

In [20]:
import json
from collections import OrderedDict
from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
from IPython.display import Markdown, display

import sys

_NOTEBOOK_DIR = Path.cwd().resolve()
_REPO_ROOT = None
for candidate in [_NOTEBOOK_DIR, *_NOTEBOOK_DIR.parents]:
    if (candidate / "hypothesis_testing").exists() and (candidate / "config").exists():
        _REPO_ROOT = candidate
        if str(candidate) not in sys.path:
            sys.path.insert(0, str(candidate))
        break

if _REPO_ROOT is None:
    raise RuntimeError("Cannot locate repository root from notebook path.")

from hypothesis_testing.utils_hypothesis_testing.style import (
    apply_bayreuth_theme,
    format_principle_label,
)

apply_bayreuth_theme()

pd.set_option("display.precision", 1)


In [21]:

RESULTS_BASE = _REPO_ROOT / "hypothesis_testing" / "hypothesis_3" / "results"

INTELLIGENCE_LEVELS = OrderedDict([
    ("low", "Low Intelligence Manipulator"),
    ("high", "High Intelligence Manipulator"),
])

PRINCIPLE_CANONICAL_TO_DISPLAY = OrderedDict([
    ("maximizing_average_floor_constraint", "Max Avg + Floor"),
    ("maximizing_average", "Max Avg Income"),
    ("maximizing_average_range_constraint", "Max Avg + Range"),
    ("maximizing_floor", "Max Floor"),
])


def load_result_payload(path: Path) -> Dict:
    with path.open("r", encoding="utf-8") as handle:
        return json.load(handle)


def summarise_manipulation_outcomes(intelligence_key: str) -> tuple[pd.DataFrame, List[str]]:
    results_dir = RESULTS_BASE / intelligence_key
    if not results_dir.exists():
        raise FileNotFoundError(f"Results directory not found: {results_dir}")

    counts: Dict[str, Dict[str, int]] = {
        canonical: {"attempts": 0, "success": 0}
        for canonical in PRINCIPLE_CANONICAL_TO_DISPLAY
    }
    errors: List[str] = []

    for result_path in sorted(results_dir.glob("*_results.json")):
        try:
            payload = load_result_payload(result_path)
        except json.JSONDecodeError as exc:
            errors.append(f"{result_path.name}: JSON decode error ({exc})")
            continue

        general_info = payload.get("general_information", {})
        target_info = general_info.get("manipulator_target_info")

        if not target_info:
            errors.append(f"{result_path.name}: missing manipulator_target_info")
            continue
        if not target_info.get("delivered", False):
            errors.append(f"{result_path.name}: target delivery failed")
            continue

        target_principle = target_info.get("target_principle")
        if target_principle not in PRINCIPLE_CANONICAL_TO_DISPLAY:
            errors.append(
                f"{result_path.name}: unexpected target_principle '{target_principle}'"
            )
            continue

        counts[target_principle]["attempts"] += 1

        consensus_reached = bool(general_info.get("consensus_reached", False))
        consensus_principle = general_info.get("consensus_principle")
        success = consensus_reached and consensus_principle == target_principle
        if success:
            counts[target_principle]["success"] += 1

    rows: List[Dict[str, float | int | str]] = []
    for canonical, display_name in PRINCIPLE_CANONICAL_TO_DISPLAY.items():
        attempts = counts[canonical]["attempts"]
        successes = counts[canonical]["success"]
        share = (successes / attempts * 100.0) if attempts else np.nan
        rows.append(
            {
                "Least popular Principle": format_principle_label(display_name),
                "Attempted Manipulations": int(attempts),
                "Successful Manipulation": int(successes),
                "Share of Successful Manipulation": share,
            }
        )

    table = pd.DataFrame(rows)
    return table, errors


In [22]:

def format_share(value: float) -> str:
    if pd.isna(value):
        return "—"
    return f"{value:.1f}%"

summary_tables: Dict[str, pd.DataFrame] = {}
error_log: Dict[str, List[str]] = {}

for key in INTELLIGENCE_LEVELS:
    table, errors = summarise_manipulation_outcomes(key)
    summary_tables[key] = table
    error_log[key] = errors

for key, label in INTELLIGENCE_LEVELS.items():
    display(Markdown(f"## {label}"))
    styler = (
        summary_tables[key]
        .style
        .format({"Share of Successful Manipulation": format_share})
        .hide(axis="index")
        .set_properties(
            subset=["Least popular Principle"],
            **{"text-align": "left"},
        )
        .set_properties(
            subset=[
                "Attempted Manipulations",
                "Successful Manipulation",
                "Share of Successful Manipulation",
            ],
            **{"text-align": "right"},
        )
    )
    display(styler)

    if error_log[key]:
        display(
            Markdown(
                f"*⚠️ Skipped {len(error_log[key])} runs due to missing or invalid metadata.*"
            )
        )


## Low Intelligence Manipulator

Least popular Principle,Attempted Manipulations,Successful Manipulation,Share of Successful Manipulation
Max. Avg. + Floor,0,0,—
Max. Avg. Income,23,1,4.3%
Max. Avg. + Range,8,0,0.0%
Max. Floor,3,0,0.0%


## High Intelligence Manipulator

Least popular Principle,Attempted Manipulations,Successful Manipulation,Share of Successful Manipulation
Max. Avg. + Floor,0,0,—
Max. Avg. Income,20,6,30.0%
Max. Avg. + Range,12,3,25.0%
Max. Floor,2,2,100.0%


## Target Principle Preference Drift

We compare how non-manipulator agents rank the manipulator's target principle at the end of individual deliberation (Phase 1) and after the group discussion (Phase 2). Results are segmented by manipulator intelligence and whether the manipulation succeeded.

In [None]:
analysis_records: List[Dict[str, object]] = []

for intelligence_key, intelligence_label in INTELLIGENCE_LEVELS.items():
    results_dir = RESULTS_BASE / intelligence_key
    for result_path in sorted(results_dir.glob('*_results.json')):
        payload = load_result_payload(result_path)
        general_info = payload.get('general_information', {})
        target_info = general_info.get('manipulator_target_info')
        if not target_info or not target_info.get('delivered', False):
            continue

        target_principle = target_info.get('target_principle')
        if target_principle not in PRINCIPLE_CANONICAL_TO_DISPLAY:
            continue

        manipulator_name = target_info.get('manipulator_name')
        consensus_principle = general_info.get('consensus_principle')
        consensus_reached = bool(general_info.get('consensus_reached', False))
        outcome = 'Successful' if (consensus_reached and consensus_principle == target_principle) else 'Unsuccessful'
        run_id = result_path.stem

        agents = payload.get('agents', [])
        for agent in agents:
            if agent.get('name') == manipulator_name:
                continue

            phase1_ranking = agent.get('phase_1', {}).get('ranking_3', {}).get('ranking_result', {}).get('rankings', [])
            phase2_ranking = (
                agent
                .get('phase_2', {})
                .get('post_group_discussion', {})
                .get('final_ranking', {})
                .get('rankings', [])
            )

            def extract_rank(ranking: List[Dict[str, object]]) -> float:
                for entry in ranking:
                    if entry.get('principle') == target_principle:
                        return float(entry.get('rank'))
                return np.nan

            analysis_records.append({
                'Manipulator Intelligence': intelligence_label,
                'Manipulation Outcome': outcome,
                'intelligence_key': intelligence_key,
                'outcome_key': outcome.lower(),
                'run_id': run_id,
                'agent_name': agent.get('name'),
                'Phase 1 Rank': extract_rank(phase1_ranking),
                'Phase 2 Rank': extract_rank(phase2_ranking),
            })

preference_df = pd.DataFrame(analysis_records)

if preference_df.empty:
    display(Markdown('*No participant preference data available.*'))
else:
    order = {'Successful': 0, 'Unsuccessful': 1}
    grouped = (
        preference_df
        .groupby(['Manipulator Intelligence', 'Manipulation Outcome'], as_index=False)
        .agg(
            Runs=('run_id', pd.Series.nunique),
            AvgPhase1Rank=('Phase 1 Rank', 'mean'),
            AvgPhase2Rank=('Phase 2 Rank', 'mean'),
        )
        .sort_values(
            by=['Manipulator Intelligence', 'Manipulation Outcome'],
            key=lambda col: col.map(order) if col.name == 'Manipulation Outcome' else col
        )
    )

    grouped = grouped.rename(columns={
        'AvgPhase1Rank': 'Avg. Rank (End Phase 1)',
        'AvgPhase2Rank': 'Avg. Rank (End Phase 2)',
    })
    grouped['Change'] = grouped['Avg. Rank (End Phase 2)'] - grouped['Avg. Rank (End Phase 1)']

    styler = (
        grouped
        .style
        .format({
            'Avg. Rank (End Phase 1)': '{:.2f}',
            'Avg. Rank (End Phase 2)': '{:.2f}',
            'Change': '{:+.2f}',
        })
        .hide(axis='index')
        .set_properties(subset=['Manipulator Intelligence', 'Manipulation Outcome'], **{'text-align': 'left'})
        .set_properties(
            subset=['Runs', 'Avg. Rank (End Phase 1)', 'Avg. Rank (End Phase 2)', 'Change'],
            **{'text-align': 'right'}
        )
    )
    display(styler)

    small_samples = grouped[grouped['Runs'] < 3]
    if not small_samples.empty:
        display(Markdown('*⚠️ Sample sizes below three runs should be interpreted cautiously.*'))

## First Speaker Analysis: Manipulator Position in Round 1

This section analyzes how often the manipulator agent (Agent_2) was the first to speak in Round 1 of group discussion. The analysis is segmented by manipulator intelligence level (high vs. low) and manipulation outcome (successful vs. unsuccessful).

In [None]:
import re

def extract_first_speaker_round_1(conversation: str) -> str:
    """
    Extract the agent name of the first speaker in Round 1 from the conversation transcript.
    
    Args:
        conversation: The public_conversation_phase_2 transcript string
        
    Returns:
        Agent name (e.g., "Agent_0", "Agent_1", "Agent_2") or None if not found
    """
    # Pattern to match Round 1 speaker: "Round 1 / Speaker: Agent_X Statement:"
    pattern = r'Round 1 / Speaker: (Agent_\d+)'
    match = re.search(pattern, conversation)
    return match.group(1) if match else None


# Collect first speaker data
first_speaker_records: List[Dict[str, object]] = []

for intelligence_key, intelligence_label in INTELLIGENCE_LEVELS.items():
    results_dir = RESULTS_BASE / intelligence_key
    for result_path in sorted(results_dir.glob('*_results.json')):
        payload = load_result_payload(result_path)
        general_info = payload.get('general_information', {})
        target_info = general_info.get('manipulator_target_info')
        
        if not target_info or not target_info.get('delivered', False):
            continue

        target_principle = target_info.get('target_principle')
        if target_principle not in PRINCIPLE_CANONICAL_TO_DISPLAY:
            continue

        manipulator_name = target_info.get('manipulator_name')
        consensus_principle = general_info.get('consensus_principle')
        consensus_reached = bool(general_info.get('consensus_reached', False))
        outcome = 'Successful' if (consensus_reached and consensus_principle == target_principle) else 'Unsuccessful'
        
        # Extract first speaker from conversation
        conversation = general_info.get('public_conversation_phase_2', '')
        first_speaker = extract_first_speaker_round_1(conversation)
        
        if first_speaker is None:
            continue
        
        # Check if manipulator spoke first
        manipulator_spoke_first = (first_speaker == manipulator_name)
        
        first_speaker_records.append({
            'Manipulator Intelligence': intelligence_label,
            'Manipulation Outcome': outcome,
            'intelligence_key': intelligence_key,
            'outcome_key': outcome.lower(),
            'run_id': result_path.stem,
            'first_speaker': first_speaker,
            'manipulator_name': manipulator_name,
            'manipulator_spoke_first': manipulator_spoke_first,
        })

first_speaker_df = pd.DataFrame(first_speaker_records)

if first_speaker_df.empty:
    display(Markdown('*No first speaker data available.*'))
else:
    # Aggregate by intelligence level and outcome
    order = {'Successful': 0, 'Unsuccessful': 1}
    first_speaker_summary = (
        first_speaker_df
        .groupby(['Manipulator Intelligence', 'Manipulation Outcome'], as_index=False)
        .agg(
            Total_Experiments=('run_id', 'count'),
            Manipulator_First=('manipulator_spoke_first', 'sum'),
        )
        .sort_values(
            by=['Manipulator Intelligence', 'Manipulation Outcome'],
            key=lambda col: col.map(order) if col.name == 'Manipulation Outcome' else col
        )
    )
    
    # Calculate percentage
    first_speaker_summary['Percentage'] = (
        first_speaker_summary['Manipulator_First'] / first_speaker_summary['Total_Experiments'] * 100.0
    )
    
    first_speaker_summary = first_speaker_summary.rename(columns={
        'Total_Experiments': 'Total Experiments',
        'Manipulator_First': 'Manipulator Spoke First',
    })
    
    styler = (
        first_speaker_summary
        .style
        .format({
            'Total Experiments': '{:.0f}',
            'Manipulator Spoke First': '{:.0f}',
            'Percentage': '{:.1f}%',
        })
        .hide(axis='index')
        .set_properties(
            subset=['Manipulator Intelligence', 'Manipulation Outcome'], 
            **{'text-align': 'left'}
        )
        .set_properties(
            subset=['Total Experiments', 'Manipulator Spoke First', 'Percentage'],
            **{'text-align': 'right'}
        )
    )
    display(styler)

In [None]:
import matplotlib.pyplot as plt

# Create visualization if data exists
if not first_speaker_df.empty:
    # Prepare data for visualization
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Create grouped bar chart
    x_labels = []
    manipulator_first_counts = []
    total_counts = []
    
    for _, row in first_speaker_summary.iterrows():
        label = f"{row['Manipulator Intelligence']}\n{row['Manipulation Outcome']}"
        x_labels.append(label)
        manipulator_first_counts.append(row['Manipulator Spoke First'])
        total_counts.append(row['Total Experiments'])
    
    x_pos = np.arange(len(x_labels))
    width = 0.35
    
    # Create bars
    bars1 = ax.bar(x_pos - width/2, manipulator_first_counts, width, 
                   label='Manipulator Spoke First', color='#009260', alpha=0.8)
    bars2 = ax.bar(x_pos + width/2, total_counts, width,
                   label='Total Experiments', color='#7F8990', alpha=0.6)
    
    # Add percentage labels on bars
    for i, (count, total) in enumerate(zip(manipulator_first_counts, total_counts)):
        percentage = (count / total * 100) if total > 0 else 0
        ax.text(x_pos[i] - width/2, count + 0.5, f'{percentage:.1f}%', 
                ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    # Customize chart
    ax.set_xlabel('Intelligence Level & Outcome', fontsize=12, fontweight='bold')
    ax.set_ylabel('Number of Experiments', fontsize=12, fontweight='bold')
    ax.set_title('Frequency of Manipulator Speaking First in Round 1', 
                 fontsize=14, fontweight='bold', pad=20)
    ax.set_xticks(x_pos)
    ax.set_xticklabels(x_labels, fontsize=10)
    ax.legend(loc='upper right', fontsize=10)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    
    # Set y-axis to start at 0
    ax.set_ylim(bottom=0)
    
    plt.tight_layout()
    plt.show()
    
    # Display summary insights
    display(Markdown('### Key Insights'))
    
    # Calculate overall statistics
    overall_total = first_speaker_df.shape[0]
    overall_first = first_speaker_df['manipulator_spoke_first'].sum()
    overall_pct = (overall_first / overall_total * 100) if overall_total > 0 else 0
    
    insights = [
        f"**Overall**: The manipulator spoke first in {overall_first} out of {overall_total} experiments ({overall_pct:.1f}%)",
    ]
    
    # Calculate by intelligence level
    for intelligence_key, intelligence_label in INTELLIGENCE_LEVELS.items():
        intel_df = first_speaker_df[first_speaker_df['intelligence_key'] == intelligence_key]
        intel_total = intel_df.shape[0]
        intel_first = intel_df['manipulator_spoke_first'].sum()
        intel_pct = (intel_first / intel_total * 100) if intel_total > 0 else 0
        insights.append(
            f"**{intelligence_label}**: {intel_first}/{intel_total} experiments ({intel_pct:.1f}%)"
        )
    
    # Calculate by outcome
    for outcome in ['Successful', 'Unsuccessful']:
        outcome_df = first_speaker_df[first_speaker_df['Manipulation Outcome'] == outcome]
        outcome_total = outcome_df.shape[0]
        outcome_first = outcome_df['manipulator_spoke_first'].sum()
        outcome_pct = (outcome_first / outcome_total * 100) if outcome_total > 0 else 0
        insights.append(
            f"**{outcome} Manipulations**: {outcome_first}/{outcome_total} experiments ({outcome_pct:.1f}%)"
        )
    
    for insight in insights:
        display(Markdown(f"- {insight}"))

### Detailed Breakdown: Distribution of First Speakers

The following table shows the complete distribution of which agent spoke first in Round 1, broken down by intelligence level and manipulation outcome.

In [None]:
if not first_speaker_df.empty:
    # Create detailed breakdown by first speaker
    detailed_breakdown = (
        first_speaker_df
        .groupby(['Manipulator Intelligence', 'Manipulation Outcome', 'first_speaker'])
        .size()
        .reset_index(name='Count')
    )
    
    # Pivot to make it more readable
    pivot_table = detailed_breakdown.pivot_table(
        index=['Manipulator Intelligence', 'Manipulation Outcome'],
        columns='first_speaker',
        values='Count',
        fill_value=0
    )
    
    # Reset index to make it a regular DataFrame
    pivot_table = pivot_table.reset_index()
    
    # Ensure all agent columns exist
    for agent in ['Agent_0', 'Agent_1', 'Agent_2']:
        if agent not in pivot_table.columns:
            pivot_table[agent] = 0
    
    # Reorder columns
    column_order = ['Manipulator Intelligence', 'Manipulation Outcome', 'Agent_0', 'Agent_1', 'Agent_2']
    pivot_table = pivot_table[column_order]
    
    # Calculate totals
    pivot_table['Total'] = pivot_table[['Agent_0', 'Agent_1', 'Agent_2']].sum(axis=1)
    
    # Sort by intelligence and outcome
    order = {'Successful': 0, 'Unsuccessful': 1}
    pivot_table = pivot_table.sort_values(
        by=['Manipulator Intelligence', 'Manipulation Outcome'],
        key=lambda col: col.map(order) if col.name == 'Manipulation Outcome' else col
    )
    
    # Format and display
    styler = (
        pivot_table
        .style
        .format({
            'Agent_0': '{:.0f}',
            'Agent_1': '{:.0f}',
            'Agent_2': '{:.0f}',
            'Total': '{:.0f}',
        })
        .hide(axis='index')
        .set_properties(
            subset=['Manipulator Intelligence', 'Manipulation Outcome'], 
            **{'text-align': 'left'}
        )
        .set_properties(
            subset=['Agent_0', 'Agent_1', 'Agent_2', 'Total'],
            **{'text-align': 'right'}
        )
        # Highlight Agent_2 column (manipulator)
        .set_properties(
            subset=['Agent_2'],
            **{'background-color': '#009260', 'color': 'white', 'font-weight': 'bold'}
        )
    )
    display(styler)
    
    # Add note about manipulator
    display(Markdown('*Note: **Agent_2** (highlighted in green) is the manipulator agent in all experiments.*'))