# Hypothesis 4 — Main Notebook

This notebook investigates Hypothesis 4: The number of AI agents affects fairness judgments by the MAAI in statistically significant ways, even when the relative composition of AI agents remains constant. It generates experiment configurations with varying numbers of agents (5, 10, 20), runs them, and analyzes the results.

In [None]:
# Imports
import sys, os
from pathlib import Path

# Ensure repo root on sys.path (for local package imports)
def _add_repo_root_to_sys_path():
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if (p / 'main.py').exists() and (p / 'hypothesis_testing').is_dir():
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            return p
    return here
_REPO_ROOT = _add_repo_root_to_sys_path()

import json
import random
import shutil
import yaml
from collections import Counter
import numpy as np
from scipy.stats import chi2_contingency
from hypothesis_testing.utils_hypothesis_testing.runner import (
    list_config_files,
    select_configs,
    run_configs_in_parallel,
)


In [None]:
# Configuration paths and constants
CONFIG_DIR = _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_4' / 'configs'
TERMINAL_OUTPUTS_DIR   = _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_4' / 'terminal_outputs'
RESULTS_DIR= _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_4' / 'results'
TRANSCRIPTS_DIR = _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_4' / 'transcripts'

MODEL_LIST = [
    "gemini-2.5-pro",
    "gemini-2.5-flash",
    "gemini-2.5-flash-lite",
 ]
,
INCOME_CLASS_PROBS = {
    'high': 0.05,
    'medium_high': 0.10,
    'medium': 0.50,
    'medium_low': 0.25,
    'low': 0.10,
}

# Ensure directories exist
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
TERMINAL_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
TRANSCRIPTS_DIR.mkdir(parents=True, exist_ok=True)


## 1. Generate Configurations for Hypothesis 4

In [None]:
def make_agents(num_agents: int, temp: float, rng: random.Random) -> list[dict]:
    agents = []
    for i in range(num_agents):
        agents.append({
            'name': f'Agent_{i+1}',
            'personality': 'You are an American college student',
            'model': rng.choice(MODEL_LIST),
            'temperature': float(temp),
            'memory_character_limit': 25000,
            'reasoning_enabled': True,
        })
    return agents

def build_config(num_agents: int, temp: float, seed_val: int, rng: random.Random) -> dict:
    return {
        'language': 'English',
        'seed': int(seed_val),
        'agents': make_agents(num_agents, temp, rng),
        'utility_agent_model': 'gemini-2.0-flash-lite-001',
        'utility_agent_temperature': 0.0,
        'phase2_rounds': 10,
        'distribution_range_phase2': [2, 6],
        'income_class_probabilities': INCOME_CLASS_PROBS,
        'original_values_mode': {'enabled': True},
    }

GLOBAL_SEED = 42000
master_rng = random.Random(GLOBAL_SEED)

AGENT_COUNTS = [5, 10, 20]  # As per Hypothesis 4
NUM_CONFIGS_PER_COUNT = 11 # 33 configs total

generated_files = []
config_idx = 1
for count in AGENT_COUNTS:
    for _ in range(NUM_CONFIGS_PER_COUNT):
        seed_val = master_rng.randint(0, 2**31 - 1)
        cfg_rng = random.Random(seed_val)
        temp = master_rng.uniform(0.0, 1.5) # Random temperature
        cfg = build_config(num_agents=count, temp=temp, seed_val=seed_val, rng=cfg_rng)
        fname = CONFIG_DIR / f'hypothesis_4_agents_{count}_config_{config_idx}.yaml'
        with open(fname, 'w') as f:
            yaml.safe_dump(cfg, f, sort_keys=False)
        generated_files.append(fname)
        config_idx += 1

print(f'Generated {len(generated_files)} config files.')


## 2. Run Configs (Parallel + Selective)

In [None]:
# Discover all config files
configs = list_config_files(CONFIG_DIR)
print(f'Found {len(configs)} configs')

# Selection controls
SELECT_INDICES = None  # Run all
SELECT_NAMES = None    # Or select by name
CONCURRENCY = 5        # adjust parallel workers
TIMEOUT_SECONDS = None # e.g., 900 for 15 minutes per run

selected = select_configs(configs, include_indices=SELECT_INDICES, include_names=SELECT_NAMES)
print(f'Selected {len(selected)} configs to run')

run_results = run_configs_in_parallel(
    selected,
    concurrency=CONCURRENCY,
    logs_dir=TERMINAL_OUTPUTS_DIR,
    results_dir=RESULTS_DIR,
    timeout_sec=TIMEOUT_SECONDS,
)

# Quick summary
ok = sum(1 for r in run_results if r.get('ok'))
print(f'Completed: {ok}/{len(run_results)} OK')


## 3. Analysis — Principles vs. Disagreements

In [None]:
CATEGORIES = [
    'maximizing_floor',
    'maximizing_average',
    'maximizing_average_floor_constraint',
    'maximizing_average_range_constraint',
    'disagreement',
]
,
def categorize_result(result_path: Path) -> str:
    try:
        with open(result_path, 'r') as f:
            data = json.load(f)
        gi = data.get('general_information', { })
        consensus = gi.get('consensus_reached', False)
        principle = gi.get('consensus_principle')
        if consensus and principle in CATEGORIES:
            return principle
        return 'disagreement'
    except Exception:
        return 'disagreement'

counts_by_agents = {count: Counter() for count in AGENT_COUNTS}
result_files = sorted(RESULTS_DIR.glob('*_results.json'))

for rp in result_files:
    for count in AGENT_COUNTS:
        if f'_agents_{count}_' in rp.name:
            counts_by_agents[count][categorize_result(rp)] += 1
            break

for count in AGENT_COUNTS:
    print(f"--- Agent Count: {count} ---")
    for cat in CATEGORIES:
        print(f'{cat:38} | {counts_by_agents[count].get(cat, 0)}')


## 4. Statistical Tests — Fisher–Freeman–Halton and Cramér's V

In [None]:
# Build contingency table (rows=categories, cols=agent_counts)
contingency_table = []
for cat in CATEGORIES:
    row = [counts_by_agents[count].get(cat, 0) for count in AGENT_COUNTS]
    contingency_table.append(row)

contingency = np.array(contingency_table)
print("Contingency Table:")
print(contingency)

def fisher_freeman_halton_pvalue_r(contingency: np.ndarray) -> float | None:
    if shutil.which('Rscript') is None:
        return None
    r_matrix = ','.join(str(int(x)) for x in contingency.flatten(order='C'))
    nrow = contingency.shape[0]
    r_code = f"""
m <- matrix(c({r_matrix}), nrow={nrow}, byrow=TRUE);
    f <- tryCatch(fisher.test(m), error=function(e) NA);
    if (is.list(f)) {{ cat(f$p.value) }} else {{ cat('NA') }}
    """
    import subprocess
    try:
        out = subprocess.check_output(['Rscript', '-e', r_code], stderr=subprocess.STDOUT, text=True)
        out = out.strip()
        return float(out) if out and out != 'NA' else None
    except Exception:
        return None

p_ffh = fisher_freeman_halton_pvalue_r(contingency)
if p_ffh is not None:
    print(f'Fisher–Freeman–Halton exact test p-value: {p_ffh:.6f}')
else:
    print('Rscript not found, skipping Fisher-Freeman-Halton test.')
