# Hypothesis 1 — Main Notebook

This notebook generates 33 experiment configurations, runs them in parallel with configurable concurrency and selection, logs outputs, and analyzes how often groups reach each of the four principles vs. disagreement.

In [2]:
# Imports
import sys, os
from pathlib import Path

# Ensure repo root on sys.path (for local package imports)
def _add_repo_root_to_sys_path():
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if (p / 'main.py').exists() and (p / 'hypothesis_testing').is_dir():
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            return p
    return here
_REPO_ROOT = _add_repo_root_to_sys_path()

import json
import random
import shutil
import yaml
from collections import Counter
import numpy as np
from scipy.stats import chi2_contingency
from hypothesis_testing.utils_hypothesis_testing.runner import (
    list_config_files,
    select_configs,
    run_configs_in_parallel,
)


In [4]:
# Configuration paths and constants
CONFIG_DIR = _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_1' / 'configs'
TERMINAL_OUTPUTS_DIR   = _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_1' / 'terminal_outputs'
RESULTS_DIR= _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_1' / 'results'
TRANSCRIPTS_DIR = _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_1' / 'transcripts'

# Placeholder model list for participant agents

MODEL_LIST = [
    "gemini-2.5-pro",
    "gemini-2.5-flash",
    "gemini-2.5-flash-lite",
]

# Income class probabilities (must sum to 1.0)
# Same as in Frohlich & Oppenheimer (1992) for the initial distribution
INCOME_CLASS_PROBS = {
    'high': 0.05,
    'medium_high': 0.10,
    'medium': 0.50,
    'medium_low': 0.25,
    'low': 0.10,
}

# Ensure directories exist
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
TERMINAL_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
TRANSCRIPTS_DIR.mkdir(parents=True, exist_ok=True)

CONFIG_DIR, TERMINAL_OUTPUTS_DIR, RESULTS_DIR, TRANSCRIPTS_DIR


(PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/configs'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/terminal_outputs'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/results'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/transcripts'))

## 1. Generate 33 Configurations

In [3]:
def make_agents(temp: float, rng: random.Random) -> list[dict]:
    agents = []
    for i in range(0, 5):  # 5 participant agents
        agents.append({
            'name': f'Agent_{i+1}',
            'personality': 'You are an American college student',
            'model': rng.choice(MODEL_LIST),
            'temperature': float(temp),
            'memory_character_limit': 25000,
            'reasoning_enabled': True,
        })
    return agents

def build_config(temp: float, seed_val: int, rng: random.Random) -> dict:
    return {
        'language': 'English',
        'seed': int(seed_val),
        'agents': make_agents(temp, rng),
        'utility_agent_model': 'gemini-2.0-flash-lite-001',
        'utility_agent_temperature': 0.0,
        'phase2_rounds': 10,
        'distribution_range_phase2': [2, 6],
        'income_class_probabilities': INCOME_CLASS_PROBS,
        'original_values_mode': {'enabled': True},
    }

# Temperatures: 11 with 0, 11 with U(0,1), 11 with U(0,1.5)
GLOBAL_SEED = 21000
master_rng = random.Random(GLOBAL_SEED)  # deterministic config generation

temps_fixed = [0.0] * 11
temps_u01 = [master_rng.uniform(0.0, 1.0) for _ in range(11)]
temps_u015 = [master_rng.uniform(0.0, 1.5) for _ in range(11)]
all_temps = temps_fixed + temps_u01 + temps_u015

generated_files = []
for idx, temp in enumerate(all_temps, start=1):
    seed_val = master_rng.randint(0, 2**31 - 1)
    cfg_rng = random.Random(seed_val)  # tie agent sampling to the seed
    cfg = build_config(temp=temp, seed_val=seed_val, rng=cfg_rng)
    fname = CONFIG_DIR / f'hypothesis_1_condition_{idx}_config.yaml'
    with open(fname, 'w') as f:
        yaml.safe_dump(cfg, f, sort_keys=False)
    generated_files.append(fname)

len(generated_files), generated_files[0] if generated_files else None


(33,
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/configs/hypothesis_1_condition_1_config.yaml'))

## 2. Run Configs (Parallel + Selective)

In [4]:
# Discover all config files
configs = list_config_files(CONFIG_DIR)
print(f'Found {len(configs)} configs')

# Selection controls
All_SELECT_INDICES = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 
31, 32, 33]  # e.g., [1,2,3] for the first three
SELECT_INDICES = [15,32]
SELECT_NAMES = None    # e.g., ['condition_1', 'condition_33']
CONCURRENCY = 2      # adjust parallel workers
TIMEOUT_SECONDS = None # e.g., 900 for 15 minutes per run

selected = select_configs(configs, include_indices=SELECT_INDICES, include_names=SELECT_NAMES)
print(f'Selected {len(selected)} configs to run')

run_results = run_configs_in_parallel(
    selected,
    concurrency=CONCURRENCY,
    logs_dir=TERMINAL_OUTPUTS_DIR,
    results_dir=RESULTS_DIR,
    timeout_sec=TIMEOUT_SECONDS,
)

# Quick summary
ok = sum(1 for r in run_results if r.get('ok'))
print(f'Completed: {ok}/{len(run_results)} OK')
run_results[:3]  # show a few

Found 33 configs
Selected 2 configs to run
Completed: 2/2 OK


[{'config': '/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/configs/hypothesis_1_condition_15_config.yaml',
  'log': '/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/terminal_outputs/hypothesis_1_condition_15_log',
  'result': '/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/results/hypothesis_1_condition_15_config_results.json',
  'returncode': 0,
  'ok': True},
 {'config': '/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/configs/hypothesis_1_condition_32_config.yaml',
  'log': '/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/terminal_outputs/hypothesis_1_condition_32_log',
  'result': '/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_1/results/hypothesis_1_condition_32_config_results.json',
  'returncode': 0,
  'ok': True}]

## 3. Analysis — Principles vs. Disagreements

Counts how often runs ended in consensus for each principle vs. disagreement (no consensus).

In [5]:
CATEGORIES = [
    'maximizing_floor',
    'maximizing_average',
    'maximizing_average_floor_constraint',
    'maximizing_average_range_constraint',
    'disagreement',
]

def categorize_result(result_path: Path) -> str:
    try:
        with open(result_path, 'r') as f:
            data = json.load(f)
        gi = data.get('general_information', {})
        consensus = gi.get('consensus_reached', False)
        principle = gi.get('consensus_principle')
        if consensus and principle in CATEGORIES:
            return principle
        return 'disagreement'
    except Exception:
        return 'disagreement'

counts = Counter()
result_files = sorted(RESULTS_DIR.glob('*_results.json'))
for rp in result_files:
    counts[categorize_result(rp)] += 1

# Ensure all categories are present
for cat in CATEGORIES:
    counts.setdefault(cat, 0)

# Display as a simple table
print('Category | Count')
print('---------|------')
for cat in CATEGORIES:
    print(f'{cat:38} | {counts[cat]}')

counts

Category | Count
---------|------
maximizing_floor                       | 0
maximizing_average                     | 1
maximizing_average_floor_constraint    | 29
maximizing_average_range_constraint    | 0
disagreement                           | 3


Counter({'maximizing_average_floor_constraint': 29,
         'disagreement': 3,
         'maximizing_average': 1,
         'maximizing_floor': 0,
         'maximizing_average_range_constraint': 0})

## 4. Statistical Tests — Fisher–Freeman–Halton and Cramér's V

Compare aggregated Hypothesis 1 outcomes (AI) against human outcomes as a 5×2 contingency table.

- Rows (categories): the four principles + disagreement.
- Columns (groups): Human vs AI.
- Fisher–Freeman–Halton exact test via R's `fisher.test()` when available; fallback to Chi-square otherwise.
- Cramér's V with bias correction and bootstrap CI.


In [12]:

# 1) Aggregate AI outcomes across all runs into 5 categories
ai_counts = [counts.get(cat, 0) for cat in CATEGORIES]
print('AI counts by category:', dict(zip(CATEGORIES, ai_counts)))

# 2) Specify Human counts (edit to match hypothesis_testing/hypothesis_1/image copy.png)
# Defaults below use Frohlich & Oppenheimer published values as a placeholder.
HUMAN_COUNTS = {
    'maximizing_floor': 5,
    'maximizing_average': 1,
    'maximizing_average_floor_constraint': 23,
    'maximizing_average_range_constraint': 2,
    'disagreement': 7,
}
human_counts = [HUMAN_COUNTS.get(cat, 0) for cat in CATEGORIES]
print('Human counts by category:', dict(zip(CATEGORIES, human_counts)))

# 3) Build 5×2 contingency table (rows=categories, cols=[Human, AI])
contingency = np.vstack([human_counts, ai_counts]).T  # shape (5, 2)
contingency, CATEGORIES, ['Human','AI']


AI counts by category: {'maximizing_floor': 0, 'maximizing_average': 1, 'maximizing_average_floor_constraint': 29, 'maximizing_average_range_constraint': 0, 'disagreement': 3}
Human counts by category: {'maximizing_floor': 5, 'maximizing_average': 1, 'maximizing_average_floor_constraint': 23, 'maximizing_average_range_constraint': 2, 'disagreement': 7}


(array([[ 5,  0],
        [ 1,  1],
        [23, 29],
        [ 2,  0],
        [ 7,  3]]),
 ['maximizing_floor',
  'maximizing_average',
  'maximizing_average_floor_constraint',
  'maximizing_average_range_constraint',
  'disagreement'],
 ['Human', 'AI'])

In [14]:
def fisher_freeman_halton_pvalue_r(contingency: np.ndarray) -> float | None:
    """Run Fisher-Freeman-Halton test via R's fisher.test if available.
    Returns p-value or None if Rscript not found or fails.
    """
    if shutil.which('Rscript') is None:
        return None
    r_matrix = ','.join(str(int(x)) for x in contingency.flatten(order='C'))
    nrow = contingency.shape[0]
    r_code = f"""
m <- matrix(c({r_matrix}), nrow={nrow}, byrow=TRUE);
f <- tryCatch(fisher.test(m), error=function(e) NA);
if (is.list(f)) {{ cat(f$p.value) }} else {{ cat('NA') }}
"""
    import subprocess
    try:
        out = subprocess.check_output(['Rscript', '-e', r_code], stderr=subprocess.STDOUT, text=True)
        out = out.strip()
        return float(out) if out and out != 'NA' else None
    except Exception:
        return None

p_ffh = fisher_freeman_halton_pvalue_r(contingency)

print(f'Fisher–Freeman–Halton exact test p-value: {p_ffh:.6f}')


Fisher–Freeman–Halton exact test p-value: 0.033087


In [19]:
from __future__ import annotations

import numpy as np
from scipy.stats import chi2_contingency


def _prune_empty_rows_cols(contingency: np.ndarray) -> np.ndarray:
    """
    Drop all-zero rows and columns to avoid degenerate shapes and expected=0 cells.
    Returns the original array if nothing needs pruning.
    """
    contingency = np.asarray(contingency, dtype=float)
    if contingency.ndim != 2:
        raise ValueError("contingency must be a 2D array")

    row_mask = contingency.sum(axis=1) > 0
    col_mask = contingency.sum(axis=0) > 0

    if row_mask.all() and col_mask.all():
        return contingency
    pruned = contingency[row_mask][:, col_mask]
    return pruned


def cramers_v(contingency: np.ndarray) -> float:
    """
    Standard Cramér's V using Pearson's chi-square with NO Yates correction.
    Matches the definition used by Bergsma (2013) before bias correction.
    """
    tab = _prune_empty_rows_cols(contingency)
    n = tab.sum()
    if n <= 0:
        return 0.0

    r, c = tab.shape
    df_min = min(r - 1, c - 1)
    if df_min <= 0:
        return 0.0

    chi2, _, _, _ = chi2_contingency(tab, correction=False)
    return float(np.sqrt((chi2 / n) / df_min))


def bias_corrected_cramers_v(contingency: np.ndarray) -> float:
    """
    Bergsma (2013) bias-corrected Cramér's V.
    - Subtracts finite-sample bias for phi^2 at independence.
    - Adjusts effective dimensions so the corrected measure can still reach 1.
    Formulas:
      phi2_corr = max(0, chi2/n - ((r-1)(c-1))/(n-1))
      r_corr = r - ((r-1)^2)/(n-1)
      c_corr = c - ((c-1)^2)/(n-1)
      V_tilde = sqrt(phi2_corr / min(r_corr-1, c_corr-1))
    """
    tab = _prune_empty_rows_cols(contingency)
    n = tab.sum()
    if n <= 1:
        return 0.0

    r, c = tab.shape
    if min(r - 1, c - 1) <= 0:
        return 0.0

    chi2, _, _, _ = chi2_contingency(tab, correction=False)
    phi2 = chi2 / n
    r1, c1 = r - 1, c - 1

    # Bias correction at independence
    phi2_corr = max(0.0, phi2 - (r1 * c1) / (n - 1))

    # Adjusted table dimensions
    r_corr = r - (r1 * r1) / (n - 1)
    c_corr = c - (c1 * c1) / (n - 1)
    denom = min(r_corr - 1.0, c_corr - 1.0)
    if denom <= 0.0:
        return 0.0

    return float(np.sqrt(phi2_corr / denom))


def bootstrap_cramers_v(
    contingency: np.ndarray,
    n_bootstrap: int = 5000,
    confidence_level: float = 0.95,
    bias_corrected: bool = True,
    seed: int | None = 123,
) -> tuple[np.ndarray, float, float]:
    """
    Nonparametric bootstrap for Cramér's V by resampling counts from the empirical
    cell distribution via Multinomial(n, p). Uses percentile CI.

    Returns:
        (vs, lo, hi)
        - vs: array of bootstrap V values
        - lo, hi: lower/upper bounds of the (1 - alpha) percentile CI
    """
    tab = np.asarray(contingency, dtype=float)
    n = int(tab.sum())
    if n <= 0:
        return np.array([]), 0.0, 0.0
    if not (0.0 < confidence_level < 1.0):
        raise ValueError("confidence_level must be in (0, 1)")

    p = (tab / n).ravel()
    idxs = np.arange(p.size)

    rng = np.random.default_rng(seed)
    vs = np.empty(n_bootstrap, dtype=float)

    for b in range(n_bootstrap):
        # Draw a bootstrap table ~ Multinomial(n, p)
        counts = rng.multinomial(n, p).reshape(tab.shape)
        v = bias_corrected_cramers_v(counts) if bias_corrected else cramers_v(counts)
        vs[b] = v

    alpha = 1.0 - confidence_level
    lo, hi = np.quantile(vs, [alpha / 2.0, 1.0 - alpha / 2.0])
    return vs, float(lo), float(hi)




In [20]:
cramers_v_value = bias_corrected_cramers_v(contingency)

print(f'Bias-corrected Cramér\'s V: {cramers_v_value:.6f}')


Bias-corrected Cramér's V: 0.265347
