# Hypothesis 6 — Main Notebook

This notebook prepares three sets of 34 experiment configurations (English, Spanish, Mandarin),
runs them selectively in parallel, and compares outcome distributions across languages.

Requirements implemented:
- 34 configs per language (total 102) with agent temperatures drawn from U(0, 1.5) per config.
- Separate subfolders per language under `configs/`, `terminal_outputs/`, and `results/`.
- Statistical analysis comparing the outcomes across the three language sets (5×3 table).

In [9]:
# Imports
import sys, os
from pathlib import Path

# Ensure repo root on sys.path (for local package imports)
def _add_repo_root_to_sys_path():
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if (p / 'main.py').exists() and (p / 'hypothesis_testing').is_dir():
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            return p
    return here
_REPO_ROOT = _add_repo_root_to_sys_path()

import json
import random
import shutil
import yaml
import numpy as np
from collections import Counter

from hypothesis_testing.utils_hypothesis_testing.runner import (
    list_config_files,
    select_configs,
    run_configs_in_parallel,
)


In [10]:
# Base paths and per-language subfolders
BASE_DIR = _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_6'
CONFIGS_BASE = BASE_DIR / 'configs'
LOGS_BASE = BASE_DIR / 'terminal_outputs'
RESULTS_BASE = BASE_DIR / 'results'
TRANSCRIPTS_BASE = BASE_DIR / 'transcripts'

LANG_SETS = {
    'english': 'English',
    'spanish': 'Spanish',
    'mandarin': 'Mandarin',
}

# Ensure subfolders exist (does not create config files)
for key in LANG_SETS.keys():
    (CONFIGS_BASE / key).mkdir(parents=True, exist_ok=True)
    (LOGS_BASE / key).mkdir(parents=True, exist_ok=True)
    (RESULTS_BASE / key).mkdir(parents=True, exist_ok=True)
    (TRANSCRIPTS_BASE / key).mkdir(parents=True, exist_ok=True)

CONFIGS_BASE, LOGS_BASE, RESULTS_BASE, TRANSCRIPTS_BASE, LANG_SETS


(PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_6/configs'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_6/terminal_outputs'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_6/results'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_6/transcripts'),
 {'english': 'English', 'spanish': 'Spanish', 'mandarin': 'Mandarin'})

## 1) Config Generation 

Generates 34 YAML configurations for each language set.
- All 5 agents share a per-config temperature drawn from U(0, 1.5).


In [None]:
# Income class probabilities (must sum to 1.0)
INCOME_CLASS_PROBS = {
    'high': 0.05,
    'medium_high': 0.10,
    'medium': 0.50,
    'medium_low': 0.25,
    'low': 0.10,
}

# Placeholder model list for participant agents — adjust as needed
MODEL_LIST = [
    "gemini-2.5-pro",
    "gemini-2.5-flash",
    "gemini-2.5-flash-lite",
]

def make_agents_with_models(temp: float, models: list[str]) -> list[dict]:
    agents = []
    for i in range(0, 5):  # 5 participant agents
        agents.append({
            'name': f'Agent_{i}',
            'personality': 'You are an American college student',
            'model': models[i],
            'temperature': float(temp),
            'memory_character_limit': 25000,
            'reasoning_enabled': True,
        })
    return agents

def build_config(lang: str, temp: float, seed_val: int, models: list[str]) -> dict:
    return {
        'language': lang,
        'seed': int(seed_val),
        'agents': make_agents_with_models(temp, models),
        'utility_agent_model': "gemini-2.5-flash-lite",
        'utility_agent_temperature': 0.0,
        'phase2_rounds': 10,
        'distribution_range_phase2': [2, 6],
        'income_class_probabilities': INCOME_CLASS_PROBS,
        'original_values_mode': { 'enabled': True },
    }

# Optional: set a global seed for reproducible generation (adjust or comment out)
GLOBAL_SEED = 10000
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

def generate_aligned_configs(n: int = 34) -> dict[str, list[Path]]:
    paths: dict[str, list[Path]] = {k: [] for k in LANG_SETS.keys()}
    for idx in range(1, n + 1):
        temp = random.uniform(0.0, 1.5)  # shared per condition across languages
        seed_val = random.randint(0, 2**31 - 1)
        models = [random.choice(MODEL_LIST) for _ in range(5)]  # shared agent models
        for lang_key, lang_name in LANG_SETS.items():
            cfg = build_config(lang=lang_name, temp=temp, seed_val=seed_val, models=models)
            out_dir = (CONFIGS_BASE / lang_key)
            out_dir.mkdir(parents=True, exist_ok=True)
            fname = out_dir / f'hypothesis_6_{lang_key}_condition_{idx}_config.yaml'
            with open(fname, 'w') as f:
                yaml.safe_dump(cfg, f, sort_keys=False)
            paths[lang_key].append(fname)
    return paths

# To generate aligned configs for all languages at once (writes 102 files total):
#files_by_lang = generate_aligned_configs(n=34)
#{k: len(v) for k, v in files_by_lang.items()}


{'english': 34, 'spanish': 34, 'mandarin': 34}

## 2) Run Configs (Parallel per Language)

Select subsets and run with per-language logs/results directories.

In [3]:
def run_language_set(lang_key: str, include_indices=None, include_names=None, concurrency: int = 4, timeout_sec: int | None = None):
    cfg_dir = CONFIGS_BASE / lang_key
    logs_dir = LOGS_BASE / lang_key
    results_dir = RESULTS_BASE / lang_key
    configs = list_config_files(cfg_dir)

    selected = select_configs(configs, include_indices=include_indices, include_names=include_names)
    print(f'[{lang_key}] Found {len(configs)} configs; selected {len(selected)}')

    run_results = run_configs_in_parallel(
        selected,
        concurrency=concurrency,
        logs_dir=logs_dir,
        results_dir=results_dir,
        timeout_sec=timeout_sec,
    )
    ok = sum(1 for r in run_results if r.get('ok'))
    print(f'[{lang_key}] Completed: {ok}/{len(run_results)} OK')
    return run_results




In [None]:
# Run all conditions from each language sequentially by language
# Starting with English, then Spanish, then Mandarin
# Each language runs all its 34 conditions with concurrency=3
all_run_results = []
import time

for lang_key in ['english', 'spanish', 'mandarin']:
    print(f'\n=== Running ALL {lang_key.upper()} conditions (34 configs) ===')
    start_time = time.time()
    run_results = run_language_set(
        lang_key,
        include_indices=list(range(1, 35)),  # All 34 conditions (1-34)
        concurrency=2,        # Run 3 configs in parallel per language
        timeout_sec=None
    )
    duration = time.time() - start_time
    print(f'Completed all {lang_key.upper()} conditions in {duration:.1f}s')
    all_run_results.extend(run_results)

# Overall summary
ok = sum(1 for r in all_run_results if r.get('ok'))
total_configs = len(all_run_results)
print(f'\n=== OVERALL SUMMARY ===')
print(f'Completed: {ok}/{total_configs} OK from {total_configs} configs (34 per language × 3 languages)')
print(f'Expected total: {34 * 3} configs')
#all_run_results


=== Running ALL ENGLISH conditions (34 configs) ===
[english] Found 34 configs; selected 34
[english] Completed: 31/34 OK
Completed all ENGLISH conditions in 28887.2s

=== Running ALL SPANISH conditions (34 configs) ===
[spanish] Found 34 configs; selected 34
[spanish] Completed: 34/34 OK
Completed all SPANISH conditions in 41132.2s

=== Running ALL MANDARIN conditions (34 configs) ===
[mandarin] Found 34 configs; selected 34


In [None]:
run_results = run_language_set(
    'mandarin',
    include_indices=list(range(14,35)),  # Only config 1
    concurrency=4,
    timeout_sec=None
)


# Display results
for result in run_results:
    if result.get('ok'):
        print(f"\n✓ Success: {result['config']}")
        print(f"  Result: {result.get('result_file')}")
    else:
        print(f"\n✗ Failed: {result['config']}")
        print(f"  Error: {result.get('error', 'Unknown error')}")

run_results

## 3) Analysis — Compare Outcomes Across Languages

Build a 5×3 contingency table (rows=principle/disagreement categories; columns=English/Spanish/Mandarin)
and run Fisher–Freeman–Halton exact test via R when available.
Also compute Cramér's V with optional bootstrap CI.

In [11]:
CATEGORIES = [
    'maximizing_floor',
    'maximizing_average',
    'maximizing_average_floor_constraint',
    'maximizing_average_range_constraint',
    'disagreement',
]

def categorize_result(result_path: Path) -> str:
    try:
        with open(result_path, 'r') as f:
            data = json.load(f)
        gi = data.get('general_information', {})
        consensus = gi.get('consensus_reached', False)
        principle = gi.get('consensus_principle')
        if consensus and principle in CATEGORIES:
            return principle
        return 'disagreement'
    except Exception:
        return 'disagreement'

def count_by_language() -> dict[str, Counter]:
    out: dict[str, Counter] = {}
    for k in LANG_SETS.keys():
        counts = Counter()
        result_files = sorted((RESULTS_BASE / k).glob('*_results.json'))
        for rp in result_files:
            counts[categorize_result(rp)] += 1
        for cat in CATEGORIES:
            counts.setdefault(cat, 0)
        out[k] = counts
    return out

lang_counts = count_by_language()
for k, counts in lang_counts.items():
    print(f'{k.capitalize()} counts:', dict(counts))

# Build contingency table: rows=categories, cols=[English, Spanish, Mandarin]
col_order = ['english', 'spanish', 'mandarin']
contingency = np.vstack([[lang_counts[col][cat] for col in col_order] for cat in CATEGORIES])
contingency, CATEGORIES, col_order


English counts: {'disagreement': 4, 'maximizing_average_floor_constraint': 30, 'maximizing_floor': 0, 'maximizing_average': 0, 'maximizing_average_range_constraint': 0}
Spanish counts: {'maximizing_average_range_constraint': 2, 'disagreement': 15, 'maximizing_average_floor_constraint': 17, 'maximizing_floor': 0, 'maximizing_average': 0}
Mandarin counts: {'maximizing_average_floor_constraint': 27, 'disagreement': 6, 'maximizing_floor': 1, 'maximizing_average': 0, 'maximizing_average_range_constraint': 0}


(array([[ 0,  0,  1],
        [ 0,  0,  0],
        [30, 17, 27],
        [ 0,  2,  0],
        [ 4, 15,  6]]),
 ['maximizing_floor',
  'maximizing_average',
  'maximizing_average_floor_constraint',
  'maximizing_average_range_constraint',
  'disagreement'],
 ['english', 'spanish', 'mandarin'])

In [12]:
def fisher_freeman_halton_pvalue_r(contingency: np.ndarray) -> float | None:
    """Run Fisher–Freeman–Halton test via R's fisher.test if available.
    Returns p-value or None if Rscript not found or fails.
    """
    if shutil.which('Rscript') is None:
        return None
    r_matrix = ','.join(str(int(x)) for x in contingency.flatten(order='C'))
    nrow, ncol = contingency.shape
    r_code = f"""m <- matrix(c({r_matrix}), nrow={nrow}, ncol={ncol}, byrow=TRUE);
f <- tryCatch(fisher.test(m), error=function(e) NA);
if (is.list(f)) {{ cat(f$p.value) }} else {{ cat('NA') }}
"""
    import subprocess
    try:
        out = subprocess.check_output(['Rscript', '-e', r_code], stderr=subprocess.STDOUT, text=True, timeout=30)
        out = out.strip()
        return float(out) if out and out != 'NA' else None
    except subprocess.TimeoutExpired:
        return None
    except Exception:
        return None

p_ffh = fisher_freeman_halton_pvalue_r(contingency)
if p_ffh is None:
    print('R not available; skipping Fisher–Freeman–Halton exact test')
else:
    print(f'Fisher–Freeman–Halton exact test p-value: {p_ffh:.6f}')


Fisher–Freeman–Halton exact test p-value: 0.002187


In [13]:
from hypothesis_testing.utils_hypothesis_testing import (
    bias_corrected_cramers_v,
    bootstrap_cramers_v,
    cramers_v,
)

# Effect size summary
if contingency.sum() > 0:
    v_std = cramers_v(contingency, correction=False)
    v_corr = bias_corrected_cramers_v(contingency, correction=False)
    print(f"Cramér's V (standard): {v_std:.4f}")
    print(f"Cramér's V (bias-corrected): {v_corr:.4f}")
    boot_vs, ci_lo, ci_hi = bootstrap_cramers_v(
        contingency,
        n_bootstrap=2000,
        confidence_level=0.95,
        bias_corrected=True,
        correction=False,
        seed=123,
    )
    print(f"95% CI for bias-corrected V: [{ci_lo:.4f}, {ci_hi:.4f}]")
else:
    print('Insufficient data for effect size computation')


Cramér's V (standard): 0.2970
Cramér's V (bias-corrected): 0.2443
95% CI for bias-corrected V: [0.1176, 0.4076]


## Pairwise Comparisons


### English vs Spanish


In [6]:
pair = ['english', 'spanish']
sub_contingency = np.vstack([[lang_counts[col][cat] for col in pair] for cat in CATEGORIES])
print(f'Sub-contingency for {pair[0]} vs {pair[1]}:')
print(sub_contingency)
p_ffh = fisher_freeman_halton_pvalue_r(sub_contingency)
if p_ffh is None:
    print('R not available; skipping Fisher–Freeman–Halton exact test')
else:
    print(f'Fisher–Freeman–Halton exact test p-value: {p_ffh:.6f}')
v_corr = bias_corrected_cramers_v(sub_contingency)
print(f"Cramér's V (bias-corrected): {v_corr:.4f}")


Sub-contingency for english vs spanish:
[[ 0  0]
 [ 0  0]
 [30 17]
 [ 0  2]
 [ 4 15]]
Fisher–Freeman–Halton exact test p-value: 0.001617
Cramér's V (bias-corrected): 0.3851


### English vs Mandarin


In [7]:
pair = ['english', 'mandarin']
sub_contingency = np.vstack([[lang_counts[col][cat] for col in pair] for cat in CATEGORIES])
print(f'Sub-contingency for {pair[0]} vs {pair[1]}:')
print(sub_contingency)
p_ffh = fisher_freeman_halton_pvalue_r(sub_contingency)
if p_ffh is None:
    print('R not available; skipping Fisher–Freeman–Halton exact test')
else:
    print(f'Fisher–Freeman–Halton exact test p-value: {p_ffh:.6f}')
v_corr = bias_corrected_cramers_v(sub_contingency)
print(f"Cramér's V (bias-corrected): {v_corr:.4f}")


Sub-contingency for english vs mandarin:
[[ 0  1]
 [ 0  0]
 [30 27]
 [ 0  0]
 [ 4  6]]
Fisher–Freeman–Halton exact test p-value: 0.511789
Cramér's V (bias-corrected): 0.0000


### Spanish vs Mandarin


In [8]:
pair = ['spanish', 'mandarin']
sub_contingency = np.vstack([[lang_counts[col][cat] for col in pair] for cat in CATEGORIES])
print(f'Sub-contingency for {pair[0]} vs {pair[1]}:')
print(sub_contingency)
p_ffh = fisher_freeman_halton_pvalue_r(sub_contingency)
if p_ffh is None:
    print('R not available; skipping Fisher–Freeman–Halton exact test')
else:
    print(f'Fisher–Freeman–Halton exact test p-value: {p_ffh:.6f}')
v_corr = bias_corrected_cramers_v(sub_contingency)
print(f"Cramér's V (bias-corrected): {v_corr:.4f}")


Sub-contingency for spanish vs mandarin:
[[ 0  1]
 [ 0  0]
 [17 27]
 [ 2  0]
 [15  6]]
Fisher–Freeman–Halton exact test p-value: 0.012048
Cramér's V (bias-corrected): 0.3014


## Floor Constraint Amount Analysis

In [15]:
import pandas as pd
from IPython.display import display

def extract_floor_constraints() -> pd.DataFrame:
    rows: list[dict[str, object]] = []
    for lang_key, lang_label in LANG_SETS.items():
        result_files = sorted((RESULTS_BASE / lang_key).glob('*_results.json'))
        for result_path in result_files:
            with open(result_path, 'r') as fp:
                data = json.load(fp)
            vote_rounds = data.get('voting_history', {}).get('vote_rounds', [])
            agreed_constraint = None
            for vote_round in reversed(vote_rounds):
                if vote_round.get('agreed_constraint') is not None:
                    agreed_constraint = vote_round['agreed_constraint']
                    break
            if agreed_constraint is not None:
                rows.append({
                    'language_key': lang_key,
                    'language': lang_label,
                    'result_file': result_path.name,
                    'floor_constraint': agreed_constraint,
                })
    return pd.DataFrame(rows)

floor_constraints_df = extract_floor_constraints()
display(floor_constraints_df)

if not floor_constraints_df.empty:
    per_language_stats = floor_constraints_df.groupby('language')['floor_constraint'].agg(
        average='mean',
        minimum='min',
        maximum='max',
        std_dev='std',
    )
    overall_stats = floor_constraints_df['floor_constraint'].agg(['mean', 'min', 'max', 'std']).rename({
        'mean': 'average',
        'min': 'minimum',
        'max': 'maximum',
        'std': 'std_dev',
    })
    overall_stats_df = pd.DataFrame([overall_stats], index=['all_languages'])
    print('\nPer-language floor constraint statistics:')
    display(per_language_stats)
    print('\nOverall floor constraint statistics:')
    display(overall_stats_df)
else:
    print('No floor constraint values found across runs.')


Unnamed: 0,language_key,language,result_file,floor_constraint
0,english,English,hypothesis_6_english_condition_11_config_resul...,15000
1,english,English,hypothesis_6_english_condition_12_config_resul...,13000
2,english,English,hypothesis_6_english_condition_13_config_resul...,11000
3,english,English,hypothesis_6_english_condition_14_config_resul...,20000
4,english,English,hypothesis_6_english_condition_16_config_resul...,20000
...,...,...,...,...
71,mandarin,Mandarin,hypothesis_6_mandarin_condition_5_config_resul...,12000
72,mandarin,Mandarin,hypothesis_6_mandarin_condition_6_config_resul...,12000
73,mandarin,Mandarin,hypothesis_6_mandarin_condition_7_config_resul...,12000
74,mandarin,Mandarin,hypothesis_6_mandarin_condition_8_config_resul...,13000



Per-language floor constraint statistics:


Unnamed: 0_level_0,average,minimum,maximum,std_dev
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
English,14016.666667,1000,25000,4652.554607
Mandarin,13074.074074,10000,40000,5599.972019
Spanish,16894.736842,9000,48000,10027.15611



Overall floor constraint statistics:


Unnamed: 0,average,minimum,maximum,std_dev
all_languages,14401.315789,1000.0,48000.0,6755.74804
