# Hypothesis 2 — Main Notebook

Hypothesis: The underlying LLM of an MAAI statistically significantly affects its fairness judgments; specifically, American and Chinese LLMs differ in their judgments.

This notebook prepares two groups of aligned experiment configurations (American, Chinese),
runs them selectively in parallel, and compares the outcome distributions across groups (5 categories).

- 34 configs per group (default), total 68.
- Per-config: a shared temperature drawn from U(0, 1.5), shared random seed, and agent models selected per group.
- Language is English for all runs.
- Utility agent is `gemini-2.5-flash`.
- Voting detection mode is set to "complex" for all runs.
- Exact test: Fisher–Freeman–Halton via R (if available). No Chi-square fallback.
- Effect size: Cramér's V (bias-corrected, as in Hypothesis 1).


# 1. Imports

In [1]:
import pandas as pd
import numpy as np
import sys, os
from pathlib import Path

# Ensure repo root on sys.path (for local package imports)
def _add_repo_root_to_sys_path():
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if (p / 'main.py').exists() and (p / 'hypothesis_testing').is_dir():
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            return p
    return here
_REPO_ROOT = _add_repo_root_to_sys_path()

import json
import random
import shutil
import yaml
from collections import Counter
from hypothesis_testing.utils_hypothesis_testing.runner import (
    list_config_files,
    select_configs,
    run_configs_in_parallel,
)


# 2. Model Selection Process

## Data Import & Cleaning

In [2]:
# Read the CSV file into a pandas dataframe
# Data scraped from https://artificialanalysis.ai/leaderboards/models on 2025-10-08
df = pd.read_csv("model_overview_artificial_analysis.csv")

In [3]:
# Remove '€' symbol and spaces, replace comma with dot, and convert to float
df['Price (Blended EUR/1M Tokens)'] = df['Price (Blended EUR/1M Tokens)'].str.replace('€', '').str.strip().str.replace(',', '.').astype(float)


In [4]:

s = df['Context Window'].astype(str).str.strip().str.lower()
m = s.str.extract(r'^\s*(\d+(?:[.,]\d+)?)\s*([km]?)\s*$')
num = pd.to_numeric(m[0].str.replace(',', '.', regex=False), errors='coerce')
mult = m[1].map({'k': 1_000, 'm': 1_000_000}).fillna(1)
df['Context Window'] = num * mult

In [5]:
# Convert Latency First Answer Chunk (s) (replace comma with dot)
df['Latency First Answer Chunk (s)'] = (
    df['Latency First Answer Chunk (s)']
      .astype(str)
      .str.replace(',', '.', regex=False)
      .astype(float)
)

In [6]:
# Filter for models with price less than 0.5 EUR per 1M tokens to reduce cost
df_cheap = df[df['Price (Blended EUR/1M Tokens)'] < 0.5]

In [7]:
# Sort by Intelligence Index (Artificial Analysis Index)
df_cheap= df_cheap.sort_values(by=["Intelligence Index"], ascending=False)

## Creating Final Model Selection

In [8]:
# Selecting the Chinese Models for the analysis

# Filtering by Chinese models
df_chinese = df_cheap[df_cheap["Country"] == "China"]

# Removing models which are not properly accesible through API
df_chinese = df_chinese[df_chinese["Speed (Median Tokens/s)"]>0]

#Selecting 6 best models by Articialy Intelligence Index
df_chinese_select = df_chinese[:3]



# List of Chinese models
chinese_models = df_chinese_select["Model"].tolist()

df_chinese_select.head(10)

Unnamed: 0,Model,Context Window,Creator,Country,Intelligence Index,Price (Blended EUR/1M Tokens),Speed (Median Tokens/s),Latency First Answer Chunk (s)
15,DeepSeek V3.2 Exp,128000,DeepSeek,China,57,0.28,25,1.35
28,GLM-4.5-Air,128000,Z AI,China,49,0.36,210,0.66
53,Qwen3 Omni 30B A3B,66000,Alibaba,China,40,0.37,88,0.97


In [10]:
# Selecting the American Models
df_american = df_cheap[df_cheap["Country"] == "USA"] 
df_american = df_american.copy()
# Remove Llama Nemotron Super 49B v1.5 and Gemini 2.5 Flash-Lite (Sep) (reasoning) since they are not accessible through the Open Router API
df_american.drop(df[df["Model"] == "Llama Nemotron Super 49B v1.5"].index, inplace=True)
df_american.drop(df[df["Model"] == "Gemini 2.5 Flash-Lite (Sep) (reasoning)"].index, inplace=True)

df_american = df_american[df_american["Speed (Median Tokens/s)"]>0]

# Exclude GPT 5 models since they do not support temperature
df_american = df_american[~df_american["Model"].str.contains(r"gpt[-\s]*5", case=False, na=False)]


In [13]:
# Select top 3 Chinese LLMs by Intelligence Index
top_3_chinese = df_chinese_select.head(3)
print("Top 3 Chinese LLMs:")
print(top_3_chinese[['Model', 'Intelligence Index']])

# Get their scores
top_scores = top_3_chinese['Intelligence Index'].values

# For each score, find the closest American Models so that there is not delta in Intelligence Index
matched_americans = []
for score in top_scores:
    df_american_copy = df_american.copy()
    df_american_copy['diff'] = abs(df_american_copy['Intelligence Index'] - score)
    closest_idx = df_american_copy['diff'].idxmin()
    closest = df_american_copy.loc[closest_idx]
    matched_americans.append(closest)

# Create DataFrame
matched_df = pd.DataFrame(matched_americans)
print("\nMatched American LLMs:")
print(matched_df[['Model', 'Intelligence Index']])

Top 3 Chinese LLMs:
                 Model  Intelligence Index
15   DeepSeek V3.2 Exp                  57
28         GLM-4.5-Air                  49
53  Qwen3 Omni 30B A3B                  40

Matched American LLMs:
                  Model  Intelligence Index
12  gpt-oss-120B (high)                  58
29     Grok Code Fast 1                  49
54          Grok 4 Fast                  39


In [14]:
us_models = matched_df["Model"].tolist()

In [15]:
print(us_models)

['gpt-oss-120B (high)', 'Grok Code Fast 1', 'Grok 4 Fast']


In [13]:
print(chinese_models)

['DeepSeek V3.2 Exp', 'GLM-4.5-Air', 'Qwen3 Omni 30B A3B']


## Retrieval of OpenRouter IDs

In [16]:
model_lookup = {
"DeepSeek V3.2 Exp": "deepseek/deepseek-v3.2-exp",
"GLM-4.5-Air": "z-ai/glm-4.5-air",
"Qwen3 Omni 30B A3B": "qwen/qwen3-next-80b-a3b-thinking",
"gpt-oss-120B (high)": "openai/gpt-oss-120b",
"Grok Code Fast 1": "x-ai/grok-code-fast-1",
"Grok 4 Fast": "x-ai/grok-4-fast"
}

In [15]:
CHINESE_MODELS = []
for model in chinese_models:
    if model in model_lookup:
        print(f"{model}: {model_lookup[model]}")
        CHINESE_MODELS.append(model_lookup[model])
    else:
        print(f"{model}: Not found in lookup")

DeepSeek V3.2 Exp: deepseek/deepseek-v3.2-exp
GLM-4.5-Air: z-ai/glm-4.5-air
Qwen3 Omni 30B A3B: qwen/qwen3-next-80b-a3b-thinking


In [17]:
AMERICAN_MODELS = []
for model in us_models:
    if model in model_lookup:
        print(f"{model}: {model_lookup[model]}")
        AMERICAN_MODELS.append(model_lookup[model])
    else:
        print(f"{model}: Not found in lookup")

gpt-oss-120B (high): openai/gpt-oss-120b
Grok Code Fast 1: x-ai/grok-code-fast-1
Grok 4 Fast: x-ai/grok-4-fast


# 3) Config Generation

Generates aligned YAML configurations for each group.
- Language is English for all runs.
- Utility agent: `gemini-2.5-flash`.
- Voting detection mode: `complex`.


In [20]:
# Base paths and groups
BASE_DIR = _REPO_ROOT / 'hypothesis_testing' / 'hypothesis_2'
CONFIGS_BASE = BASE_DIR / 'configs'
LOGS_BASE = BASE_DIR / 'terminal_outputs'
RESULTS_BASE = BASE_DIR / 'results'
TRANSCRIPTS_BASE = BASE_DIR / 'transcripts'

GROUPS = {
    'american': 'American LLMs',
    'chinese': 'Chinese LLMs',
}

# Ensure subfolders exist
for key in GROUPS.keys():
    (CONFIGS_BASE / key).mkdir(parents=True, exist_ok=True)
    (LOGS_BASE / key).mkdir(parents=True, exist_ok=True)
    (RESULTS_BASE / key).mkdir(parents=True, exist_ok=True)
    (TRANSCRIPTS_BASE / key).mkdir(parents=True, exist_ok=True)
CONFIGS_BASE, LOGS_BASE, RESULTS_BASE, TRANSCRIPTS_BASE, GROUPS


(PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_2/configs'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_2/terminal_outputs'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_2/results'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/hypothesis_2/transcripts'),
 {'american': 'American LLMs', 'chinese': 'Chinese LLMs'})

In [None]:
# Income class probabilities (must sum to 1.0)
INCOME_CLASS_PROBS = {
    'high': 0.05,
    'medium_high': 0.10,
    'medium': 0.50,
    'medium_low': 0.25,
    'low': 0.10,
}

# American and Chinese model pools (OpenRouter IDs)

def make_agents_with_models(temp: float, models: list[str]) -> list[dict]:
    agents = []
    for i in range(0, 3):  # 3 participant agents
        agents.append({
            'name': f'Agent_{i+1}',
            'personality': 'You are a college student',
            'model': models[i],
            'temperature': float(temp),
            'memory_character_limit': 25000,
            'reasoning_enabled': True,
        })
    return agents

def build_config(temp: float, models: list[str]) -> dict:
    return {
        'language': 'English',
        'agents': make_agents_with_models(temp, models),
        'utility_agent_model': 'gemini-2.0-flash-lite',
        'utility_agent_temperature': 0.0,
        'phase2_rounds': 10,
        'distribution_range_phase2': [2, 6],
        'income_class_probabilities': INCOME_CLASS_PROBS,
        'original_values_mode': { 'enabled': True },
        'transcript_logging': { 'enabled': True },
        'logging': {"verbosity_level": "debug"}  # or minimal, detailed, debug

        
    }

def _pick_american_models() -> list[str]:
    # Allow repeats; sample each agent independently
    return [random.choice(AMERICAN_MODELS) for _ in range(4)]

def _pick_chinese_models() -> list[str]:
    return [random.choice(CHINESE_MODELS) for _ in range(4)]

def generate_aligned_configs(n: int = 33) -> dict[str, list[Path]]:
    paths: dict[str, list[Path]] = {k: [] for k in GROUPS.keys()}
    for idx in range(1, n + 1):
            # FIXED: First 11 configs have temp = 0, rest have random temp
            if idx <= 11:
                temp = 0.0
            else:
                temp = random.uniform(0.0, 1.0) # Temperature between 0.0 and 1.0, lower value than in Hpyothesis 1 since the models here are less smart and therfore have a higher risk to fail

            # Build per-group models
            models_american = _pick_american_models()
            models_chinese = _pick_chinese_models()

            # Write configs
            for group_key, models in [('american', models_american), ('chinese', models_chinese)]:
                cfg = build_config(temp=temp,models=models)
                out_dir = (CONFIGS_BASE / group_key)
                out_dir.mkdir(parents=True, exist_ok=True)
                fname = out_dir / f'hypothesis_2_{group_key}_condition_{idx}_config.yaml'
                with open(fname, 'w') as f:
                    yaml.safe_dump(cfg, f, sort_keys=False)
                paths[group_key].append(fname)
    return paths

# Example (commented):
#files_by_group = generate_aligned_configs(n=33)
#{k: len(v) for k, v in files_by_group.items()}

{'american': 33, 'chinese': 33}

# 4) Run Configs 

Select subsets and run with per-group logs/results directories.


In [18]:
def run_group(group_key: str, include_indices=None, include_names=None, concurrency: int = 4, timeout_sec: int | None = None):
    cfg_dir = CONFIGS_BASE / group_key
    logs_dir = LOGS_BASE / group_key
    results_dir = RESULTS_BASE / group_key
    configs = list_config_files(cfg_dir)
    selected = select_configs(configs, include_indices=include_indices, include_names=include_names)
    print(f'[{group_key}] Found {len(configs)} configs; selected {len(selected)}')
    run_results = run_configs_in_parallel(
        selected,
        concurrency=concurrency,
        logs_dir=logs_dir,
        results_dir=results_dir,
        timeout_sec=timeout_sec,
    )
    ok = sum(1 for r in run_results if r.get('ok'))
    print(f'[{group_key}] Completed: {ok}/{len(run_results)} OK')
    return run_results




In [None]:
#rr_us = run_group('american', include_indices=list(range(1, 34)) , concurrency=8)

[american] Found 34 configs; selected 34
[american] Completed: 34/34 OK


In [None]:
#rr_cn = run_group('chinese', include_indices=list(range(1,34)) , concurrency=8)

[chinese] Found 34 configs; selected 22
[chinese] Completed: 13/22 OK


## 3) Analysis — Compare Outcomes Across Groups

Build a 5×2 contingency table (rows=principle/disagreement categories; columns=American/Chinese)
and run Fisher–Freeman–Halton exact test via R when available.
Compute Cramér's V with bias correction (as in Hypothesis 1). No Chi-square fallback is included.


In [31]:
CATEGORIES = [
    'maximizing_floor',
    'maximizing_average',
    'maximizing_average_floor_constraint',
    'maximizing_average_range_constraint',
    'disagreement',
]

def categorize_result(result_path: Path) -> str:
    try:
        with open(result_path, 'r') as f:
            data = json.load(f)
        gi = data.get('general_information', {})
        consensus = gi.get('consensus_reached', False)
        principle = gi.get('consensus_principle')
        if consensus and principle in CATEGORIES:
            return principle
        return 'disagreement'
    except Exception:
        return 'disagreement'

def count_by_group() -> dict[str, Counter]:
    out: dict[str, Counter] = {}
    for k in GROUPS.keys():
        counts = Counter()
        result_files = sorted((RESULTS_BASE / k).glob('*_results.json'))
        for rp in result_files:
            counts[categorize_result(rp)] += 1
        for cat in CATEGORIES:
            counts.setdefault(cat, 0)
        out[k] = counts
    return out

group_counts = count_by_group()
for k, counts in group_counts.items():
    print(f'{k.capitalize()} counts:', dict(counts))

# Build contingency table: rows=categories, cols=[American, Chinese]
col_order = ['american', 'chinese']
contingency = np.vstack([[group_counts[col][cat] for col in col_order] for cat in CATEGORIES])
contingency, CATEGORIES, col_order


American counts: {'maximizing_floor': 14, 'maximizing_average_floor_constraint': 15, 'disagreement': 4, 'maximizing_average': 0, 'maximizing_average_range_constraint': 0}
Chinese counts: {'maximizing_average_floor_constraint': 21, 'maximizing_floor': 4, 'disagreement': 6, 'maximizing_average': 2, 'maximizing_average_range_constraint': 0}


(array([[14,  4],
        [ 0,  2],
        [15, 21],
        [ 0,  0],
        [ 4,  6]]),
 ['maximizing_floor',
  'maximizing_average',
  'maximizing_average_floor_constraint',
  'maximizing_average_range_constraint',
  'disagreement'],
 ['american', 'chinese'])

In [30]:
def fisher_freeman_halton_pvalue_r(contingency: np.ndarray) -> float | None:
    """Run Fisher–Freeman–Halton test via R's fisher.test if available.
    Returns p-value or None if Rscript not found or fails.
    """
    if shutil.which('Rscript') is None:
        return None
    r_matrix = ','.join(str(int(x)) for x in contingency.flatten(order='C'))
    nrow, ncol = contingency.shape
    r_code = f"""m <- matrix(c({r_matrix}), nrow={nrow}, ncol={ncol}, byrow=TRUE);
f <- tryCatch(fisher.test(m), error=function(e) NA);
if (is.list(f)) cat(f$p.value) else cat('NA')
"""
    import subprocess
    try:
        out = subprocess.check_output(['Rscript', '-e', r_code], stderr=subprocess.STDOUT, text=True)
        out = out.strip()
        return float(out) if out and out != 'NA' else None
    except Exception:
        return None

p_ffh = fisher_freeman_halton_pvalue_r(contingency)
if p_ffh is None:
    print('R not available; skipping Fisher–Freeman–Halton exact test')
else:
    print(f'Fisher–Freeman–Halton exact test p-value: {p_ffh:.6f}')


Fisher–Freeman–Halton exact test p-value: 0.024749


In [32]:
from __future__ import annotations

import numpy as np
from scipy.stats import chi2_contingency


def _prune_empty_rows_cols(contingency: np.ndarray) -> np.ndarray:
    """
    Drop all-zero rows and columns to avoid degenerate shapes and expected=0 cells.
    Returns the original array if nothing needs pruning.
    """
    contingency = np.asarray(contingency, dtype=float)
    if contingency.ndim != 2:
        raise ValueError("contingency must be a 2D array")

    row_mask = contingency.sum(axis=1) > 0
    col_mask = contingency.sum(axis=0) > 0

    if row_mask.all() and col_mask.all():
        return contingency
    pruned = contingency[row_mask][:, col_mask]
    return pruned


def cramers_v(contingency: np.ndarray) -> float:
    """
    Standard Cramér's V using Pearson's chi-square with NO Yates correction.
    Matches the definition used by Bergsma (2013) before bias correction.
    """
    tab = _prune_empty_rows_cols(contingency)
    n = tab.sum()
    if n <= 0:
        return 0.0

    r, c = tab.shape
    df_min = min(r - 1, c - 1)
    if df_min <= 0:
        return 0.0

    chi2, _, _, _ = chi2_contingency(tab, correction=False)
    return float(np.sqrt((chi2 / n) / df_min))


def bias_corrected_cramers_v(contingency: np.ndarray) -> float:
    """
    Bergsma (2013) bias-corrected Cramér's V.
    - Subtracts finite-sample bias for phi^2 at independence.
    - Adjusts effective dimensions so the corrected measure can still reach 1.
    Formulas:
      phi2_corr = max(0, chi2/n - ((r-1)(c-1))/(n-1))
      r_corr = r - ((r-1)^2)/(n-1)
      c_corr = c - ((c-1)^2)/(n-1)
      V_tilde = sqrt(phi2_corr / min(r_corr-1, c_corr-1))
    """
    tab = _prune_empty_rows_cols(contingency)
    n = tab.sum()
    if n <= 1:
        return 0.0

    r, c = tab.shape
    if min(r - 1, c - 1) <= 0:
        return 0.0

    chi2, _, _, _ = chi2_contingency(tab, correction=False)
    phi2 = chi2 / n
    r1, c1 = r - 1, c - 1

    # Bias correction at independence
    phi2_corr = max(0.0, phi2 - (r1 * c1) / (n - 1))

    # Adjusted table dimensions
    r_corr = r - (r1 * r1) / (n - 1)
    c_corr = c - (c1 * c1) / (n - 1)
    denom = min(r_corr - 1.0, c_corr - 1.0)
    if denom <= 0.0:
        return 0.0

    return float(np.sqrt(phi2_corr / denom))


def bootstrap_cramers_v(
    contingency: np.ndarray,
    n_bootstrap: int = 5000,
    confidence_level: float = 0.95,
    bias_corrected: bool = True,
    seed: int | None = 123,
) -> tuple[np.ndarray, float, float]:
    """
    Nonparametric bootstrap for Cramér's V by resampling counts from the empirical
    cell distribution via Multinomial(n, p). Uses percentile CI.

    Returns:
        (vs, lo, hi)
        - vs: array of bootstrap V values
        - lo, hi: lower/upper bounds of the (1 - alpha) percentile CI
    """
    tab = np.asarray(contingency, dtype=float)
    n = int(tab.sum())
    if n <= 0:
        return np.array([]), 0.0, 0.0
    if not (0.0 < confidence_level < 1.0):
        raise ValueError("confidence_level must be in (0, 1)")

    p = (tab / n).ravel()
    idxs = np.arange(p.size)

    rng = np.random.default_rng(seed)
    vs = np.empty(n_bootstrap, dtype=float)

    for b in range(n_bootstrap):
        # Draw a bootstrap table ~ Multinomial(n, p)
        counts = rng.multinomial(n, p).reshape(tab.shape)
        v = bias_corrected_cramers_v(counts) if bias_corrected else cramers_v(counts)
        vs[b] = v

    alpha = 1.0 - confidence_level
    lo, hi = np.quantile(vs, [alpha / 2.0, 1.0 - alpha / 2.0])
    return vs, float(lo), float(hi)


cv = cramers_v(contingency)
cv_corr = bias_corrected_cramers_v(contingency)
print(f"Cramér's V (uncorrected): {cv:.4f}")
print(f"Cramér's V (bias-corrected): {cv_corr:.4f}")

Cramér's V (uncorrected): 0.3684
Cramér's V (bias-corrected): 0.3016
