# Reproducability Proof - Main
This notebook showcases that this experiment is reproducible by using local llms and reproducing entire experiments word for word using the seed mechanic and temperature set to 0. 

In [1]:
# Imports
import sys, os
from pathlib import Path

# Ensure repo root on sys.path (for local package imports)
def _add_repo_root_to_sys_path():
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if (p / 'main.py').exists() and (p / 'hypothesis_testing').is_dir():
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            return p
    return here
_REPO_ROOT = _add_repo_root_to_sys_path()

import json
import hashlib
import yaml
from hypothesis_testing.utils_hypothesis_testing.runner import (
    list_config_files,
    select_configs,
    run_configs_in_parallel,
)



In [6]:
# Configuration paths and constants
CONFIG_DIR = _REPO_ROOT / 'hypothesis_testing' / 'reproducibility_proof' / 'configs'
TERMINAL_OUTPUTS_DIR   = _REPO_ROOT / 'hypothesis_testing' / 'reproducibility_proof' / 'terminal_outputs'
RESULTS_DIR= _REPO_ROOT / 'hypothesis_testing' / 'reproducibility_proof' / 'results'
TRANSCRIPTS_DIR = _REPO_ROOT / 'hypothesis_testing' / 'reproducibility_proof' / 'transcripts'


# Placeholder model list for participant agents

MODEL_LIST = [
    "gemini-2.5-flash-lite",
]

# Income class probabilities (must sum to 1.0)
# Same as in Frohlich & Oppenheimer (1992) for the initial distribution
INCOME_CLASS_PROBS = {
    'high': 0.05,
    'medium_high': 0.10,
    'medium': 0.50,
    'medium_low': 0.25,
    'low': 0.10,
}

# Ensure directories exist
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
TERMINAL_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
TRANSCRIPTS_DIR.mkdir(parents=True, exist_ok=True)


CONFIG_DIR, TERMINAL_OUTPUTS_DIR, RESULTS_DIR, TRANSCRIPTS_DIR


(PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/reproducibility_proof/configs'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/reproducibility_proof/terminal_outputs'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/reproducibility_proof/results'),
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/reproducibility_proof/transcripts'))

## 1. Generate 2 identical configurations with transcript logging


In [4]:
def make_agents() -> list[dict]:
    agents = []
    for i in range(0, 2):  # 2 participant agents
        agents.append({
            'name': f'Agent_{i+1}',
            'personality': 'You are an American college student',
            'model': MODEL_LIST[0],  # Use the same model for all agents (no random.choice)
            'temperature': 0,  # Reproducible results
            'memory_character_limit': 20000,  # set to lower value given hardware constraints of local running
            'reasoning_enabled': True,
        })
    return agents

def build_config(seed_val: int, condition_idx: int) -> dict:
    transcript_filename = f'reproducibility_condition_{condition_idx}_transcript.json'
    return {
        'language': 'English',
        'seed': seed_val,  # seed for reproducibility
        'agents': make_agents(),
        'utility_agent_model': 'gemini-2.5-flash-lite',  
        'utility_agent_temperature': 0.0,
        'phase2_rounds': 5,
        'distribution_range_phase2': [2, 6],
        'income_class_probabilities': INCOME_CLASS_PROBS,
        'original_values_mode': {'enabled': True},
        'transcript_logging': {
            'enabled': True,
            'include_memory_updates': True,
            'include_instructions': False,
            'include_input_prompts': True,
            'include_agent_responses': True,
            'output_path': str(TRANSCRIPTS_DIR / transcript_filename),
        },
    }

# Generate 2 identical configs for reproducibility test
# All use the same seed (42) and temperature (0) to ensure identical results
generated_files = []
# Clear old reproducibility configs to ensure we only run the fresh pair
for stale_cfg in CONFIG_DIR.glob('reproducibility_condition_*_config.yaml'):
    stale_cfg.unlink()
for idx in range(1, 3):  # Generate 2 identical configs
    cfg = build_config(seed_val=42, condition_idx=idx)  # Same seed for all
    fname = CONFIG_DIR / f'reproducibility_condition_{idx}_config.yaml'
    transcript_path = TRANSCRIPTS_DIR / f'reproducibility_condition_{idx}_transcript.json'
    if transcript_path.exists():
        transcript_path.unlink()
    with open(fname, 'w') as f:
        yaml.safe_dump(cfg, f, sort_keys=False)
    generated_files.append(fname)

len(generated_files), generated_files[0] if generated_files else None



(2,
 PosixPath('/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/reproducability_proof/configs/reproducibility_condition_1_config.yaml'))

## 2. Run Configs (Parallel + Selective)

In [None]:
# Discover all config files
configs = list_config_files(CONFIG_DIR)
print(f'Found {len(configs)} configs')

# Selection controls
SELECT_INDICES = [1, 2]  # Run both identical configs
SELECT_NAMES = None    # e.g., ['condition_1', 'condition_3']
CONCURRENCY = 2        # Run sequentially to avoid conflicts
TIMEOUT_SECONDS = None # e.g., 900 for 15 minutes per run

selected = select_configs(configs, include_indices=SELECT_INDICES, include_names=SELECT_NAMES)
print(f'Selected {len(selected)} configs to run')

#run_results = run_configs_in_parallel(
    selected,
    concurrency=CONCURRENCY,
    logs_dir=TERMINAL_OUTPUTS_DIR,
    results_dir=RESULTS_DIR,
    timeout_sec=TIMEOUT_SECONDS,
)



Found 2 configs
Selected 2 configs to run


## 3. Analysis — Compare transcript payloads for reproducibility

Since both runs use identical configurations (same seed, temperature 0, Gemini model),
their transcripts should be identical. This section validates reproducibility using the saved transcript files.


In [7]:
def load_transcript(path: Path) -> dict:
    with open(path, 'r', encoding='utf-8') as handle:
        return json.load(handle)

def strip_non_deterministic_fields(data):
    """Remove timestamps, UUIDs, and other non-deterministic fields for reproducibility check"""
    if isinstance(data, dict):
        cleaned = {}
        for key, value in data.items():
            # Skip fields that change between runs but don't affect reproducibility
            if key in ['timestamp', 'experiment_id', 'created_at', 'last_updated', 'saved_at', 'config_file']:
                continue
            cleaned[key] = strip_non_deterministic_fields(value)
        return cleaned
    elif isinstance(data, list):
        return [strip_non_deterministic_fields(item) for item in data]
    else:
        return data

def summarize_transcript(path: Path) -> dict:
    payload = load_transcript(path)
    
    # Compute digest on cleaned payload (excluding non-deterministic fields)
    clean_payload = strip_non_deterministic_fields(payload)
    digest = hashlib.sha256(json.dumps(clean_payload, sort_keys=True).encode('utf-8')).hexdigest()
    
    metadata = payload.get('experiment_metadata', {})
    summary = {
        'filename': path.name,
        'experiment_id': metadata.get('experiment_id'),
        'config_file': metadata.get('config_file'),
        'saved_at': metadata.get('saved_at'),
        'digest': digest,
    }
    return summary

transcript_files = sorted(TRANSCRIPTS_DIR.glob('reproducibility_condition_*_transcript.json'))
print(f"Found {len(transcript_files)} transcript files")

summaries = []
for idx, transcript_path in enumerate(transcript_files, start=1):
    summary = summarize_transcript(transcript_path)
    summaries.append(summary)
    print(f"\nRun {idx} ({transcript_path.name}):")
    print(f"  Experiment ID: {summary['experiment_id']}")
    print(f"  Digest: {summary['digest']}")
    print(f"  Saved at: {summary['saved_at']}")

if len(summaries) == 2:
    digests = {item['digest'] for item in summaries}
    if len(digests) == 1:
        print("\n✅ SUCCESS: Transcript payloads match exactly — reproducibility confirmed!")
    else:
        print("\n❌ FAILURE: Transcript payloads differ — investigate run outputs")
        print(f"Digests: {digests}")
else:
    print(f"\n⚠️  Expected 2 transcript files, found {len(summaries)}")

summaries

Found 2 transcript files

Run 1 (reproducibility_condition_1_transcript.json):
  Experiment ID: 9290fc37-1706-457b-a5cc-ccfad94387f8
  Digest: 803a16aa0b9669137495f2fca450748c4fbcabf696b390af507403e21ff70465
  Saved at: 2025-10-16T10:40:18.505044+00:00

Run 2 (reproducibility_condition_2_transcript.json):
  Experiment ID: 7476f52e-50a1-41c5-acc9-d8b4d8acd843
  Digest: 803a16aa0b9669137495f2fca450748c4fbcabf696b390af507403e21ff70465
  Saved at: 2025-10-16T10:40:17.878334+00:00

✅ SUCCESS: Transcript payloads match exactly — reproducibility confirmed!


[{'filename': 'reproducibility_condition_1_transcript.json',
  'experiment_id': '9290fc37-1706-457b-a5cc-ccfad94387f8',
  'config_file': '/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/reproducibility_proof_gemini/configs/reproducibility_condition_1_config.yaml',
  'saved_at': '2025-10-16T10:40:18.505044+00:00',
  'digest': '803a16aa0b9669137495f2fca450748c4fbcabf696b390af507403e21ff70465'},
 {'filename': 'reproducibility_condition_2_transcript.json',
  'experiment_id': '7476f52e-50a1-41c5-acc9-d8b4d8acd843',
  'config_file': '/Users/lucasmuller/Desktop/Githubg/Rawls_v3/hypothesis_testing/reproducibility_proof_gemini/configs/reproducibility_condition_2_config.yaml',
  'saved_at': '2025-10-16T10:40:17.878334+00:00',
  'digest': '803a16aa0b9669137495f2fca450748c4fbcabf696b390af507403e21ff70465'}]