# Repeatable Statistical Inference Demo - Repeatable Statistical Query

In [9]:
# Add parent directory to path for imports
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import our implementations
from distributions import (
    Distribution01, BernoulliRare, BetaSkewed, 
    BimodalMixture, UniformSpike, TruncatedNormal
)

from stat_query import (
    RepeatableStatQuery, RepeatableTestConfig
)

# Set up plotting style with larger fonts
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 14          # Increased from 12
plt.rcParams['axes.labelsize'] = 16     # Axis labels
plt.rcParams['axes.titlesize'] = 18     # Title size
plt.rcParams['xtick.labelsize'] = 14    # X-axis tick labels
plt.rcParams['ytick.labelsize'] = 14    # Y-axis tick labels
plt.rcParams['legend.fontsize'] = 14    # Legend font size
plt.rcParams['figure.titlesize'] = 20   # Figure title size

In [10]:
def demo_non_repeatable_basic_sq():
    """
    Demonstrate the non-repeatability of Algorithm 1 (Basic Statistical Query).
    
    Tests all distributions with both Monte Carlo and Importance Sampling.
    """
    print("="*80)
    print("NON-REPEATABLE BASIC STATISTICAL QUERY DEMONSTRATION")
    print("="*80)
    
    # Initialize testing framework
    test_engine = RepeatableStatQuery()
    
    # Test all distributions
    test_distributions = [
        BernoulliRare(p=0.05),                     # 1. Rare events
        BetaSkewed(alpha=0.5, beta=2.0),           # 2. Right-skewed
        TruncatedNormal(mu=0.3, sigma=0.15),       # 3. Truncated normal
        BimodalMixture(weight=0.3),                # 4. Bimodal mixture
        UniformSpike(spike_location=0.9, spike_prob=0.1)  # 5. Uniform with spike
    ]
    
    # Different configurations to show variability across parameter settings
    configs = [
        RepeatableTestConfig(gamma=0.02, c=0.05, beta=0.1, max_samples=3000, random_seed=None),
        RepeatableTestConfig(gamma=0.01, c=0.02, beta=0.05, max_samples=5000, random_seed=None),
    ]
    
    all_results = []
    
    for dist_idx, dist in enumerate(test_distributions):
        print(f"\n{'='*60}")
        print(f"DISTRIBUTION {dist_idx+1}: {dist.name}")
        print(f"True Mean: {dist.true_mean():.4f}")
        print('='*60)
        
        for config_idx, config in enumerate(configs):
            print(f"\nConfiguration {config_idx+1}: Î³={config.gamma}, c={config.c}")
            print(f"Target Accuracy: {config.gamma}")
            print(f"Confidence Level: {1-config.c:.2%}")
            
            # Test Monte Carlo (MC)
            print(f"\n--- Monte Carlo Sampling ---")
            print("Trial | Estimate | Samples | Emp. Variance | Diff from True")
            print("-" * 65)
            
            mc_results = []
            for trial in range(8):  # More trials to show variability
                estimate, n_samples, emp_var = test_engine._algorithm1_basic_sq(
                    dist, None, config  # None = Monte Carlo
                )
                
                difference = abs(estimate - dist.true_mean())
                mc_results.append({
                    'trial': trial + 1,
                    'estimate': estimate,
                    'n_samples': n_samples,
                    'emp_variance': emp_var,
                    'difference': difference,
                    'method': 'MC'
                })
                
                print(f"{trial+1:5d} | {estimate:8.4f} | {n_samples:7d} | {emp_var:12.6f} | {difference:8.4f}")
            
            # Test Importance Sampling (IS) if available
            importance_dist = dist.suggest_importance_distribution()
            
            if importance_dist.name != dist.name:
                print(f"\n--- Importance Sampling with {importance_dist.name} ---")
                print("Trial | Estimate | Samples | Emp. Variance | Diff from True")
                print("-" * 65)
                
                is_results = []
                for trial in range(8):
                    estimate, n_samples, emp_var = test_engine._algorithm1_basic_sq(
                        dist, importance_dist, config  # Use importance distribution
                    )
                    
                    difference = abs(estimate - dist.true_mean())
                    is_results.append({
                        'trial': trial + 1,
                        'estimate': estimate,
                        'n_samples': n_samples,
                        'emp_variance': emp_var,
                        'difference': difference,
                        'method': 'IS'
                    })
                    
                    print(f"{trial+1:5d} | {estimate:8.4f} | {n_samples:7d} | {emp_var:12.6f} | {difference:8.4f}")
            else:
                is_results = []
                print(f"\n--- No Importance Sampling Available ---")
                print(f"(Same distribution suggested: {importance_dist.name})")
            
            # Analysis of variability for this distribution and config
            all_results_for_analysis = mc_results + is_results
            
            if len(all_results_for_analysis) > 0:
                estimates = [r['estimate'] for r in all_results_for_analysis]
                sample_counts = [r['n_samples'] for r in all_results_for_analysis]
                
                print(f"\nðŸ“Š VARIABILITY ANALYSIS:")
                print(f"   Estimate Range: [{min(estimates):.4f}, {max(estimates):.4f}]")
                print(f"   Estimate Std Dev: {np.std(estimates):.4f}")
                print(f"   Sample Count Range: [{min(sample_counts)}, {max(sample_counts)}]")
                print(f"   Avg Sample Count: {np.mean(sample_counts):.1f}")
                
                # Compare MC vs IS if both available
                if is_results:
                    mc_estimates = [r['estimate'] for r in mc_results]
                    is_estimates = [r['estimate'] for r in is_results]
                    print(f"   MC Estimate Std: {np.std(mc_estimates):.4f}")
                    print(f"   IS Estimate Std: {np.std(is_estimates):.4f}")
            
            all_results.extend(all_results_for_analysis)
    
    return all_results

In [11]:
# Demonstrate the non-repeatability problem first
basic_sq_results = demo_non_repeatable_basic_sq()

NON-REPEATABLE BASIC STATISTICAL QUERY DEMONSTRATION

DISTRIBUTION 1: Bernoulli(0.050)
True Mean: 0.0500

Configuration 1: Î³=0.02, c=0.05
Target Accuracy: 0.02
Confidence Level: 95.00%

--- Monte Carlo Sampling ---
Trial | Estimate | Samples | Emp. Variance | Diff from True
-----------------------------------------------------------------
    1 |   0.0437 |    1510 |     0.041798 |   0.0063
    2 |   0.0483 |    1594 |     0.045973 |   0.0017
    3 |   0.0495 |    1615 |     0.047082 |   0.0005
    4 |   0.0462 |    1557 |     0.044104 |   0.0038
    5 |   0.0479 |    1586 |     0.045623 |   0.0021
    6 |   0.0419 |    1478 |     0.040189 |   0.0081
    7 |   0.0458 |    1549 |     0.043735 |   0.0042
    8 |   0.0530 |    1678 |     0.050226 |   0.0030

--- Importance Sampling with Bernoulli(0.150) ---
Trial | Estimate | Samples | Emp. Variance | Diff from True
-----------------------------------------------------------------
    1 |   0.0517 |    1075 |     0.014508 |   0.0017
    

In [12]:
def demo_repeatable_testing():
    """
    Demonstrate the repeatable testing framework from the paper.
    
    Shows:
    1. Exact repeatability with Î±-quantization
    2. Empirical Bernstein stopping criterion
    3. Monte Carlo vs Importance Sampling comparison
    4. Accuracy and efficiency guarantees
    """
    print("="*80)
    print("REPEATABLE STATISTICAL TESTING DEMONSTRATION")
    print("="*80)
    
    # Demonstrate parameter validation
    print("\n0. PARAMETER VALIDATION AND SUGGESTIONS")
    print("-" * 50)
    
    # Initialize testing framework
    test_engine = RepeatableStatQuery()
    
    # Test distributions (reduced for faster demo)
    test_distributions = [
        BernoulliRare(p=0.05),                     # 1. Rare events
        BetaSkewed(alpha=0.5, beta=2.0),           # 2. Right-skewed
        TruncatedNormal(mu=0.3, sigma=0.15),       # 3. Truncated normal
        BimodalMixture(weight=0.3)                 # 4. Bimodal mixture
    ]
    
    # Test configurations (ensure (1-c)Â² - (1-Î²) â‰¥ 0)
    configs = [
        RepeatableTestConfig(gamma=0.05, c=0.05, beta=0.1, max_samples=5000, random_seed=None),
        RepeatableTestConfig(gamma=0.05, c=0.02, beta=0.05, max_samples=10000, random_seed=None),
    ]
    
    print("\n1. REPEATABILITY DEMONSTRATION")
    print("-" * 50)
    print("Note: Repeatability is guaranteed with probability 1-Î².")
    
    repeatability_results = []
    
    for dist in test_distributions:
        for config in configs:
            print(f"\n--- {dist.name} with Î³={config.gamma} ---")
            
            # Monte Carlo repeatability (more trials to observe probabilistic failures)
            mc_result = test_engine.demonstrate_exact_repeatability(
                dist, None, config, n_trials=10
            )
            
            # Importance Sampling repeatability
            importance_dist = dist.suggest_importance_distribution()
            if importance_dist.name != dist.name:
                print(f"\nImportance Sampling with {importance_dist.name}:")
                is_result = test_engine.demonstrate_exact_repeatability(
                    dist, importance_dist, config, n_trials=10
                )
            else:
                is_result = None
            
            repeatability_results.append({
                'distribution': dist.name,
                'monte_carlo': mc_result,
                'importance_sampling': is_result
            })
    
    print("\n2. EMPIRICAL BERNSTEIN STOPPING CRITERION")
    print("-" * 50)
    
    # Compare with different accuracy parameters
    bernstein_results = []
    test_dist = BernoulliRare(p=0.05)
    
    for gamma in [0.05, 0.02]:
        config = RepeatableTestConfig(gamma=gamma, c=0.05, beta=0.1, max_samples=3000, random_seed=None)
        result = test_engine.algorithm2_repeatable_test(test_dist, None, config)
        
        print(f"Î³={gamma:.3f}: n={result.n_samples}, "
              f"empirical_var={result.empirical_variance:.6f}, "
              f"converged={result.is_converged}")
        
        bernstein_results.append(result)

In [13]:
demo_repeatable_testing()

REPEATABLE STATISTICAL TESTING DEMONSTRATION

0. PARAMETER VALIDATION AND SUGGESTIONS
--------------------------------------------------

1. REPEATABILITY DEMONSTRATION
--------------------------------------------------
Note: Repeatability is guaranteed with probability 1-Î².

--- Bernoulli(0.050) with Î³=0.05 ---
Demonstrating exact repeatability for Bernoulli(0.050)
Parameters: Î³=0.05, c=0.05, Î²=0.1
Computed Î±=0.094737
  Trial 1: original=0.045226, quantized=0.045455, n=398
  Trial 2: original=0.058824, quantized=0.045455, n=442
  Trial 3: original=0.045226, quantized=0.045455, n=398
  Trial 4: original=0.039578, quantized=0.045455, n=379
  Trial 5: original=0.043367, quantized=0.045455, n=392
  Trial 6: original=0.052257, quantized=0.045455, n=421
  Trial 7: original=0.055556, quantized=0.045455, n=432
  Trial 8: original=0.055556, quantized=0.045455, n=432
  Trial 9: original=0.050481, quantized=0.045455, n=416
  Trial 10: original=0.045226, quantized=0.045455, n=398
  âœ“ All 1