# 12 - Statistical Validation (Phase 3)

**Author:** Tan Ming Kai (24PMR12003)  
**Date:** 2025-11-24  
**Purpose:** Statistical validation of Phase 2 results for thesis Chapter 5

---

## Objectives

1. Calculate **95% confidence intervals** for all models (bootstrap method)
2. Perform **hypothesis testing**: H₁ (CrossViT vs baselines)
3. Apply **Bonferroni correction** for multiple comparisons (α' = 0.01)
4. Calculate **effect sizes** (Cohen's d)
5. Generate **APA-formatted tables** for thesis

---

## Hypotheses

**H₁ (Primary):** CrossViT-Tiny achieves significantly higher accuracy than CNN baselines (ResNet-50, DenseNet-121, EfficientNet-B0) at p < 0.05

**Statistical Tests:**
- Paired t-test (since same train/val/test splits)
- Bonferroni correction: α' = 0.05 / 5 comparisons = 0.01
- Effect size: Cohen's d

---

In [None]:
# Standard imports
import os, sys, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_rel, ttest_ind

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

print("[OK] Imports complete")

In [None]:
# Configuration
RESULTS_DIR = Path("../experiments/phase2_systematic/results/metrics")
OUTPUT_DIR = Path("../experiments/phase3_analysis/statistical_validation")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

models = {
    'CrossViT-Tiny': 'crossvit_results.csv',
    'ResNet-50': 'resnet50_results.csv',
    'DenseNet-121': 'densenet121_results.csv',
    'EfficientNet-B0': 'efficientnet_results.csv',
    'ViT-Tiny': 'vit_results.csv',
    'Swin-Tiny': 'swin_results.csv'
}

ALPHA = 0.05
N_COMPARISONS = 5  # CrossViT vs 5 baselines
BONFERRONI_ALPHA = ALPHA / N_COMPARISONS

print(f"Significance level: α = {ALPHA}")
print(f"Bonferroni-corrected α' = {BONFERRONI_ALPHA:.4f}")

## 1. Load All Results

In [None]:
# Load all model results
results = {}
for model_name, csv_file in models.items():
    df = pd.read_csv(RESULTS_DIR / csv_file)
    results[model_name] = df['test_acc'].values
    print(f"{model_name:20s}: {df['test_acc'].values}")

print("\n[OK] All results loaded")

## 2. Calculate 95% Confidence Intervals (Bootstrap)

In [None]:
def bootstrap_ci(data, n_bootstrap=10000, confidence=0.95):
    """
    Calculate bootstrap confidence interval.
    
    Args:
        data: Array of accuracy values
        n_bootstrap: Number of bootstrap samples
        confidence: Confidence level (default 0.95)
    
    Returns:
        mean, lower_bound, upper_bound
    """
    means = []
    n = len(data)
    
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        means.append(np.mean(sample))
    
    means = np.array(means)
    alpha = 1 - confidence
    lower = np.percentile(means, alpha/2 * 100)
    upper = np.percentile(means, (1 - alpha/2) * 100)
    
    return np.mean(data), lower, upper

# Calculate CIs for all models
print("95% CONFIDENCE INTERVALS (Bootstrap, n=10000)")
print("="*70)
print(f"{'Model':<20s} {'Mean':<12s} {'95% CI':<25s}")
print("-"*70)

ci_results = {}
for model_name, accuracies in results.items():
    mean, lower, upper = bootstrap_ci(accuracies)
    ci_results[model_name] = {'mean': mean, 'lower': lower, 'upper': upper}
    print(f"{model_name:<20s} {mean:>6.2f}%     [{lower:>6.2f}%, {upper:>6.2f}%]")

print("\n[OK] Confidence intervals calculated")

## 3. Hypothesis Testing: CrossViT vs Baselines

In [None]:
def cohens_d(group1, group2):
    """
    Calculate Cohen's d effect size.
    
    Cohen's d interpretation:
    - Small: d = 0.2
    - Medium: d = 0.5
    - Large: d = 0.8
    """
    mean1, mean2 = np.mean(group1), np.mean(group2)
    std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    
    # Pooled standard deviation
    n1, n2 = len(group1), len(group2)
    pooled_std = np.sqrt(((n1-1)*std1**2 + (n2-1)*std2**2) / (n1 + n2 - 2))
    
    return (mean1 - mean2) / pooled_std

# Perform paired t-tests
crossvit_acc = results['CrossViT-Tiny']

print("HYPOTHESIS TESTING: CrossViT-Tiny vs Baselines")
print("="*90)
print(f"{'Comparison':<30s} {'Mean Diff':<12s} {'t-stat':<10s} {'p-value':<12s} {'Significant':<15s} {'Cohen\\'s d'}")
print("-"*90)

hypothesis_results = []

baseline_models = ['ResNet-50', 'DenseNet-121', 'EfficientNet-B0', 'ViT-Tiny', 'Swin-Tiny']

for baseline in baseline_models:
    baseline_acc = results[baseline]
    
    # Paired t-test (same splits used)
    t_stat, p_value = ttest_rel(crossvit_acc, baseline_acc)
    
    # Effect size
    effect_size = cohens_d(crossvit_acc, baseline_acc)
    
    # Significance (Bonferroni-corrected)
    significant = "Yes*" if p_value < BONFERRONI_ALPHA else "No"
    
    mean_diff = np.mean(crossvit_acc) - np.mean(baseline_acc)
    
    print(f"CrossViT vs {baseline:<15s} {mean_diff:>+6.2f}%     {t_stat:>+6.3f}    {p_value:>8.4f}    {significant:<15s} {effect_size:>+6.3f}")
    
    hypothesis_results.append({
        'Comparison': f'CrossViT vs {baseline}',
        'Mean Difference (%)': mean_diff,
        't-statistic': t_stat,
        'p-value': p_value,
        'Significant (α\' = 0.01)': significant,
        'Cohen\\'s d': effect_size
    })

print("-"*90)
print("* Significant at Bonferroni-corrected α' = 0.01")
print("\n[OK] Hypothesis testing complete")

# Save results
hypothesis_df = pd.DataFrame(hypothesis_results)
hypothesis_df.to_csv(OUTPUT_DIR / 'hypothesis_testing_results.csv', index=False)
print(f"[OK] Results saved to: {OUTPUT_DIR / 'hypothesis_testing_results.csv'}")

## 4. Visualization: CI Plot

In [None]:
# Plot confidence intervals
fig, ax = plt.subplots(figsize=(12, 8))

model_names = list(ci_results.keys())
means = [ci_results[m]['mean'] for m in model_names]
lowers = [ci_results[m]['lower'] for m in model_names]
uppers = [ci_results[m]['upper'] for m in model_names]

# Sort by mean accuracy
sorted_indices = np.argsort(means)[::-1]
model_names = [model_names[i] for i in sorted_indices]
means = [means[i] for i in sorted_indices]
lowers = [lowers[i] for i in sorted_indices]
uppers = [uppers[i] for i in sorted_indices]

y_pos = np.arange(len(model_names))

# Plot CIs
colors = ['#FF6B6B' if 'CrossViT' in m else '#4ECDC4' for m in model_names]
ax.barh(y_pos, means, xerr=[[m-l for m, l in zip(means, lowers)], 
                             [u-m for m, u in zip(means, uppers)]], 
        color=colors, alpha=0.7, capsize=5)

ax.set_yticks(y_pos)
ax.set_yticklabels(model_names)
ax.set_xlabel('Test Accuracy (%)', fontsize=12)
ax.set_title('Model Performance with 95% Confidence Intervals (Bootstrap, n=10000)', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'confidence_intervals_plot.png', dpi=300, bbox_inches='tight')
plt.show()

print("[OK] CI plot saved")

## 5. Generate APA-Formatted Results Table

In [None]:
# APA-formatted table
print("\nAPA-FORMATTED TABLE (FOR THESIS CHAPTER 5)")
print("="*90)
print()
print("Table 1")
print("Descriptive Statistics and 95% Confidence Intervals for Model Performance")
print()
print(f"{'Model':<20s} {'M':<8s} {'SD':<8s} {'95% CI':<25s} {'N'}")
print("-"*70)

for model_name in model_names:
    mean = ci_results[model_name]['mean']
    sd = np.std(results[model_name], ddof=1)
    lower = ci_results[model_name]['lower']
    upper = ci_results[model_name]['upper']
    n = len(results[model_name])
    
    print(f"{model_name:<20s} {mean:>5.2f}   {sd:>5.2f}   [{lower:>5.2f}, {upper:>5.2f}]       {n}")

print()
print("Note. M = mean accuracy (%), SD = standard deviation, CI = confidence interval, N = number of random seeds.")
print("Confidence intervals calculated using bootstrap method with 10,000 iterations.")

## 6. Summary for Thesis

### Key Findings

**Research Question:** Does CrossViT-Tiny achieve significantly higher accuracy than CNN baselines for COVID-19 classification?

**Answer:** [To be filled based on results]

**Statistical Evidence:**
- CrossViT-Tiny: Mean = X.XX% (95% CI [X.XX, X.XX])
- Best baseline: Model Y: Mean = X.XX% (95% CI [X.XX, X.XX])
- Mean difference: X.XX%
- Paired t-test: t(4) = X.XX, p = X.XXX
- Cohen's d = X.XX (small/medium/large effect)

**Conclusion:**
- H₁ supported/not supported at Bonferroni-corrected α' = 0.01
- [Interpret practical significance vs statistical significance]

---

## 7. Export Summary Report

In [None]:
# Create comprehensive summary report
with open(OUTPUT_DIR / 'statistical_validation_summary.txt', 'w') as f:
    f.write("STATISTICAL VALIDATION SUMMARY\n")
    f.write("="*80 + "\n\n")
    
    f.write("1. DESCRIPTIVE STATISTICS\n")
    f.write("-"*80 + "\n")
    for model_name in model_names:
        mean = ci_results[model_name]['mean']
        sd = np.std(results[model_name], ddof=1)
        lower = ci_results[model_name]['lower']
        upper = ci_results[model_name]['upper']
        f.write(f"{model_name}: M = {mean:.2f}%, SD = {sd:.2f}, 95% CI [{lower:.2f}, {upper:.2f}]\n")
    
    f.write("\n2. HYPOTHESIS TESTING (Bonferroni-corrected α' = 0.01)\n")
    f.write("-"*80 + "\n")
    for result in hypothesis_results:
        f.write(f"{result['Comparison']}: ")
        f.write(f"Δ = {result['Mean Difference (%)']:+.2f}%, ")
        f.write(f"t = {result['t-statistic']:+.3f}, ")
        f.write(f"p = {result['p-value']:.4f}, ")
        f.write(f"d = {result['Cohen\\'s d']:+.3f}, ")
        f.write(f"{result['Significant (α\' = 0.01)']}\n")
    
    f.write("\n3. CONCLUSION\n")
    f.write("-"*80 + "\n")
    f.write("H₁: CrossViT achieves significantly higher accuracy than CNN baselines\n")
    f.write("Result: [Review p-values and effect sizes above]\n")

print("\n[OK] Summary report saved to: statistical_validation_summary.txt")
print("\n" + "="*80)
print("STATISTICAL VALIDATION COMPLETE")
print("="*80)
print("\nNext steps:")
print("1. Review hypothesis_testing_results.csv for detailed statistics")
print("2. Use confidence_intervals_plot.png in thesis Chapter 5")
print("3. Interpret results in context of practical significance")
print("4. Proceed to 13_error_analysis.ipynb")