In [None]:
# 5_中心极限定理.ipynb
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, skew, kurtosis
from utils import BatteryDataLoader

# 1. 加载真实电池寿命数据
lifetimes = results_part1['lifetimes']
true_mean = np.mean(lifetimes)
true_std = np.std(lifetimes)

print("=== 电池寿命总体参数 ===")
print(f"总体均值 μ = {true_mean:.1f}")
print(f"总体标准差 σ = {true_std:.1f}")
print(f"总体偏度 = {skew(lifetimes):.3f}")
print(f"总体峰度 = {kurtosis(lifetimes):.3f}")

# 2. 大数定律演示
np.random.seed(42)

sample_sizes = np.arange(10, len(lifetimes) + 1, 10)
sample_means = []

for n in sample_sizes:
    sample = np.random.choice(lifetimes, n, replace=True)
    sample_means.append(np.mean(sample))

# 3. 中心极限定理演示
n_samples = 50  # 每次抽样数量
n_repetitions = 1000  # 重复抽样次数

sample_means_clt = []
sample_stds = []

for _ in range(n_repetitions):
    sample = np.random.choice(lifetimes, n_samples, replace=True)
    sample_means_clt.append(np.mean(sample))
    sample_stds.append(np.std(sample))

sample_means_clt = np.array(sample_means_clt)
sample_stds = np.array(sample_stds)

# 4. 可视化
plt.figure(figsize=(15, 10))

# 4.1 大数定律
plt.subplot(2, 3, 1)
plt.plot(sample_sizes, sample_means, 'b-', linewidth=1.5, alpha=0.7)
plt.axhline(true_mean, color='red', linestyle='--', linewidth=2, label=f'真实均值 μ={true_mean:.1f}')
plt.xlabel('样本量 n')
plt.ylabel('样本均值')
plt.title('大数定律：样本均值收敛于总体均值')
plt.legend()
plt.grid(True, alpha=0.3)

# 4.2 原始分布
plt.subplot(2, 3, 2)
plt.hist(lifetimes, bins=20, density=True, alpha=0.6, edgecolor='black')
plt.xlabel('电池寿命（循环次数）')
plt.ylabel('概率密度')
plt.title('电池寿命总体分布')
plt.grid(True, alpha=0.3)

# 4.3 样本均值分布（CLT）
plt.subplot(2, 3, 3)
hist_vals, bins, _ = plt.hist(sample_means_clt, bins=30, density=True, alpha=0.6, 
                               edgecolor='black', label='样本均值分布')

# 理论正态分布
se = true_std / np.sqrt(n_samples)  # 标准误
x_norm = np.linspace(min(sample_means_clt), max(sample_means_clt), 1000)
pdf_norm = norm.pdf(x_norm, true_mean, se)
plt.plot(x_norm, pdf_norm, 'r-', linewidth=2, label='理论正态分布')

plt.xlabel('样本均值')
plt.ylabel('概率密度')
plt.title(f'中心极限定理（n={n_samples}）')
plt.legend()
plt.grid(True, alpha=0.3)

# 4.4 Q-Q图检验正态性
plt.subplot(2, 3, 4)
sorted_means = np.sort(sample_means_clt)
ecdf = np.arange(1, len(sorted_means) + 1) / len(sorted_means)
theoretical_quantiles = norm.ppf(ecdf, loc=np.mean(sorted_means), scale=np.std(sorted_means))

plt.scatter(theoretical_quantiles, sorted_means, alpha=0.6)
plt.plot([min(theoretical_quantiles), max(theoretical_quantiles)], 
         [min(theoretical_quantiles), max(theoretical_quantiles)], 
         'r--', alpha=0.8, label='45度线')
plt.xlabel('理论正态分位数')
plt.ylabel('样本均值分位数')
plt.title('样本均值分布的Q-Q图')
plt.legend()
plt.grid(True, alpha=0.3)

# 4.5 置信区间构造
plt.subplot(2, 3, 5)
n_ci = 100  # 绘制100个置信区间
ci_lower = []
ci_upper = []
captures = []

z_alpha = norm.ppf(0.975)

for i in range(n_ci):
    sample = np.random.choice(lifetimes, n_samples, replace=True)
    sample_mean = np.mean(sample)
    sample_std = np.std(sample)
    
    margin = z_alpha * (sample_std / np.sqrt(n_samples))
    lower = sample_mean - margin
    upper = sample_mean + margin
    
    ci_lower.append(lower)
    ci_upper.append(upper)
    captures.append(lower <= true_mean <= upper)

# 绘制置信区间
plt.errorbar(range(n_ci), [(l+u)/2 for l, u in zip(ci_lower, ci_upper)], 
             yerr=[(u-l)/2 for l, u in zip(ci_lower, ci_upper)], 
             fmt='o', alpha=0.5, markersize=3, capsize=3)

plt.axhline(true_mean, color='red', linestyle='-', linewidth=2, label=f'真实均值 μ={true_mean:.1f}')
plt.xlabel('模拟实验序号')
plt.ylabel('电池寿命均值')
plt.title(f'{n_ci}个95%置信区间（{np.mean(captures)*100:.1f}%包含真值）')
plt.legend()
plt.grid(True, alpha=0.3)

# 4.6 不同样本量下的CLT效果
plt.subplot(2, 3, 6)
sample_sizes_clt = [10, 30, 50, 100]
colors = ['blue', 'green', 'orange', 'red']

for i, n in enumerate(sample_sizes_clt):
    means_temp = []
    for _ in range(500):
        sample = np.random.choice(lifetimes, n, replace=True)
        means_temp.append(np.mean(sample))
    
    # 核密度估计
    from scipy.stats import gaussian_kde
    kde = gaussian_kde(means_temp)
    x_plot = np.linspace(min(means_temp), max(means_temp), 100)
    plt.plot(x_plot, kde(x_plot), color=colors[i], linewidth=2, label=f'n={n}')

# 理论正态分布
x_theory = np.linspace(true_mean - 3*true_std/np.sqrt(10), true_mean + 3*true_std/np.sqrt(10), 100)
plt.plot(x_theory, norm.pdf(x_theory, true_mean, true_std/np.sqrt(50)), 'k--', linewidth=2, label='理论正态')

plt.xlabel('样本均值')
plt.ylabel('概率密度')
plt.title('不同样本量下的样本均值分布')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('中心极限定理.png', dpi=300, bbox_inches='tight')
plt.show()

# 5. 蒙特卡洛模拟展示
print(f"\n=== 中心极限定理验证 ===")
print(f"样本均值分布的均值: {np.mean(sample_means_clt):.2f}")
print(f"样本均值分布的标准差（理论SE={true_std/np.sqrt(n_samples):.2f}）: {np.std(sample_means_clt):.2f}")
print(f"样本均值分布的偏度: {skew(sample_means_clt):.3f}")
print(f"样本均值分布的峰度: {kurtosis(sample_means_clt):.3f}")

# 正态性检验
from scipy.stats import shapiro, anderson
shapiro_stat, shapiro_p = shapiro(sample_means_clt)
print(f"\nShapiro-Wilk正态性检验:")
print(f"统计量: {shapiro_stat:.4f}, P值: {shapiro_p:.4f}")

# 置信区间计算示例
sample_mean_example = np.mean(sample_means_clt)
sample_std_example = np.std(sample_means_clt, ddof=1)
margin_example = z_alpha * (sample_std_example / np.sqrt(len(sample_means_clt)))
ci_example = (sample_mean_example - margin_example, sample_mean_example + margin_example)

print(f"\n基于CLT的置信区间:")
print(f"样本均值: {sample_mean_example:.2f}")
print(f"95%置信区间: ({ci_example[0]:.2f}, {ci_example[1]:.2f})")
print(f"真实均值 μ={true_mean:.1f} {'在区间内' if ci_example[0] <= true_mean <= ci_example[1] else '不在区间内'}")

# 保存结果
results_part5 = {
    'sample_means_clt': sample_means_clt,
    'clt_verification': {
        'mean_of_means': np.mean(sample_means_clt),
        'std_of_means': np.std(sample_means_clt),
        'theoretical_se': true_std / np.sqrt(n_samples),
        'normality_test_p': shapiro_p
    }
}