<a href="https://colab.research.google.com/github/Masuzyo/Thrivesummer/blob/main/Central_Limit_Theorem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.datasets import load_diabetes,load_iris

# Central Limit Theorem

### Theory
The Central Limit Theorem states that the sampling distribution of the sample mean approaches a normal distribution as the sample size increases, regardless of the population's distribution.

## $χ$-square Distribution

In [None]:
sample_sizes=[5,10,50,1000]
n_samples=5000
k=1

# draw chisquare 1 distribution
x=np.linspace(0,4,1000)
y=stats.chi(df=k).pdf(x)
plt.plot(x,y)
plt.show()

In [None]:
# CLT example sampled from chisq distribution

sample_sizes=[5,10,50,1000]
n_samples=5000
k=1

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for i, n in enumerate(sample_sizes):
    sample_means = []
    for _ in range(n_samples):
        sample = np.random.chisquare(df=k, size=n)
        sample_means.append(np.mean(sample))

    res=stats.shapiro(sample_means)

    axes[i].hist(sample_means, bins=30, density=True, alpha=0.7)
    axes[i].set_title(f'Sample Size = {n},p-value={res[1]:.3f}')
    axes[i].set_xlabel('Sample Mean')
    axes[i].set_ylabel('Density')
    axes[i].grid(True)

    # Overlay theoretical normal distribution
    mu_theory = k
    sigma_theory = np.sqrt(2 * k) / np.sqrt(n)

    x = np.linspace(min(sample_means), max(sample_means), 100)
    axes[i].plot(x, stats.norm.pdf(x, mu_theory, sigma_theory), 'r-', linewidth=2)

plt.tight_layout()
plt.show()



In [None]:
diabetes = load_diabetes()
population = diabetes.target

# Demonstrate CLT with different sample sizes
sample_sizes = [5, 10, 30, 1000]
n_samples = 5000


fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

fig.suptitle('Central Limit Theorem Demonstration', fontsize=16)

for i, n in enumerate(sample_sizes):
    sample_means = []
    samples_all=[]
    for _ in range(n_samples):
        sample = np.random.choice(population, size=n, replace=True)
        sample_means.append(np.mean(sample))
        samples_all.append(sample)

    res=stats.shapiro(sample_means)

    axes[i].hist(sample_means, bins=30, density=True, alpha=0.7)
    axes[i].set_title(f'Sample Size = {n},p-value={res[1]:.3f}')
    axes[i].set_xlabel('Sample Mean')
    axes[i].set_ylabel('Density')


    # Overlay theoretical normal distribution
    mu_theory = np.mean(population)
    sigma_theory = np.std(population) / np.sqrt(n)
    x = np.linspace(min(sample_means), max(sample_means), 100)
    axes[i].plot(x, stats.norm.pdf(x, mu_theory, sigma_theory), 'r-', linewidth=2)

plt.tight_layout()
plt.show()

### Exercise 2: Central Limit Theorem Verification
**Task**: Using the iris dataset, demonstrate the Central Limit Theorem using petal width measurements.

1. Extract petal width data from the iris dataset
2. Create sampling distributions for sample sizes: 3, 10, 25, 50
3. Plot histograms showing how the sampling distribution becomes more normal
4. Calculate the theoretical and empirical means and standard deviations
5. Comment on how the sampling distribution changes with sample size

**Your Code Here:**

In [None]:
# Load iris dataset
iris = load_iris()
# Your solution here