# Week 5: Sampling Methods

**Course:** Statistics for Data Science II (BSMA1004)  
**Week:** 5 of 12

## Learning Objectives
- Understand simple random sampling
- Apply stratified and cluster sampling
- Master sampling distributions
- Verify Central Limit Theorem

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

print('✓ Libraries loaded')

## 1. Simple Random Sampling (SRS)

Every sample of size n has equal probability

**Properties:**
- $E[\bar{X}] = \mu$ (unbiased)
- $\text{Var}(\bar{X}) = \frac{\sigma^2}{n}$ (with replacement)
- $\text{SE}(\bar{X}) = \frac{\sigma}{\sqrt{n}}$

In [None]:
# Population
np.random.seed(42)
population = np.random.normal(100, 15, 10000)
mu = population.mean()
sigma = population.std()

# Sample sizes
sample_sizes = [5, 10, 30, 50, 100]
n_samples = 1000

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, n in enumerate(sample_sizes):
    sample_means = [np.random.choice(population, n).mean() for _ in range(n_samples)]
    
    axes[idx].hist(sample_means, bins=50, alpha=0.7, edgecolor='black', density=True)
    axes[idx].axvline(mu, color='red', linestyle='--', label=f'μ={mu:.1f}')
    axes[idx].axvline(np.mean(sample_means), color='green', linestyle=':', 
                      label=f'Mean={np.mean(sample_means):.1f}')
    
    # Theory curve
    x = np.linspace(mu-4*sigma/np.sqrt(n), mu+4*sigma/np.sqrt(n), 100)
    axes[idx].plot(x, stats.norm.pdf(x, mu, sigma/np.sqrt(n)), 
                   'b-', linewidth=2, label='Theory')
    
    axes[idx].set_title(f'n={n}, SE={sigma/np.sqrt(n):.2f}', fontweight='bold')
    axes[idx].set_xlabel('Sample Mean')
    axes[idx].set_ylabel('Density')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

axes[-1].axis('off')
plt.tight_layout()
plt.show()

print(f"Population: μ={mu:.2f}, σ={sigma:.2f}")

## 2. Central Limit Theorem

### Theorem
For any distribution with mean μ and variance σ²:

$$\frac{\bar{X} - \mu}{\sigma/\sqrt{n}} \xrightarrow{d} N(0,1)$$

As n → ∞, sample mean approaches normal distribution

In [None]:
# CLT demonstration with non-normal distribution
# Start with exponential (highly skewed)
lam = 0.5
n_samples = 10000
sample_sizes = [1, 5, 10, 30]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, n in enumerate(sample_sizes):
    if n == 1:
        samples = np.random.exponential(1/lam, n_samples)
    else:
        samples = [np.random.exponential(1/lam, n).mean() for _ in range(n_samples)]
    
    axes[idx].hist(samples, bins=50, alpha=0.7, edgecolor='black', density=True)
    
    # Overlay normal
    mu = 1/lam
    sigma = 1/lam
    x = np.linspace(0, 8, 100)
    axes[idx].plot(x, stats.norm.pdf(x, mu, sigma/np.sqrt(n)), 
                   'r-', linewidth=2, label='Normal approximation')
    
    axes[idx].set_title(f'n={n}', fontweight='bold', fontsize=14)
    axes[idx].set_xlabel('Sample Mean')
    axes[idx].set_ylabel('Density')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Central Limit Theorem: Exponential → Normal', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("Notice: As n increases, distribution becomes more normal!")

## 3. Stratified Sampling

Divide population into strata, sample from each

**Advantages:**
- Reduces variance
- Ensures representation
- More efficient than SRS

In [None]:
# Create stratified population (income by education level)
np.random.seed(42)
strata = {
    'High School': {'size': 5000, 'mean': 40000, 'std': 10000},
    'Bachelor': {'size': 3000, 'mean': 60000, 'std': 15000},
    'Graduate': {'size': 2000, 'mean': 85000, 'std': 20000}
}

# Create population
population_df = pd.DataFrame()
for name, params in strata.items():
    df = pd.DataFrame({
        'education': name,
        'income': np.random.normal(params['mean'], params['std'], params['size'])
    })
    population_df = pd.concat([population_df, df], ignore_index=True)

true_mean = population_df['income'].mean()
print(f"True population mean income: ${true_mean:,.2f}")

# Compare SRS vs Stratified
n_samples = 1000
sample_size = 100

# Simple random sampling
srs_means = [population_df.sample(sample_size)['income'].mean() for _ in range(n_samples)]

# Stratified sampling (proportional allocation)
stratified_means = []
for _ in range(n_samples):
    strat_sample = []
    for name, params in strata.items():
        n_strat = int(sample_size * params['size'] / len(population_df))
        sample = population_df[population_df['education']==name].sample(n_strat)['income']
        strat_sample.extend(sample)
    stratified_means.append(np.mean(strat_sample))

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].hist(srs_means, bins=50, alpha=0.7, edgecolor='black', label='SRS')
axes[0].axvline(true_mean, color='red', linestyle='--', linewidth=2, label=f'True μ')
axes[0].set_xlabel('Sample Mean Income')
axes[0].set_ylabel('Frequency')
axes[0].set_title(f'Simple Random Sampling\nSE={np.std(srs_means):,.0f}', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].hist(stratified_means, bins=50, alpha=0.7, edgecolor='black', 
             color='green', label='Stratified')
axes[1].axvline(true_mean, color='red', linestyle='--', linewidth=2, label=f'True μ')
axes[1].set_xlabel('Sample Mean Income')
axes[1].set_ylabel('Frequency')
axes[1].set_title(f'Stratified Sampling\nSE={np.std(stratified_means):,.0f}', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nSRS SE: ${np.std(srs_means):,.2f}")
print(f"Stratified SE: ${np.std(stratified_means):,.2f}")
print(f"Efficiency gain: {(1 - np.std(stratified_means)/np.std(srs_means))*100:.1f}%")

## 4. Summary & Key Takeaways

### 📚 Core Concepts
1. **SRS**: Equal probability, unbiased, $SE = \sigma/\sqrt{n}$
2. **CLT**: Sample mean → Normal as n increases
3. **Stratified**: Reduces variance, ensures representation

### 🚀 Next Week
**Week 6: Estimation Theory**

---
**🎓 End of Week 5**