# Statistics Basics - Foundation of Data Science

Learn statistical concepts essential for ML.

## Table of Contents
1. [Descriptive Statistics](#desc)
2. [Probability Distributions](#prob)
3. [Hypothesis Testing](#hyp)
4. [Correlation](#corr)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_theme()

## Descriptive Statistics <a id='desc'></a>

In [None]:
# Generate sample data
np.random.seed(42)
data = np.random.normal(100, 15, 1000)

print("Descriptive Statistics:")
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Mode: {stats.mode(data, keepdims=True)[0][0]:.2f}")
print(f"Std Dev: {np.std(data):.2f}")
print(f"Variance: {np.var(data):.2f}")
print(f"Min: {np.min(data):.2f}")
print(f"Max: {np.max(data):.2f}")
print(f"25th percentile: {np.percentile(data, 25):.2f}")
print(f"75th percentile: {np.percentile(data, 75):.2f}")

# Visualize
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(data, bins=30, edgecolor='black')
plt.axvline(np.mean(data), color='r', linestyle='--', label='Mean')
plt.axvline(np.median(data), color='g', linestyle='--', label='Median')
plt.legend()
plt.title('Distribution')

plt.subplot(1, 2, 2)
plt.boxplot(data)
plt.title('Box Plot')
plt.tight_layout()
plt.show()

## Probability Distributions <a id='prob'></a>

In [None]:
# Normal distribution
x = np.linspace(-4, 4, 100)
y = stats.norm.pdf(x, 0, 1)

plt.figure(figsize=(10, 6))
plt.plot(x, y, label='Normal(0,1)')
plt.fill_between(x, y, alpha=0.3)
plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.title('Normal Distribution')
plt.legend()
plt.grid(True)
plt.show()

# Generate samples
samples = np.random.normal(0, 1, 1000)
plt.figure(figsize=(10, 6))
plt.hist(samples, bins=30, density=True, alpha=0.7, label='Samples')
plt.plot(x, y, 'r', label='Theoretical')
plt.legend()
plt.title('Sampled vs Theoretical Normal Distribution')
plt.show()

## Hypothesis Testing <a id='hyp'></a>

In [None]:
# One-sample t-test
sample = np.random.normal(105, 15, 30)
t_stat, p_value = stats.ttest_1samp(sample, 100)

print("One-sample t-test:")
print(f"Sample mean: {np.mean(sample):.2f}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant at 0.05 level: {p_value < 0.05}")

# Two-sample t-test
group1 = np.random.normal(100, 15, 50)
group2 = np.random.normal(105, 15, 50)
t_stat, p_value = stats.ttest_ind(group1, group2)

print("\nTwo-sample t-test:")
print(f"Group 1 mean: {np.mean(group1):.2f}")
print(f"Group 2 mean: {np.mean(group2):.2f}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant at 0.05 level: {p_value < 0.05}")

## Correlation <a id='corr'></a>

In [None]:
# Generate correlated data
np.random.seed(42)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100) * 0.5

# Calculate correlation
pearson_corr, _ = stats.pearsonr(x, y)
spearman_corr, _ = stats.spearmanr(x, y)

print(f"Pearson correlation: {pearson_corr:.4f}")
print(f"Spearman correlation: {spearman_corr:.4f}")

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.5)
plt.xlabel('X')
plt.ylabel('Y')
plt.title(f'Correlation: {pearson_corr:.2f}')
plt.grid(True)
plt.show()

## Summary

Covered:
- Descriptive statistics (mean, median, variance)
- Probability distributions
- Hypothesis testing (t-tests)
- Correlation analysis

Next: **Feature Engineering** for ML preparation!