# PCA: Dimensionality Reduction and Data Compression

> **"PCA finds the most important directions in your data."**

## Learning Objectives
- Understand the mathematical foundation of Principal Component Analysis
- Implement PCA from scratch using eigenvalue decomposition
- Learn about variance explained and component selection
- Master visualization techniques for high-dimensional data
- Apply PCA to real-world dimensionality reduction problems


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_iris, load_wine
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)

print("Libraries imported successfully!")


## 1. Principal Component Analysis (PCA) Fundamentals

### What is PCA?
PCA is a dimensionality reduction technique that transforms data into a new coordinate system where the first coordinate (first principal component) has the largest possible variance, the second coordinate has the second largest variance, and so on.

### Mathematical Foundation

#### 1. Covariance Matrix
$C = \frac{1}{n-1}X^TX$

Where X is the centered data matrix.

#### 2. Eigenvalue Decomposition
$C = V\Lambda V^T$

Where:
- V contains the eigenvectors (principal components)
- Λ contains the eigenvalues (variances)

#### 3. Transformation
$Y = XV$

Where Y is the transformed data in the new coordinate system.


In [None]:
# Generate sample data
X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, 
                          n_redundant=7, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Dataset Overview:")
print("=" * 50)
print(f"Original shape: {X.shape}")
print(f"Scaled shape: {X_scaled.shape}")

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Calculate explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

print(f"\nExplained Variance Ratio:")
print("=" * 50)
for i, (var_ratio, cum_var_ratio) in enumerate(zip(explained_variance_ratio, cumulative_variance_ratio)):
    print(f"PC{i+1}: {var_ratio:.3f} (Cumulative: {cum_var_ratio:.3f})")

# Visualize explained variance
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Individual explained variance
axes[0].bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, alpha=0.7)
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Explained Variance by Component')
axes[0].grid(True, alpha=0.3)

# Cumulative explained variance
axes[1].plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'o-', linewidth=2)
axes[1].axhline(y=0.95, color='r', linestyle='--', alpha=0.7, label='95% variance')
axes[1].axhline(y=0.99, color='g', linestyle='--', alpha=0.7, label='99% variance')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance Ratio')
axes[1].set_title('Cumulative Explained Variance')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find number of components for 95% and 99% variance
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
n_components_99 = np.argmax(cumulative_variance_ratio >= 0.99) + 1

print(f"\nNumber of components for 95% variance: {n_components_95}")
print(f"Number of components for 99% variance: {n_components_99}")


In [None]:
# Apply PCA with different numbers of components
n_components_list = [2, 3, 5, 7, 10]
results = {}

for n_components in n_components_list:
    # Apply PCA
    pca_reduced = PCA(n_components=n_components)
    X_pca_reduced = pca_reduced.fit_transform(X_scaled)
    
    # Train classifier on reduced data
    X_train, X_test, y_train, y_test = train_test_split(X_pca_reduced, y, test_size=0.2, random_state=42)
    
    clf = LogisticRegression(random_state=42)
    clf.fit(X_train, y_train)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, clf.predict(X_test))
    
    # Calculate explained variance
    explained_variance = np.sum(pca_reduced.explained_variance_ratio_)
    
    results[n_components] = {
        'accuracy': accuracy,
        'explained_variance': explained_variance,
        'data_shape': X_pca_reduced.shape
    }

# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Accuracy vs number of components
n_components = list(results.keys())
accuracies = [results[n]['accuracy'] for n in n_components]
explained_variances = [results[n]['explained_variance'] for n in n_components]

axes[0].plot(n_components, accuracies, 'o-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Components')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Classification Accuracy vs Number of Components')
axes[0].grid(True, alpha=0.3)

# Add value labels
for i, (n_comp, acc) in enumerate(zip(n_components, accuracies)):
    axes[0].text(n_comp, acc + 0.01, f'{acc:.3f}', ha='center', va='bottom')

# Explained variance vs number of components
axes[1].plot(n_components, explained_variances, 's-', linewidth=2, markersize=8, color='green')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Explained Variance Ratio')
axes[1].set_title('Explained Variance vs Number of Components')
axes[1].grid(True, alpha=0.3)

# Add value labels
for i, (n_comp, var) in enumerate(zip(n_components, explained_variances)):
    axes[1].text(n_comp, var + 0.01, f'{var:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print results table
print("\nResults Summary:")
print("=" * 80)
print(f"{'Components':<12} {'Accuracy':<10} {'Explained Var':<15} {'Data Shape':<15}")
print("-" * 80)
for n_comp in n_components:
    result = results[n_comp]
    print(f"{n_comp:<12} {result['accuracy']:<10.3f} {result['explained_variance']:<15.3f} {str(result['data_shape']):<15}")

# Visualize 2D projection
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=y, cmap='viridis', alpha=0.7)
plt.xlabel(f'First Principal Component ({pca_2d.explained_variance_ratio_[0]:.3f})')
plt.ylabel(f'Second Principal Component ({pca_2d.explained_variance_ratio_[1]:.3f})')
plt.title('2D PCA Projection')
plt.colorbar(scatter)
plt.grid(True, alpha=0.3)
plt.show()
