# Week 4, Day 3: Principal Component Analysis (PCA)

## Learning Objectives
- Understand dimensionality reduction concepts
- Learn PCA algorithm and mathematics
- Master variance explained analysis
- Practice implementing PCA

## Topics Covered
1. Dimensionality Reduction
2. PCA Algorithm
3. Explained Variance
4. Applications

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits, load_breast_cancer

## 1. Basic PCA Example

In [None]:
def basic_pca_example():
    # Generate correlated data
    np.random.seed(42)
    n_samples = 300
    
    # Create features with correlation
    x = np.random.normal(0, 1, n_samples)
    y = x + np.random.normal(0, 0.3, n_samples)
    X = np.column_stack([x, y])
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)
    
    # Plot original data and principal components
    plt.figure(figsize=(12, 5))
    
    # Original data
    plt.subplot(121)
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], alpha=0.5)
    plt.title('Original Data')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    
    # Plot principal components
    plt.subplot(122)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
    plt.title('Data in PC Space')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    
    plt.tight_layout()
    plt.show()
    
    # Print explained variance ratios
    print("Explained Variance Ratios:")
    for i, ratio in enumerate(pca.explained_variance_ratio_, 1):
        print(f"PC{i}: {ratio:.4f}")

basic_pca_example()

## 2. Dimensionality Reduction

In [None]:
def dimensionality_reduction_example():
    # Load digits dataset
    digits = load_digits()
    X = digits.data
    y = digits.target
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)
    
    # Plot cumulative explained variance ratio
    plt.figure(figsize=(10, 6))
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    plt.plot(range(1, len(cumsum) + 1), cumsum, 'bo-')
    plt.axhline(y=0.95, color='r', linestyle='--')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('Explained Variance vs. Number of Components')
    plt.show()
    
    # Visualize first two principal components
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
    plt.colorbar(scatter)
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title('Digits Dataset in PC Space')
    plt.show()
    
    # Find number of components for 95% variance
    n_components_95 = np.argmax(cumsum >= 0.95) + 1
    print(f"Number of components needed for 95% variance: {n_components_95}")

dimensionality_reduction_example()

## 3. Feature Analysis

In [None]:
def feature_analysis_example():
    # Load breast cancer dataset
    data = load_breast_cancer()
    X = data.data
    feature_names = data.feature_names
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA()
    pca.fit(X_scaled)
    
    # Get feature loadings
    loadings = pd.DataFrame(
        pca.components_.T,
        columns=[f'PC{i+1}' for i in range(pca.n_components_)],
        index=feature_names
    )
    
    # Plot feature loadings heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(loadings.iloc[:, :5], annot=True, cmap='coolwarm', center=0)
    plt.title('Feature Loadings (First 5 PCs)')
    plt.show()
    
    # Plot explained variance
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, len(pca.explained_variance_ratio_) + 1),
            pca.explained_variance_ratio_)
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Scree Plot')
    plt.show()
    
    # Print top features for first 3 PCs
    print("\nTop features for first 3 principal components:")
    for i in range(3):
        pc = loadings[f'PC{i+1}'].abs().sort_values(ascending=False)
        print(f"\nPC{i+1} top features:")
        print(pc.head())

feature_analysis_example()

## Practical Exercises

In [None]:
# Exercise 1: Image Compression

def image_compression_exercise():
    from sklearn.datasets import load_sample_images
    
    # Load sample image
    dataset = load_sample_images()
    image = dataset.images[0]
    
    # Display original image
    plt.figure(figsize=(10, 5))
    plt.subplot(121)
    plt.imshow(image)
    plt.title('Original Image')
    plt.axis('off')
    
    print("Image shape:", image.shape)
    
    # Task: Compress image using PCA
    # 1. Prepare image data
    # 2. Apply PCA
    # 3. Reconstruct image
    # 4. Compare original and compressed images
    
    # Your code here

image_compression_exercise()

In [None]:
# Exercise 2: Stock Market Analysis

def stock_analysis_exercise():
    # Generate synthetic stock data
    np.random.seed(42)
    n_days = 1000
    n_stocks = 10
    
    # Create correlated stock returns
    market_factor = np.random.normal(0, 1, n_days)
    stock_returns = np.zeros((n_days, n_stocks))
    
    for i in range(n_stocks):
        beta = np.random.uniform(0.5, 1.5)
        stock_returns[:, i] = beta * market_factor + np.random.normal(0, 0.5, n_days)
    
    # Create DataFrame
    stocks_df = pd.DataFrame(
        stock_returns,
        columns=[f'Stock_{i+1}' for i in range(n_stocks)]
    )
    
    print("Sample of stock returns:")
    print(stocks_df.head())
    
    # Task: Analyze stock market factors using PCA
    # 1. Prepare the data
    # 2. Apply PCA
    # 3. Analyze components
    # 4. Visualize results
    
    # Your code here

stock_analysis_exercise()

## MCQ Quiz

1. What is the main purpose of PCA?
   - a) Classification
   - b) Dimensionality reduction
   - c) Clustering
   - d) Regression

2. What does explained variance ratio represent?
   - a) Error rate
   - b) Proportion of variance captured
   - c) Number of components
   - d) Feature importance

3. Which preprocessing step is crucial for PCA?
   - a) Feature scaling
   - b) Feature selection
   - c) Outlier removal
   - d) Missing value imputation

4. What are principal components?
   - a) Original features
   - b) Linear combinations of features
   - c) Random projections
   - d) Cluster centers

5. How are principal components ordered?
   - a) Alphabetically
   - b) By variance explained
   - c) Randomly
   - d) By correlation

6. What is a scree plot used for?
   - a) Feature selection
   - b) Component selection
   - c) Outlier detection
   - d) Clustering

7. What is the maximum number of principal components?
   - a) Unlimited
   - b) Number of features
   - c) Number of samples
   - d) Fixed number

8. Which property is NOT true for principal components?
   - a) Orthogonal
   - b) Ordered by variance
   - c) Correlated
   - d) Linear combinations

9. What is the time complexity of PCA?
   - a) O(n)
   - b) O(n log n)
   - c) O(n²)
   - d) O(n³)

10. When should you NOT use PCA?
    - a) High dimensional data
    - b) Correlated features
    - c) Categorical data
    - d) Noisy data

Answers: 1-b, 2-b, 3-a, 4-b, 5-b, 6-b, 7-b, 8-c, 9-d, 10-c