# Week 4, Day 5: Anomaly Detection

## Learning Objectives
- Understand anomaly detection concepts
- Learn different detection methods
- Master outlier analysis techniques
- Practice implementing anomaly detection

## Topics Covered
1. Statistical Methods
2. Isolation Forest
3. One-Class SVM
4. Local Outlier Factor (LOF)

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

## 1. Statistical Methods

In [None]:
def statistical_methods():
    # Generate data with outliers
    np.random.seed(42)
    n_samples = 300
    
    # Normal data
    X_normal = np.random.normal(0, 1, (n_samples, 2))
    
    # Add outliers
    X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
    X = np.vstack([X_normal, X_outliers])
    
    # Z-score method
    z_scores = np.abs(stats.zscore(X))
    outliers_z = np.any(z_scores > 3, axis=1)
    
    # IQR method
    Q1 = np.percentile(X, 25, axis=0)
    Q3 = np.percentile(X, 75, axis=0)
    IQR = Q3 - Q1
    outliers_iqr = np.any((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR)), axis=1)
    
    # Visualize results
    plt.figure(figsize=(15, 5))
    
    # Original data
    plt.subplot(131)
    plt.scatter(X[:, 0], X[:, 1], alpha=0.5)
    plt.title('Original Data')
    
    # Z-score method
    plt.subplot(132)
    plt.scatter(X[~outliers_z, 0], X[~outliers_z, 1], label='Normal')
    plt.scatter(X[outliers_z, 0], X[outliers_z, 1], color='red', label='Outlier')
    plt.title('Z-score Method')
    plt.legend()
    
    # IQR method
    plt.subplot(133)
    plt.scatter(X[~outliers_iqr, 0], X[~outliers_iqr, 1], label='Normal')
    plt.scatter(X[outliers_iqr, 0], X[outliers_iqr, 1], color='red', label='Outlier')
    plt.title('IQR Method')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

statistical_methods()

## 2. Isolation Forest

In [None]:
def isolation_forest_example():
    # Generate data
    np.random.seed(42)
    n_samples = 300
    
    # Create normal samples
    X_normal = np.random.normal(0, 1, (n_samples, 2))
    
    # Add outliers
    X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
    X = np.vstack([X_normal, X_outliers])
    
    # Apply Isolation Forest
    clf = IsolationForest(random_state=42, contamination=0.1)
    y_pred = clf.fit_predict(X)
    
    # Visualize results
    plt.figure(figsize=(10, 6))
    plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], label='Normal')
    plt.scatter(X[y_pred == -1, 0], X[y_pred == -1, 1], color='red', label='Outlier')
    plt.title('Isolation Forest Results')
    plt.legend()
    plt.show()
    
    # Print statistics
    print("Number of outliers:", (y_pred == -1).sum())
    print("Number of normal points:", (y_pred == 1).sum())

isolation_forest_example()

## 3. One-Class SVM

In [None]:
def oneclass_svm_example():
    # Generate data
    np.random.seed(42)
    n_samples = 300
    
    # Create normal samples
    X_normal = np.random.normal(0, 1, (n_samples, 2))
    
    # Add outliers
    X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
    X = np.vstack([X_normal, X_outliers])
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply One-Class SVM
    clf = OneClassSVM(kernel='rbf', nu=0.1)
    y_pred = clf.fit_predict(X_scaled)
    
    # Visualize results
    plt.figure(figsize=(10, 6))
    plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], label='Normal')
    plt.scatter(X[y_pred == -1, 0], X[y_pred == -1, 1], color='red', label='Outlier')
    plt.title('One-Class SVM Results')
    plt.legend()
    plt.show()
    
    # Print statistics
    print("Number of outliers:", (y_pred == -1).sum())
    print("Number of normal points:", (y_pred == 1).sum())

oneclass_svm_example()

## 4. Local Outlier Factor

In [None]:
def lof_example():
    # Generate data
    np.random.seed(42)
    n_samples = 300
    
    # Create normal samples
    X_normal = np.random.normal(0, 1, (n_samples, 2))
    
    # Add outliers
    X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
    X = np.vstack([X_normal, X_outliers])
    
    # Apply LOF
    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    y_pred = clf.fit_predict(X)
    
    # Get LOF scores
    lof_scores = -clf.negative_outlier_factor_
    
    # Visualize results
    plt.figure(figsize=(15, 5))
    
    # Points colored by prediction
    plt.subplot(121)
    plt.scatter(X[y_pred == 1, 0], X[y_pred == 1, 1], label='Normal')
    plt.scatter(X[y_pred == -1, 0], X[y_pred == -1, 1], color='red', label='Outlier')
    plt.title('LOF Classifications')
    plt.legend()
    
    # Points colored by LOF score
    plt.subplot(122)
    scatter = plt.scatter(X[:, 0], X[:, 1], c=lof_scores, cmap='viridis')
    plt.colorbar(scatter)
    plt.title('LOF Scores')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("Number of outliers:", (y_pred == -1).sum())
    print("Number of normal points:", (y_pred == 1).sum())

lof_example()

## Practical Exercises

In [None]:
# Exercise 1: Credit Card Fraud Detection

def credit_card_fraud():
    # Generate synthetic credit card data
    np.random.seed(42)
    n_samples = 1000
    
    # Create normal transactions
    amount_normal = np.random.lognormal(4, 0.5, n_samples)
    time_normal = np.random.uniform(0, 24, n_samples)
    
    # Create fraudulent transactions
    amount_fraud = np.random.lognormal(6, 1, 50)
    time_fraud = np.random.uniform(0, 24, 50)
    
    # Combine data
    X = np.column_stack([
        np.concatenate([amount_normal, amount_fraud]),
        np.concatenate([time_normal, time_fraud])
    ])
    
    print("Dataset shape:", X.shape)
    
    # Task: Detect fraudulent transactions
    # 1. Scale the features
    # 2. Apply multiple detection methods
    # 3. Compare results
    # 4. Analyze detection performance
    
    # Your code here

credit_card_fraud()

In [None]:
# Exercise 2: Network Intrusion Detection

def network_intrusion():
    # Generate synthetic network data
    np.random.seed(42)
    n_samples = 1000
    
    # Normal network traffic
    traffic_normal = np.random.normal(100, 20, n_samples)  # bytes/s
    latency_normal = np.random.normal(50, 10, n_samples)   # ms
    packets_normal = np.random.poisson(100, n_samples)     # packets/s
    
    # Anomalous traffic (DDoS attack)
    traffic_anomaly = np.random.normal(500, 50, 50)
    latency_anomaly = np.random.normal(200, 30, 50)
    packets_anomaly = np.random.poisson(1000, 50)
    
    # Combine data
    X = np.column_stack([
        np.concatenate([traffic_normal, traffic_anomaly]),
        np.concatenate([latency_normal, latency_anomaly]),
        np.concatenate([packets_normal, packets_anomaly])
    ])
    
    print("Dataset shape:", X.shape)
    
    # Task: Detect network anomalies
    # 1. Preprocess the data
    # 2. Implement detection methods
    # 3. Visualize results
    # 4. Compare detection methods
    
    # Your code here

network_intrusion()

## MCQ Quiz

1. Which method is best for high-dimensional data?
   - a) Z-score
   - b) IQR
   - c) Isolation Forest
   - d) LOF

2. What is the main advantage of Isolation Forest?
   - a) Linear complexity
   - b) No parameters
   - c) Perfect accuracy
   - d) Works with categorical data

3. What does the contamination parameter control?
   - a) Training speed
   - b) Expected proportion of outliers
   - c) Number of features
   - d) Model complexity

4. Which method is density-based?
   - a) Isolation Forest
   - b) One-Class SVM
   - c) LOF
   - d) Z-score

5. What is the time complexity of LOF?
   - a) O(n)
   - b) O(n log n)
   - c) O(n²)
   - d) O(n³)

6. Which method requires feature scaling?
   - a) Isolation Forest
   - b) One-Class SVM
   - c) IQR
   - d) Z-score

7. What is the main limitation of statistical methods?
   - a) Slow computation
   - b) High memory usage
   - c) Assumes normal distribution
   - d) Complex implementation

8. Which method is most suitable for streaming data?
   - a) LOF
   - b) One-Class SVM
   - c) Statistical methods
   - d) Isolation Forest

9. What does One-Class SVM learn?
   - a) Cluster centers
   - b) Decision boundary
   - c) Feature importance
   - d) Distance metrics

10. Which is NOT an application of anomaly detection?
    - a) Fraud detection
    - b) Network security
    - c) Image classification
    - d) System monitoring

Answers: 1-c, 2-a, 3-b, 4-c, 5-c, 6-b, 7-c, 8-c, 9-b, 10-c