# 🔬 Week 02 Algorithm Comparison

Comparing all 8 algorithms on the same dataset to understand their strengths and weaknesses.

---


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.datasets import make_classification, load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Import all sklearn versions for fair comparison
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

print('✅ All algorithms loaded!')


## Dataset: Iris Classification

Using the classic Iris dataset for comparison:
- **Samples**: 150
- **Features**: 4 (continuous)
- **Classes**: 3 (balanced)
- **Task**: Multi-class classification


In [None]:
# Load and prepare data
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale for algorithms that need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')


## Performance Comparison


In [None]:
# Define all algorithms
algorithms = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN (k=5)': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'SVM (RBF)': SVC(kernel='rbf', random_state=42),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
}

# Store results
results = []

for name, model in algorithms.items():
    # Choose scaled or unscaled data
    if name in ['Logistic Regression', 'KNN (k=5)', 'SVM (RBF)']:
        X_tr, X_te = X_train_scaled, X_test_scaled
    else:
        X_tr, X_te = X_train, X_test
    
    # Training time
    start = time.time()
    model.fit(X_tr, y_train)
    train_time = time.time() - start
    
    # Prediction time
    start = time.time()
    y_pred = model.predict(X_te)
    pred_time = time.time() - start
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_tr, y_train, cv=5)
    
    results.append({
        'Algorithm': name,
        'Accuracy': acc,
        'F1 Score': f1,
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std(),
        'Train Time (ms)': train_time * 1000,
        'Predict Time (ms)': pred_time * 1000
    })

import pandas as pd
df_results = pd.DataFrame(results)
df_results = df_results.sort_values('Accuracy', ascending=False)
print(df_results.to_string(index=False))


## Key Insights

### Top Performers
1. **Random Forest** - Most robust, little tuning needed
2. **SVM** - Excellent for this dataset size
3. **Logistic Regression** - Simple baseline, surprisingly good

### Speed Champions
1. **Naive Bayes** - Fastest training
2. **Decision Trees** - Fast both training and prediction
3. **Logistic Regression** - Good balance

### Trade-offs
- **Accuracy vs Speed**: Random Forest wins accuracy, Naive Bayes wins speed
- **Interpretability**: Decision Trees most interpretable
- **Scalability**: Naive Bayes scales best

---

## When to Use Which Algorithm

| Scenario | Recommended | Why |
|----------|-------------|-----|
| **Need high accuracy, have time** | Random Forest | Best performance, robust |
| **Real-time predictions** | Naive Bayes | Fastest |
| **Need to explain model** | Decision Trees | Interpretable |
| **High-dimensional data** | SVM | Kernel trick |
| **Simple baseline** | Logistic Regression | Fast, interpretable |
| **Small dataset** | KNN | No training needed |

