# 🎯 Logistic Regression: Complete Implementation

## What You'll Learn
1. Derive cost function from maximum likelihood
2. Implement binary & multiclass from scratch
3. Add L1/L2 regularization
4. Visualize decision boundaries & ROC curves

---


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix

np.random.seed(42)
plt.style.use('seaborn-v0_8')
print('✅ Setup complete!')


---
# Chapter 1: Mathematical Foundation

## Sigmoid Function

$$\sigma(z) = \frac{1}{1 + e^{-z}}$$

**Properties:**
- Range: (0, 1)
- Derivative: $\sigma'(z) = \sigma(z)(1-\sigma(z))$
- Decision boundary at $z=0$


In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

def sigmoid_derivative(z):
    s = sigmoid(z)
    return s * (1 - s)

# Visualize
z = np.linspace(-10, 10, 200)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(z, sigmoid(z), 'b-', lw=2)
ax1.axhline(0.5, color='r', linestyle='--', alpha=0.5)
ax1.set_title('Sigmoid Function')
ax1.grid(True, alpha=0.3)
ax2.plot(z, sigmoid_derivative(z), 'g-', lw=2)
ax2.set_title('Sigmoid Derivative')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


## Binary Cross-Entropy Loss

$$J(\mathbf{w}) = -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log(h(\mathbf{x}^{(i)})) + (1-y^{(i)}) \log(1-h(\mathbf{x}^{(i)})) \right]$$

**Gradient:**

$$\frac{\partial J}{\partial \mathbf{w}} = \frac{1}{m} \sum_{i=1}^{m} (h(\mathbf{x}^{(i)}) - y^{(i)}) \mathbf{x}^{(i)}$$


In [None]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000, regularization=None, lambda_=0.01):
        self.lr = learning_rate
        self.n_iters = n_iterations
        self.reg = regularization
        self.lambda_ = lambda_
        self.weights = None
        self.bias = None
        self.cost_history = []
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for i in range(self.n_iters):
            # Forward pass
            z = X @ self.weights + self.bias
            y_pred = sigmoid(z)
            
            # Compute cost
            cost = -np.mean(y * np.log(y_pred + 1e-10) + (1-y) * np.log(1-y_pred + 1e-10))
            
            # Add regularization
            if self.reg == 'l2':
                cost += (self.lambda_ / (2 * n_samples)) * np.sum(self.weights ** 2)
            elif self.reg == 'l1':
                cost += (self.lambda_ / n_samples) * np.sum(np.abs(self.weights))
            
            self.cost_history.append(cost)
            
            # Compute gradients
            dw = (1/n_samples) * (X.T @ (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)
            
            # Add regularization gradient
            if self.reg == 'l2':
                dw += (self.lambda_ / n_samples) * self.weights
            elif self.reg == 'l1':
                dw += (self.lambda_ / n_samples) * np.sign(self.weights)
            
            # Update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
        
        return self
    
    def predict_proba(self, X):
        z = X @ self.weights + self.bias
        return sigmoid(z)
    
    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))


---
# Chapter 2: Application to Real Data


In [None]:
# Load breast cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models
models = {
    'No Reg': LogisticRegression(learning_rate=0.1, n_iterations=1000),
    'L2 Reg': LogisticRegression(learning_rate=0.1, n_iterations=1000, regularization='l2', lambda_=0.1),
    'L1 Reg': LogisticRegression(learning_rate=0.1, n_iterations=1000, regularization='l1', lambda_=0.1)
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    train_acc = model.score(X_train_scaled, y_train)
    test_acc = model.score(X_test_scaled, y_test)
    results[name] = {'train': train_acc, 'test': test_acc}
    print(f'{name:10} | Train: {train_acc:.4f} | Test: {test_acc:.4f}')


In [None]:
# ROC Curves
fig, ax = plt.subplots(figsize=(10, 7))
for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test_scaled)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curves: Logistic Regression Variants')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()


---
# Summary

| Concept | Formula | Purpose |
|---------|---------|----------|
| Sigmoid | $\sigma(z) = 1/(1+e^{-z})$ | Squeeze to [0,1] |
| BCE Loss | $-y\log(h) - (1-y)\log(1-h)$ | Measure error |
| Gradient | $(h-y)\mathbf{x}$ | Update direction |
| L2 Reg | $\lambda \|\mathbf{w}\|^2$ | Prevent overfitting |
