In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_regression, make_classification, load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, log_loss
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('seaborn-v0_8')
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
class LossFunctions:
    """Collection of loss functions implemented from scratch"""
    
    @staticmethod
    def mse_loss(y_true, y_pred):
        """Mean Squared Error"""
        return np.mean((y_true - y_pred) ** 2)
    
    @staticmethod
    def mae_loss(y_true, y_pred):
        """Mean Absolute Error"""
        return np.mean(np.abs(y_true - y_pred))
    
    @staticmethod
    def huber_loss(y_true, y_pred, delta=1.0):
        """Huber Loss - combines MSE and MAE"""
        residual = np.abs(y_true - y_pred)
        return np.mean(np.where(
            residual <= delta,
            0.5 * residual ** 2,
            delta * (residual - 0.5 * delta)
        ))
    
    @staticmethod
    def cross_entropy_loss(y_true, y_pred):
        """Binary Cross-Entropy Loss"""
        # Clip predictions to prevent log(0)
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    
    @staticmethod
    def hinge_loss(y_true, y_pred):
        """Hinge Loss for SVMs"""
        # Convert labels to {-1, 1}
        y_true_signed = 2 * y_true - 1
        return np.mean(np.maximum(0, 1 - y_true_signed * y_pred))
    
    @staticmethod
    def log_cosh_loss(y_true, y_pred):
        """Logarithm of hyperbolic cosine"""
        return np.mean(np.log(np.cosh(y_pred - y_true)))

# Test loss functions with synthetic data
def test_loss_functions():
    """Test and visualize different loss functions"""
    
    # Generate regression data with outliers
    np.random.seed(42)
    n_samples = 100
    X = np.linspace(-3, 3, n_samples)
    y_true = 2 * X + 1 + np.random.normal(0, 0.5, n_samples)
    
    # Add some outliers
    outlier_indices = np.random.choice(n_samples, 5, replace=False)
    y_true[outlier_indices] += np.random.normal(0, 5, 5)
    
    # Generate predictions (with some error)
    y_pred = 2.1 * X + 0.9
    
    # Calculate losses
    losses = {
        'MSE': LossFunctions.mse_loss(y_true, y_pred),
        'MAE': LossFunctions.mae_loss(y_true, y_pred),
        'Huber (δ=1)': LossFunctions.huber_loss(y_true, y_pred, delta=1.0),
        'Huber (δ=2)': LossFunctions.huber_loss(y_true, y_pred, delta=2.0),
        'Log-Cosh': LossFunctions.log_cosh_loss(y_true, y_pred)
    }
    
    # Visualize data and predictions
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Data and predictions
    plt.subplot(2, 3, 1)
    plt.scatter(X, y_true, alpha=0.6, label='True values')
    plt.plot(X, y_pred, 'r-', label='Predictions')
    plt.scatter(X[outlier_indices], y_true[outlier_indices], 
                color='red', s=100, alpha=0.8, label='Outliers')
    plt.xlabel('X')
    plt.ylabel('y')
    plt.title('Data with Outliers')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 2: Loss comparison
    plt.subplot(2, 3, 2)
    loss_names = list(losses.keys())
    loss_values = list(losses.values())
    bars = plt.bar(loss_names, loss_values, color='skyblue', alpha=0.7)
    plt.title('Loss Function Comparison')
    plt.ylabel('Loss Value')
    plt.xticks(rotation=45)
    
    # Add value labels on bars
    for bar, value in zip(bars, loss_values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
                f'{value:.3f}', ha='center', va='bottom')
    
    # Plot 3-6: Individual loss landscapes
    residuals = np.linspace(-5, 5, 100)
    loss_landscapes = {
        'MSE': residuals ** 2,
        'MAE': np.abs(residuals),
        'Huber (δ=1)': np.where(np.abs(residuals) <= 1, 
                               0.5 * residuals ** 2,
                               np.abs(residuals) - 0.5),
        'Log-Cosh': np.log(np.cosh(residuals))
    }
    
    for i, (name, landscape) in enumerate(loss_landscapes.items(), 3):
        plt.subplot(2, 3, i)
        plt.plot(residuals, landscape, linewidth=2)
        plt.xlabel('Residual (y_true - y_pred)')
        plt.ylabel('Loss')
        plt.title(f'{name} Loss Landscape')
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return losses

# Run tests
print("=== Testing Loss Functions ===")
loss_results = test_loss_functions()

print("\\nLoss Values:")
for name, value in loss_results.items():
    print(f"{name}: {value:.4f}")

print("\\nKey Observations:")
print("- MSE is sensitive to outliers (squared error)")
print("- MAE is more robust to outliers")  
print("- Huber loss combines benefits of both")
print("- Log-Cosh is smooth and robust")


In [None]:
class RegularizedLinearRegression:
    """Linear regression with different regularization techniques"""
    
    def __init__(self, regularization='none', alpha=0.01, l1_ratio=0.5):
        self.regularization = regularization
        self.alpha = alpha  # Regularization strength
        self.l1_ratio = l1_ratio  # For elastic net
        self.weights = None
        self.bias = None
        self.cost_history = []
    
    def _add_regularization(self, loss, weights):
        """Add regularization term to loss"""
        if self.regularization == 'l1':
            return loss + self.alpha * np.sum(np.abs(weights))
        elif self.regularization == 'l2':
            return loss + self.alpha * np.sum(weights ** 2)
        elif self.regularization == 'elastic_net':
            l1_term = self.alpha * self.l1_ratio * np.sum(np.abs(weights))
            l2_term = self.alpha * (1 - self.l1_ratio) * np.sum(weights ** 2)
            return loss + l1_term + l2_term
        else:
            return loss
    
    def _compute_gradients(self, X, y, predictions):
        """Compute gradients with regularization"""
        m = X.shape[0]
        
        # Base gradients
        dw = (2/m) * np.dot(X.T, (predictions - y))
        db = (2/m) * np.sum(predictions - y)
        
        # Add regularization to weight gradients
        if self.regularization == 'l1':
            dw += self.alpha * np.sign(self.weights)
        elif self.regularization == 'l2':
            dw += 2 * self.alpha * self.weights
        elif self.regularization == 'elastic_net':
            dw += self.alpha * (self.l1_ratio * np.sign(self.weights) + 
                               2 * (1 - self.l1_ratio) * self.weights)
        
        return dw, db
    
    def fit(self, X, y, learning_rate=0.01, n_iterations=1000):
        """Fit the model using gradient descent"""
        m, n = X.shape
        
        # Initialize parameters
        self.weights = np.random.normal(0, 0.01, n)
        self.bias = 0
        self.cost_history = []
        
        for i in range(n_iterations):
            # Forward pass
            predictions = X.dot(self.weights) + self.bias
            
            # Compute loss
            base_loss = np.mean((predictions - y) ** 2)
            total_loss = self._add_regularization(base_loss, self.weights)
            self.cost_history.append(total_loss)
            
            # Compute gradients
            dw, db = self._compute_gradients(X, y, predictions)
            
            # Update parameters
            self.weights -= learning_rate * dw
            self.bias -= learning_rate * db
            
            # Apply soft thresholding for L1 regularization
            if self.regularization == 'l1':
                self.weights = np.sign(self.weights) * np.maximum(
                    np.abs(self.weights) - learning_rate * self.alpha, 0
                )
    
    def predict(self, X):
        """Make predictions"""
        return X.dot(self.weights) + self.bias

# Generate dataset for regularization comparison
def create_regularization_dataset():
    """Create dataset with multicollinearity for regularization demo"""
    np.random.seed(42)
    
    # Create base features
    n_samples = 100
    X_base = np.random.randn(n_samples, 3)
    
    # Add highly correlated features to create multicollinearity
    X_corr1 = X_base[:, 0:1] + 0.1 * np.random.randn(n_samples, 1)
    X_corr2 = X_base[:, 1:2] + 0.1 * np.random.randn(n_samples, 1)
    X_noise = np.random.randn(n_samples, 5)  # Irrelevant features
    
    # Combine all features
    X = np.hstack([X_base, X_corr1, X_corr2, X_noise])
    
    # True relationship (only first 3 features are relevant)
    true_weights = np.array([2, -1.5, 1, 0, 0, 0, 0, 0])
    y = X.dot(true_weights) + 0.5 * np.random.randn(n_samples)
    
    return X, y, true_weights

# Compare regularization techniques
def compare_regularization():
    """Compare different regularization techniques"""
    
    # Create dataset
    X, y, true_weights = create_regularization_dataset()
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Define models
    models = {
        'No Regularization': RegularizedLinearRegression('none'),
        'L1 (α=0.1)': RegularizedLinearRegression('l1', alpha=0.1),
        'L2 (α=0.1)': RegularizedLinearRegression('l2', alpha=0.1),
        'Elastic Net': RegularizedLinearRegression('elastic_net', alpha=0.1, l1_ratio=0.5)
    }
    
    # Train models and collect results
    results = {}
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    for i, (name, model) in enumerate(models.items()):
        # Train model
        model.fit(X_train_scaled, y_train, learning_rate=0.1, n_iterations=1000)
        
        # Make predictions
        train_pred = model.predict(X_train_scaled)
        test_pred = model.predict(X_test_scaled)
        
        # Calculate metrics
        train_mse = mean_squared_error(y_train, train_pred)
        test_mse = mean_squared_error(y_test, test_pred)
        
        results[name] = {
            'weights': model.weights,
            'train_mse': train_mse,
            'test_mse': test_mse,
            'cost_history': model.cost_history
        }
        
        # Plot cost history
        ax = axes[0, i] if i < 2 else axes[1, i-2]
        ax.plot(model.cost_history)
        ax.set_title(f'{name}\\nTrain MSE: {train_mse:.3f}, Test MSE: {test_mse:.3f}')
        ax.set_xlabel('Iteration')
        ax.set_ylabel('Cost')
        ax.grid(True, alpha=0.3)
    
    # Plot weight comparison
    ax = axes[0, 2]
    feature_names = [f'Feature {i}' for i in range(len(true_weights))]
    x_pos = np.arange(len(feature_names))
    
    width = 0.15
    for i, (name, result) in enumerate(results.items()):
        ax.bar(x_pos + i * width, result['weights'], width, 
               label=name, alpha=0.7)
    
    ax.bar(x_pos + len(results) * width, true_weights, width, 
           label='True Weights', alpha=0.7, color='black')
    
    ax.set_xlabel('Features')
    ax.set_ylabel('Weight Value')
    ax.set_title('Weight Comparison')
    ax.set_xticks(x_pos + width * len(results) / 2)
    ax.set_xticklabels(feature_names, rotation=45)
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Plot sparsity comparison
    ax = axes[1, 2]
    sparsity_data = []
    model_names = []
    
    for name, result in results.items():
        sparsity = np.sum(np.abs(result['weights']) < 0.01) / len(result['weights'])
        sparsity_data.append(sparsity)
        model_names.append(name)
    
    bars = ax.bar(model_names, sparsity_data, alpha=0.7, color='lightcoral')
    ax.set_ylabel('Sparsity (fraction of near-zero weights)')
    ax.set_title('Model Sparsity Comparison')
    ax.set_xticklabels(model_names, rotation=45)
    
    # Add value labels
    for bar, value in zip(bars, sparsity_data):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{value:.2f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    return results

# Run regularization comparison
print("=== Regularization Comparison ===")
reg_results = compare_regularization()

# Print summary
print("\\n=== Results Summary ===")
for name, result in reg_results.items():
    print(f"\\n{name}:")
    print(f"  Train MSE: {result['train_mse']:.4f}")
    print(f"  Test MSE: {result['test_mse']:.4f}")
    print(f"  Non-zero weights: {np.sum(np.abs(result['weights']) > 0.01)}/8")
    print(f"  Weight magnitudes: {np.linalg.norm(result['weights']):.4f}")


In [None]:
class SimpleNeuralNetworkWithDropout:
    """Simple neural network with dropout regularization"""
    
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.5):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_rate = dropout_rate
        
        # Initialize weights
        self.W1 = np.random.randn(input_size, hidden_size) * 0.1
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.1
        self.b2 = np.zeros((1, output_size))
        
        self.train_history = []
        self.val_history = []
    
    def _sigmoid(self, x):
        """Sigmoid activation function"""
        return 1 / (1 + np.exp(-np.clip(x, -250, 250)))
    
    def _sigmoid_derivative(self, x):
        """Derivative of sigmoid"""
        s = self._sigmoid(x)
        return s * (1 - s)
    
    def _apply_dropout(self, x, training=True):
        """Apply dropout during training"""
        if training and self.dropout_rate > 0:
            # Create dropout mask
            mask = np.random.binomial(1, 1 - self.dropout_rate, x.shape) / (1 - self.dropout_rate)
            return x * mask, mask
        else:
            return x, np.ones_like(x)
    
    def forward(self, X, training=True):
        """Forward propagation with optional dropout"""
        # Hidden layer
        z1 = np.dot(X, self.W1) + self.b1
        a1 = self._sigmoid(z1)
        
        # Apply dropout to hidden layer
        a1_dropout, dropout_mask = self._apply_dropout(a1, training)
        
        # Output layer
        z2 = np.dot(a1_dropout, self.W2) + self.b2
        a2 = self._sigmoid(z2)
        
        return a2, a1, a1_dropout, z1, z2, dropout_mask
    
    def backward(self, X, y, a2, a1, a1_dropout, z1, z2, dropout_mask):
        """Backward propagation"""
        m = X.shape[0]
        
        # Output layer gradients
        dz2 = a2 - y
        dW2 = (1/m) * np.dot(a1_dropout.T, dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        # Hidden layer gradients
        da1 = np.dot(dz2, self.W2.T)
        # Apply dropout mask to gradients
        da1 = da1 * dropout_mask
        dz1 = da1 * self._sigmoid_derivative(z1)
        dW1 = (1/m) * np.dot(X.T, dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2
    
    def fit(self, X_train, y_train, X_val, y_val, epochs=1000, learning_rate=0.1):
        """Train the network"""
        for epoch in range(epochs):
            # Forward pass with dropout
            a2, a1, a1_dropout, z1, z2, dropout_mask = self.forward(X_train, training=True)
            
            # Compute training loss
            train_loss = -np.mean(y_train * np.log(a2 + 1e-15) + 
                                 (1 - y_train) * np.log(1 - a2 + 1e-15))
            
            # Backward pass
            dW1, db1, dW2, db2 = self.backward(X_train, y_train, a2, a1, a1_dropout, z1, z2, dropout_mask)
            
            # Update parameters
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2
            
            # Validation loss (without dropout)
            val_pred, _, _, _, _, _ = self.forward(X_val, training=False)
            val_loss = -np.mean(y_val * np.log(val_pred + 1e-15) + 
                               (1 - y_val) * np.log(1 - val_pred + 1e-15))
            
            # Store history
            self.train_history.append(train_loss)
            self.val_history.append(val_loss)
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    def predict(self, X):
        """Make predictions (without dropout)"""
        pred, _, _, _, _, _ = self.forward(X, training=False)
        return pred

# Compare networks with and without dropout
def compare_dropout_effects():
    """Compare neural networks with and without dropout"""
    
    # Generate classification dataset
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, 
                              n_redundant=10, n_clusters_per_class=1, random_state=42)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # Standardize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Reshape targets
    y_train = y_train.reshape(-1, 1)
    y_val = y_val.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)
    
    # Create networks
    network_no_dropout = SimpleNeuralNetworkWithDropout(20, 50, 1, dropout_rate=0.0)
    network_with_dropout = SimpleNeuralNetworkWithDropout(20, 50, 1, dropout_rate=0.3)
    
    print("Training network without dropout...")
    network_no_dropout.fit(X_train_scaled, y_train, X_val_scaled, y_val, epochs=500, learning_rate=0.1)
    
    print("\\nTraining network with dropout...")
    network_with_dropout.fit(X_train_scaled, y_train, X_val_scaled, y_val, epochs=500, learning_rate=0.1)
    
    # Make predictions
    pred_no_dropout = network_no_dropout.predict(X_test_scaled)
    pred_with_dropout = network_with_dropout.predict(X_test_scaled)
    
    # Calculate accuracies
    acc_no_dropout = accuracy_score(y_test, (pred_no_dropout > 0.5).astype(int))
    acc_with_dropout = accuracy_score(y_test, (pred_with_dropout > 0.5).astype(int))
    
    # Plot training curves
    plt.figure(figsize=(15, 5))
    
    # Training curves
    plt.subplot(1, 3, 1)
    plt.plot(network_no_dropout.train_history, label='No Dropout - Train', alpha=0.7)
    plt.plot(network_no_dropout.val_history, label='No Dropout - Val', alpha=0.7)
    plt.plot(network_with_dropout.train_history, label='With Dropout - Train', alpha=0.7)
    plt.plot(network_with_dropout.val_history, label='With Dropout - Val', alpha=0.7)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Curves Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Overfitting analysis
    plt.subplot(1, 3, 2)
    overfitting_no_dropout = np.array(network_no_dropout.val_history) - np.array(network_no_dropout.train_history)
    overfitting_with_dropout = np.array(network_with_dropout.val_history) - np.array(network_with_dropout.train_history)
    
    plt.plot(overfitting_no_dropout, label='No Dropout', alpha=0.7)
    plt.plot(overfitting_with_dropout, label='With Dropout', alpha=0.7)
    plt.xlabel('Epoch')
    plt.ylabel('Validation - Training Loss')
    plt.title('Overfitting Analysis')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Accuracy comparison
    plt.subplot(1, 3, 3)
    accuracies = [acc_no_dropout, acc_with_dropout]
    labels = ['No Dropout', 'With Dropout']
    bars = plt.bar(labels, accuracies, alpha=0.7, color=['coral', 'skyblue'])
    plt.ylabel('Test Accuracy')
    plt.title('Test Accuracy Comparison')
    plt.ylim(0, 1)
    
    # Add value labels
    for bar, acc in zip(bars, accuracies):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{acc:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    return {
        'no_dropout': {
            'accuracy': acc_no_dropout,
            'final_train_loss': network_no_dropout.train_history[-1],
            'final_val_loss': network_no_dropout.val_history[-1]
        },
        'with_dropout': {
            'accuracy': acc_with_dropout,
            'final_train_loss': network_with_dropout.train_history[-1],
            'final_val_loss': network_with_dropout.val_history[-1]
        }
    }

# Run dropout comparison
print("=== Dropout Regularization Analysis ===")
dropout_results = compare_dropout_effects()

print("\\n=== Dropout Results Summary ===")
for name, result in dropout_results.items():
    print(f"\\n{name.replace('_', ' ').title()}:")
    print(f"  Test Accuracy: {result['accuracy']:.4f}")
    print(f"  Final Train Loss: {result['final_train_loss']:.4f}")
    print(f"  Final Val Loss: {result['final_val_loss']:.4f}")
    print(f"  Overfitting: {result['final_val_loss'] - result['final_train_loss']:.4f}")
