In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('seaborn-v0_8')
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
class BatchGradientDescent:
    """Neural Network with configurable batch processing"""
    
    def __init__(self, input_size, hidden_size, output_size, batch_size=32):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        
        # Initialize weights with Xavier initialization
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
        
        # Training history
        self.train_losses = []
        self.val_losses = []
        self.learning_rates = []
        
    def _relu(self, x):
        """ReLU activation function"""
        return np.maximum(0, x)
    
    def _relu_derivative(self, x):
        """Derivative of ReLU"""
        return (x > 0).astype(float)
    
    def _sigmoid(self, x):
        """Sigmoid activation function"""
        return 1 / (1 + np.exp(-np.clip(x, -250, 250)))
    
    def forward(self, X):
        """Forward propagation"""
        z1 = np.dot(X, self.W1) + self.b1
        a1 = self._relu(z1)
        z2 = np.dot(a1, self.W2) + self.b2
        a2 = self._sigmoid(z2)
        return a2, a1, z1, z2
    
    def backward(self, X, y, a2, a1, z1):
        """Backward propagation"""
        m = X.shape[0]
        
        # Output layer
        dz2 = a2 - y
        dW2 = (1/m) * np.dot(a1.T, dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        # Hidden layer
        da1 = np.dot(dz2, self.W2.T)
        dz1 = da1 * self._relu_derivative(z1)
        dW1 = (1/m) * np.dot(X.T, dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2
    
    def create_batches(self, X, y):
        """Create mini-batches from data"""
        m = X.shape[0]
        if self.batch_size >= m:
            # Full batch
            return [(X, y)]
        
        # Shuffle data
        indices = np.random.permutation(m)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        
        # Create batches
        batches = []
        for i in range(0, m, self.batch_size):
            end_idx = min(i + self.batch_size, m)
            batch_X = X_shuffled[i:end_idx]
            batch_y = y_shuffled[i:end_idx]
            batches.append((batch_X, batch_y))
        
        return batches
    
    def compute_loss(self, X, y):
        """Compute binary cross-entropy loss"""
        a2, _, _, _ = self.forward(X)
        loss = -np.mean(y * np.log(a2 + 1e-15) + (1 - y) * np.log(1 - a2 + 1e-15))
        return loss
    
    def fit(self, X_train, y_train, X_val, y_val, epochs=100, learning_rate=0.01, 
            lr_scheduler=None, early_stopping_patience=None):
        """Train the network with various strategies"""
        
        best_val_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(epochs):
            # Adjust learning rate if scheduler provided
            if lr_scheduler:
                current_lr = lr_scheduler(epoch, learning_rate)
            else:
                current_lr = learning_rate
            
            self.learning_rates.append(current_lr)
            
            # Create batches
            batches = self.create_batches(X_train, y_train)
            
            # Train on batches
            epoch_losses = []
            for batch_X, batch_y in batches:
                # Forward pass
                a2, a1, z1, z2 = self.forward(batch_X)
                
                # Compute loss
                batch_loss = self.compute_loss(batch_X, batch_y)
                epoch_losses.append(batch_loss)
                
                # Backward pass
                dW1, db1, dW2, db2 = self.backward(batch_X, batch_y, a2, a1, z1)
                
                # Update parameters
                self.W1 -= current_lr * dW1
                self.b1 -= current_lr * db1
                self.W2 -= current_lr * dW2
                self.b2 -= current_lr * db2
            
            # Compute epoch losses
            train_loss = np.mean(epoch_losses)
            val_loss = self.compute_loss(X_val, y_val)
            
            self.train_losses.append(train_loss)
            self.val_losses.append(val_loss)
            
            # Early stopping
            if early_stopping_patience:
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= early_stopping_patience:
                        print(f"Early stopping at epoch {epoch}")
                        break
            
            if epoch % 20 == 0:
                print(f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, LR: {current_lr:.6f}")
    
    def predict(self, X):
        """Make predictions"""
        a2, _, _, _ = self.forward(X)
        return a2

# Compare different batch sizes
def compare_batch_sizes():
    """Compare training with different batch sizes"""
    
    # Generate dataset
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                              n_redundant=5, n_clusters_per_class=1, random_state=42)
    
    # Split and scale data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Reshape targets
    y_train = y_train.reshape(-1, 1)
    y_val = y_val.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)
    
    # Test different batch sizes
    batch_sizes = [1, 8, 32, 128, len(X_train)]  # SGD, small, medium, large, full batch
    batch_names = ['SGD (1)', 'Small (8)', 'Medium (32)', 'Large (128)', 'Full Batch']
    
    results = {}
    
    plt.figure(figsize=(20, 12))
    
    for i, (batch_size, name) in enumerate(zip(batch_sizes, batch_names)):
        print(f"\\nTraining with {name}...")
        
        # Create and train model
        model = BatchGradientDescent(20, 50, 1, batch_size=batch_size)
        model.fit(X_train_scaled, y_train, X_val_scaled, y_val, 
                 epochs=100, learning_rate=0.01)
        
        # Make predictions
        test_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, (test_pred > 0.5).astype(int))
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'final_train_loss': model.train_losses[-1],
            'final_val_loss': model.val_losses[-1]
        }
        
        # Plot training curves
        plt.subplot(2, 3, i+1)
        plt.plot(model.train_losses, label='Train Loss', alpha=0.8)
        plt.plot(model.val_losses, label='Val Loss', alpha=0.8)
        plt.title(f'{name}\\nAccuracy: {accuracy:.3f}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)
    
    # Summary comparison
    plt.subplot(2, 3, 6)
    names = list(results.keys())
    accuracies = [results[name]['accuracy'] for name in names]
    train_losses = [results[name]['final_train_loss'] for name in names]
    val_losses = [results[name]['final_val_loss'] for name in names]
    
    x_pos = np.arange(len(names))
    width = 0.25
    
    plt.bar(x_pos - width, accuracies, width, label='Test Accuracy', alpha=0.7)
    plt.bar(x_pos, train_losses, width, label='Final Train Loss', alpha=0.7)
    plt.bar(x_pos + width, val_losses, width, label='Final Val Loss', alpha=0.7)
    
    plt.xlabel('Batch Size')
    plt.ylabel('Metric Value')
    plt.title('Batch Size Comparison')
    plt.xticks(x_pos, names, rotation=45)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return results

# Run batch size comparison
print("=== Batch Size Analysis ===")
batch_results = compare_batch_sizes()

print("\\n=== Batch Size Results Summary ===")
for name, result in batch_results.items():
    print(f"\\n{name}:")
    print(f"  Test Accuracy: {result['accuracy']:.4f}")
    print(f"  Final Train Loss: {result['final_train_loss']:.4f}")
    print(f"  Final Val Loss: {result['final_val_loss']:.4f}")
    print(f"  Overfitting: {result['final_val_loss'] - result['final_train_loss']:.4f}")


In [None]:
class LearningRateSchedulers:
    """Collection of learning rate scheduling strategies"""
    
    @staticmethod
    def constant_lr(epoch, initial_lr):
        """Constant learning rate"""
        return initial_lr
    
    @staticmethod
    def step_decay(epoch, initial_lr, drop_rate=0.5, epochs_drop=20):
        """Step decay - reduce LR by factor every N epochs"""
        return initial_lr * (drop_rate ** (epoch // epochs_drop))
    
    @staticmethod
    def exponential_decay(epoch, initial_lr, decay_rate=0.95):
        """Exponential decay"""
        return initial_lr * (decay_rate ** epoch)
    
    @staticmethod
    def cosine_annealing(epoch, initial_lr, max_epochs=100):
        """Cosine annealing"""
        return initial_lr * (1 + np.cos(np.pi * epoch / max_epochs)) / 2
    
    @staticmethod
    def linear_warmup_cosine_decay(epoch, initial_lr, warmup_epochs=10, max_epochs=100):
        """Linear warmup followed by cosine decay"""
        if epoch < warmup_epochs:
            # Linear warmup
            return initial_lr * (epoch + 1) / warmup_epochs
        else:
            # Cosine decay
            progress = (epoch - warmup_epochs) / (max_epochs - warmup_epochs)
            return initial_lr * (1 + np.cos(np.pi * progress)) / 2

# Visualize learning rate schedules
def visualize_lr_schedules():
    """Visualize different learning rate schedules"""
    
    epochs = np.arange(100)
    initial_lr = 0.1
    
    schedules = {
        'Constant': [LearningRateSchedulers.constant_lr(e, initial_lr) for e in epochs],
        'Step Decay': [LearningRateSchedulers.step_decay(e, initial_lr) for e in epochs],
        'Exponential': [LearningRateSchedulers.exponential_decay(e, initial_lr, 0.98) for e in epochs],
        'Cosine Annealing': [LearningRateSchedulers.cosine_annealing(e, initial_lr) for e in epochs],
        'Warmup + Cosine': [LearningRateSchedulers.linear_warmup_cosine_decay(e, initial_lr) for e in epochs]
    }
    
    plt.figure(figsize=(15, 10))
    
    # Plot all schedules
    plt.subplot(2, 3, 1)
    for name, schedule in schedules.items():
        plt.plot(epochs, schedule, label=name, linewidth=2)
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')
    plt.title('Learning Rate Schedules Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot individual schedules
    for i, (name, schedule) in enumerate(schedules.items(), 2):
        if i <= 6:
            plt.subplot(2, 3, i)
            plt.plot(epochs, schedule, linewidth=2, color=f'C{i-2}')
            plt.xlabel('Epoch')
            plt.ylabel('Learning Rate')
            plt.title(f'{name} Schedule')
            plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return schedules

# Compare training with different LR schedules
def compare_lr_schedules():
    """Compare training with different learning rate schedules"""
    
    # Use same dataset as before
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                              n_redundant=5, n_clusters_per_class=1, random_state=42)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    y_train = y_train.reshape(-1, 1)
    y_val = y_val.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)
    
    # Define schedulers
    schedulers = {
        'Constant': LearningRateSchedulers.constant_lr,
        'Step Decay': lambda epoch, lr: LearningRateSchedulers.step_decay(epoch, lr, 0.7, 25),
        'Exponential': lambda epoch, lr: LearningRateSchedulers.exponential_decay(epoch, lr, 0.98),
        'Cosine': lambda epoch, lr: LearningRateSchedulers.cosine_annealing(epoch, lr, 100),
        'Warmup+Cosine': lambda epoch, lr: LearningRateSchedulers.linear_warmup_cosine_decay(epoch, lr, 10, 100)
    }
    
    results = {}
    
    plt.figure(figsize=(20, 15))
    
    for i, (name, scheduler) in enumerate(schedulers.items()):
        print(f"\\nTraining with {name} LR schedule...")
        
        # Create and train model
        model = BatchGradientDescent(20, 50, 1, batch_size=32)
        model.fit(X_train_scaled, y_train, X_val_scaled, y_val, 
                 epochs=100, learning_rate=0.1, lr_scheduler=scheduler)
        
        # Make predictions
        test_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, (test_pred > 0.5).astype(int))
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'final_train_loss': model.train_losses[-1],
            'final_val_loss': model.val_losses[-1]
        }
        
        # Plot training curves
        plt.subplot(3, 5, i+1)
        plt.plot(model.train_losses, label='Train Loss', alpha=0.8)
        plt.plot(model.val_losses, label='Val Loss', alpha=0.8)
        plt.title(f'{name}\\nAccuracy: {accuracy:.3f}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Plot learning rate schedule
        plt.subplot(3, 5, i+6)
        plt.plot(model.learning_rates, linewidth=2)
        plt.title(f'{name} LR Schedule')
        plt.xlabel('Epoch')
        plt.ylabel('Learning Rate')
        plt.grid(True, alpha=0.3)
    
    # Combined loss curves
    plt.subplot(3, 5, 11)
    for name, result in results.items():
        plt.plot(result['model'].train_losses, label=f'{name} Train', alpha=0.7)
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.title('Training Loss Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(3, 5, 12)
    for name, result in results.items():
        plt.plot(result['model'].val_losses, label=f'{name} Val', alpha=0.7)
    plt.xlabel('Epoch')
    plt.ylabel('Validation Loss')
    plt.title('Validation Loss Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Summary metrics
    plt.subplot(3, 5, 13)
    names = list(results.keys())
    accuracies = [results[name]['accuracy'] for name in names]
    
    bars = plt.bar(names, accuracies, alpha=0.7, color='skyblue')
    plt.ylabel('Test Accuracy')
    plt.title('Final Accuracy Comparison')
    plt.xticks(rotation=45)
    
    for bar, acc in zip(bars, accuracies):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                f'{acc:.3f}', ha='center', va='bottom')
    
    # Convergence speed analysis
    plt.subplot(3, 5, 14)
    convergence_epochs = []
    for name, result in results.items():
        # Find when validation loss stabilizes (within 1% of final value)
        final_val_loss = result['final_val_loss']
        threshold = final_val_loss * 1.01
        
        converged_epoch = len(result['model'].val_losses)
        for epoch, val_loss in enumerate(result['model'].val_losses):
            if val_loss <= threshold:
                converged_epoch = epoch
                break
        
        convergence_epochs.append(converged_epoch)
    
    bars = plt.bar(names, convergence_epochs, alpha=0.7, color='lightgreen')
    plt.ylabel('Convergence Epoch')
    plt.title('Convergence Speed')
    plt.xticks(rotation=45)
    
    for bar, epoch in zip(bars, convergence_epochs):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{epoch}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    return results

# Visualize LR schedules first
print("=== Learning Rate Schedules Visualization ===")
lr_schedules = visualize_lr_schedules()

# Compare LR schedules in training
print("\\n=== Learning Rate Schedule Comparison ===")
lr_results = compare_lr_schedules()

print("\\n=== LR Schedule Results Summary ===")
for name, result in lr_results.items():
    print(f"\\n{name}:")
    print(f"  Test Accuracy: {result['accuracy']:.4f}")
    print(f"  Final Train Loss: {result['final_train_loss']:.4f}")
    print(f"  Final Val Loss: {result['final_val_loss']:.4f}")


In [None]:
class OptimizedNeuralNetwork:
    """Neural network with advanced optimization algorithms"""
    
    def __init__(self, input_size, hidden_size, output_size, optimizer='sgd'):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.optimizer = optimizer
        
        # Initialize weights
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
        
        # Optimizer state variables
        self._init_optimizer_state()
        
        # Training history
        self.train_losses = []
        self.val_losses = []
        
    def _init_optimizer_state(self):
        """Initialize optimizer-specific state variables"""
        if self.optimizer in ['momentum', 'adam']:
            # Momentum terms
            self.vW1 = np.zeros_like(self.W1)
            self.vb1 = np.zeros_like(self.b1)
            self.vW2 = np.zeros_like(self.W2)
            self.vb2 = np.zeros_like(self.b2)
        
        if self.optimizer in ['rmsprop', 'adam']:
            # Second moment terms
            self.sW1 = np.zeros_like(self.W1)
            self.sb1 = np.zeros_like(self.b1)
            self.sW2 = np.zeros_like(self.W2)
            self.sb2 = np.zeros_like(self.b2)
        
        if self.optimizer == 'adam':
            self.t = 0  # Time step for bias correction
    
    def _relu(self, x):
        return np.maximum(0, x)
    
    def _relu_derivative(self, x):
        return (x > 0).astype(float)
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -250, 250)))
    
    def forward(self, X):
        z1 = np.dot(X, self.W1) + self.b1
        a1 = self._relu(z1)
        z2 = np.dot(a1, self.W2) + self.b2
        a2 = self._sigmoid(z2)
        return a2, a1, z1, z2
    
    def backward(self, X, y, a2, a1, z1):
        m = X.shape[0]
        
        dz2 = a2 - y
        dW2 = (1/m) * np.dot(a1.T, dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        da1 = np.dot(dz2, self.W2.T)
        dz1 = da1 * self._relu_derivative(z1)
        dW1 = (1/m) * np.dot(X.T, dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2
    
    def _update_parameters_sgd(self, dW1, db1, dW2, db2, learning_rate):
        """Standard SGD parameter update"""
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
    
    def _update_parameters_momentum(self, dW1, db1, dW2, db2, learning_rate, beta=0.9):
        """SGD with momentum parameter update"""
        # Update momentum terms
        self.vW1 = beta * self.vW1 + (1 - beta) * dW1
        self.vb1 = beta * self.vb1 + (1 - beta) * db1
        self.vW2 = beta * self.vW2 + (1 - beta) * dW2
        self.vb2 = beta * self.vb2 + (1 - beta) * db2
        
        # Update parameters
        self.W1 -= learning_rate * self.vW1
        self.b1 -= learning_rate * self.vb1
        self.W2 -= learning_rate * self.vW2
        self.b2 -= learning_rate * self.vb2
    
    def _update_parameters_rmsprop(self, dW1, db1, dW2, db2, learning_rate, beta=0.999, epsilon=1e-8):
        """RMSprop parameter update"""
        # Update second moment estimates
        self.sW1 = beta * self.sW1 + (1 - beta) * dW1**2
        self.sb1 = beta * self.sb1 + (1 - beta) * db1**2
        self.sW2 = beta * self.sW2 + (1 - beta) * dW2**2
        self.sb2 = beta * self.sb2 + (1 - beta) * db2**2
        
        # Update parameters
        self.W1 -= learning_rate * dW1 / (np.sqrt(self.sW1) + epsilon)
        self.b1 -= learning_rate * db1 / (np.sqrt(self.sb1) + epsilon)
        self.W2 -= learning_rate * dW2 / (np.sqrt(self.sW2) + epsilon)
        self.b2 -= learning_rate * db2 / (np.sqrt(self.sb2) + epsilon)
    
    def _update_parameters_adam(self, dW1, db1, dW2, db2, learning_rate, 
                               beta1=0.9, beta2=0.999, epsilon=1e-8):
        """Adam parameter update"""
        self.t += 1
        
        # Update biased first moment estimates
        self.vW1 = beta1 * self.vW1 + (1 - beta1) * dW1
        self.vb1 = beta1 * self.vb1 + (1 - beta1) * db1
        self.vW2 = beta1 * self.vW2 + (1 - beta1) * dW2
        self.vb2 = beta1 * self.vb2 + (1 - beta1) * db2
        
        # Update biased second moment estimates
        self.sW1 = beta2 * self.sW1 + (1 - beta2) * dW1**2
        self.sb1 = beta2 * self.sb1 + (1 - beta2) * db1**2
        self.sW2 = beta2 * self.sW2 + (1 - beta2) * dW2**2
        self.sb2 = beta2 * self.sb2 + (1 - beta2) * db2**2
        
        # Bias correction
        vW1_corrected = self.vW1 / (1 - beta1**self.t)
        vb1_corrected = self.vb1 / (1 - beta1**self.t)
        vW2_corrected = self.vW2 / (1 - beta1**self.t)
        vb2_corrected = self.vb2 / (1 - beta1**self.t)
        
        sW1_corrected = self.sW1 / (1 - beta2**self.t)
        sb1_corrected = self.sb1 / (1 - beta2**self.t)
        sW2_corrected = self.sW2 / (1 - beta2**self.t)
        sb2_corrected = self.sb2 / (1 - beta2**self.t)
        
        # Update parameters
        self.W1 -= learning_rate * vW1_corrected / (np.sqrt(sW1_corrected) + epsilon)
        self.b1 -= learning_rate * vb1_corrected / (np.sqrt(sb1_corrected) + epsilon)
        self.W2 -= learning_rate * vW2_corrected / (np.sqrt(sW2_corrected) + epsilon)
        self.b2 -= learning_rate * vb2_corrected / (np.sqrt(sb2_corrected) + epsilon)
    
    def update_parameters(self, dW1, db1, dW2, db2, learning_rate):
        """Update parameters using selected optimizer"""
        if self.optimizer == 'sgd':
            self._update_parameters_sgd(dW1, db1, dW2, db2, learning_rate)
        elif self.optimizer == 'momentum':
            self._update_parameters_momentum(dW1, db1, dW2, db2, learning_rate)
        elif self.optimizer == 'rmsprop':
            self._update_parameters_rmsprop(dW1, db1, dW2, db2, learning_rate)
        elif self.optimizer == 'adam':
            self._update_parameters_adam(dW1, db1, dW2, db2, learning_rate)
    
    def compute_loss(self, X, y):
        a2, _, _, _ = self.forward(X)
        loss = -np.mean(y * np.log(a2 + 1e-15) + (1 - y) * np.log(1 - a2 + 1e-15))
        return loss
    
    def fit(self, X_train, y_train, X_val, y_val, epochs=100, learning_rate=0.01, batch_size=32):
        """Train the network"""
        m = X_train.shape[0]
        
        for epoch in range(epochs):
            # Shuffle data
            indices = np.random.permutation(m)
            X_shuffled = X_train[indices]
            y_shuffled = y_train[indices]
            
            # Mini-batch training
            for i in range(0, m, batch_size):
                end_idx = min(i + batch_size, m)
                batch_X = X_shuffled[i:end_idx]
                batch_y = y_shuffled[i:end_idx]
                
                # Forward pass
                a2, a1, z1, z2 = self.forward(batch_X)
                
                # Backward pass
                dW1, db1, dW2, db2 = self.backward(batch_X, batch_y, a2, a1, z1)
                
                # Update parameters
                self.update_parameters(dW1, db1, dW2, db2, learning_rate)
            
            # Compute losses
            train_loss = self.compute_loss(X_train, y_train)
            val_loss = self.compute_loss(X_val, y_val)
            
            self.train_losses.append(train_loss)
            self.val_losses.append(val_loss)
            
            if epoch % 20 == 0:
                print(f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    def predict(self, X):
        a2, _, _, _ = self.forward(X)
        return a2

# Compare optimization algorithms
def compare_optimizers():
    """Compare different optimization algorithms"""
    
    # Generate challenging dataset
    X, y = make_classification(n_samples=2000, n_features=50, n_informative=30, 
                              n_redundant=20, n_clusters_per_class=2, 
                              class_sep=0.8, random_state=42)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    y_train = y_train.reshape(-1, 1)
    y_val = y_val.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)
    
    # Test different optimizers
    optimizers = ['sgd', 'momentum', 'rmsprop', 'adam']
    optimizer_names = ['SGD', 'SGD + Momentum', 'RMSprop', 'Adam']
    
    results = {}
    
    plt.figure(figsize=(20, 12))
    
    for i, (optimizer, name) in enumerate(zip(optimizers, optimizer_names)):
        print(f"\\nTraining with {name} optimizer...")
        
        # Create and train model
        model = OptimizedNeuralNetwork(50, 100, 1, optimizer=optimizer)
        model.fit(X_train_scaled, y_train, X_val_scaled, y_val, 
                 epochs=100, learning_rate=0.01, batch_size=32)
        
        # Make predictions
        test_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, (test_pred > 0.5).astype(int))
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'final_train_loss': model.train_losses[-1],
            'final_val_loss': model.val_losses[-1]
        }
        
        # Plot training curves
        plt.subplot(2, 4, i+1)
        plt.plot(model.train_losses, label='Train Loss', alpha=0.8)
        plt.plot(model.val_losses, label='Val Loss', alpha=0.8)
        plt.title(f'{name}\\nAccuracy: {accuracy:.3f}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)
    
    # Combined comparison plots
    plt.subplot(2, 4, 5)
    for name, result in results.items():
        plt.plot(result['model'].train_losses, label=name, alpha=0.8, linewidth=2)
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.title('Training Loss Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(2, 4, 6)
    for name, result in results.items():
        plt.plot(result['model'].val_losses, label=name, alpha=0.8, linewidth=2)
    plt.xlabel('Epoch')
    plt.ylabel('Validation Loss')
    plt.title('Validation Loss Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Final metrics comparison
    plt.subplot(2, 4, 7)
    names = list(results.keys())
    accuracies = [results[name]['accuracy'] for name in names]
    train_losses = [results[name]['final_train_loss'] for name in names]
    val_losses = [results[name]['final_val_loss'] for name in names]
    
    x_pos = np.arange(len(names))
    plt.bar(x_pos, accuracies, alpha=0.7, color='skyblue')
    plt.xlabel('Optimizer')
    plt.ylabel('Test Accuracy')
    plt.title('Final Accuracy Comparison')
    plt.xticks(x_pos, names, rotation=45)
    
    for i, acc in enumerate(accuracies):
        plt.text(i, acc + 0.005, f'{acc:.3f}', ha='center', va='bottom')
    
    # Convergence analysis
    plt.subplot(2, 4, 8)
    convergence_rates = []
    for name, result in results.items():
        # Calculate convergence rate (how fast loss decreases)
        train_losses = np.array(result['model'].train_losses)
        initial_loss = train_losses[0]
        final_loss = train_losses[-1]
        convergence_rate = (initial_loss - final_loss) / len(train_losses)
        convergence_rates.append(convergence_rate)
    
    bars = plt.bar(names, convergence_rates, alpha=0.7, color='lightgreen')
    plt.xlabel('Optimizer')
    plt.ylabel('Convergence Rate')
    plt.title('Convergence Speed')
    plt.xticks(rotation=45)
    
    for bar, rate in zip(bars, convergence_rates):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.0001,
                f'{rate:.4f}', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    return results

# Run optimizer comparison
print("=== Optimization Algorithms Comparison ===")
optimizer_results = compare_optimizers()

print("\\n=== Optimizer Results Summary ===")
for name, result in optimizer_results.items():
    print(f"\\n{name}:")
    print(f"  Test Accuracy: {result['accuracy']:.4f}")
    print(f"  Final Train Loss: {result['final_train_loss']:.4f}")
    print(f"  Final Val Loss: {result['final_val_loss']:.4f}")
    
    # Calculate convergence metrics
    train_losses = np.array(result['model'].train_losses)
    improvement = train_losses[0] - train_losses[-1]
    print(f"  Total Loss Improvement: {improvement:.4f}")
    print(f"  Convergence Rate: {improvement / len(train_losses):.6f} per epoch")
