# Calculus & Optimization Fundamentals for Machine Learning

This notebook covers the essential calculus and optimization concepts needed for ML/AI:
1. **Derivatives**: Partial derivatives, chain rule, gradients
2. **Optimization**: Gradient descent, local/global minima
3. **Multivariable Calculus**: Gradients, Hessians, Taylor series
4. **Constrained Optimization**: Lagrange multipliers

## Why Calculus Matters in ML
- **Optimization**: Finding best parameters for models
- **Backpropagation**: Training neural networks
- **Loss minimization**: Core of all learning algorithms
- **Convergence analysis**: Understanding when algorithms work
- **Feature engineering**: Understanding function behavior

In [None]:
# Import essential libraries
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy.optimize import minimize
from scipy import optimize
import sympy as sp
from sympy import symbols, diff, Matrix, hessian
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")
print(f"SymPy version: {sp.__version__}")

# 1. Derivatives: The Foundation of Optimization

## What is a Derivative?
A derivative measures **how a function changes** when its input changes slightly.

**Mathematical Definition**:
$$f'(x) = \lim_{h \to 0} \frac{f(x+h) - f(x)}{h}$$

**Geometric Interpretation**: Slope of the tangent line

**ML Interpretation**: 
- **Direction**: Which way to move parameters
- **Magnitude**: How much to change parameters
- **Optimization**: Follow negative gradient to minimize

In [None]:
# Visualizing derivatives as slopes
def f(x):
    """Simple quadratic function"""
    return x**2 + 2*x + 1

def f_derivative(x):
    """Derivative of f(x)"""
    return 2*x + 2

# Create x values
x = np.linspace(-3, 2, 1000)
y = f(x)

# Points to show tangent lines
points = [-2, -1, 0, 1]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot function and tangent lines
ax1.plot(x, y, 'b-', linewidth=2, label='f(x) = x² + 2x + 1')

for point in points:
    # Function value at point
    y_val = f(point)
    
    # Derivative (slope) at point
    slope = f_derivative(point)
    
    # Tangent line: y - y0 = m(x - x0)
    tangent_x = np.linspace(point - 0.5, point + 0.5, 100)
    tangent_y = slope * (tangent_x - point) + y_val
    
    # Plot point and tangent
    ax1.plot(point, y_val, 'ro', markersize=8)
    ax1.plot(tangent_x, tangent_y, 'r--', alpha=0.7, 
             label=f'Tangent at x={point}, slope={slope:.1f}')
    
    # Add slope annotation
    ax1.annotate(f'slope = {slope:.1f}', 
                xy=(point, y_val), xytext=(point + 0.3, y_val + 0.5),
                arrowprops=dict(arrowstyle='->', color='red', alpha=0.7),
                fontsize=10, ha='center')

ax1.set_xlabel('x')
ax1.set_ylabel('f(x)')
ax1.set_title('Function and Tangent Lines (Derivatives)', fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot derivative function
ax2.plot(x, f_derivative(x), 'g-', linewidth=2, label="f'(x) = 2x + 2")
ax2.axhline(y=0, color='k', linestyle='--', alpha=0.5)
ax2.axvline(x=-1, color='r', linestyle='--', alpha=0.5, label='Critical point (f\'(x)=0)')

# Mark points where we calculated slopes
for point in points:
    slope = f_derivative(point)
    ax2.plot(point, slope, 'ro', markersize=8)
    ax2.annotate(f'({point}, {slope})', 
                xy=(point, slope), xytext=(point + 0.2, slope + 0.3),
                fontsize=10, ha='center')

ax2.set_xlabel('x')
ax2.set_ylabel("f'(x)")
ax2.set_title('Derivative Function', fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.legend()

plt.tight_layout()
plt.show()

print("Key Insights:")
print("• Derivative = 0 at minimum (x = -1)")
print("• Negative derivative → function decreasing")
print("• Positive derivative → function increasing")
print("• Steeper slope → larger derivative magnitude")

## Partial Derivatives

For functions of **multiple variables** f(x, y), partial derivatives measure how the function changes with respect to **one variable while keeping others constant**.

**Notation**:
- $\frac{\partial f}{\partial x}$: Partial derivative with respect to x
- $\frac{\partial f}{\partial y}$: Partial derivative with respect to y

**ML Context**: Most loss functions depend on many parameters!

In [None]:
# Partial derivatives example
# Define symbolic variables
x, y = symbols('x y')

# Define a function of two variables
f_xy = x**2 + 2*x*y + y**2 + 3*x + 4*y + 5

print("Function: f(x,y) =", f_xy)
print()

# Calculate partial derivatives
df_dx = diff(f_xy, x)
df_dy = diff(f_xy, y)

print("Partial Derivatives:")
print(f"∂f/∂x = {df_dx}")
print(f"∂f/∂y = {df_dy}")
print()

# Find critical points (where both partial derivatives = 0)
critical_points = sp.solve([df_dx, df_dy], [x, y])
print(f"Critical point: {critical_points}")

# Convert to numerical functions for plotting
f_numeric = sp.lambdify([x, y], f_xy, 'numpy')
df_dx_numeric = sp.lambdify([x, y], df_dx, 'numpy')
df_dy_numeric = sp.lambdify([x, y], df_dy, 'numpy')

# Create mesh for 3D plotting
x_vals = np.linspace(-3, 1, 50)
y_vals = np.linspace(-3, 1, 50)
X, Y = np.meshgrid(x_vals, y_vals)
Z = f_numeric(X, Y)

# Create subplots
fig = plt.figure(figsize=(18, 5))

# 3D surface plot
ax1 = fig.add_subplot(131, projection='3d')
surface = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

# Mark critical point
cp_x, cp_y = float(critical_points[x]), float(critical_points[y])
cp_z = f_numeric(cp_x, cp_y)
ax1.scatter([cp_x], [cp_y], [cp_z], color='red', s=100, label='Critical Point')

ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('Function f(x,y)', fontweight='bold')

# Contour plot with gradient field
ax2 = fig.add_subplot(132)
contour = ax2.contour(X, Y, Z, levels=20)
ax2.clabel(contour, inline=True, fontsize=8)

# Add gradient vectors (scaled down for visibility)
step = 5  # Plot every 5th vector
dx = df_dx_numeric(X[::step, ::step], Y[::step, ::step])
dy = df_dy_numeric(X[::step, ::step], Y[::step, ::step])

ax2.quiver(X[::step, ::step], Y[::step, ::step], -dx, -dy, 
           alpha=0.6, scale=50, width=0.003, color='red')

ax2.plot(cp_x, cp_y, 'ro', markersize=10, label='Critical Point')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Contour Plot + Gradient Field', fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Partial derivative cross-sections
ax3 = fig.add_subplot(133)

# Fix y at critical point and plot f vs x
x_line = np.linspace(-3, 1, 100)
f_x_fixed_y = f_numeric(x_line, cp_y)
ax3.plot(x_line, f_x_fixed_y, 'b-', label=f'f(x, {cp_y:.1f})', linewidth=2)

# Fix x at critical point and plot f vs y  
y_line = np.linspace(-3, 1, 100)
f_y_fixed_x = f_numeric(cp_x, y_line)
ax3.plot(y_line, f_y_fixed_x, 'g-', label=f'f({cp_x:.1f}, y)', linewidth=2)

ax3.plot(cp_x, cp_z, 'ro', markersize=10, label='Minimum')
ax3.set_xlabel('x or y')
ax3.set_ylabel('f')
ax3.set_title('Cross-sections through Critical Point', fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nAt critical point ({cp_x:.1f}, {cp_y:.1f}):")
print(f"∂f/∂x = {df_dx_numeric(cp_x, cp_y):.10f}")
print(f"∂f/∂y = {df_dy_numeric(cp_x, cp_y):.10f}")
print(f"Function value = {cp_z:.2f}")

## Chain Rule: The Backbone of Backpropagation

The chain rule tells us how to find derivatives of **composite functions**.

**Single Variable**:
If $y = f(g(x))$, then $\frac{dy}{dx} = f'(g(x)) \cdot g'(x)$

**Multivariable**:
If $z = f(x, y)$ where $x = x(t)$ and $y = y(t)$, then:
$$\frac{dz}{dt} = \frac{\partial z}{\partial x}\frac{dx}{dt} + \frac{\partial z}{\partial y}\frac{dy}{dt}$$

**ML Context**: This is exactly how backpropagation works in neural networks!

In [None]:
# Chain rule demonstration
print("🔗 CHAIN RULE EXAMPLES")
print("=" * 50)

# Example 1: Simple composition
print("\n1. Single Variable Chain Rule")
print("-" * 30)

# Define symbolic functions
t = symbols('t')

# Let u = t^2 + 1, and y = sin(u)
u = t**2 + 1
y = sp.sin(u)

print(f"u(t) = {u}")
print(f"y(u) = sin(u)")
print(f"Composite: y(t) = {y}")
print()

# Calculate derivatives
du_dt = diff(u, t)
dy_du = sp.cos(u)  # derivative of sin(u)
dy_dt_chain = dy_du * du_dt
dy_dt_direct = diff(y, t)

print(f"du/dt = {du_dt}")
print(f"dy/du = {dy_du}")
print(f"dy/dt (chain rule) = (dy/du)(du/dt) = {dy_dt_chain}")
print(f"dy/dt (direct) = {dy_dt_direct}")
print(f"Same result? {sp.simplify(dy_dt_chain - dy_dt_direct) == 0}")

# Example 2: Neural Network Layer
print("\n\n2. Neural Network Example")
print("-" * 30)

# Simple neural network: z = σ(wx + b) where σ(x) = 1/(1 + e^(-x))
w, x_var, b = symbols('w x b')

# Linear combination
linear = w * x_var + b

# Sigmoid activation
sigmoid = 1 / (1 + sp.exp(-linear))

print(f"Linear: z = {linear}")
print(f"Output: a = σ(z) = {sigmoid}")
print()

# Calculate gradients (what backprop computes!)
da_dw = diff(sigmoid, w)
da_db = diff(sigmoid, b)
da_dx = diff(sigmoid, x_var)

print("Gradients (for backpropagation):")
print(f"∂a/∂w = {da_dw}")
print(f"∂a/∂b = {da_db}")
print(f"∂a/∂x = {da_dx}")

# Show how chain rule breaks down the computation
da_dz = diff(sigmoid, linear)  # derivative of sigmoid
dz_dw = diff(linear, w)        # derivative of linear w.r.t. w

print(f"\nChain rule breakdown:")
print(f"∂a/∂z = {da_dz}")
print(f"∂z/∂w = {dz_dw}")
print(f"∂a/∂w = (∂a/∂z)(∂z/∂w) = {sp.simplify(da_dz * dz_dw)}")

In [None]:
# Visualizing chain rule in action
def sigmoid_func(x):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))  # Clip to prevent overflow

def sigmoid_derivative(x):
    """Derivative of sigmoid"""
    s = sigmoid_func(x)
    return s * (1 - s)

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Parameters for our neural network layer
w_val = 2.0
b_val = -1.0
x_range = np.linspace(-2, 3, 1000)

# Forward pass
z_vals = w_val * x_range + b_val  # Linear combination
a_vals = sigmoid_func(z_vals)     # Activation

# Plot 1: Input to linear output
axes[0,0].plot(x_range, z_vals, 'b-', linewidth=2, label=f'z = {w_val}x + {b_val}')
axes[0,0].set_xlabel('Input x')
axes[0,0].set_ylabel('Linear output z')
axes[0,0].set_title('Step 1: Linear Transformation', fontweight='bold')
axes[0,0].grid(True, alpha=0.3)
axes[0,0].legend()

# Plot 2: Linear output to final activation
z_plot_range = np.linspace(-6, 6, 1000)
a_plot_vals = sigmoid_func(z_plot_range)
axes[0,1].plot(z_plot_range, a_plot_vals, 'g-', linewidth=2, label='a = σ(z)')
axes[0,1].set_xlabel('Linear output z')
axes[0,1].set_ylabel('Activation a')
axes[0,1].set_title('Step 2: Sigmoid Activation', fontweight='bold')
axes[0,1].grid(True, alpha=0.3)
axes[0,1].legend()

# Plot 3: Complete input to output transformation
axes[1,0].plot(x_range, a_vals, 'r-', linewidth=2, label='a = σ(wx + b)')
axes[1,0].set_xlabel('Input x')
axes[1,0].set_ylabel('Final output a')
axes[1,0].set_title('Complete Transformation', fontweight='bold')
axes[1,0].grid(True, alpha=0.3)
axes[1,0].legend()

# Plot 4: Gradients (chain rule in action)
da_dz_vals = sigmoid_derivative(z_vals)  # ∂a/∂z
dz_dx_vals = np.full_like(x_range, w_val)  # ∂z/∂x = w
da_dx_vals = da_dz_vals * dz_dx_vals  # Chain rule: ∂a/∂x = (∂a/∂z)(∂z/∂x)

axes[1,1].plot(x_range, da_dz_vals, 'orange', linewidth=2, label='∂a/∂z (sigmoid derivative)')
axes[1,1].axhline(y=w_val, color='blue', linestyle='--', linewidth=2, label=f'∂z/∂x = {w_val}')
axes[1,1].plot(x_range, da_dx_vals, 'red', linewidth=2, label='∂a/∂x (chain rule)')
axes[1,1].set_xlabel('Input x')
axes[1,1].set_ylabel('Gradient values')
axes[1,1].set_title('Gradients via Chain Rule', fontweight='bold')
axes[1,1].grid(True, alpha=0.3)
axes[1,1].legend()

plt.tight_layout()
plt.show()

print("🧠 Neural Network Insights:")
print("• Forward pass: x → z = wx + b → a = σ(z)")
print("• Backward pass: ∂a/∂x = (∂a/∂z)(∂z/∂x) = σ'(z) × w")
print("• Chain rule enables efficient gradient computation")
print("• This is exactly how backpropagation works!")

## Gradients: The Direction of Steepest Ascent

The **gradient** is a vector containing all partial derivatives of a function.

**Definition**:
$$\nabla f(x, y) = \begin{bmatrix} \frac{\partial f}{\partial x} \\ \frac{\partial f}{\partial y} \end{bmatrix}$$

**Key Properties**:
- Points in direction of **steepest increase**
- **Magnitude** indicates how steep the increase is
- **Perpendicular** to level curves/contours
- **Zero gradient** = critical point (local min/max/saddle)

**ML Applications**:
- **Gradient descent**: Move opposite to gradient
- **Feature importance**: Large gradients = sensitive parameters
- **Optimization**: Follow gradients to extrema

In [None]:
# Comprehensive gradient visualization
def create_test_function(func_type='bowl'):
    """Create different types of functions for gradient analysis"""
    if func_type == 'bowl':
        return lambda x, y: x**2 + y**2 + 0.5*x*y
    elif func_type == 'saddle':
        return lambda x, y: x**2 - y**2
    elif func_type == 'rosenbrock':
        return lambda x, y: (1 - x)**2 + 100*(y - x**2)**2
    elif func_type == 'peaks':
        return lambda x, y: 3*(1-x)**2 * np.exp(-(x**2) - (y+1)**2) - 10*(x/5 - x**3 - y**5) * np.exp(-x**2-y**2) - (1/3)*np.exp(-(x+1)**2 - y**2)

def numerical_gradient(f, x, y, h=1e-5):
    """Compute gradient numerically using finite differences"""
    grad_x = (f(x + h, y) - f(x - h, y)) / (2 * h)
    grad_y = (f(x, y + h) - f(x, y - h)) / (2 * h)
    return grad_x, grad_y

# Test different function types
functions = {
    'Convex Bowl': 'bowl',
    'Saddle Point': 'saddle', 
    'Rosenbrock': 'rosenbrock'
}

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, (name, func_type) in enumerate(functions.items()):
    f = create_test_function(func_type)
    
    # Create coordinate grids
    if func_type == 'rosenbrock':
        x_range = np.linspace(-2, 2, 50)
        y_range = np.linspace(-1, 3, 50)
    else:
        x_range = np.linspace(-3, 3, 50)
        y_range = np.linspace(-3, 3, 50)
    
    X, Y = np.meshgrid(x_range, y_range)
    Z = f(X, Y)
    
    # Create contour plot
    contour = axes[i].contour(X, Y, Z, levels=15, alpha=0.6)
    axes[i].contourf(X, Y, Z, levels=15, alpha=0.3, cmap='viridis')
    
    # Add gradient field
    step = 5  # Sample every 5th point
    X_sample = X[::step, ::step]
    Y_sample = Y[::step, ::step]
    
    # Compute gradients
    grad_x = np.zeros_like(X_sample)
    grad_y = np.zeros_like(Y_sample)
    
    for row in range(X_sample.shape[0]):
        for col in range(X_sample.shape[1]):
            gx, gy = numerical_gradient(f, X_sample[row, col], Y_sample[row, col])
            grad_x[row, col] = gx
            grad_y[row, col] = gy
    
    # Plot gradient vectors
    axes[i].quiver(X_sample, Y_sample, grad_x, grad_y, 
                   angles='xy', scale_units='xy', scale=None,
                   color='red', alpha=0.7, width=0.003)
    
    # Find and mark critical points (where gradient ≈ 0)
    if func_type == 'bowl':
        # For x² + y² + 0.5xy, critical point is at (0, 0)
        axes[i].plot(0, 0, 'ro', markersize=10, label='Minimum')
    elif func_type == 'saddle':
        # For x² - y², critical point is at (0, 0)
        axes[i].plot(0, 0, 'bo', markersize=10, label='Saddle Point')
    elif func_type == 'rosenbrock':
        # Rosenbrock minimum is at (1, 1)
        axes[i].plot(1, 1, 'go', markersize=10, label='Global Minimum')
    
    axes[i].set_xlabel('x')
    axes[i].set_ylabel('y')
    axes[i].set_title(f'{name}\nGradient Field', fontweight='bold')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)
    axes[i].set_aspect('equal')

plt.tight_layout()
plt.show()

print("🎯 Gradient Insights:")
print("• Red arrows show gradient direction (steepest ascent)")
print("• To minimize: move OPPOSITE to gradient (gradient descent)")
print("• Arrow length ∝ gradient magnitude (steepness)")
print("• Zero gradient = critical point (min/max/saddle)")
print("• Gradients perpendicular to contour lines")