In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print("="*70)
print("PROBLEM 6: RIDGE REGRESSION")
print("="*70)

# ============================================
# HELPER FUNCTIONS
# ============================================

def predict(X, theta):
    """Make predictions"""
    return X @ theta


def compute_mse(y_true, y_pred):
    """Compute Mean Squared Error"""
    return np.mean((y_true - y_pred) ** 2)


def compute_r2(y_true, y_pred):
    """Compute R-squared score"""
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)


def gradient_descent_ridge(X, y, alpha, num_iterations, lambda_reg):
    """
    Perform gradient descent for ridge regression
    
    Cost function: J(theta) = (1/n)*sum[(X*theta - y)^2] + lambda*sum[theta_j^2]
    
    Gradient: dJ/d(theta) = (2/n)*X^T*(X*theta - y) + 2*lambda*theta
    
    Note: We typically DON'T regularize the intercept (theta[0])
    
    Parameters:
    -----------
    X : array (n_samples, n_features)
        Feature matrix with intercept column
    y : array (n_samples,)
        Target values
    alpha : float
        Learning rate
    num_iterations : int
        Number of iterations
    lambda_reg : float
        Regularization parameter (lambda)
    
    Returns:
    --------
    theta : array (n_features,)
        Optimized parameters
    """
    n_samples, n_features = X.shape
    theta = np.zeros(n_features)
    
    for i in range(num_iterations):
        # Predictions
        predictions = X @ theta
        
        # Errors
        errors = predictions - y
        
        # Gradient of MSE term: (2/n) * X^T @ errors
        gradient = (2 / n_samples) * X.T @ errors
        
        # Add regularization gradient for all coefficients EXCEPT intercept
        # theta[0] is intercept, theta[1:] are feature coefficients
        gradient[1:] += (2 * lambda_reg / n_samples) * theta[1:]
        
        # Update theta
        theta = theta - alpha * gradient
    
    return theta


# ============================================
# PART 1: ANALYTICAL SOLUTION
# ============================================
print("\n" + "="*70)
print("PART 1: CLOSED-FORM SOLUTION DERIVATION")
print("="*70)

print("""
Ridge Regression Loss Function:
  J(theta) = sum[(y_pred - y)^2] + lambda * sum[theta_j^2]

Matrix form:
  J(theta) = (X*theta - y)^T * (X*theta - y) + lambda * theta^T * theta

Expand:
  J(theta) = theta^T*X^T*X*theta - 2*theta^T*X^T*y + y^T*y + lambda*theta^T*theta

Take gradient with respect to theta:
  dJ/d(theta) = 2*X^T*X*theta - 2*X^T*y + 2*lambda*theta

Set gradient to zero:
  2*X^T*X*theta - 2*X^T*y + 2*lambda*theta = 0
  X^T*X*theta + lambda*theta = X^T*y
  (X^T*X + lambda*I)*theta = X^T*y

CLOSED-FORM SOLUTION:
  theta = (X^T*X + lambda*I)^(-1) * X^T * y

Where I is the identity matrix of size (n_features x n_features).
""")


# ============================================
# PART 2: IMPLEMENT RIDGE REGRESSION
# ============================================
print("\n" + "="*70)
print("PART 2: GRADIENT DESCENT IMPLEMENTATION")
print("="*70)
print("\nImplemented gradient_descent_ridge() function")
print("Key features:")
print("  - Adds regularization term to gradient: 2*lambda*theta")
print("  - Does NOT regularize intercept (theta[0])")
print("  - Uses learning rate alpha and runs for num_iterations")


# ============================================
# PART 3: SIMULATE DATA
# ============================================
print("\n" + "="*70)
print("PART 3: SIMULATED DATA EXPERIMENT")
print("="*70)

# Set random seed for reproducibility
np.random.seed(42)

# Generate data: Y = 1 + 2*X + noise
N = 1000
X_sim = np.random.uniform(-2, 2, N)
noise = np.random.normal(0, np.sqrt(2), N)  # noise ~ N(0, 2)
Y_sim = 1 + 2 * X_sim + noise

print(f"\nSimulated Data:")
print(f"  Sample size: N = {N}")
print(f"  True relationship: Y = 1 + 2*X + noise")
print(f"  True intercept: 1")
print(f"  True slope: 2")
print(f"  Noise: N(0, sqrt(2))")
print(f"\nData statistics:")
print(f"  X range: [{X_sim.min():.2f}, {X_sim.max():.2f}]")
print(f"  Y range: [{Y_sim.min():.2f}, {Y_sim.max():.2f}]")
print(f"  Y mean: {Y_sim.mean():.2f}")

# Prepare data with intercept
X_sim_with_int = np.c_[np.ones(N), X_sim]

print(f"  Feature matrix shape: {X_sim_with_int.shape}")


# ============================================
# PART 4: TEST DIFFERENT LAMBDA VALUES
# ============================================
print("\n" + "="*70)
print("PART 4: RIDGE REGRESSION WITH DIFFERENT LAMBDA VALUES")
print("="*70)

# Lambda values to test
lambdas = [0, 1, 10, 100, 1000, 10000]

# Hyperparameters for gradient descent
alpha = 0.01
num_iterations = 2000

results = []

for lam in lambdas:
    print(f"\n{'-'*70}")
    print(f"Lambda = {lam}")
    print('-'*70)
    
    # Train ridge regression
    theta_ridge = gradient_descent_ridge(
        X_sim_with_int, 
        Y_sim, 
        alpha=alpha, 
        num_iterations=num_iterations, 
        lambda_reg=lam
    )
    
    # Extract coefficients
    intercept = theta_ridge[0]
    slope = theta_ridge[1]
    
    # Make predictions
    Y_pred = predict(X_sim_with_int, theta_ridge)
    
    # Compute metrics
    mse = compute_mse(Y_sim, Y_pred)
    r2 = compute_r2(Y_sim, Y_pred)
    
    # Compare with true values
    intercept_error = abs(intercept - 1.0)
    slope_error = abs(slope - 2.0)
    
    print(f"  Intercept: {intercept:.4f}  (true = 1.0, error = {intercept_error:.4f})")
    print(f"  Slope:     {slope:.4f}  (true = 2.0, error = {slope_error:.4f})")
    print(f"  MSE:       {mse:.4f}")
    print(f"  R^2:       {r2:.4f}")
    
    # Store results
    results.append({
        'Lambda': lam,
        'Intercept': intercept,
        'Slope': slope,
        'MSE': mse,
        'R2': r2,
        'Intercept_Error': intercept_error,
        'Slope_Error': slope_error
    })


# ============================================
# SUMMARY TABLE
# ============================================
print("\n" + "="*70)
print("SUMMARY TABLE")
print("="*70)

results_df = pd.DataFrame(results)
display_df = results_df[['Lambda', 'Intercept', 'Slope', 'MSE', 'R2']]
print("\n" + display_df.to_string(index=False))

print(f"\n{'='*70}")
print("TRUE VALUES:")
print("  Intercept = 1.0")
print("  Slope = 2.0")


# ============================================
# ANALYSIS
# ============================================
print("\n" + "="*70)
print("ANALYSIS")
print("="*70)

print("\n1. EFFECT OF LAMBDA ON COEFFICIENTS:")
print("   " + "-"*66)

for _, row in results_df.iterrows():
    lam = int(row['Lambda'])
    slope = row['Slope']
    slope_pct = (slope / 2.0) * 100  # Percentage of true slope
    
    if lam == 0:
        print(f"\n   Lambda = {lam:5d}: Slope = {slope:.4f} (no regularization)")
    elif lam < 100:
        print(f"   Lambda = {lam:5d}: Slope = {slope:.4f} ({slope_pct:.1f}% of true value)")
    else:
        print(f"   Lambda = {lam:5d}: Slope = {slope:.4f} ({slope_pct:.1f}% of true value) [heavy shrinkage]")

print("\n2. EFFECT OF LAMBDA ON MODEL PERFORMANCE:")
print("   " + "-"*66)

best_mse_idx = results_df['MSE'].idxmin()
best_r2_idx = results_df['R2'].idxmax()

print(f"\n   Best MSE: Lambda = {int(results_df.loc[best_mse_idx, 'Lambda'])} (MSE = {results_df.loc[best_mse_idx, 'MSE']:.4f})")
print(f"   Best R^2: Lambda = {int(results_df.loc[best_r2_idx, 'Lambda'])} (R^2 = {results_df.loc[best_r2_idx, 'R2']:.4f})")

print("\n   As lambda increases:")
print("   - Coefficients shrink toward zero")
print("   - MSE increases (worse fit to training data)")
print("   - R^2 decreases (less variance explained)")

print("\n3. BIAS-VARIANCE TRADEOFF:")
print("   " + "-"*66)

print(f"\n   Lambda = 0 (No regularization):")
print(f"     Most accurate coefficients: Slope = {results_df.loc[0, 'Slope']:.4f}")
print(f"     Best fit: MSE = {results_df.loc[0, 'MSE']:.4f}")
print(f"     High variance, low bias")

high_lambda_idx = results_df['Lambda'].idxmax()
print(f"\n   Lambda = {int(results_df.loc[high_lambda_idx, 'Lambda'])} (Heavy regularization):")
print(f"     Highly shrunk coefficients: Slope = {results_df.loc[high_lambda_idx, 'Slope']:.4f}")
print(f"     Worse fit: MSE = {results_df.loc[high_lambda_idx, 'MSE']:.4f}")
print(f"     Low variance, high bias")

print("\n   Regularization trades training accuracy for model simplicity")

PROBLEM 6: RIDGE REGRESSION

PART 1: CLOSED-FORM SOLUTION DERIVATION

Ridge Regression Loss Function:
  J(theta) = sum[(y_pred - y)^2] + lambda * sum[theta_j^2]

Matrix form:
  J(theta) = (X*theta - y)^T * (X*theta - y) + lambda * theta^T * theta

Expand:
  J(theta) = theta^T*X^T*X*theta - 2*theta^T*X^T*y + y^T*y + lambda*theta^T*theta

Take gradient with respect to theta:
  dJ/d(theta) = 2*X^T*X*theta - 2*X^T*y + 2*lambda*theta

Set gradient to zero:
  2*X^T*X*theta - 2*X^T*y + 2*lambda*theta = 0
  X^T*X*theta + lambda*theta = X^T*y
  (X^T*X + lambda*I)*theta = X^T*y

CLOSED-FORM SOLUTION:
  theta = (X^T*X + lambda*I)^(-1) * X^T * y

Where I is the identity matrix of size (n_features x n_features).


PART 2: GRADIENT DESCENT IMPLEMENTATION

Implemented gradient_descent_ridge() function
Key features:
  - Adds regularization term to gradient: 2*lambda*theta
  - Does NOT regularize intercept (theta[0])
  - Uses learning rate alpha and runs for num_iterations

PART 3: SIMULATED DATA EXPER