Part IV: Elastic Net Regularization using Gradient Descent
=======================================================


In [10]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt

In [11]:
diamonds_df = pd.read_csv("diamonds_preprocessed.csv")
diamonds_df = diamonds_df.drop(columns='Unnamed: 0')
print('Dropping unrelated column \"Unnamed: 0\" since it is not required')
diamonds_df.info()

Dropping unrelated column "Unnamed: 0" since it is not required
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51334 entries, 0 to 51333
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   carat   51334 non-null  float64
 1   table   51334 non-null  float64
 2   price   51334 non-null  float64
 3   x       51334 non-null  float64
 4   y       51334 non-null  float64
 5   z       51334 non-null  float64
dtypes: float64(6)
memory usage: 2.4 MB


In [12]:
X = diamonds_df[['carat', 'table', 'x', 'y', 'z']].values
y = diamonds_df['price'].values.reshape(-1, 1) 
N = X.shape[0] 
train_size = int(0.8 * N) 
index_number = np.arange(N)
np.random.shuffle(index_number)
train_indices = index_number[:train_size]
test_indices = index_number[train_size:]
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

X_train shape: (41067, 5)
y_train shape: (41067, 1)
X_test shape: (10267, 5)
y_test shape: (10267, 1)


In [13]:
def elastic_net_loss(X, y, w, lambda1, lambda2):
    N = len(y)
    predictions = X.dot(w)
    error = predictions - y
    mse_loss = (1 / (2 * N)) * np.sum(error ** 2)
    l2_penalty = (lambda1 / 2) * np.sum(w ** 2)
    l1_penalty = lambda2 * np.sum(np.abs(w))
    return mse_loss + l2_penalty + l1_penalty

def gradient_descent(X, y, w, lambda1, lambda2, alpha, epochs, threshold=None):
    loss_history = []
    N, d = X.shape
    for epoch in range(epochs):
        predictions = X.dot(w)
        error = predictions - y
        gradient = (1 / N) * X.T.dot(error) + lambda1 * w + lambda2 * np.sign(w)
        w -= alpha * gradient

        loss = elastic_net_loss(X, y, w, lambda1, lambda2)
        loss_history.append(loss)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")
        
        if threshold is not None and np.all(np.abs(gradient) < threshold):
            print(f"Stopping early at epoch {epoch} due to small gradient: {gradient}")
            break
    
    return w, loss_history

def random_initialization(input_dim, output_dim):
    return np.random.randn(input_dim, output_dim)

def zero_initialization(input_dim, output_dim):
    return np.zeros((input_dim, output_dim))

def xavier_initialization(input_dim, output_dim):
    limit = sqrt(6 / (input_dim + output_dim))
    return np.random.uniform(-limit, limit, size=(input_dim, output_dim))


def three_initialization_methods(X_train, y_train, lambda1, lambda2, alpha, epochs):
    input_dim, output_dim = X_train.shape[1], 1

    w_random = random_initialization(input_dim, output_dim)
    print("\nTraining with Random Initialization:")
    w_random, loss_random = gradient_descent(X_train, y_train, w_random, lambda1, lambda2, alpha, epochs)

    w_zero = zero_initialization(input_dim, output_dim)
    print("\nTraining with Zero Initialization:")
    w_zero, loss_zero = gradient_descent(X_train, y_train, w_zero, lambda1, lambda2, alpha, epochs)

    w_xavier = xavier_initialization(input_dim, output_dim)
    print("\nTraining with Xavier Initialization:")
    w_xavier, loss_xavier = gradient_descent(X_train, y_train, w_xavier, lambda1, lambda2, alpha, epochs)

    return w_random, w_zero, w_xavier

def stopping_criteria_experiment(X_train, y_train, lambda1, lambda2, alpha):
    input_dim, output_dim = X_train.shape[1], 1

    w_init = random_initialization(input_dim, output_dim)

    # number of iterations 10,000
    print("\nTraining with Fixed Iterations (10,000):")
    w_fixed_10k, loss_history_10k = gradient_descent(X_train, y_train, w_init, lambda1, lambda2, alpha, 10000)

    # number of iterations 100,000
    print("\nTraining with Fixed Iterations (100,000):")
    w_fixed_100k, loss_history_100k = gradient_descent(X_train, y_train, w_init, lambda1, lambda2, alpha, 100000)

    # Gradient Threshold (-0.01 < gradient < 0.01)
    print("\nTraining with Gradient Threshold (|grad| < 0.01):")
    w_grad_threshold, loss_history_threshold = gradient_descent(X_train, y_train, w_init, lambda1, lambda2, alpha, 100000, threshold=0.01)

    return loss_history_10k, loss_history_100k, loss_history_threshold

In [14]:
lambda1 = 0.1  # L2 regularization parameter
lambda2 = 0.1  # L1 regularization parameter
alpha = 0.01
epochs = 1000

w_random, w_zero, w_xavier = three_initialization_methods(X_train, y_train, lambda1, lambda2, alpha, epochs)


Training with Random Initialization:
Epoch 0, Loss: 0.9701519103153425
Epoch 100, Loss: 0.269122967280484
Epoch 200, Loss: 0.14060130635996956
Epoch 300, Loss: 0.08756038960834503
Epoch 400, Loss: 0.07214668147108969
Epoch 500, Loss: 0.06567341316199682
Epoch 600, Loss: 0.06172028456734572
Epoch 700, Loss: 0.059281541629197576
Epoch 800, Loss: 0.057636781718755956
Epoch 900, Loss: 0.056515826603974396

Training with Zero Initialization:
Epoch 0, Loss: 0.05544253069376722
Epoch 100, Loss: 0.05385368736381106
Epoch 200, Loss: 0.05344217085052766
Epoch 300, Loss: 0.053223575388244536
Epoch 400, Loss: 0.05308423566334204
Epoch 500, Loss: 0.052989674054975865
Epoch 600, Loss: 0.05291919829670091
Epoch 700, Loss: 0.052871832289486206
Epoch 800, Loss: 0.05283123628026598
Epoch 900, Loss: 0.05279659776830975

Training with Xavier Initialization:
Epoch 0, Loss: 0.3528053493325102
Epoch 100, Loss: 0.23780468222264853
Epoch 200, Loss: 0.15903355823980048
Epoch 300, Loss: 0.10109289517004982
Epoc

In [15]:
loss_history_10k, loss_history_100k, loss_history_threshold = stopping_criteria_experiment(X_train, y_train, lambda1, lambda2, alpha)


Training with Fixed Iterations (10,000):
Epoch 0, Loss: 3.2290006488547762
Epoch 100, Loss: 0.6180338002261506
Epoch 200, Loss: 0.25276125561913143
Epoch 300, Loss: 0.13836336296590707
Epoch 400, Loss: 0.0944102421363402
Epoch 500, Loss: 0.069604287550557
Epoch 600, Loss: 0.06325259799169425
Epoch 700, Loss: 0.05958992709454601
Epoch 800, Loss: 0.05749481742411662
Epoch 900, Loss: 0.05625333068601048
Epoch 1000, Loss: 0.05548728250039152
Epoch 1100, Loss: 0.054943568211906144
Epoch 1200, Loss: 0.054502846039741336
Epoch 1300, Loss: 0.05416522024255539
Epoch 1400, Loss: 0.05389518171294433
Epoch 1500, Loss: 0.053668939375495564
Epoch 1600, Loss: 0.05348393126539243
Epoch 1700, Loss: 0.0533391028457996
Epoch 1800, Loss: 0.05321896762182907
Epoch 1900, Loss: 0.053118431253688474
Epoch 2000, Loss: 0.0530389958118355
Epoch 2100, Loss: 0.05297723414085728
Epoch 2200, Loss: 0.052926419344534174
Epoch 2300, Loss: 0.05287856476093719
Epoch 2400, Loss: 0.05286028702506324
Epoch 2500, Loss: 0.05