In [9]:
import numpy as np
import torch

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
display(device)

device(type='cuda')

In [11]:
np.random.seed(45)
num_samples = 40
    
# Generate data
x1 = np.random.uniform(-1, 1, num_samples)
f_x = 3*x1 + 4
eps = np.random.randn(num_samples)
y = f_x + eps

In [12]:
theta = torch.tensor([3., 4.], requires_grad=True)

In [13]:
x_tensor = torch.tensor(x1, dtype=torch.float32).reshape(-1, 1)
X = torch.cat((torch.ones(x_tensor.shape), x_tensor), 1)
y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

y_pred = (X @ theta).reshape(-1, 1)
loss = torch.mean((y_pred - y) ** 2)
loss.backward()

true_grad = theta.grad.clone()
true_loss = loss.data.clone()

display(true_grad, true_loss, theta)

tensor([-2.1246,  0.9933])

tensor(2.2600)

tensor([3., 4.], requires_grad=True)

In [14]:
def compute_sgd(x_i, y_i):
    theta.grad.zero_()
    y_pred_i = x_i @ theta
    loss_i = torch.mean((y_pred_i - y_i) ** 2)
    loss_i.backward()
    
    return theta.grad.clone()

stochastic_grads = []

for i in range(y.shape[0]):
    x_i = X[i].clone().detach()
    y_i = y[i].clone().detach()
    stochastic_grads.append(compute_sgd(x_i, y_i))

mean_stochastic_grad = torch.mean(torch.stack(stochastic_grads), dim=0)

display(mean_stochastic_grad)

tensor([-2.1246,  0.9933])

In [15]:
residual_tensor = true_grad - mean_stochastic_grad
display(residual_tensor)

tensor([-2.3842e-07, -5.9605e-08])

As we can observe from the above cell, the values in the residual tensor are all of the order $10^{-7}$, which indicates that the error is minimal. We can conclude that the stochastic gradient is a good estimate of the true gradient.

In [16]:
def gradient_descent(grad_type: str='bgd', epochs=15, eps=1e-3, alpha=1e-5, minibatch_size=32):
    loss_vals = []
    theta_vals = []
    theta_copy = torch.tensor([0., 0.], requires_grad=True)
    theta_copy.requires_grad = True
    epoch = 1
    pretty_print = {'bgd': 'Batch Gradient Descent', 'mbgd': 'Mini-Batch Gradient Descent', 'sgd': 'Stochastic Gradient Descent'}
    while True:
        if grad_type == 'bgd':
            y_pred = (X @ theta_copy).reshape(-1, 1)
            loss = torch.mean((y_pred - y) ** 2)
            loss.backward()

            loss_vals.append(loss.item())
            theta_vals.append(theta_copy.clone())

            with torch.no_grad():
                theta_copy -= alpha * theta_copy.grad

            theta_copy.grad.zero_()
        
        elif grad_type == 'mbgd':
            random_permutation = torch.randperm(num_samples)
            X_shuffled = X[random_permutation]
            y_shuffled = y[random_permutation]

            for i in range(0, num_samples, minibatch_size):
                X_batch = X_shuffled[i:i + minibatch_size]
                y_batch = y_shuffled[i:i + minibatch_size]

                y_pred = (X_batch @ theta_copy).reshape(-1, 1)
                loss = torch.mean((y_pred - y_batch) ** 2)
                loss.backward()

                loss_vals.append(loss.item())
                theta_vals.append(theta_copy.clone())

                with torch.no_grad():
                    theta_copy -= alpha * theta_copy.grad

                theta_copy.grad.zero_()

        else:
            for i in range(num_samples):
                X_i = X[i].clone().detach()
                y_i = y[i].clone().detach()

                y_pred = X_i @ theta_copy
                loss = torch.mean((y_pred - y_i) ** 2)
                loss.backward()

                loss_vals.append(loss.item())
                theta_vals.append(theta_copy.clone())

                with torch.no_grad():
                    theta_copy -= alpha * theta_copy.grad

                theta_copy.grad.zero_()

        if torch.abs(loss.data - true_loss) < eps:
            print(f'{pretty_print[grad_type]} converged in {epoch} epochs')
            break

        epoch += 1

gradient_descent('bgd')
gradient_descent('mbgd')
gradient_descent('sgd')

Batch Gradient Descent converged in 79547 epochs
Mini-Batch Gradient Descent converged in 10618 epochs
Stochastic Gradient Descent converged in 358 epochs
