## Mathematics of Machine Learning

### 8th Exercise: Stochastic Gradient Descent

In [None]:
import torch
import torch.nn as nn

import random
import numpy as np
import matplotlib.pyplot as plt

#### (0) Preparation

In [None]:
# Load data
X = np.genfromtxt("data_MNIST_78_X.csv", delimiter=',')
Y = np.genfromtxt("data_MNIST_78_Y.csv", delimiter=',')

In [None]:
# Transfom the labels in +1 (7) and -1 (8)
# y = (y == 7) - (y == 8)
for ind, val in enumerate(Y):
    if val == 7:
        Y[ind] = +1
    else:
        Y[ind] = -1

In [None]:
# Size of the dataset
m = len(Y)
print(m)

In [None]:
# Number of features (= dimension of the feature space)
d = X.shape[0]
print(d)

#### (1) Gradient Descent for Log-Loss

In [None]:
# Auxiliary quantities for faster calculation of y*(w*x+b):
X1 = np.r_[X, np.ones((1, m))]
X1Y = np.tile(Y, (X1.shape[0], 1)) * X1

def exp_XY(w): return np.exp(-np.dot(w.T, X1Y)).T


In [None]:
# Empirical log-Risk as a function of e = exp(- y*(w*x+b))
def RS(e): return np.mean(np.log(1 + e), axis=0)

In [None]:
# Gradient of the empirical log-Risk again as a function of e = exp(- y*(w*x+b))
def Grad_RS(e): return - np.divide((np.dot(X1, ((Y * e)/(1 + e)))), m)
# Grad_RS = @(e) - (X1 * ( (Y .* e)./(1+e) ) )/m;

In [None]:
# Estimate Lipschitz constant of the gradient according to lecture
L = 1/4 * np.mean(np.sum(X * X, axis = 0))
print(L)

In [None]:
# Maximum allowed step size according to lecture
eta = 1/L
print(eta)

In [None]:
# Gradient Descent

n_iter = 10 # Step count
# n_iter = m

# Matrix of iterates
ws = np.zeros((d+1, n_iter+1)) 

# Start point w_0
ws[:, 0] = np.append(np.zeros((d, 1)), np.array([1])) 

In [None]:
for i in range(n_iter):
    # Calculation of exp(- y*(w*x+b))
    e = exp_XY(ws[:, i])
    # Gradient step
    ws[:, i+1] = ws[:, i] - eta * Grad_RS(e)

In [None]:
# Calculation of empirical risks for all iterates
Fs = RS(exp_XY(ws))
print(Fs)

In [None]:
# Plot the function
fig, ax = plt.subplots(figsize=(7, 5))

plt.semilogx(Fs)

plt.xlabel('Step k', fontsize=16)
plt.ylabel('$ F(w_{k}) = R_S(w_k) $', fontsize=16)

plt.show()

#### (2) Stochastic Gradient Descent

In [None]:
# Number of steps and step sizes
n_iter_SGD = m
def eta_k(k): return 0.5/(1+k)

# Matrix of iterates
ws_SGD = np.zeros((d+1, n_iter_SGD+1))

#Start point w_0
ws_SGD[:, 0] = np.append(np.zeros((d, 1)), np.array([1])) 

In [None]:
for i in range(n_iter_SGD):
    ind = np.random.choice(m) # selecting random data point
    x = X1[:, ind] # corresponding feature x
    y = Y[ind] # corresponding label y
    e = np.exp(-np.dot(y, np.dot(ws_SGD[:, i].T, x))) # Calculation of exp(- y*(w*x+b))
    v = - np.dot((y*e/(1+e)), x) # Direction of the gradient for data point (x,y)
    
    # Gradient step
    ws_SGD[:, i+1] = ws_SGD[:, i] - eta_k(i) * v

In [None]:
# Calculation of empirical risks for all iterates
Fs_SGD = RS(exp_XY(ws_SGD))
print(Fs_SGD)

In [None]:
# Plot the function
fig, ax = plt.subplots(figsize=(7, 5))

plt.semilogx(Fs)
plt.semilogx(Fs_SGD, '--')

plt.xlim(xmin=10e-1)

plt.xlabel('Step k', fontsize=16)
plt.ylabel('$ F(w_{k}) = R_S(w_k) $', fontsize=16)

plt.show()

In [None]:
# Plot the function
fig, ax = plt.subplots(figsize=(7, 5))

plt.loglog(Fs)
plt.loglog(Fs_SGD, '--')

plt.xlim(xmin=10e-1)

plt.xlabel('Step k', fontsize=16)
plt.ylabel('$ F(w_{k}) = R_S(w_k) $', fontsize=16)

plt.show()

In [None]:
plt.plot(Fs_SGD)
plt.xlabel("Number of updates")
plt.ylabel("empirical risk")
plt.yscale("log")
# plt.xscale("log")
plt.show()

#### (2) Stochastic Gradient Descent in PyTorch

In [None]:
# build the model
class LogisticRegression(torch.nn.Module):
     def __init__(self, input_dim, output_dim):
         super(LogisticRegression, self).__init__()
         self.linear = torch.nn.Linear(input_dim, output_dim)
     def forward(self, x):
         outputs = torch.sigmoid(self.linear(x))
         return outputs

In [None]:
# from numpy array to tensor
X = torch.from_numpy(X.T.astype(np.float32))

for ind, val in enumerate(Y):
    if val == 1:
        Y[ind] = +1
    else:
        Y[ind] = 0

Y = torch.from_numpy(Y.astype(np.float32))
print(X.shape)
print(Y)
        


In [None]:
# Number of data pairs and features 
m, d = X.shape
print(m, d)

In [None]:
# assigning some hyper-parameters:
epochs = m # Indicates the number of passes through the entire training dataset the network has completed
input_dim = d 
output_dim = 1 # Single output 
learning_rate = 0.5

In [None]:
model = LogisticRegression(input_dim, output_dim)

In [None]:
# Binary Cross Entropy Loss
criterion = torch.nn.BCELoss()

In [None]:
# SGD: Implements stochastic gradient descent
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
# adjusting learning rate
lambda1 = lambda epoch: 1/(1+epoch)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

In [None]:
losses = []
iter = 0
for epoch in range(epochs):
    optimizer.zero_grad() # Setting our stored gradients equal to zero
    outputs = model(X)
    
    loss = criterion(torch.squeeze(outputs), Y) 
    loss.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
    
    optimizer.step() # Updates weights and biases with the optimizer (SGD)
    scheduler.step()
    
    losses.append(loss.item())
    iter+=1
    
    # print(scheduler.get_last_lr())
    # print(optimizer.param_groups[0]['lr'])

In [None]:
# Plot the function
fig, ax = plt.subplots(figsize=(7, 5))

plt.semilogx(Fs)
plt.semilogx(Fs_SGD)
plt.semilogx([loss for loss in losses], '--')

plt.xlim(xmin=10e-1)

plt.xlabel('Step k', fontsize=16)
plt.ylabel('$ F(w_{k}) = R_S(w_k) $', fontsize=16)

plt.show()

In [None]:
plt.plot(losses)
plt.xlabel("Number of updates")
plt.ylabel("BCE Loss")
# plt.yscale("log")
# plt.xscale("log")
plt.show()