In [None]:
%matplotlib inline

# Fun with optimization

Lets play with SGD, momentum, Adagrad, and Adam. Lets do this for a simple 2D problem that we can visualize! Yes, I know, I could improve the initilization of the algorithms, the efficiency of my codes, yes my code might crash because I did not defensively program them, etc. The point of this Jupyter page is simply to give us a little optimization sand box to play with. 

First, declare some lib's

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
from mpl_toolkits.mplot3d import Axes3D

Lets define our function to optimize and then calc its derivative

I put three functions in there for you so you can pick and explore

In [None]:
# what function do you want to run?
WhichFx = 3

if(WhichFx==1):
    
    def func_z(x, y):
        z = x**2/10. + x*y/50. + y**2.
        return z

    def der_x(x, y):
        return (2*x/10. + y/50.)

    def der_y(x, y):
        return (x/50. + 2*y/1.)
    
    bounds = np.asarray([-3,3])
    
elif(WhichFx==2):
    
    def func_z(x, y):
        z = x**2 + 3 * np.sin(y)
        return z

    def der_x(x, y):
        return 2*x

    def der_y(x, y):
        return 3 * np.cos(y)    
    
    bounds = np.asarray([-6,6])
    
if(WhichFx==3):
    
    def func_z(x, y):
        z = x**4 - 2*x**2 + y**2
        return z

    def der_x(x, y):
        return 4*x**3 - 4*x

    def der_y(x, y):
        return 2*y
    
    bounds = np.asarray([-3,3])

Lets make a gradient descent function with just a learning rate

In [None]:
def gradient_descentGD(previous_x, previous_y, learning_rate, epoch):
    
    # store our gradients
    x_gd = []
    y_gd = []
    z_gd = []

    # append the first sampled point
    x_gd.append(previous_x)
    y_gd.append(previous_y)
    z_gd.append(func_z(previous_x, previous_y))

    # gradient descent algorithm
    for i in range(epoch):

        # what is our derivative?
        dx = der_x(previous_x,previous_y)
        dy = der_y(previous_x,previous_y)
        
        # simple update in x and y
        current_x = previous_x - learning_rate*dx
        x_gd.append(current_x)
        
        current_y = previous_y - learning_rate*dy
        y_gd.append(current_y)

        z_gd.append(func_z(current_x, current_y))

        # update previous_x and previous_y
        previous_x = current_x
        previous_y = current_y

    return x_gd, y_gd, z_gd

Lets pick some init values

I just manually selected these fyi

To show you the behavior of these methods

In [None]:
if(WhichFx==1):
    # location to start at 
    x0 = 3
    y0 = 2
    # learning rate
    learning_rate = 0.9
    # number of epochs
    epoch = 100
elif(WhichFx==2):
    x0 = 4
    y0 = 1
    learning_rate = 0.2
    epoch = 100
elif(WhichFx==3):
    x0 = -2
    y0 = -2
    learning_rate = 0.15
    epoch = 50   

Run our algorithm

In [None]:
x_gd1, y_gd1, z_gd1 = gradient_descentGD(x0, y0, learning_rate, epoch)

Show the result

In [None]:
a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(-3, 3, 25), cmap='jet')

# Plot our steps
ax1.plot(x_gd1, y_gd1, 'ro')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd1[i], y_gd1[i]), xytext=(x_gd1[i-1], y_gd1[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},
                   va='center', ha='center')

from matplotlib.patches import Patch
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], color='r', lw=4, label='GD')]
ax1.legend(handles=legend_elements, loc='upper right')
plt.show()

What if we considered momentum?

In [None]:
def gradient_descentMOM(previous_x, previous_y, learning_rate, epoch, mom):
    
    x_gd = []
    y_gd = []
    z_gd = []

    x_v = 0
    y_v = 0
    
    x_gd.append(previous_x)
    y_gd.append(previous_y)
    z_gd.append(func_z(previous_x, previous_y))

    for i in range(epoch):
        
        dx = der_x(previous_x,previous_y)
        dy = der_y(previous_x,previous_y)
        
        # note, our update changes a tad
        x_v = mom * x_v - learning_rate*dx # velocity
        current_x = previous_x + x_v # position
        x_gd.append(current_x)
                
        y_v = mom * y_v - learning_rate*dy # velocity
        current_y = previous_y + y_v # position
        y_gd.append(current_y)

        z_gd.append(func_z(current_x, current_y))

        previous_x = current_x
        previous_y = current_y

    return x_gd, y_gd, z_gd

mom = 0.25 # what momentum to use?
x_gd2, y_gd2, z_gd2 = gradient_descentMOM(x0, y0, learning_rate, epoch, mom)

a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(bounds[0], bounds[1], 25), cmap='jet')

# Plot our steps
ax1.plot(x_gd2, y_gd2, 'co')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd2[i], y_gd2[i]), xytext=(x_gd2[i-1], y_gd2[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'c', 'lw': 1},
                   va='center', ha='center')

# Plot our steps
ax1.plot(x_gd1, y_gd1, 'ro')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd1[i], y_gd1[i]), xytext=(x_gd1[i-1], y_gd1[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},
                   va='center', ha='center')
    
legend_elements = [Line2D([0], [0], color='r', lw=4, label='GD'),
                  Line2D([0], [0], color='c', lw=4, label='MOM')]
ax1.legend(handles=legend_elements, loc='upper right')
plt.show()

Adagrad

In [None]:
def gradient_descentAdaGrad(previous_x, previous_y, learning_rate, epoch):
    
    eps = 1e-8
    
    x_gd = []
    y_gd = []
    z_gd = []
    
    x_gd.append(previous_x)
    y_gd.append(previous_y)
    z_gd.append(func_z(previous_x, previous_y))

    cache_x = cache_y = 0
    
    for i in range(epoch):
        
        dx = der_x(previous_x,previous_y)
        dy = der_y(previous_x,previous_y)
        
        cache_x = cache_x + dx**2 
        cache_y = cache_y + dy**2
        current_x = previous_x - learning_rate * dx / (np.sqrt(cache_x) + eps) 
        current_y = previous_y - learning_rate * dy / (np.sqrt(cache_y) + eps) 

        x_gd.append(current_x)
        y_gd.append(current_y)
            
        z_gd.append(func_z(current_x, current_y))

        previous_x = current_x
        previous_y = current_y

    return x_gd, y_gd, z_gd

x_gd5, y_gd5, z_gd5 = gradient_descentAdaGrad(x0, y0, learning_rate, epoch)

a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(bounds[0], bounds[1], 25), cmap='jet')

# Plot our steps
ax1.plot(x_gd5, y_gd5, 'go')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd5[i], y_gd5[i]), xytext=(x_gd5[i-1], y_gd5[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'g', 'lw': 1},
                   va='center', ha='center')
    
plt.show()

Plot all on the same screen

In [None]:
a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(bounds[0], bounds[1], 25), cmap='jet')

# Plot our steps
ax1.plot(x_gd5, y_gd5, 'go')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd5[i], y_gd5[i]), xytext=(x_gd5[i-1], y_gd5[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'g', 'lw': 1},
                   va='center', ha='center')
    
# Plot our steps
ax1.plot(x_gd1, y_gd1, 'ro')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd1[i], y_gd1[i]), xytext=(x_gd1[i-1], y_gd1[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},
                   va='center', ha='center')

# Plot our steps
ax1.plot(x_gd2, y_gd2, 'co')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd2[i], y_gd2[i]), xytext=(x_gd2[i-1], y_gd2[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'c', 'lw': 1},
                   va='center', ha='center')
    
legend_elements = [Line2D([0], [0], color='r', lw=4, label='GD'),
                  Line2D([0], [0], color='c', lw=4, label='MOM'),
                  Line2D([0], [0], color='g', lw=4, label='AGrad')]
ax1.legend(handles=legend_elements, loc='upper right')
plt.show()

RMSprop

In [None]:
def gradient_descentRMSProp(previous_x, previous_y, learning_rate, epoch, decay_rate):
    
    eps = 1e-8
    
    x_gd = []
    y_gd = []
    z_gd = []

    cache_x = 0
    cache_y = 0
    
    x_gd.append(previous_x)
    y_gd.append(previous_y)
    z_gd.append(func_z(previous_x, previous_y))

    for i in range(epoch):
        
        dx = der_x(previous_x,previous_y)
        dy = der_y(previous_x,previous_y)
        
        if(i==0):
            cache_x = dx**2        
        else:
            cache_x = decay_rate * cache_x + (1 - decay_rate)*dx**2        
        current_x = previous_x - learning_rate*dx / (np.sqrt(cache_x) + eps) 
        x_gd.append(current_x)
        
        if(i==0):
            cache_y = dy**2         
        else:
            cache_y = decay_rate * cache_y + (1 - decay_rate)*dy**2         
        current_y = previous_y - learning_rate*dy / (np.sqrt(cache_y) + eps) 
        y_gd.append(current_y)

        z_gd.append(func_z(current_x, current_y))

        previous_x = current_x
        previous_y = current_y

    return x_gd, y_gd, z_gd

x_gd6, y_gd6, z_gd6 = gradient_descentRMSProp(x0, y0, learning_rate, epoch, decay_rate=0.99)

a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(bounds[0], bounds[1], 25), cmap='jet')

# Plot our steps
ax1.plot(x_gd6, y_gd6, 'mo')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd6[i], y_gd6[i]), xytext=(x_gd6[i-1], y_gd6[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'm', 'lw': 1},
                   va='center', ha='center')

plt.show()

Plot all on the same graph

In [None]:
a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(bounds[0], bounds[1], 25), cmap='jet')

# Plot our steps
ax1.plot(x_gd6, y_gd6, 'mo')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd6[i], y_gd6[i]), xytext=(x_gd6[i-1], y_gd6[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'm', 'lw': 1},
                   va='center', ha='center')

# Plot our steps
ax1.plot(x_gd1, y_gd1, 'ro')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd1[i], y_gd1[i]), xytext=(x_gd1[i-1], y_gd1[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},
                   va='center', ha='center')
    
# Plot our steps
ax1.plot(x_gd5, y_gd5, 'go')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd5[i], y_gd5[i]), xytext=(x_gd5[i-1], y_gd5[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'g', 'lw': 1},
                   va='center', ha='center')

# Plot our steps
ax1.plot(x_gd2, y_gd2, 'co')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd2[i], y_gd2[i]), xytext=(x_gd2[i-1], y_gd2[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'c', 'lw': 1},
                   va='center', ha='center')   
    
legend_elements = [Line2D([0], [0], color='r', lw=4, label='GD'),
                  Line2D([0], [0], color='c', lw=4, label='MOM'),
                  Line2D([0], [0], color='g', lw=4, label='AGrad'),
                  Line2D([0], [0], color='m', lw=4, label='RMSProp')]
ax1.legend(handles=legend_elements, loc='upper right')
plt.show()

Adam

In [None]:
def gradient_descentAdam(previous_x, previous_y, learning_rate, epoch, beta1, beta2):
    
    eps = 1e-8
    
    x_gd = []
    y_gd = []
    z_gd = []
    
    x_gd.append(previous_x)
    y_gd.append(previous_y)
    z_gd.append(func_z(previous_x, previous_y))

    mx = mxt = my = myt = vx = vxt = vy = vyt = 0
    
    for i in range(epoch):
        
        dx = der_x(previous_x,previous_y)
        dy = der_y(previous_x,previous_y)
        
        if(i==0):
            mx = mxt = dx
        else:
            mx = beta1*mx + (1-beta1)*dx
            mxt = mx / (1-beta1**(i+1))
        if(i==0):
            vx = vxt = dx**2
        else:
            vx = beta2*vx + (1-beta2)*(dx**2)
            vxt = vx / (1-beta2**(i+1))
        current_x = previous_x - learning_rate * mxt / (np.sqrt(vxt) + eps)
        x_gd.append(current_x)
        
        if(i==0):
            my = myt = dy
        else:
            my = beta1*my + (1-beta1)*dy
            myt = my / (1-beta1**(i+1))
        if(i==0):
            vy = vyt = dy**2
        else:
            vy = beta2*vy + (1-beta2)*(dy**2)
            vyt = vy / (1-beta2**(i+1))
        current_y = previous_y - learning_rate * myt / (np.sqrt(vyt) + eps)                
        y_gd.append(current_y)

        z_gd.append(func_z(current_x, current_y))

        previous_x = current_x
        previous_y = current_y

    return x_gd, y_gd, z_gd

# pick our parameters
beta1 = 0.2
beta2 = 0.95
x_gd7, y_gd7, z_gd7 = gradient_descentAdam(x0, y0, learning_rate, epoch, beta1, beta2)

a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(bounds[0], bounds[1], 25), cmap='jet')

# Plot our steps
ax1.plot(x_gd7, y_gd7, 'ko')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd7[i], y_gd7[i]), xytext=(x_gd7[i-1], y_gd7[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'k', 'lw': 1},
                   va='center', ha='center')
    
plt.show()

Plot all

In [None]:
a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(bounds[0], bounds[1], 25), cmap='jet')

# Plot our steps
ax1.plot(x_gd7, y_gd7, 'ko')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd7[i], y_gd7[i]), xytext=(x_gd7[i-1], y_gd7[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'k', 'lw': 1},
                   va='center', ha='center')

# Plot our steps
ax1.plot(x_gd1, y_gd1, 'ro')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd1[i], y_gd1[i]), xytext=(x_gd1[i-1], y_gd1[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},
                   va='center', ha='center')
    
# Plot our steps
ax1.plot(x_gd5, y_gd5, 'go')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd5[i], y_gd5[i]), xytext=(x_gd5[i-1], y_gd5[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'g', 'lw': 1},
                   va='center', ha='center')

# Plot our steps
ax1.plot(x_gd2, y_gd2, 'co')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd2[i], y_gd2[i]), xytext=(x_gd2[i-1], y_gd2[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'c', 'lw': 1},
                   va='center', ha='center')   
    
# Plot our steps
ax1.plot(x_gd6, y_gd6, 'mo')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd6[i], y_gd6[i]), xytext=(x_gd6[i-1], y_gd6[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'm', 'lw': 1},
                   va='center', ha='center')
    
legend_elements = [Line2D([0], [0], color='r', lw=4, label='GD'),
                  Line2D([0], [0], color='c', lw=4, label='MOM'),
                  Line2D([0], [0], color='g', lw=4, label='AGrad'),
                  Line2D([0], [0], color='m', lw=4, label='RMSProp'),
                  Line2D([0], [0], color='k', lw=4, label='Adam')]
ax1.legend(handles=legend_elements, loc='upper right')
plt.show()

All code together

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
from mpl_toolkits.mplot3d import Axes3D

WhichFx = 3

if(WhichFx==1):
    def func_z(x, y):
        z = x**2/10. + x*y/50. + y**2.
        return z
    def der_x(x, y):
        return (2*x/10. + y/50.)
    def der_y(x, y):
        return (x/50. + 2*y/1.)
    bounds = np.asarray([-3,3])
elif(WhichFx==2):
    def func_z(x, y):
        z = x**2 + 3 * np.sin(y)
        return z
    def der_x(x, y):
        return 2*x
    def der_y(x, y):
        return 3 * np.cos(y)    
    bounds = np.asarray([-6,6])
if(WhichFx==3):
    def func_z(x, y):
        z = x**4 - 2*x**2 + y**2
        return z
    def der_x(x, y):
        return 4*x**3 - 4*x
    def der_y(x, y):
        return 2*y
    bounds = np.asarray([-3,3])
    
if(WhichFx==1):
    # location to start at
    x0 = np.random.uniform(0,1) * (bounds[1]-bounds[0]) + bounds[0] # 3
    y0 = np.random.uniform(0,1) * (bounds[1]-bounds[0]) + bounds[0] # 3
    # learning rate
    learning_rate = 0.1
    # number of epochs
    epoch = 100
elif(WhichFx==2):
    # location to start at
    x0 = np.random.uniform(0,1) * (bounds[1]-bounds[0]) + bounds[0] # 3
    y0 = np.random.uniform(0,1) * (bounds[1]-bounds[0]) + bounds[0] # 3
    learning_rate = 0.1
    epoch = 100
elif(WhichFx==3):
    # location to start at
    x0 = np.random.uniform(0,1) * (bounds[1]-bounds[0]) + bounds[0] # 3
    y0 = np.random.uniform(0,1) * (bounds[1]-bounds[0]) + bounds[0] # 3
    learning_rate = 0.15
    epoch = 100    
    
x_gd1, y_gd1, z_gd1 = gradient_descentGD(x0, y0, learning_rate, epoch)
x_gd2, y_gd2, z_gd2 = gradient_descentMOM(x0, y0, learning_rate, epoch, mom=0.25)
x_gd5, y_gd5, z_gd5 = gradient_descentAdaGrad(x0, y0, learning_rate, epoch)
x_gd6, y_gd6, z_gd6 = gradient_descentRMSProp(x0, y0, learning_rate, epoch, decay_rate=0.99)
x_gd7, y_gd7, z_gd7 = gradient_descentAdam(x0, y0, learning_rate, epoch, beta1=0.3, beta2=0.99)

a = np.arange(bounds[0], bounds[1], 0.05)
b = np.arange(bounds[0], bounds[1], 0.05)
x, y = np.meshgrid(a, b)
z = func_z(x, y)
fig1, ax1 = plt.subplots()
ax1.contour(x, y, z, levels=np.logspace(bounds[0], bounds[1], 50), cmap='jet')

# Plot our steps
ax1.plot(x_gd7, y_gd7, 'ko')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd7[i], y_gd7[i]), xytext=(x_gd7[i-1], y_gd7[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'k', 'lw': 1},
                   va='center', ha='center')

# Plot our steps
ax1.plot(x_gd1, y_gd1, 'ro')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd1[i], y_gd1[i]), xytext=(x_gd1[i-1], y_gd1[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},
                   va='center', ha='center')
    
# Plot our steps
ax1.plot(x_gd5, y_gd5, 'go')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd5[i], y_gd5[i]), xytext=(x_gd5[i-1], y_gd5[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'g', 'lw': 1},
                   va='center', ha='center')

# Plot our steps
ax1.plot(x_gd2, y_gd2, 'co')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd2[i], y_gd2[i]), xytext=(x_gd2[i-1], y_gd2[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'c', 'lw': 1},
                   va='center', ha='center')   
    
# Plot our steps
ax1.plot(x_gd6, y_gd6, 'mo')
for i in range(1, epoch+1):
    ax1.annotate('', xy=(x_gd6[i], y_gd6[i]), xytext=(x_gd6[i-1], y_gd6[i-1]),
                   arrowprops={'arrowstyle': '->', 'color': 'm', 'lw': 1},
                   va='center', ha='center')
    
legend_elements = [Line2D([0], [0], color='r', lw=4, label='GD'),
                  Line2D([0], [0], color='c', lw=4, label='MOM'),
                  Line2D([0], [0], color='g', lw=4, label='AGrad'),
                  Line2D([0], [0], color='m', lw=4, label='RMSProp'),
                  Line2D([0], [0], color='k', lw=4, label='Adam')]
ax1.legend(handles=legend_elements, loc='upper right')
plt.show()