# Gradient descent examples

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from matplotlib.animation import FuncAnimation

In [None]:
%matplotlib notebook

## 0. Gradient descent procedure

In [None]:
def trace_grad_descent(f, df, x0, alpha=0.01, n_iter=100):
    x = x0
    points_X = [x]
    points_Y = [f(x)]

    for i in range(n_iter):
        x = x - alpha * df(x)
        y = f(x)
        points_X.append(x)
        points_Y.append(y)

    return points_X, points_Y

In [None]:
def animate_grad_descent(X, Y, gd_X, gd_Y, title=''):
    fig = plt.figure(figsize=(6, 4))

    plt.grid(True, which='both')
    plt.axhline(y=0, color='k')
    plt.axvline(x=0, color='k')

    plt.plot(X, Y)

    graph, = plt.plot([], [], 'or-')

    def animate(i):
        graph.set_data(gd_X[:i], gd_Y[:i])
        return graph

    ani = FuncAnimation(fig, animate, frames=len(gd_X), interval=200, repeat=True)

    plt.title(title)
    plt.show()

    return ani


def animate_grad_descent_2d(X, Y, Z, min_point, gd_X, title=''):
    fig = plt.figure(figsize=(6, 4))

    ctr = plt.contour(X, Y, Z)

    plt.grid(True, which='both')

    plt.plot(min_point[0], min_point[1], 'bx', markersize=12)

    graph, = plt.plot([], [], 'or-')

    def animate(i):
        graph.set_data(gd_X[:i, 0], gd_X[:i, 1])
        return graph

    ani = FuncAnimation(fig, animate, frames=len(gd_X), interval=200, repeat=True)

    plt.title(title)
    plt.show()

    return ani

## 1. $y=(1 - x)^2 + 2$

In [None]:
def x_squared(x):
    return (1 - x) ** 2 + 2


def dx_squared(x):
    return 2 * x - 2

In [None]:
X = np.linspace(start=-2, stop=4, num=50, endpoint=True)
Y = x_squared(X)
dY = dx_squared(X)

### 1.1. Slow convergence

In [None]:
x0 = -2
alpha = 0.005

gd_X, gd_Y = trace_grad_descent(f=x_squared, df=dx_squared, x0=x0, alpha=alpha, n_iter=100)
ani = animate_grad_descent(X, Y, gd_X=gd_X, gd_Y=gd_Y, title='alpha = {}'.format(alpha))

### 1.2. Good convergence

In [None]:
x0 = -2
alpha = 0.1

gd_X, gd_Y = trace_grad_descent(f=x_squared, df=dx_squared, x0=x0, alpha=alpha, n_iter=100)
ani = animate_grad_descent(X, Y, gd_X=gd_X, gd_Y=gd_Y, title='alpha = {}'.format(alpha))

### 1.3. Overjumping slow convergence

In [None]:
x0 = -2
alpha = 0.995

gd_X, gd_Y = trace_grad_descent(f=x_squared, df=dx_squared, x0=x0, alpha=alpha, n_iter=100)
ani = animate_grad_descent(X, Y, gd_X=gd_X, gd_Y=gd_Y, title='alpha = {}'.format(alpha))

### 1.4. Divergence

In [None]:
x0 = 0
alpha = 1.025

gd_X, gd_Y = trace_grad_descent(f=x_squared, df=dx_squared, x0=x0, alpha=alpha, n_iter=100)
ani = animate_grad_descent(X, Y, gd_X=gd_X, gd_Y=gd_Y, title='alpha = {}'.format(alpha))

## 2. $y = x \cdot \sin x$

In [None]:
def x_sinx(x):
    return x * np.sin(x)


def dx_sinx(x):
    return np.sin(x) + x * np.cos(x)

In [None]:
X = np.linspace(start=-2, stop=8, num=100, endpoint=True)
Y = x_sinx(X)
dY = dx_sinx(X)

### 2.1. Slow convergence

In [None]:
x0 = 2.5
alpha = 0.01

gd_X, gd_Y = trace_grad_descent(f=x_sinx, df=dx_sinx, x0=x0, alpha=alpha, n_iter=100)
animate_grad_descent(X, Y, gd_X=gd_X, gd_Y=gd_Y, title='alpha = {}'.format(alpha))

### 2.2. Good convergence

In [None]:
x0 = 2.5
alpha = 0.1

gd_X, gd_Y = trace_grad_descent(f=x_sinx, df=dx_sinx, x0=x0, alpha=alpha, n_iter=100)
animate_grad_descent(X, Y, gd_X=gd_X, gd_Y=gd_Y, title='alpha = {}'.format(alpha))

### 2.3. Overjumping divergence

In [None]:
x0 = 2.5
alpha = 0.6

gd_X, gd_Y = trace_grad_descent(f=x_sinx, df=dx_sinx, x0=x0, alpha=alpha, n_iter=100)
animate_grad_descent(X, Y, gd_X=gd_X, gd_Y=gd_Y, title='alpha = {}'.format(alpha))

### 2.4. Convergence with jumping into another local minumum

In [None]:
x0 = 2.5
alpha = 1

gd_X, gd_Y = trace_grad_descent(f=x_sinx, df=dx_sinx, x0=x0, alpha=alpha, n_iter=100)
animate_grad_descent(X, Y, gd_X=gd_X, gd_Y=gd_Y, title='alpha = {}'.format(alpha))

## 3. $z=3x^2 + y^2$

In [None]:
def z(x):
    return 3 * x[0] ** 2 + x[1] ** 2


def dz(x):
    dx_0 = 6 * x[0]
    dx_1 = 2 * x[1]
    return np.array([dx_0, dx_1])

In [None]:
x = np.linspace(start=-10, stop=10, num=200, endpoint=True)
y = np.linspace(start=-10, stop=10, num=200, endpoint=True)
X, Y = np.meshgrid(x, y)
Z = z([X, Y])

### 3.1. Slow convergence

In [None]:
alpha = 0.005

gd_X, gd_Z = trace_grad_descent(z, dz, x0=np.array([-9., -9.]), alpha=alpha, n_iter=100)

gd_X = np.array(gd_X)
gd_Z = np.array(gd_Z).reshape(-1, 1)

animate_grad_descent_2d(X, Y, Z, min_point=(0, 0), gd_X=gd_X, title='$3x^2 + y^2$, alpha={}'.format(alpha))

### 3.2. Good convergence

In [None]:
alpha = 0.1

gd_X, gd_Z = trace_grad_descent(z, dz, x0=np.array([-9., -9.]), alpha=alpha, n_iter=100)

gd_X = np.array(gd_X)
gd_Z = np.array(gd_Z).reshape(-1, 1)

animate_grad_descent_2d(X, Y, Z, min_point=(0, 0), gd_X=gd_X, title='$3x^2 + y^2$, alpha={}'.format(alpha))

### 3.3. Overjumping convergence

In [None]:
alpha = 0.25

gd_X, gd_Z = trace_grad_descent(z, dz, x0=np.array([-9., -9.]), alpha=alpha, n_iter=100)

gd_X = np.array(gd_X)
gd_Z = np.array(gd_Z).reshape(-1, 1)

animate_grad_descent_2d(X, Y, Z, min_point=(0, 0), gd_X=gd_X, title='$3x^2 + y^2$, alpha={}'.format(alpha))