In [13]:
# some standard imports
import matplotlib.pyplot as plt
import autograd.numpy as np
from autograd import grad as compute_grad   
from autograd.misc.flatten import flatten_func

from autograd.util import flatten_func

#this is needed to compensate for matplotlib notebook's tendancy to blow up images when plotted inline
%matplotlib notebook
from matplotlib import rcParams
rcParams['figure.autolayout'] = True

# Gradient descent

In [2]:
# gradient descent function
def gradient_descent(g,w,alpha,max_its,version):    
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = compute_grad(g_flat)

    # record history
    w_hist = []
    w_hist.append(unflatten(w))
    
    # over the line
    for k in range(max_its):   
        # plug in value into func and derivative
        grad_eval = grad(w)
        grad_eval.shape = np.shape(w)

        ### normalized or unnormalized descent step? ###
        if version == 'normalized':
            grad_norm = np.linalg.norm(grad_eval)
            if grad_norm == 0:
                grad_norm += 10**-6*np.sign(2*np.random.rand(1) - 1)
            grad_eval /= grad_norm
            
        # take descent step with momentum
        w = w - alpha*grad_eval

        # record weight update
        w_hist.append(unflatten(w))

    return w_hist

# load a face detection dataset

A dataset of size $P = 10,000$ face detection dataset (about 5K faces and non-face images).

In [3]:
# load in large-ish dataset
datapath = '../../mlrefined_datasets/convnet_datasets/feat_face_data.csv'
data = np.loadtxt(datapath,delimiter = ',')
x = data[:,:-1]
y = data[:,-1:]

Lets take a random subset of 8,000 points for training.

In [4]:
# load in data - for this one split up training and testing
ind = np.random.permutation(len(data))
data_train = data[ind[:-1],:]
data_test = data[ind[-2000:],:]

In [5]:
x = data_train[:,:-1]
y = data_train[:,-1:]
num_pts = len(y)

# Cost functions

Nice compact cost function.

In [6]:
# the softmax cost function
def softmax(w):
    cost  = np.sum(np.log(1 + np.exp((-y)*(w[0] + np.dot(x,w[1:])))))
    return cost

To use stochastic / mini-batch gradient descent we need a cost function that takes in only a subset of the data, and whose input ``autograd`` will not differentiate with respect to (most built in ``Python`` functionality is immune to differentiation).  So we can use e.g., ``iter`` - a built in ``Python`` function for iteration.

In [7]:
# the softmax cost - with minibatch input
def minibatch_softmax(w,iter):
    # get subset of points
    x_p = x[iter,:]
    y_p = y[iter]
    
    # compute cost over just this subset
    cost  = np.sum(np.log(1 + np.exp((-y_p)*(w[0] + np.dot(x_p,w[1:])))))
    return cost

# Minibatch gradient descent

Not much has changed here - we just can just plug in various subsets of the data in our standard gradient descent loop.

In [8]:
# gradient descent function
def minibatch_gradient_descent(g,w,alpha,batch_size,max_its,version):    
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = compute_grad(g_flat)

    # record history
    w_hist = []
    w_hist.append(unflatten(w))
    
    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_pts,batch_size)))
    # over the line
    for k in range(max_its):   
        # loop over each minibatch
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b*batch_size,min((b+1)*batch_size,num_pts))

            # plug in value into func and derivative
            grad_eval = grad(w,batch_inds)
            grad_eval.shape = np.shape(w)

            ### normalized or unnormalized descent step? ###
            if version == 'normalized':
                grad_norm = np.linalg.norm(grad_eval)
                if grad_norm == 0:
                    grad_norm += 10**-6*np.sign(2*np.random.rand(1) - 1)
                grad_eval /= grad_norm
            
            # take descent step with momentum
            w = w - alpha*grad_eval

        # record weight update
        w_hist.append(unflatten(w))

    return w_hist

Now we can test the minibatch extension out - comparing it to the standard gradient descent module.

In [11]:
# make an initialization
scale = 0.1
w_init = scale*np.random.randn(np.shape(x)[1]+1,1)

In [14]:
# run standard (full) gradient descent
g = softmax
scale = 0.1
w = w_init
alpha = 10**(-3)
max_its = 100
version = 'unnormalized'
weight_history_1 =  gradient_descent(g,w,alpha,max_its,version)

In [15]:
# run minibatch gradient descent
g = minibatch_softmax
scale = 0.1
w_init = scale*np.random.randn(np.shape(x)[1]+1,1)
w = w_init
alpha = 10**(-1)
max_its = 100
version = 'unnormalized'
batch_size = 100
weight_history_2 =  minibatch_gradient_descent(g,w,alpha,batch_size,max_its,version)

Below is a function that will plot the cost function histories corresponding to our two weight histories

In [16]:
# import plotting library and other necessities
import matplotlib.pyplot as plt
from matplotlib import gridspec
import copy

# our plotting function
def plot_history(x,y,weight_histories):
    '''
    A module for computing / plotting the cost and misclassification histories for a given run of gradient descent.
    Here the input should be the data and weight history from a gradient descent run
    '''
    
    # local copies of the softmax cost function written more compactly, for scoping issues
    softmax = lambda w: np.sum(np.log(1 + np.exp((-y)*(w[0] + np.dot(x,w[1:])))))
    count = lambda w: 0.25*np.sum((np.sign(w[0] + np.dot(x,w[1:])) - y)**2)
    
    # initialize figure
    fig = plt.figure(figsize = (9,3))

    # create subplot with 3 panels, plot input function in center plot
    gs = gridspec.GridSpec(1, 2) 
    ax1 = plt.subplot(gs[0]); 
    ax2 = plt.subplot(gs[1]);
    
    # loop over histories and plot all
    c = 1
    for weight_history in weight_histories:
        # loop over input weight history and create associated cost and misclassification histories
        cost_history = []
        count_history = []
        for weight in weight_history:
            cost_val = softmax(weight)
            cost_history.append(cost_val)

            count_val = count(weight)
            count_history.append(count_val)

        # now plot each, one per panel
        ax1.plot(cost_history)  
        label = 'full grad'
        if c == 2:
            label = 'mini-batch'
        if c == 3:
            label = 'stochastic'
        ax2.plot(count_history,label = label)
        c+=1
        
    # label each panel
    ax1.set_xlabel('iteration')
    ax1.set_ylabel('cost function val')
    ax1.set_title('cost function history')
    
    ax2.set_xlabel('iteration')
    ax2.set_ylabel('misclassifications')
    ax2.set_title('number of misclassificaions')
    
    ax2.legend()
#     ax2.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=2)

    
    plt.show()

In [17]:
weight_histories = [weight_history_1,weight_history_2]
plot_history(x,y,weight_histories)

<IPython.core.display.Javascript object>

Yep - minibatch is pretty good!