#  Loading the dataset 

In [2]:
from sklearn.datasets import load_digits
digits = load_digits ()
print digits.keys()
data = digits["data"]
images = digits["images"]
target = digits["target"]
target_names = digits["target_names"]
import numpy as np
from scipy.sparse.linalg import lsqr

['images', 'data', 'target_names', 'DESCR', 'target']


In [3]:
# Create feature matrix X and groundtruth y
y = target[(target == 3) | (target == 8)]
y[y == 3] = 1
y[y == 8] = -1
images_ = images[(target == 3) | (target == 8)]
X = data[(target == 3) | (target == 8)]
np.shape(X)

(357, 64)

# Basic functions

In [4]:
def sigmoid(Z):
    return 1./(1.+np.exp(-Z))

def gradient(beta,X,y):
    return np.sum((-y[:,np.newaxis]*X) *(1-sigmoid(y[:,np.newaxis]*np.dot(X,beta[:,np.newaxis]))),0)/y.shape

def predict(beta,X):
    return np.sign(np.dot(X,beta))

def zero_one_loss(y_p,y_g):
    return np.sum(y_p != y_g) / float(len(y_p))

# Optimization Method

SGD and its variants use sampling without replacement. It reduces the number of sampling calls by shuffling an index array only once per epoch and simplifies slicing in mini batch versions.

In [5]:
def gradient_descent(beta, X, y, tau, m=10):
    for t in range(m):
        beta = beta - tau * gradient(beta, X, y)
    return beta

def stochastic_gradient_descent(beta, X, y, tau_0, gamma=1, m=150):
    np.random.seed(1)
    tau = tau_0
    n = len(y)
    indexx = np.arange(n)
    for t in range(m):
        i = np.mod(t, n) # np modulus returns 0 if divisor = 0
        if i == 0:
            np.random.shuffle(indexx)
        beta -= tau*gradient(beta, X[indexx[i]], y[indexx[i]:indexx[i]+1])
        tau = tau_0 / (1 + t*gamma)
    return beta


def stochastic_gradient_mini(beta, X, y, tau_0, gamma=1, m=150, b=30):
    np.random.seed(1)
    tau = tau_0
    n = len(y)
    indexx = np.arange(n)
    np.shuffle(indexx)
    full, rest = np.mod(n, b)
    for t in range(m):
        if np.mod(t, full+1) == 0:
            # every last minibatch per epoch gets filled with random samples
            i = np.append(indexx[b*t,:], np.random.randint(n, size=rest))
            np.shuffle(indexx)
        else:
            i = indexx[b*t,b*(t+1)]
        beta -= tau*gradient(beta,X[i],y[i])
        tau = tau_0/(1 + t*gamma)
    return beta
                     
                          
def stochastic_gradient_mom(beta, X, y, tau_0, mu=0.3, m=150):
    np.random.seed(1)
    tau = tau_0
    n = len(y)
    g = 0
    indexx = np.arange(n)
    for t in range(m):
        i = np.mod(t, n) # np modulus returns 0 if divisor = 0
        if i == 0:
            np.random.shuffle(indexx)
        g = mu * g + (1 - mu) * gradient(beta, X[indexx[i]], y[indexx[i]:indexx[i]+1])
        beta -= tau*g
        tau = tau_0 / (1 + t*gamma)
    return beta                          
                          
def average_stochastic_gradient(beta, X, y, tau_0, mu):
    np.random.seed(1)
    tau = tau_0
    n = len(y)
    g = 0
    indexx = np.arange(n)
    for t in range(m):
        i = np.mod(t, n)
        if i == 0:
            np.random.shuffle(indexx)
        g -= tau * gradient(beta, X[indexx[i]], y[indexx[i]:indexx[i]+1])
        beta = (1 - mu) * beta + mu * g
        tau = tau_0 / np.power(1 + t*gamma,3/4)
    return beta
    
def stochastic_average_gradient(beta, X, y, tau_0, m=150):  # Roux et al. [2012]
    np.random.seed(1)
    tau = tau_0
    n = len(y)
    indexx = np.arange(n)
    gradients = np.zeros((n, beta.size))
    for t in range(m):
        i = np.mod(n, t)
        if i == 0:
            np.random.shuffle(indexx)
        gradients[indexx[i]] = gradient(
                        beta, x[indexx[i]], y[indexx[i]:indexx[i]+1])
        beta -= tau*gradients.mean(axis=0)
        tau = tau_0 / (1 + t*gamma)
    return beta
        
        
def dual_coordinate_ascent(beta,X,y,tau_0=0.1,gamma=0.01,m = 150):
    np.random.seed(1)
    n = len(y)
    a = np.random.rand(n)
    a_ = np.copy(a)
    beta =np.sum(a[:,np.newaxis]*y[:,np.newaxis]*X,0)
    indexx = np.arange(n)
    Z =np.dot(X,X.transpose())
    for t in range(m):   
        i = np.mod(t,n)
        a_[indexx[i]] = np.clip(a[indexx[i]]-y[indexx[i]:indexx[i]+1]*np.dot(X[indexx[i]],beta)/Z[indexx[i],indexx[i]], 0,1) # add formula 
        beta = beta+(a_[indexx[i]]-a[indexx[i]])*y[indexx[i]:indexx[i]+1]*X[indexx[i],:]
        a[indexx[i]] = a_[indexx[i]]
    return beta

def weighted_least_squares(beta,X,y,m=10):
    for t in range(0, m):
        z = np.dot(X,beta[:,np.newaxis])
        V = np.zeros([len(z), len(z)])
        y_tilde = np.zeros([len(z)])
        for i_prime in range(len(z)):
            V[i_prime, i_prime] = np.sqrt(1. / len(z) * sigmoid(z[i_prime]) * (1-sigmoid(z[i_prime])))
            y_tilde[i_prime] = y[i_prime] / sigmoid(y[i_prime]*z[i_prime])
        z_tilde = np.dot((z + y_tilde[:, np.newaxis]).T, V)
        X_tilde = np.dot(V,X)
        
        # use scipy lsqr to solve argmin_{\beta}(z_tilde-X_tilde*\beta)^2
        ls_obj = lsqr(X_tilde, z_tilde)
        beta = np.reshape(ls_obj[0], beta.shape)
        
    return beta

# 2.3 Comparison

In [6]:
from sklearn import cross_validation
from itertools import product

X, X_test, y, y_test = cross_validation.train_test_split(X, y,
                                            random_state=0, test_size=0.3)
taus = [0.001, 0.01, 0.1]
mus = [0.1, 0.2, 0.5]
gammas = [0.0001, 0.001, 0.01]

optimizers = [gradient_descent, stochastic_gradient_descent, stochastic_gradient_mini,
             stochastic_gradient_mom, stochastic_average_gradient,]
              #dual_coordinate_ascent, weighted_least_squares]

In [7]:
kf = cross_validation.KFold(y.shape[0], n_folds=10)

def opt_params(kf, optimizer, values):
    best_error = 5000 # ok bc/ zero_one_loss limits to number of samples
    for combination in product(*values):
        error = 0        
        for train_index, validation_index in kf:
            X_train, X_validation = X[train_index], X[validation_index]
            y_train, y_validation = y[train_index], y[validation_index]
            beta = np.zeros(X_train[0,:].shape)
            b = optimizer(beta, X_train, y_train, *combination)
            error += zero_one_loss(predict(b, X_validation), y_validation) / len(kf)
        print(combination, error)
        if error < best_error:
            best_error = error
            best_params = combination
    return best_error, best_params

## Gradient Descent

In [8]:
opt_params(kf, gradient_descent, [taus])

((0.001,), 0.060499999999999998)
((0.01,), 0.048333333333333332)
((0.1,), 0.1615)


(0.048333333333333332, (0.01,))

## SGD

In [9]:
opt_params(kf, stochastic_gradient_descent, [taus, gammas])

((0.001, 0.0001), 0.048333333333333339)
((0.001, 0.001), 0.048333333333333339)
((0.001, 0.01), 0.036333333333333336)
((0.01, 0.0001), 0.048166666666666663)
((0.01, 0.001), 0.056166666666666656)
((0.01, 0.01), 0.036166666666666666)
((0.1, 0.0001), 0.060166666666666667)
((0.1, 0.001), 0.072166666666666657)
((0.1, 0.01), 0.052500000000000005)


(0.036166666666666666, (0.01, 0.01))

## Dual Coordinate Ascent

In [10]:
opt_params(kf, dual_coordinate_ascent, [taus, gammas])

((0.001, 0.0001), 0.05616666666666667)
((0.001, 0.001), 0.05616666666666667)
((0.001, 0.01), 0.05616666666666667)
((0.01, 0.0001), 0.05616666666666667)
((0.01, 0.001), 0.05616666666666667)
((0.01, 0.01), 0.05616666666666667)
((0.1, 0.0001), 0.05616666666666667)
((0.1, 0.001), 0.05616666666666667)
((0.1, 0.01), 0.05616666666666667)


(0.05616666666666667, (0.001, 0.0001))

## WLSQR

In [11]:
opt_params(kf, weighted_least_squares, [])

((), 0.036333333333333336)


(0.036333333333333336, ())