## Loading the dataset 

In [1]:
from sklearn.datasets import load_digits
digits = load_digits ()
print(digits.keys())
data = digits["data"]
images = digits["images"]
target = digits["target"]
target_names = digits["target_names"]
import numpy as np

dict_keys(['DESCR', 'images', 'target_names', 'data', 'target'])


In [2]:
# Create feature matrix X and groundtruth y
y = target[(target == 3) | (target == 8)]
y[y == 3] = 1
y[y == 8] = -1
images_ = images[(target == 3) | (target == 8)]
X = data[(target == 3) | (target == 8)]
np.shape(X)

(357, 64)

## Basic functions

In [126]:
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def gradient(beta, X, y):
    gradients = (1-sigmoid(y[:, np.newaxis] * X @ beta)).reshape(-1,1) * (-y[:, np.newaxis] * X)
    return np.sum(gradients, 0) / len(y)

def gradient_i(beta, x, y):
    return (1-sigmoid(y * x @ beta)) * (-y * x)

def predict(beta, X):
    p = sigmoid(np.sign(X @ beta))
    p[p >= 0] = 1
    p[p != 1] = -1
    return p

def zero_one_loss(y_p, y_g):
        return np.sum(y_p != y_g)

In [127]:
gradient_i(b_, x_,y_)

array([ 0. ,  0. ,  2. ,  5.5,  7.5,  3.5,  0. ,  0. ,  0. ,  1. ,  7.5,
        7. ,  4.5,  7.5,  0.5,  0. ,  0. ,  4. ,  7.5,  0.5,  3. ,  8. ,
        2.5,  0. ,  0. ,  3. ,  7. ,  6.5,  7.5,  3. ,  0. ,  0. ,  0. ,
        0.5,  8. ,  8. ,  3. ,  0. ,  0. ,  0. ,  0. ,  2. ,  7.5,  5.5,
        7.5,  0.5,  0. ,  0. ,  0. ,  0.5,  6. ,  1.5,  3.5,  4.5,  0. ,
        0. ,  0. ,  0. ,  2. ,  7. ,  8. ,  3. ,  0. ,  0. ])

array([10, 11, 12, 13, 14])

## Optimization Method
SGD and its variants use sampling without replacement.
It reduces the number of sampling calls by shuffling an index array only once per epoch and
simplifies slicing in mini batch versions.

In [130]:
def gradient_descent(beta, X, y, tau, m=10):
    for t in range(m):
        beta = beta - tau * gradient(beta, X, y)
    return beta

def stochastic_gradient_descent(beta, X, y, tau_0, gamma=1, m=150):
    tau = tau_0
    n = len(y)
    indexx = np.arange(n)
    for t in range(m):
        i = np.mod(t, n) # np modulus returns 0 if divisor = 0
        if i == 0:
            np.random.shuffle(indexx)
        beta -= tau*gradient_i(beta, X[indexx[i]], y[indexx[i]])
        tau = tau_0 / (1 + t*gamma)
    return beta


def stochastic_gradient_mini(beta, X, y, tau_0, gamma=1, m=150, b=30):
    tau = tau_0
    n = len(y)
    indexx = np.arange(n)
    np.shuffle(indexx)
    full, rest = np.mod(n, b)
    for t in range(m):
        if np.mod(t, full+1) == 0:
            # every last minibatch per epoch gets filled with random samples
            i = np.append(indexx[b*t,:], np.random.randint(n, size=rest))
            np.shuffle(indexx)
        else:
            i = indexx[b*t,b*(t+1)]
        beta -= tau*gradient(beta,X[i],y[i])
        tau = tau_0/(1 + t*gamma)
    return beta
                     
                          
def stochastic_gradient_mom(beta, X, y, tau_0, mu=0.3, m=150):
    tau = tau_0
    n = len(y)
    g = 0
    indexx = np.arange(n)
    for t in range(m):
        i = np.mod(t, n) # np modulus returns 0 if divisor = 0
        if i == 0:
            np.random.shuffle(indexx)
        g = mu * g + (1 - mu) * gradient_i(beta, X[indexx[i]], y[indexx[i]])
        beta -= tau*g
        tau = tau_0 / (1 + t*gamma)
    return beta                          
                          
def average_stochastic_gradient(beta, X, y, tau_0, mu):
    tau = tau_0
    n = len(y)
    g = 0
    indexx = np.arange(n)
    for t in range(m):
        i = np.mod(t, n)
        if i == 0:
            np.random.shuffle(indexx)
        g -= tau * gradient_i(beta, X[indexx[i]], y[indexx[i]])
        beta = (1 - mu) * beta + mu * g
        tau = tau_0 / np.power(1 + t*gamma,3/4)
    return beta
    
def stochastic_average_gradient(beta, X, y, tau_0, m=150):  # Roux et al. [2012]
    tau = tau_0
    n = len(y)
    indexx = np.arange(n)
    gradients = np.zeros((n, beta.size))
    for t in range(m):
        i = np.mod(n, t)
        if i == 0:
            np.random.shuffle(indexx)
        gradients[indexx[i]] = gradient_i(
                        beta, x[indexx[i]], y[indexx[i]])
        beta -= tau*gradients.mean(axis=0)
        tau = tau_0 / (1 + t*gamma)
    return beta
        
        
def dual_coordinate_ascent(beta,X,y,tau,tau_0,gamma,mu):
    a = np.random.rand(len(y))
    beta =a*y*X.transpose()
    b=1
    for t in range(0,m):   
        i = np.random.randint(0,len(y)-1,size=b)
        a_[i] = np.clip(a(i), 0,1) # add formula 
        beta = beta+(a_(i)-a(i))*y[i_]*X[i,:].transpose()
        tau = tau/(1+gamma*t)
    return beta
    
def weighted_lest_squares(beta,X,y,tau,tau_0,gamma,mu):
    
    return beta
    

2.3 Comparison
===

In [87]:
from sklearn import cross_validation
from itertools import product

X, X_test, y, y_test = cross_validation.train_test_split(X, y,
                                            random_state=0, test_size=0.3)
taus = [0.001, 0.01, 0.1]
mus = [0.1, 0.2, 0.5]
gammas = [0.0001, 0.001, 0.01]

optimizers = [gradient_descent, stochastic_gradient_descent, stochastic_gradient_mini,
             stochastic_gradient_mom, stochastic_average_gradient,]
              #dual_coordinate_ascent, weighted_lest_squares]

In [94]:
kf = cross_validation.KFold(y.shape[0], n_folds=10)

def opt_params(kf, optimizer, values):
    best_error = 5000 # ok bc/ zero_one_loss limits to number of samples
    for combination in product(*values):
        error = 0        
        for train_index, validation_index in kf:
            X_train, X_validation = X[train_index], X[validation_index]
            y_train, y_validation = y[train_index], y[validation_index]
            beta = np.zeros(X_train[0,:].shape)
            b = optimizer(beta, X_train, y_train, *combination)
            error += zero_one_loss(predict(b, X_validation), y_validation)
        print(*combination, error)
        if error < best_error:
            best_error = error
            best_params = combination
    return best_error, best_params
        

Gradient Descent
---

In [95]:
opt_params(kf, gradient_descent, [taus])

0.001 21
0.01 21
0.1 21


(21, (0.001,))

SGD
---

In [141]:
opt_params(kf, stochastic_gradient_descent, [taus, gammas])

0.001 0.0001 21
0.001 0.001 21
0.001 0.01 21
0.01 0.0001 21
0.01 0.001 21
0.01 0.01 21
0.1 0.0001 21
0.1 0.001 21
0.1 0.01 21




(21, (0.001, 0.0001))

Mini Batch
---

(64,)

Momentum
---

ASG
---

SAG
---