In [None]:
import numpy as np

In [None]:
from sklearn.datasets import load_digits

In [None]:
digits_x, digits_y = load_digits(return_X_y=True)
digits_y = digits_y.astype(int)
print(digits_x.shape, digits_y.shape)

(1797, 64) (1797,)


In [None]:
def softmax(x):
      return np.exp(x) / np.sum(np.exp(x), axis=0)

def one_hot_encoding(y, k):
    one_hot = np.zeros((len(y), k))
    one_hot[np.arange(len(y)), y] = 1
    return one_hot
    
class SoftmaxRegression:
    def __init__(self, total_cls):
        self.total_cls = total_cls
        self.w = None

    def fit(self, x_train, y_train, x_val, y_val, optimizer):
        N,D = x_train.shape
        def gradient(x, y, w):
            yh =  softmax(np.matmul(x, w))
            grad = 1/N * np.matmul(x.T, (yh - one_hot_encoding(y, self.total_cls)))
            return grad
        w0 = np.zeros((D, self.total_cls))
        self.w, history = optimizer.run(gradient, x_train, y_train, x_val, y_val, w0, self.total_cls)
        return history
    
    def predict(self, x):
        return softmax(np.matmul(x, self.w))

In [None]:
from collections import defaultdict
from sklearn.metrics import mean_squared_error

In [None]:
class MiniBatchGradientDescent:

  def __init__(self, batch_size = 256, learning_rate = 0.01, momentum = 0.9, l2_regularization = 0.1, termination_step = 20):
      self.batch_size = batch_size
      self.learning_rate = learning_rate
      self.momentum = momentum
      self.l2_regularization = l2_regularization
      self.termination_step = termination_step

  def create_mini_batches(self, x):
      lst = list(range(len(x)))
      np.random.shuffle(lst)
      return np.array_split(lst, self.batch_size)

  def cost(self, x, y, w, k):
      z = np.matmul(x, w)
      z -= np.max(z)
      return np.mean(-1 * np.sum(one_hot_encoding(y, k) * z - np.log(np.sum(np.exp(z)))))

  def accuracy(self, x, y, w):
      return mean_squared_error(np.argmax(softmax(np.matmul(x, w)), axis=1), y)


  def run(self, gradient_function, x_train, y_train, x_validation, y_validation, w, k):
      grad = np.inf
      validation_best = np.inf
      delta_w = 0
      best_w = w.copy()
      step = 0
      history = defaultdict(list)
      termination = False
      while not termination:
          minibatch_ttl = self.create_mini_batches(x_train)
          for minibatch_idx in minibatch_ttl:
              x_batch = x_train[minibatch_idx]
              y_batch = y_train[minibatch_idx]
              gradient = gradient_function(x_batch, y_batch, w) 
              gradient += (np.linalg.norm(w) ** 2) * self.l2_regularization * 0.5
              delta_w = self.momentum * delta_w + (1 - self.momentum) * gradient
              w -= self.learning_rate * delta_w
              history["cost"].append(self.cost(x_batch, y_batch, w, k))
              history["train_accuracy"].append(self.accuracy(x_batch, y_batch, w))
              history["val_accuracy"].append(self.accuracy(x_validation, y_validation, w))

              validation_error = mean_squared_error(np.argmax(softmax(np.matmul(x_validation, w)), axis=1), y_validation)

              if validation_error < validation_best:
                  validation_best = validation_error
                  best_w = w.copy()
                  step = 0
              else:
                  step += 1
                  if step >= self.termination_step:
                      termination = True
                      history["training_stop"] = [step]
                      break
      return best_w, history

In [None]:
def k_fold_cross_val(x, k = 5):
    lst = list(range(len(x)))
    np.random.shuffle(lst)
    folds = np.array_split(lst, k)
    for i in range(k):
        test = np.concatenate(*[folds[:i] + folds[i+1:]])
        val = folds[i]
        yield test.flatten(), val

In [None]:
def run_k_fold_cross_val(x, y, model, optimizer):
    for i, (train_id, val_id) in enumerate(k_fold_cross_val(x)):
        x_train, y_train = x[train_id], y[train_id]
        x_val, y_val = x[val_id], y[val_id]
        history = model.fit(x_train, y_train, x_val, y_val, optimizer)
        print(history)

In [None]:
model = SoftmaxRegression(26)
optimizer = MiniBatchGradientDescent()
run_k_fold_cross_val(digits_x, digits_y, model, optimizer)

defaultdict(<class 'list'>, {'cost': [786.9567199602757, 786.3950408211851, 786.0092910355245, 784.6720078902863, 784.6700742016353, 783.8464075145491, 783.124824537056, 780.7191942689178, 778.6860093574289, 777.4140968995619, 776.2615708170576, 774.3893969356272, 774.9962426370764, 773.4867096397043, 773.1610929638281, 770.3127993641621, 769.6458235667561, 764.9153504748073, 767.7570296526337, 761.2392001343829, 762.0450990430984, 762.4511551315618, 761.3854732953077, 760.8830600677303, 753.415063672426, 756.4640068015951, 756.3089788731613, 752.2376261272672, 753.6409754881191, 752.7238401902923, 748.6318409649523, 746.31478888325, 746.57775152781, 743.930939026867, 741.9576018873933, 736.1802683822732, 742.803454652539, 738.6196844462552, 736.6008766965058, 740.8164930896148, 741.521827660627], 'train_accuracy': [0.0, 0.0, 11.166666666666666, 13.666666666666666, 2.8333333333333335, 1.5, 6.0, 17.666666666666668, 4.166666666666667, 3.6666666666666665, 0.16666666666666666, 1.8333333333