In [1]:
import numpy as np 
import os
import platform
from six.moves import cPickle as pickle
from builtins import range

def rel_error(x, y):
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

def load_pickle(f):
    version = platform.python_version_tuple()
    if version[0] == "2":
        return pickle.load(f)
    elif version[0] == "3":
        return pickle.load(f, encoding="latin1")
    raise ValueError("invalid python version: {}".format(version))

def load_CIFAR_batch(filename):
    with open(filename, "rb") as f:
        datadict = load_pickle(f)
        X = datadict["data"]
        Y = datadict["labels"]
        X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
        Y = np.array(Y)
        return X, Y

def load_CIFAR10(ROOT):
    xs = []
    ys = []
    for b in range(1, 6):
        f = os.path.join(ROOT, "data_batch_%d" % (b,))
        X, Y = load_CIFAR_batch(f)
        xs.append(X)
        ys.append(Y)
    Xtr = np.concatenate(xs)
    Ytr = np.concatenate(ys)
    del X, Y
    Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, "test_batch"))
    return Xtr, Ytr, Xte, Yte

def eval_numerical_gradient_array(f, x, df, h=1e-5):
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
    while not it.finished:
        ix = it.multi_index

        oldval = x[ix]
        x[ix] = oldval + h
        pos = f(x).copy()
        x[ix] = oldval - h
        neg = f(x).copy()
        x[ix] = oldval

        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
        it.iternext()
    return grad

def eval_numerical_gradient(f, x, verbose=True, h=0.00001):
    fx = f(x)  # evaluate function value at original point
    grad = np.zeros_like(x)
    # iterate over all indexes in x
    it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
    while not it.finished:

        # evaluate function at x+h
        ix = it.multi_index
        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evalute f(x + h)
        x[ix] = oldval - h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # restore

        # compute the partial derivative with centered formula
        grad[ix] = (fxph - fxmh) / (2 * h)  # the slope
        if verbose:
            print(ix, grad[ix])
        it.iternext()  # step to next dimension

    return grad

def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, subtract_mean=True):
    # Load the raw CIFAR-10 data
    cifar10_dir = os.path.join("/root/cifar-10-batches-py")
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
    
    
    # Subsample the data
    mask = list(range(num_training, num_training + num_validation))
    
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = list(range(num_training))
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = list(range(num_test))
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    if subtract_mean:
        mean_image = np.mean(X_train, axis=0)
        X_train -= mean_image
        X_val -= mean_image
        X_test -= mean_image

    # Transpose so that channels come first
    X_train = X_train.transpose(0, 3, 1, 2).copy()
    X_val = X_val.transpose(0, 3, 1, 2).copy()
    X_test = X_test.transpose(0, 3, 1, 2).copy()

    # Package data into a dictionary
    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_val": X_val,
        "y_val": y_val,
        "X_test": X_test,
        "y_test": y_test,
    }

def sgd(w, dw, config=None):
    if config is None:
        config = {}
    config.setdefault("learning_rate", 1e-2)
    w -= config["learning_rate"]*dw
    return w, config

class Solver(object):
    def __init__(self, model, data, **kwargs):
        """
        Construct a new Solver instance.

        Required arguments:
        - model: A model object conforming to the API described above
        - data: A dictionary of training and validation data containing:
          'X_train': Array, shape (N_train, d_1, ..., d_n) of training images
          'X_val': Array, shape (N_val, d_1, ..., d_n) of validation images
          'y_train': Array, shape (N_train,) of labels for training images
          'y_val': Array, shape (N_val,) of labels for validation images

        Optional arguments:
        - optim_config: A dictionary containing hyperparameters that will be
          passed to the chosen update rule. Each update rule requires different
          hyperparameters  but all update rules require a
          'learning_rate' parameter so that should always be present.
        - lr_decay: A scalar for learning rate decay; after each epoch the
          learning rate is multiplied by this value.
        - batch_size: Size of minibatches used to compute loss and gradient
          during training.
        - num_epochs: The number of epochs to run for during training.
        - print_every: Integer; training losses will be printed every
          print_every iterations.
        - verbose: Boolean; if set to false then no output will be printed
          during training.
        - num_train_samples: Number of training samples used to check training
          accuracy; default is 1000; set to None to use entire training set.
        - num_val_samples: Number of validation samples to use to check val
          accuracy; default is None, which uses the entire validation set.
        - checkpoint_name: If not None, then save model checkpoints here every
          epoch.
        """
        self.model = model
        self.X_train = data["X_train"]
        self.y_train = data["y_train"]
        self.X_val = data["X_val"]
        self.y_val = data["y_val"]

        # Unpack keyword arguments
        self.optim_config = kwargs.pop("optim_config", {})
        self.lr_decay = kwargs.pop("lr_decay", 1.0)
        self.batch_size = kwargs.pop("batch_size", 100)
        self.num_epochs = kwargs.pop("num_epochs", 10)
        self.num_train_samples = kwargs.pop("num_train_samples", 1000)
        self.num_val_samples = kwargs.pop("num_val_samples", None)

        self.checkpoint_name = kwargs.pop("checkpoint_name", None)
        self.print_every = kwargs.pop("print_every", 10)
        self.verbose = kwargs.pop("verbose", True)

        # Throw an error if there are extra keyword arguments
        if len(kwargs) > 0:
            extra = ", ".join('"%s"' % k for k in list(kwargs.keys()))
            raise ValueError("Unrecognized arguments %s" % extra)

        self.update_rule = sgd

        self._reset()

    def _reset(self):
        # Set up some variables for book-keeping
        self.epoch = 0
        self.best_val_acc = 0
        self.best_params = {}
        self.loss_history = []
        self.train_acc_history = []
        self.val_acc_history = []

        # Make a deep copy of the optim_config for each parameter
        self.optim_configs = {}
        for p in self.model.params:
            d = {k: v for k, v in self.optim_config.items()}
            self.optim_configs[p] = d

    def _step(self):
        # Make a minibatch of training data
        num_train = self.X_train.shape[0]
        batch_mask = np.random.choice(num_train, self.batch_size)
        X_batch = self.X_train[batch_mask]
        y_batch = self.y_train[batch_mask]

        # Compute loss and gradient
        loss, grads = self.model.loss(X_batch, y_batch)
        self.loss_history.append(loss)

        # Perform a parameter update
        for p, w in self.model.params.items():
            dw = grads[p]
            config = self.optim_configs[p]
            next_w, next_config = self.update_rule(w, dw, config)
            self.model.params[p] = next_w
            self.optim_configs[p] = next_config

    def _save_checkpoint(self):
        if self.checkpoint_name is None:
            return
        checkpoint = {
            "model": self.model,
            "update_rule": self.update_rule,
            "lr_decay": self.lr_decay,
            "optim_config": self.optim_config,
            "batch_size": self.batch_size,
            "num_train_samples": self.num_train_samples,
            "num_val_samples": self.num_val_samples,
            "epoch": self.epoch,
            "loss_history": self.loss_history,
            "train_acc_history": self.train_acc_history,
            "val_acc_history": self.val_acc_history,
        }
        filename = "%s_epoch_%d.pkl" % (self.checkpoint_name, self.epoch)
        if self.verbose:
            print('Saving checkpoint to "%s"' % filename)
        with open(filename, "wb") as f:
            pickle.dump(checkpoint, f)

    def check_accuracy(self, X, y, num_samples=None, batch_size=100):
        # Maybe subsample the data
        N = X.shape[0]
        if num_samples is not None and N > num_samples:
            mask = np.random.choice(N, num_samples)
            N = num_samples
            X = X[mask]
            y = y[mask]

        # Compute predictions in batches
        num_batches = N // batch_size
        if N % batch_size != 0:
            num_batches += 1
        y_pred = []
        for i in range(num_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            scores = self.model.loss(X[start:end])
            y_pred.append(np.argmax(scores, axis=1))
        y_pred = np.hstack(y_pred)
        acc = np.mean(y_pred == y)

        return acc

    def train(self):
        num_train = self.X_train.shape[0]
        iterations_per_epoch = max(num_train // self.batch_size, 1)
        num_iterations = self.num_epochs * iterations_per_epoch

        for t in range(num_iterations):
            self._step()

            # Maybe print training loss
            if self.verbose and t % self.print_every == 0:
                print(
                    "(Iteration %d / %d) loss: %f"
                    % (t + 1, num_iterations, self.loss_history[-1])
                )

            # At the end of every epoch, increment the epoch counter and decay
            # the learning rate.
            epoch_end = (t + 1) % iterations_per_epoch == 0
            if epoch_end:
                self.epoch += 1
                for k in self.optim_configs:
                    self.optim_configs[k]["learning_rate"] *= self.lr_decay

            # Check train and val accuracy on the first iteration, the last
            # iteration, and at the end of each epoch.
            first_it = t == 0
            last_it = t == num_iterations - 1
            if first_it or last_it or epoch_end:
                train_acc = self.check_accuracy(
                    self.X_train, self.y_train, num_samples=self.num_train_samples
                )
                val_acc = self.check_accuracy(
                    self.X_val, self.y_val, num_samples=self.num_val_samples
                )
                self.train_acc_history.append(train_acc)
                self.val_acc_history.append(val_acc)
                self._save_checkpoint()

                if self.verbose:
                    print(
                        "(Epoch %d / %d) train acc: %f; val_acc: %f"
                        % (self.epoch, self.num_epochs, train_acc, val_acc)
                    )

                # Keep track of the best model
                if val_acc > self.best_val_acc:
                    self.best_val_acc = val_acc
                    self.best_params = {}
                    for k, v in self.model.params.items():
                        self.best_params[k] = v.copy()

        # At the end of training swap the best params into the model
        self.model.params = self.best_params

In [2]:
def linear_forward(X, W, b):
    """
    Forward pass for linear layer 
    N is number of examples (batch size)
    D = d_1 * ... * d_n
    M is output dimension
    
    Inputs:
    - X: array containing input data of shape (N, d_1, ..., d_n)
    - W: array of weights of shape (D, M)
    - b: array of biases of shape (M,)

    Returns a tuple of:
    - out: output, of shape (N, M)
    - cache: (X, W, b)
    """
    out = None
    # Reshape into vector before applying linear transformation
    # YOUR CODE HERE
    #raise NotImplementedError()
    X_reshape = np.reshape(X, (X.shape[0], -1))
    out = X_reshape@W + b
    cache = (X, W, b)
    return out, cache

In [3]:
num_inputs = 2
input_shape = (4, 5, 6)
output_dim = 3
input_size = num_inputs * np.prod(input_shape)
weight_size = output_dim * np.prod(input_shape)
x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape)
w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim)
b = np.linspace(-0.3, 0.1, num=output_dim)
out, _ = linear_forward(x, w, b)
correct_out = np.array([[ 1.49834967,  1.70660132,  1.91485297],
                        [ 3.25553199,  3.5141327,   3.77273342]])
error = rel_error(out, correct_out)
print(error)
assert error < 1e-9

9.7698500479884e-10


In [4]:
def linear_backward(dout, cache):
    """
    Backward pass for linear layer layer.

    Inputs:
    - dout: Upstream gradient of shape (N, M)
    - cache: Tuple of:
      - X: array of shape (N, d_1, ... d_n)
      - W: array of weights of shape (D, M)
      - b: array biases of shape (M,)

    Returns a tuple of:
    - dX: Gradient with respect to x of shape (N, d1, ..., d_k)
    - dW: Gradient with respect to w of shape (D, M)
    - db: Gradient with respect to b of shape (M,)
    """
    X, W, b = cache
    X_reshape = np.reshape(X, (X.shape[0], -1))
    dX, dW, db = None, None, None
    # YOUR CODE HERE
    #raise NotImplementedError()
    dW = X_reshape.T@dout
    db = np.sum(dout, axis=0)
    dX = dout@W.T
    dX = np.reshape(dX, X.shape)
    return dX, dW, db

In [5]:
np.random.seed(47)
x = np.random.randn(10, 2, 3)
w = np.random.randn(6, 5)
b = np.random.randn(5)
dout = np.random.randn(10, 5)
dx_num = eval_numerical_gradient_array(lambda x: linear_forward(x, w, b)[0], x, dout)
dw_num = eval_numerical_gradient_array(lambda w: linear_forward(x, w, b)[0], w, dout)
db_num = eval_numerical_gradient_array(lambda b: linear_forward(x, w, b)[0], b, dout)
_, cache = linear_forward(x, w, b)
dx, dw, db = linear_backward(dout, cache)
error_dX = rel_error(dx_num, dx)
error_dW = rel_error(dw_num, dw)
error_db = rel_error(db_num, db)
print('dX error: ', error_dX)
assert error_dX < 1e-9
print('dW error: ', error_dW)
assert error_dW < 1e-9
print('db error: ', error_db)
assert error_db < 1e-9

dX error:  2.4408840841167563e-10
dW error:  6.193103439209376e-11
db error:  1.4539016235043804e-11


In [6]:
def relu_forward(X):
    """
    Forward pass for rectified linear units (ReLUs)layer

    Input:
    - X: Inputs 

    Returns a tuple of:
    - out: Output of the same shape as X
    - cache: X
    """
    out = None
    # YOUR CODE HERE
    #raise NotImplementedError()
    out = np.maximum(X, 0)
    cache = X
    return out, cache

In [7]:
x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)
out, _ = relu_forward(x)
correct_out = np.array([[ 0.,          0.,          0.,          0.,        ],
                        [ 0.,          0.,          0.04545455,  0.13636364,],
                        [ 0.22727273,  0.31818182,  0.40909091,  0.5,       ]])
error = rel_error(out, correct_out)
print(error)
assert error < 1e-7

4.999999798022158e-08


In [8]:
def relu_backward(dout, cache):
    """
    Backward pass for rectified linear units (ReLUs) layer

    Input:
    - dout: Upstream gradient
    - cache: Input X of same shape as dout

    Returns:
    - dX: Gradient with respect to X
    """
    dX, X = None, cache
    # YOUR CODE HERE
    #raise NotImplementedError()
    dX = np.sign(np.maximum(0, X))*dout
    return dX

In [9]:
np.random.seed(47)
x = np.random.randn(10, 10)
dout = np.random.randn(*x.shape)
dx_num = eval_numerical_gradient_array(lambda x: relu_forward(x)[0], x, dout)
_, cache = relu_forward(x)
dx = relu_backward(dout, cache)
error = rel_error(dx_num, dx)
print(error)
assert error < 1e-11

3.2756399622829256e-12


In [10]:
def linear_relu_forward(x, w, b):
    """
    Linear transform followed by a ReLU

    Inputs:
    - X: Input to the affine layer
    - W, b: Weights for the affine layer

    Returns a tuple of:
    - out: Output from the ReLU
    - cache: Object to give to the backward pass
    """
    out, linear_cache, relu_cache = None, None, None
    # YOUR CODE HERE
    #raise NotImplementedError()
    out_forward, linear_cache = linear_forward(x, w, b)
    out, relu_cache = relu_forward(out_forward)
    
    cache = (linear_cache, relu_cache)
    return out, cache

def linear_relu_backward(dout, cache):
    """
    Backward pass for the linear-relu layer
    """
    fc_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dw, db = linear_backward(da, fc_cache)
    return dx, dw, db

In [11]:
np.random.seed(47)
x = np.random.randn(2, 3, 4)
w = np.random.randn(12, 10)
b = np.random.randn(10)
dout = np.random.randn(2, 10)
out, cache = linear_relu_forward(x, w, b)
dx, dw, db = linear_relu_backward(dout, cache)
dx_num = eval_numerical_gradient_array(lambda x: linear_relu_forward(x, w, b)[0], x, dout)
dw_num = eval_numerical_gradient_array(lambda w: linear_relu_forward(x, w, b)[0], w, dout)
db_num = eval_numerical_gradient_array(lambda b: linear_relu_forward(x, w, b)[0], b, dout)

error_dX = rel_error(dx_num, dx)
error_dW = rel_error(dw_num, dw)
error_db = rel_error(db_num, db)
print('dX error: ', error_dX)
assert error_dX < 1e-8
print('dW error: ', error_dW)
assert error_dW < 1e-8
print('db error: ', error_db)
assert error_db < 1e-8

dX error:  7.014478385971623e-09
dW error:  1.841788863087595e-09
db error:  1.8928925550934774e-11


In [14]:
def svm_loss(X, y):
    loss, dX = None, None
    # YOUR CODE HERE
    #raise NotImplementedError()
    N = x.shape[0]
    correct_class_scores = x[np.arange(N), y]
    margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
    margins[np.arange(N), y] = 0
    loss = np.sum(margins) / N
    num_pos = np.sum(margins > 0, axis=1)
    dX = np.zeros_like(x)
    dX[margins > 0] = 1
    dX[np.arange(N), y] -= num_pos
    dX /= N
    return loss, dX
    

def softmax_loss(X, y):
    shifted_logits = x - np.max(x, axis=1, keepdims=True)
    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
    log_probs = shifted_logits - np.log(Z)
    probs = np.exp(log_probs)
    N = x.shape[0]
    loss = -np.sum(log_probs[np.arange(N), y]) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    return loss, dx

In [16]:
np.random.seed(231)
num_classes, num_inputs = 10, 50
x = 0.001 * np.random.randn(num_inputs, num_classes)
y = np.random.randint(num_classes, size=num_inputs)
dx_num = eval_numerical_gradient(lambda x: svm_loss(x, y)[0], x, verbose=False)
loss, dx = svm_loss(x, y)
svm_dX_error = rel_error(dx_num, dx)
print('SVM dX error: ', svm_dX_error)
assert svm_dX_error < 1e-8

SVM dX error:  1.4021566006651672e-09


In [17]:
np.random.seed(231)
num_classes, num_inputs = 10, 50
x = 0.001 * np.random.randn(num_inputs, num_classes)
y = np.random.randint(num_classes, size=num_inputs)
dx_num = eval_numerical_gradient(lambda x: svm_loss(x, y)[0], x, verbose=False)
loss, dx = svm_loss(x, y)
svm_dX_error = rel_error(dx_num, dx)
print('SVM dX error: ', svm_dX_error)
assert svm_dX_error < 1e-8

dx_num = eval_numerical_gradient(lambda x: softmax_loss(x, y)[0], x, verbose=False)
loss, dx = softmax_loss(x, y)
softmax_dX_error = rel_error(dx_num, dx)
print('dx error: ', softmax_dX_error)
assert softmax_dX_error < 1e-8

SVM dX error:  1.4021566006651672e-09
dx error:  9.384673161989355e-09


In [None]:
class NeuralNetwork(object):
    """
    Two-layer neural network with ReLU nonlinearity and softmax loss 
    D = number of features
    H = hidden dimenion
    C = number of classes

    Architecure is affine - relu - affine - softmax.

    Parameters are stored in the dictionary self.params mapping parameter names to numpy arrays.
    """

    def __init__(
        self,
        input_dim=3 * 32 * 32,
        hidden_dim=100,
        num_classes=10,
        weight_scale=1e-3,
        reg=0.0,
    ):
        """
        Initialize a new network.

        Inputs:
        - input_dim: An integer giving the size of the input
        - hidden_dim: An integer giving the size of the hidden layer
        - num_classes: An integer giving the number of classes to classify
        - weight_scale: Scalar giving the standard deviation for random
          initialization of the weights.
        - reg: Scalar giving L2 regularization strength.
        """
        self.params = {}
        self.reg = reg
        
        # Initialize weights using Gaussian with mean 0.0 and standard deviation weight_scale
        # Initialize bias to zero
        # Store weights and biases in the dictionary self.params using names 'W1' and 'b1' for first layer and 
        #'W2' and 'b2' for second layer
        # YOUR CODE HERE
        raise NotImplementedError()
        
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.

        Inputs:
        - X: Array of input data of shape (N, d_1, ..., d_k)
        - y: Array of labels of shape (N,)

        Returns:
        when y is None,it is test-time prediction, just return scores
        A tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        scores = None
        # Forward pass
        # Put the class scores in scores variable
        # YOUR CODE HERE
        raise NotImplementedError()
        # juet return scores if y is None
        if y is None:
            return scores

        loss, grads = 0, {}
        
        # Compute loss, do not forget L2 regularization, use 0.5*self.reg for the loss
        # Store the gradients in grads using keys 'W1' and 'b1' for first layer and 'W2' and 'b2' for second layer
        # YOUR CODE HERE
        raise NotImplementedError()

        return loss, grads

In [None]:
np.random.seed(47)
N, D, H, C = 3, 5, 50, 7
X = np.random.randn(N, D)
y = np.random.randint(C, size=N)
std = 1e-3
model = NeuralNetwork(input_dim=D, hidden_dim=H, num_classes=C, weight_scale=std)

print('Initialization ... ')
W1_std = abs(model.params['W1'].std() - std)
b1 = model.params['b1']
W2_std = abs(model.params['W2'].std() - std)
b2 = model.params['b2']
assert W1_std < std / 10, 'Error First layer weights'
assert np.all(b1 == 0), 'Error First layer biases '
assert W2_std < std / 10, 'Error Second layer weights '
assert np.all(b2 == 0), 'Error Second layer biases'

print('Test-time forward pass ... ')
model.params['W1'] = np.linspace(-0.7, 0.3, num=D*H).reshape(D, H)
model.params['b1'] = np.linspace(-0.1, 0.9, num=H)
model.params['W2'] = np.linspace(-0.3, 0.4, num=H*C).reshape(H, C)
model.params['b2'] = np.linspace(-0.9, 0.1, num=C)
X = np.linspace(-5.5, 4.5, num=N*D).reshape(D, N).T
scores = model.loss(X)
correct_scores = np.asarray(
  [[11.53165108,  12.2917344,   13.05181771,  13.81190102,  14.57198434, 15.33206765,  16.09215096],
   [12.05769098,  12.74614105,  13.43459113,  14.1230412,   14.81149128, 15.49994135,  16.18839143],
   [12.58373087,  13.20054771,  13.81736455,  14.43418138,  15.05099822, 15.66781506,  16.2846319 ]])
scores_diff = np.abs(scores - correct_scores).sum()
assert scores_diff < 1e-6, 'Error test-time forward pass'

print('Training loss (no regularization)')
y = np.asarray([0, 5, 1])
loss, grads = model.loss(X, y)
correct_loss = 3.4702243556
assert abs(loss - correct_loss) < 1e-10, 'Error training-time loss'

model.reg = 1.0
loss, grads = model.loss(X, y)
correct_loss = 26.5948426952
assert abs(loss - correct_loss) < 1e-10, 'Error regularization loss'

for reg in [0.0, 0.7]:
    print('Gradient check with reg = ', reg)
    model.reg = reg
    loss, grads = model.loss(X, y)

    for name in sorted(grads):
        f = lambda _: model.loss(X, y)[0]
        grad_num = eval_numerical_gradient(f, model.params[name], verbose=False)
        error = rel_error(grad_num, grads[name])
        print('%s error: %.2e' % (name, error))
        assert error < 1e-6

## Hyperparameter tuning

In [None]:
data = get_CIFAR10_data()

In [None]:
input_size = 32 * 32 * 3
hidden_size = 50
num_classes = 10

# Tune the learning rate, num_epochs, regularization to get the best accuracy 
learning_rate = None
num_epochs = None
reg = None

# YOUR CODE HERE
raise NotImplementedError()

model = NeuralNetwork(input_size, hidden_size, num_classes, reg=reg)
solver = Solver(model, data, optim_config={'learning_rate': learning_rate}, num_epochs=num_epochs)
solver.train()

In [None]:
y_val_pred = np.argmax(model.loss(data['X_val']), axis=1)
val_acc = (y_val_pred == data['y_val']).mean()
print('Validation set accuracy: ', val_acc)
assert val_acc >= 0.5

In [None]:
y_test_pred = np.argmax(model.loss(data['X_test']), axis=1)
test_acc = (y_test_pred == data['y_test']).mean()
print('Test set accuracy: ', test_acc)
assert test_acc >= 0.49