In [None]:
import numpy as np

# basic layers, forward and backward pass
def affine_forward(x, w, b):
    out = np.dot(x, w) + b
    cache = (x, w, b)
    return out, cache
def affine_backward(dout, cache):
    dx, dw, db = None, None, None
    dx = np.dot(dout, w.T)
    dw = np.dot(x.T, dout)
    db = dout.sum(0)
    return dx, dw, db
def relu_forward(x):
    out = None
    out = np.maximum(0, x)
    cache = x
    return out, cache
def relu_backward(dout, cache):
    dx, x = None, cache
    dx = dout
    dx[x < 0] = 0
    return dx
def softmax_loss(x, y):
    probs = np.exp(x - np.max(x, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    N = x.shape[0]
    loss = -np.sum(np.log(probs[np.arange(N), y])) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    return loss, dx


# combination of affine and relu
def affine_relu_forward(x, w, b):
    a, fc_cache = affine_forward(x, w, b)
    out, relu_cache = relu_forward(a)
    cache = (fc_cache, relu_cache)
    return out, cache

def affine_relu_backward(dout, cache):
    fc_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dw, db = affine_backward(da, fc_cache)
    return dx, dw, db

class TwoLayerNet(object):
    def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
                 weight_scale=1e-3, reg=0.0):
        self.params = {}
        self.reg = reg
    
        self.params['W1'] = weight_scale * np.random.randn(input_dim, hidden_dim)
        self.params['b1'] = np.zeros(hidden_dim)
        self.params['W2'] = weight_scale * np.random.randn(hidden_dim, num_classes)
        self.params['b2'] = np.zeros(num_classes)
    def loss(self, X, y=None):
        scores = None
        out_1, cache_1 = affine_relu_forward(X, self.params['W1'], self.params['b1'])
        scores, cache_2 = affine_forward(out_1, self.params['W2'], self.params['b2'])
        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores
    
        loss, grads = 0, {}

        loss, d_scores = softmax_loss(scores, y)
        loss += 1/2.0 * self.reg * (np.sum(self.params['W1'] ** 2) + np.sum(self.params['W2'] ** 2))
        d_out1, grads['W2'], grads['b2'] = affine_backward(d_scores, cache_2)
        dx, grads['W1'], grads['b1'] = affine_relu_backward(d_out1, cache_1)
        grads['W1'] += self.reg * self.params['W1']
        grads['W2'] += self.reg * self.params['W2']

        return loss, grads


