In [43]:
import numpy as np
import sklearn.datasets
import sklearn.linear_model

In [44]:
np.random.seed(0)
X, y = sklearn.datasets.make_moons(200, noise=0.2)

In [45]:
y

array([0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1], dtype=int64)

In [46]:
num_examples = len(X)       # size of training set
nn_input_dim = 2
nn_output_dim = 2

lr = 0.01
reg_lambda = 0.01

In [49]:
def calculate_loss(model):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    z1 = X.dot(W1) + b1
    a1 = np.tanh(z1)

    z2 = a1.dot(W2) + b2

    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    print(probs)
    log_probs = -np.log(probs[range(num_examples), y])
    print(log_probs)
    loss = np.sum(log_probs)

    return 1./num_examples * loss

In [50]:
def build_model(nn_hdim, num_passes=30000, print_loss=False):
    W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
    b1 = np.zeros((1, nn_hdim))
    W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
    b2 = np.zeros((1, nn_output_dim))

    model = {}

    # Gradient descent.
    for i in range(0, num_passes):
        # forward
        z1 = X.dot(W1) + b1
        a1 = np.tanh(z1)
        z2 = a1.dot(W2) + b2
        exp_scores = np.exp(z2)

        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)   # this is softmax

        # bp
        delta3 = probs
        delta3[range(num_examples), y] -= 1    # this is the derivative of softmax [no need to thoroughly understand yet]
                                      #                                   [we'll revisit in weeks later]
        dW2 = (a1.T).dot(delta3)
        db2 = np.sum(delta3, axis=0, keepdims=True)
        delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2)) # tanh derivative
        dW1 = np.dot(X.T, delta2)
        db1 = np.sum(delta2, axis=0)

        # optional
        W1 += -lr * dW1
        b1 += -lr * db1
        W2 += -lr * dW2
        b2 += -lr * db2

        model = {'W1': W1, 'b1':b1, 'W2':W2, 'b2': b2}

        if print_loss and i % 1000 == 0:
            print("Loss after iteration %i: %f" % (i, calculate_loss(model)))
        break
    return model


# n-dimesional hidden layer
model = build_model(10, print_loss = True)

[[0.3552745  0.6447255 ]
 [0.05461035 0.94538965]
 [0.73602386 0.26397614]
 [0.79775572 0.20224428]
 [0.08251806 0.91748194]
 [0.09378509 0.90621491]
 [0.27673467 0.72326533]
 [0.10066617 0.89933383]
 [0.29960996 0.70039004]
 [0.06437229 0.93562771]
 [0.86600661 0.13399339]
 [0.38485434 0.61514566]
 [0.07510544 0.92489456]
 [0.10438617 0.89561383]
 [0.94786321 0.05213679]
 [0.20964247 0.79035753]
 [0.5209719  0.4790281 ]
 [0.3795646  0.6204354 ]
 [0.66358415 0.33641585]
 [0.96323413 0.03676587]
 [0.11057356 0.88942644]
 [0.10662802 0.89337198]
 [0.65036896 0.34963104]
 [0.11927758 0.88072242]
 [0.86953336 0.13046664]
 [0.5060445  0.4939555 ]
 [0.09535293 0.90464707]
 [0.11046452 0.88953548]
 [0.61469968 0.38530032]
 [0.73707972 0.26292028]
 [0.35967842 0.64032158]
 [0.85252846 0.14747154]
 [0.11528484 0.88471516]
 [0.67421996 0.32578004]
 [0.44344062 0.55655938]
 [0.17353945 0.82646055]
 [0.25025883 0.74974117]
 [0.18880479 0.81119521]
 [0.93110103 0.06889897]
 [0.8163652  0.1836348 ]
