In [1]:
from urllib import request
import gzip
import pickle
import os
import numpy as np
import math

def load_synth(num_train=60_000, num_val=10_000, seed=0):
    """
    Load some very basic synthetic data that should be easy to classify. Two features, so that we can plot the
    decision boundary (which is an ellipse in the feature space).

    :param num_train: Number of training instances
    :param num_val: Number of test/validation instances
    :param num_features: Number of features per instance

    :return: Two tuples and an integer: (xtrain, ytrain), (xval, yval), num_cls. The first contains a matrix of training
     data with 2 features as a numpy floating point array, and the corresponding classification labels as a numpy
     integer array. The second contains the test/validation data in the same format. The last integer contains the
     number of classes (this is always 2 for this function).
    """
    np.random.seed(seed)

    THRESHOLD = 0.6
    quad = np.asarray([[1, -0.05], [1, .4]])

    ntotal = num_train + num_val

    x = np.random.randn(ntotal, 2)

    # compute the quadratic form
    q = np.einsum('bf, fk, bk -> b', x, quad, x)
    y = (q > THRESHOLD).astype(int)

    return (x[:num_train, :], y[:num_train]), (x[num_train:, :], y[num_train:]), 2

def normalize(X):
    # Calculate the mean.
    feature1 = 0
    feature2 = 0
    for i in range(len(xtrain)):
        feature1 += xtrain[i][0]
        feature2 += xtrain[i][1]
    mean_f1 = feature1/len(xtrain)
    mean_f2 = feature2/len(xtrain)

    # Calculate the variance.
    deviations_f1 = 0
    deviations_f2 = 0

    for i in range(len(xtrain)):
        deviations_f1 += xtrain[i][0] - mean_f1
        deviations_f2 += xtrain[i][1] - mean_f2

    variance_f1 = deviations_f1/len(xtrain)
    variance_f2 = deviations_f2/len(xtrain)

    xnormalized = []

    for i in range(len(xtrain)):
        norm_f1 = (xtrain[i][0]-mean_f1)/variance_f1
        norm_f2 = (xtrain[i][0]-mean_f2)/variance_f2
        xnormalized.append([norm_f1, norm_f2])
    
    xnormalized = np.asarray(xnormalized)
    
    return xnormalized

def crossentropy(predictions, label):
    loss = 0
    for i in range(len(predictions)):
        loss -= label * math.log(
            predictions[i]+1.7976931348623157e+308) - (1-label) * math.log(
            1-predictions[i]+1.7976931348623157e+308)
    return loss

def softmax(X):
    exps = []
    probs = []

    for element in X:
        # To prevent OverflowError:
        if (element >= 709):
            element = 709
        # Softmax.
        exp_of_X = math.exp(element)
        exps.append(exp_of_X)
        expsum = sum(exps)
    
    for i in range(len(exps)):
        probs.append(exps[i] / (expsum+1.7976931348623157e+308))
        
    return probs

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def train(X, y, weights, biases):
    # Create empty network.
    network = [[0.0, 0.0, 0.0], # First layer k: 3 nodes
               [0.0, 0.0, 0.0], # Layer h: same, 3 nodes
               [0.0, 0.0], # Layer o: 2 nodes
               [0.0, 0.0]] # Layer y: same, 2 nodes

    deriv_network = [[0.0, 0.0, 0.0],
                     [0.0, 0.0, 0.0],
                     [0.0, 0.0],
                     [0.0, 0.0]]
    
    loss_log = []
    
    for instance in range(len(xtrain)):
        # Compute the first linear layer.
        for j in range(len(network[0])):
            for i in range(len(xtrain[instance])):
                network[0][j] += weights[0][i][j] * xtrain[instance][i]
                network[0][j] += biases[0][j]

        # Apply sigmoid activation on the first layer.
        for i in range(len(network[1])):
            network[1][i] = sigmoid(network[1-1][i])

        # Compute the linear output.
        for j in range(len(network[2])):
            for i in range(len(network[2-1])):
                network[2][j] += weights[2][i][j] * network[2-1][i]
                network[2][j] += biases[2][j]

        # Apply softmax activation on the output.
        network[3] = softmax(network[2])

        loss = crossentropy(network[3], ytrain[instance])

        loss_log.append(loss)

        # Backpropagation
        # Derivative of loss+softmax.
        for i in range(len(network[2])):
            deriv_network[2][i] = network[2][i]-ytrain[instance]

        # Derivative of output layer to first layer, update weights and biases.
        for j in range(len(network[2])):
            for i in range(len(network[1])):
                weights[2][i][j] = deriv_network[2][j] * network[1][i]
                deriv_network[1][i] = deriv_network[2][j] * sum(weights[2][i])
            biases[2][j] = deriv_network[2][j]

        # Derivative of sigmoid layer.
        for i in range(len(network[1])):
            deriv_network[0][i] = deriv_network[1][i] * network[1][i] * (1-network[1][i])

        # Derivative of first layer to input layer, update weights and biases.
        for j in range(len(network[0])):
            for i in range(len(xtrain[instance])):
                weights[0][i][j] = deriv_network[0][j] * xtrain[instance][i]
            biases[0][j] = deriv_network[0][j]

    return weights, biases, loss_log

# Load data
(xtrain, ytrain), (xval, yval), num_cls = load_synth()

print("--- Dimensions ---")
print("xtrain:", np.shape(xtrain))
print("ytrain:", np.shape(ytrain))
print("xval:", np.shape(xval))
print("yval", np.shape(yval))
print("\n")
print("--- Value range ---")
print("xtrain max:", xtrain.max(), "min: ", xtrain.min(), "mean:",
      xtrain.mean(), "variance:", xtrain.var())
print("xval max:", xval.max(), "min: ", xval.min(), "mean:",
      xval.mean(), "variance:", xval.var())

xtrain = normalize(xtrain)
xval = normalize(xval)

# Initialize weights with random normally distributed values.
weights = [np.random.normal(loc = 0.0, scale = 1.0,
                            size = (xtrain.shape[1],3)), # from input to first layer
           [], # none for sigmoid-activated layer
          np.random.normal(loc = 0.0, scale = 1.0,
                           size = (3,num_cls)), # from sigmoid-activated layer to output layer
          []] # non for softmax-activated layer

biases = [3*[0], # for first layer
         [], # none for sigmoid-activated layer
         num_cls*[0], # for output layer
         []] # non for softmax-activated layer

loss_log = []
for epoch in range(5):
    weights, biases, losses = train(xtrain, ytrain, weights, biases)
    loss_log.append(losses)

--- Dimensions ---
xtrain: (60000, 2)
ytrain: (60000,)
xval: (10000, 2)
yval (10000,)


--- Value range ---
xtrain max: 4.285855641221728 min:  -4.852117653180117 mean: 0.0029853645282486114 variance: 0.9947430915341501
xval max: 3.825215846396713 min:  -3.5775068414685856 mean: 0.012517284874245806 variance: 1.0033800061321618


OverflowError: math range error