# Preamble

importing

In [None]:
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split

from plot_utils import plot_probabilities


# Neural networks on non-linear data

Some familiar datasets to see how neural networks handle them.

In [None]:
from sklearn.datasets import make_moons, make_circles

colors = ListedColormap(['green', 'magenta', 'yellow', 'blue'])

X_moons, y_moons = make_moons(n_samples=1000, noise = .2, random_state = 504)
X_circles, y_circles = make_circles(n_samples=1000, factor=.65, noise = .05)

figure = plt.figure(figsize=(20, 10))

ax = figure.add_subplot(1, 2, 2, title='moons')
ax.scatter(X_moons[:, 0], X_moons[:, 1], c=y_moons, cmap=ListedColormap(['green', 'magenta']))

ax = figure.add_subplot(1, 2, 1, title='circles')
ax.scatter(X_circles[:, 0], X_circles[:, 1], c=y_circles, cmap=ListedColormap(['green', 'magenta']))


### Circles

Circles are the simplest of the two. Let's see if we can use a simple network.

The code below creates one hidden layer and one output layer. The input layer gets created automatically when we give the hidden layer the input_dim attribute.

The output layer is a single node with a sigmoid activation, since this is a binary classifier. Another popular choice for binary classifiers is to make two nodes with a softmax activation. There isn't much difference between the two approaches, but softmax is a real staple of modern neural networks.

We're using the classic sigmoid activation function because it's the easiest to interpret.

Note, the use of the Adam optimizer. Stochastic gradient descent does not converge.

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

tf.random.set_seed(504)
ann1 = tf.keras.Sequential([
  layers.Dense(4, input_dim=2, activation="sigmoid"),
  layers.Dense(1, activation="sigmoid")])

ann1.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=.01), metrics=['accuracy'])
ann1.summary()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_circles, y_circles, random_state=504)

ann1.fit(X_train, y_train, epochs=200, validation_data=(X_valid, y_valid))


If we train this enough, we can get it down to almost perfect.

**Note:** If you call fit() again, it will continue from the already trained weights. This is a way to experiment with the number of epochs run.

In [None]:
plot_probabilities(ann1, X_circles, y_circles, colors, post_process=np.round)

With a network as simple as this, it's easy to get a feel for how the network works by inspecting the output of the hidden layer.

In [None]:
early_layers = ann1.layers[:-1]

This is an array containing all the layers but the top one. (That is, the hidden layer and the implicit input layer.)

In [None]:
hidden = tf.keras.Sequential(early_layers)

We create a new neural network with the remaining layers. These layers retain their weights and biases, so we can use this to examine how the network works.

It could also be used as a _pre-trained_ network to be used as the basis for another ANN. This is of course more relevant with more complex networks.

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(15, 15))

plot_probabilities(hidden, X_circles, y_circles, colors, ax=axes[0][0])
plot_probabilities(hidden, X_circles, y_circles, colors, dim=1, ax=axes[0][1])
plot_probabilities(hidden, X_circles, y_circles, colors, dim=2, ax=axes[1][0])
plot_probabilities(hidden, X_circles, y_circles, colors, dim=3, ax=axes[1][1])

### Moons

The moons are more complex, so let's try adding another layer (we could also have tried more neurons in the hidden layer).

In [None]:
tf.random.set_seed(504)
ann2 = tf.keras.Sequential([
  layers.Dense(4, input_dim=2, activation="sigmoid"),
  layers.Dense(4, activation="sigmoid"),
  layers.Dense(1, activation="sigmoid")])

ann2.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=.01), metrics=['accuracy'])

200 epochs is a lot for these simple problems. Either they hit a plateau or they converge before that. Instead of fine tuning that each time, let's add a callback.

The *early stopping* callback is called after each epoch and evaluates if the optimizer is making much progress (by default by looking at the validation loss). In the example, the fitting stops if no progress has been made for the last 20 epochs.

In [None]:
es = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)

It doesn't go well.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_moons, y_moons, random_state=504)

history = ann2.fit(X_train, y_train, epochs=200, validation_data=(X_valid, y_valid), callbacks=[es])


In [None]:
plt.figure()
plt.title("Learning curves")
plt.xlabel("Epoch")
plt.ylabel("Cross entropy loss")
plt.plot(history.history['loss'], label = 'train')
plt.plot(history.history['val_loss'], label = 'valid')
plt.legend()
plt.show()

In [None]:
plot_probabilities(ann2, X_moons, y_moons, colors, post_process=np.round)

The sigmoid activation function has a very small window of operations before it causes vanishing gradients. For this reason, it is not used much except when you specifically want a probability.

We'll replace it with ReLU (tanh works, too).

In [None]:
ann2 = tf.keras.Sequential([
  layers.Dense(4, input_dim=2, activation="relu"),
  layers.Dense(4, activation="relu"),
  layers.Dense(1, activation="sigmoid")])

ann2.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=.01), metrics=['accuracy'])
history = ann2.fit(X_train, y_train, epochs=200, validation_data=(X_valid, y_valid), verbose = 0, callbacks=[es])


In [None]:
plt.figure()
plt.title("Learning curves")
plt.xlabel("Epoch")
plt.ylabel("Cross entropy loss")
plt.plot(history.history['loss'], label = 'train')
plt.plot(history.history['val_loss'], label = 'valid')
plt.legend()
plt.show()

In [None]:
plot_probabilities(ann2, X_moons, y_moons, colors, post_process=np.round)

Alternatively, we can _normalize_ the data between layers. The batch normalization layer scales the data to a (learned) mean and standard deviation. This helps keep the values in the active region. It is commonly used with sigmoid and tanh activation functions.

In [None]:
ann2 = tf.keras.Sequential([
  layers.Dense(4, input_dim=2, activation="sigmoid"),
  layers.BatchNormalization(),
  layers.Dense(4, activation="sigmoid"),
  layers.BatchNormalization(),
  layers.Dense(1, activation="sigmoid")])

ann2.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=.01), metrics=['accuracy'])
history=ann2.fit(X_train, y_train, epochs=200, validation_data=(X_valid, y_valid), callbacks=[es])


In [None]:
plt.figure()
plt.title("Learning curves")
plt.xlabel("Epoch")
plt.ylabel("Cross entropy loss")
plt.plot(history.history['loss'], label = 'train')
plt.plot(history.history['val_loss'], label = 'valid')
plt.legend()
plt.show()

In [None]:
plot_probabilities(ann2, X_moons, y_moons, colors, post_process=np.round)

In [None]:
hidden = tf.keras.Sequential(ann2.layers[:-4])

fig, axes = plt.subplots(2, 2, figsize=(15, 15))

plot_probabilities(hidden, X_moons, y_moons, colors, ax=axes[0][0])
plot_probabilities(hidden, X_moons, y_moons, colors, dim=1, ax=axes[0][1])
plot_probabilities(hidden, X_moons, y_moons, colors, dim=2, ax=axes[1][0])
plot_probabilities(hidden, X_moons, y_moons, colors, dim=3, ax=axes[1][1])

In [None]:
hidden = tf.keras.Sequential(ann2.layers[:-2])

fig, axes = plt.subplots(2, 2, figsize=(15, 15))

def log_(x):
    return np.log(x + 1)

plot_probabilities(hidden, X_moons, y_moons, colors, ax=axes[0][0])
plot_probabilities(hidden, X_moons, y_moons, colors, dim=1, ax=axes[0][1])
plot_probabilities(hidden, X_moons, y_moons, colors, dim=2, ax=axes[1][0])
plot_probabilities(hidden, X_moons, y_moons, colors, dim=3, ax=axes[1][1])