In [6]:
import numpy as np
import matplotlib.pyplot as plt

# NN by hand and from tensorflow

## A naive by-hand implementation of a FF NN

Activation functions

In [7]:
def ReLU(x):
    zeros = np.zeros_like(x)
    return np.maximum(x, zeros)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

Construct a dense layer

In [45]:
def hidden(activ_func, weights, inpt):
    if weights.shape[0] != inpt.shape[1] + 1:
        raise IndexError(f"Input shape {inpt.shape} and weight shape {weights.shape} do not match for multiplication")
    N_inpt = inpt.shape[0]
    inpt_ext = np.concat((np.ones((N_inpt, 1)), inpt), axis=1)
    inpt_weighted = np.dot(inpt_ext, weights)
    outpt = activ_func(inpt_weighted)
    return outpt

def initialize_weights(N_inpt_neurons, N_output_neurons):
    init_weights = np.random.uniform(0, 1, (1 + N_inpt_neurons, N_output_neurons))
    return init_weights

binary_inpt = np.zeros((4,2))
binary_inpt[0,:] = np.array([0, 0])
binary_inpt[1,:] = np.array([0, 1])
binary_inpt[2,:] = np.array([1, 0])
binary_inpt[3,:] = np.array([1, 1])

binary_outpt_nand = np.zeros((4))
binary_outpt_nand[0] = 1
binary_outpt_nand[1] = 1
binary_outpt_nand[2] = 1
binary_outpt_nand[3] = 0

In [34]:
hidden(sigmoid, initialize_weights(4,1), hidden(ReLU, initialize_weights(8,4), hidden(ReLU, initialize_weights(2, 8), binary_inpt)))

array([[0.98215134],
       [0.99990875],
       [0.99910845],
       [0.99999552]])

With initialized weights chosen uniformly between 0 and 1, the above command creates a dense neural network with two hidden layers with 8 and 4 neurons as well as one output layer with one neuron and a sigmoid activation function to obtain an output between 0 and 1.\
\
To train this network, one now would have to compare the output to the actual labels and perform a back propagation via gradient descent. Repeating these steps, the NN would learn the corresponding logical gate.

## Using tensorflow

In [38]:
import tensorflow
import keras
from keras import layers, ops

In [156]:
NN_model = keras.Sequential(
    [
        keras.Input(shape=(2,)),
        layers.Dense(8, activation='relu'),
        layers.Dense(4, activation='relu'),
        layers.Dense(1, activation='sigmoid'),
    ]
)

print(NN_model.summary())

None


In [157]:
NN_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="mean_squared_error",
)

NN_model.fit(
    x=binary_inpt, 
    y=binary_outpt_nand,
    epochs=1000,
)

Epoch 1/1000


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step - loss: 0.2418
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.2411
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.2406
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.2402
Epoch 5/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.2398
Epoch 6/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.2394
Epoch 7/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.2391
Epoch 8/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.2389
Epoch 9/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.2386
Epoch 10/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.2383
Epoch 11/1000
[1m1/

<keras.src.callbacks.history.History at 0x362cd2930>

In [158]:
def binary_pred(model, inpt):
    prediction = model.predict(inpt)
    return np.int64((prediction > 1/2).reshape((inpt.shape[0])))

binary_pred(NN_model, binary_inpt)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


array([1, 1, 1, 0])

## Creating an overfitting network and using regularization techniques

In [90]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [198]:
iris = load_iris()
X = iris['data']
y = iris['target']
X_train, X_test, y_train_pre, y_test_pre = train_test_split(X, y, train_size=20)

In [144]:
# OneHot encoding the categorical labels
onehot = OneHotEncoder()
y_train = onehot.fit_transform(y_train_pre.reshape(-1, 1)).toarray()
y_test = onehot.transform(y_test_pre.reshape(-1, 1)).toarray()

In [149]:
overfit_model = keras.Sequential(
    [
        keras.Input(shape=(4, )),
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(3, activation='softmax'),
    ]
)

overfit_model.compile(
    optimizer='adam',
    loss='CategoricalCrossentropy'
)

overfit_model.fit(x=X_train, y=y_train, epochs=1000)

Epoch 1/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step - loss: 1.6452
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.3748
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.1877
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0744
Epoch 5/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.0243
Epoch 6/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0083
Epoch 7/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.0021
Epoch 8/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.9935
Epoch 9/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.9758
Epoch 10/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.9479
Epoch 1

<keras.src.callbacks.history.History at 0x3617d3860>

In [153]:
prob_pred = overfit_model.predict(X_test)
y_pred = np.argmax(prob_pred, axis=1)
#keras.losses.categorical_crossentropy(prob_pred, y_test[None,:,:])

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 618us/step


In [155]:
print(f"The percentage of missclassifications for the overfitting neural network on the test set is given by {100 * (1 - (np.sum(np.argmax(prob_pred, axis=1) == y_test_pre) / len(y_test)))}%.")

The percentage of missclassifications for the overfitting neural network on the test set is given by 3.076923076923077%.


Here, overfitting still doesn't occur due to the inherent clean nature of the iris data set. I will consider in the following a different data set, maybe for image recognition.

In [200]:
mnist = keras.datasets.mnist.load_data()
X = mnist[0][0]
y = mnist[0][1]
X_train_pre, X_test_pre, y_train_pre, y_test = train_test_split(X, y, train_size=200, test_size=10000)
X_train = X_train_pre.reshape((200, 784))
X_test = X_test_pre.reshape((10000, 784))

In [201]:
onehot = OneHotEncoder()
y_train = onehot.fit_transform(y_train_pre.reshape(-1, 1)).toarray()

In [221]:
overfitting_model = keras.Sequential(
    [
        layers.Input(shape=(784,)),
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(10, activation='softmax'),
    ]
)

overfitting_model.summary()

In [204]:
overfitting_model.compile(
    optimizer='adam',
    loss='CategoricalCrossentropy'
)

overfitting_model.fit(x=X_train, y=y_train, epochs=1000)

Epoch 1/1000


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 870us/step - loss: 42.3637
Epoch 2/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 872us/step - loss: 8.4328
Epoch 3/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 855us/step - loss: 3.0499
Epoch 4/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 766us/step - loss: 1.0294
Epoch 5/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 730us/step - loss: 0.3325
Epoch 6/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 850us/step - loss: 0.0939
Epoch 7/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 765us/step - loss: 0.0464
Epoch 8/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 774us/step - loss: 0.0255
Epoch 9/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 790us/step - loss: 0.0087
Epoch 10/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 764us/step - loss: 0.0070
E

<keras.src.callbacks.history.History at 0x3643d4500>

In [207]:
prob_pred = overfitting_model.predict(X_test)
y_pred = np.argmax(prob_pred, axis=1)
print(f"The percentage of missclassifications for the overfitting neural network on the test set is given by {100 * (1 - (np.sum(np.argmax(prob_pred, axis=1) == y_test) / len(y_test)))}%.")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322us/step
The percentage of missclassifications for the overfitting neural network on the test set is given by 43.03%.


Finally, we observe here a rather large classification error due to the relatively small sample size of $N=200$ and a complex model with $17539$ parameters. Let us now explore different regularization methods.

### z-score feature scaling

In [209]:
from sklearn.preprocessing import StandardScaler

In [215]:
scaler = StandardScaler()
X_train_rescaled = scaler.fit_transform(X_train)
X_test_rescaled = scaler.transform(X_test)

In [216]:
overfitting_model.fit(x=X_train_rescaled, y=y_train, epochs=1000)

Epoch 1/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 956us/step - loss: 1.8990
Epoch 2/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 804us/step - loss: 1.0532
Epoch 3/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 910us/step - loss: 0.3668
Epoch 4/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 844us/step - loss: 0.1329
Epoch 5/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 813us/step - loss: 0.0351
Epoch 6/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 779us/step - loss: 0.0148
Epoch 7/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 861us/step - loss: 0.0062
Epoch 8/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 788us/step - loss: 0.0014
Epoch 9/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 770us/step - loss: 6.0863e-04
Epoch 10/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 848us/step 

<keras.src.callbacks.history.History at 0x3650960f0>

In [217]:
prob_pred = overfitting_model.predict(X_test_rescaled)
y_pred = np.argmax(prob_pred, axis=1)
print(f"The percentage of missclassifications for the overfitting neural network on the test set is given by {100 * (1 - (np.sum(np.argmax(prob_pred, axis=1) == y_test) / len(y_test)))}%.")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 320us/step
The percentage of missclassifications for the overfitting neural network on the test set is given by 30.389999999999993%.


Re-scaling the data already leads to a significant improvement of predictions on the test set!

### $\ell_2$-Penalty

Let's look at the effects of regularizing the parameters via an $\ell_2$ penalty. This leads to a down-scaling of the weights, in particular those with low variance. Since we train the model on centered data, we do not include a penalty on the bias (which would otherwise be added via `bias_regularizer=regularizers.L2()`).

In [218]:
from keras import regularizers

In [228]:
l2_model = keras.Sequential(
    [
        layers.Input(shape=(784,)),
        layers.Dense(units=128, kernel_regularizer=regularizers.L2(1e-3), activation='relu'),
        layers.Dense(128, kernel_regularizer=regularizers.L2(1e-3),activation='relu'),
        layers.Dense(64, kernel_regularizer=regularizers.L2(1e-3),activation='relu'),
        layers.Dense(32, kernel_regularizer=regularizers.L2(1e-3),activation='relu'),
        layers.Dense(10, activation='softmax'),
    ]
)

l2_model.summary()

In [229]:
l2_model.compile(
    optimizer='adam',
    loss='CategoricalCrossentropy'
)

l2_model.fit(x=X_train_rescaled, y=y_train, epochs=1000)

Epoch 1/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 940us/step - loss: 2.7782
Epoch 2/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 766us/step - loss: 2.1454
Epoch 3/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 808us/step - loss: 1.6313
Epoch 4/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 744us/step - loss: 1.1935
Epoch 5/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 816us/step - loss: 0.8445
Epoch 6/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 831us/step - loss: 0.6200
Epoch 7/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 729us/step - loss: 0.5147
Epoch 8/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 796us/step - loss: 0.4605
Epoch 9/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 749us/step - loss: 0.4305
Epoch 10/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 798us/step - lo

A difference we immediately notice is that the loss is much higher than in the previous cases without $\ell_2$-penalty. This reflects an increase of bias. Let's see if the variance is reduced!

In [230]:
prob_pred = l2_model.predict(X_test_rescaled)
y_pred = np.argmax(prob_pred, axis=1)
print(f"The percentage of missclassifications for the overfitting neural network on the test set is given by {100 * (1 - (np.sum(np.argmax(prob_pred, axis=1) == y_test) / len(y_test)))}%.")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 315us/step
The percentage of missclassifications for the overfitting neural network on the test set is given by 27.749999999999996%.


The model slightly improves by adding the $\ell_2$-penalty. Notice however that if the parameter in front of the $\ell_2$-term is chosen too large (for instance $\lambda = 0.05$) the models bias and variance actually increase. There is thus some tuning necessary.

### $\ell_1$-penalty

In [237]:
l1_model = keras.Sequential(
    [
        layers.Input(shape=(784,)),
        layers.Dense(units=128, kernel_regularizer=regularizers.L1(1e-5), activation='relu'),
        layers.Dense(128, kernel_regularizer=regularizers.L1(1e-5),activation='relu'),
        layers.Dense(64, kernel_regularizer=regularizers.L1(1e-5),activation='relu'),
        layers.Dense(32, kernel_regularizer=regularizers.L1(1e-5),activation='relu'),
        layers.Dense(10, activation='softmax'),
    ]
)

l1_model.summary()

In [238]:
l1_model.compile(
    optimizer='adam',
    loss='CategoricalCrossentropy'
)

l1_model.fit(x=X_train_rescaled, y=y_train, epochs=1000)

Epoch 1/1000


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.3421  
Epoch 2/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.7685
Epoch 3/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 962us/step - loss: 1.2948
Epoch 4/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 944us/step - loss: 0.8654
Epoch 5/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5568
Epoch 6/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3357
Epoch 7/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.2076
Epoch 8/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1350
Epoch 9/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1012
Epoch 10/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 978us/step - loss: 0.0801
Epoch 11/1000


<keras.src.callbacks.history.History at 0x37d8d1790>

In [239]:
prob_pred = l1_model.predict(X_test_rescaled)
y_pred = np.argmax(prob_pred, axis=1)
print(f"The percentage of missclassifications for the overfitting neural network on the test set is given by {100 * (1 - (np.sum(np.argmax(prob_pred, axis=1) == y_test) / len(y_test)))}%.")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322us/step
The percentage of missclassifications for the overfitting neural network on the test set is given by 29.439999999999998%.


A slight increase in performance can be achieved for a small $\ell_1$-penalty. If the parameter chosen is $\lambda \geq 10^{-3}$ bias and variance increase. Let's see whether an elastic net regularization works better.

### Elastic net

In [246]:
EN_model = keras.Sequential(
    [
        layers.Input(shape=(784,)),
        layers.Dense(units=128, kernel_regularizer=regularizers.L1L2(1e-5, 1e-3), activation='relu'),
        layers.Dense(128, kernel_regularizer=regularizers.L1L2(1e-5, 1e-3),activation='relu'),
        layers.Dense(64, kernel_regularizer=regularizers.L1L2(1e-5, 1e-3),activation='relu'),
        layers.Dense(32, kernel_regularizer=regularizers.L1L2(1e-5, 1e-3),activation='relu'),
        layers.Dense(10, activation='softmax'),
    ]
)

EN_model.summary()

In [247]:
EN_model.compile(
    optimizer='adam',
    loss='CategoricalCrossentropy'
)

EN_model.fit(x=X_train_rescaled, y=y_train, epochs=1000)

Epoch 1/1000


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 975us/step - loss: 2.8849
Epoch 2/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 959us/step - loss: 2.3498
Epoch 3/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step - loss: 1.9578
Epoch 4/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 885us/step - loss: 1.6656
Epoch 5/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 879us/step - loss: 1.2360
Epoch 6/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 844us/step - loss: 0.9281
Epoch 7/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 799us/step - loss: 0.6886
Epoch 8/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 837us/step - loss: 0.5692
Epoch 9/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 886us/step - loss: 0.5011
Epoch 10/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 864us/step - loss: 0.4732
Ep

<keras.src.callbacks.history.History at 0x36fb196d0>

In [248]:
prob_pred = EN_model.predict(X_test_rescaled)
y_pred = np.argmax(prob_pred, axis=1)
print(f"The percentage of missclassifications for the overfitting neural network on the test set is given by {100 * (1 - (np.sum(np.argmax(prob_pred, axis=1) == y_test) / len(y_test)))}%.")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321us/step
The percentage of missclassifications for the overfitting neural network on the test set is given by 27.83%.


Combining $\ell_1$ and $\ell_2$ regularization does lead to similar results as for the pure $\ell_2$ regularization case. We seem to hit a ceiling with these types of regularization methods. Let's try in the next step to add dropout.

### Dropout

In [249]:
dropout_model = keras.Sequential(
    [
        layers.Input(shape=(784,)),
        layers.Dense(units=128, kernel_regularizer=regularizers.L1L2(1e-5, 1e-3), activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(128, kernel_regularizer=regularizers.L1L2(1e-5, 1e-3),activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(64, kernel_regularizer=regularizers.L1L2(1e-5, 1e-3),activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(32, kernel_regularizer=regularizers.L1L2(1e-5, 1e-3),activation='relu'),
        layers.Dense(10, activation='softmax'),
    ]
)

dropout_model.summary()

In [250]:
dropout_model.compile(
    optimizer='adam',
    loss='CategoricalCrossentropy'
)

dropout_model.fit(x=X_train_rescaled, y=y_train, epochs=1000)

Epoch 1/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 3.3528  
Epoch 2/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.9895
Epoch 3/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 970us/step - loss: 2.8491
Epoch 4/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 949us/step - loss: 2.7980
Epoch 5/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 964us/step - loss: 2.6782
Epoch 6/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 944us/step - loss: 2.6164
Epoch 7/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 995us/step - loss: 2.5893
Epoch 8/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 954us/step - loss: 2.5200
Epoch 9/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 916us/step - loss: 2.4477
Epoch 10/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 927us/step - loss

<keras.src.callbacks.history.History at 0x3804714c0>

In [251]:
prob_pred = dropout_model.predict(X_test_rescaled)
y_pred = np.argmax(prob_pred, axis=1)
print(f"The percentage of missclassifications for the overfitting neural network on the test set is given by {100 * (1 - (np.sum(np.argmax(prob_pred, axis=1) == y_test) / len(y_test)))}%.")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316us/step
The percentage of missclassifications for the overfitting neural network on the test set is given by 24.439999999999994%.


Dropout gives a substantial improvement so that the model now has an accuracy of ~25%. Let's see if we can improve further by augmenting the few input samples we have.

### Data augmentation

In [264]:
mnist = keras.datasets.mnist.load_data()
X = mnist[0][0]
y = mnist[0][1]
X_train, X_test, y_train_pre, y_test = train_test_split(X, y, train_size=200, test_size=10000)
onehot = OneHotEncoder()
y_train = onehot.fit_transform(y_train_pre.reshape(-1, 1)).toarray()

X_train = X_train / 255.0 # normalization
X_test = X_test / 255.0 # normalization

In [293]:
augmentation_layer = keras.Sequential(
    [
     layers.RandomZoom(0.3)
    ]
)

In [294]:
augmented_model = keras.Sequential(
    [
        layers.Input(shape=(28, 28,)),
        augmentation_layer,
        layers.Flatten(),
        layers.Dense(units=128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(10, activation='softmax'),
    ]
)

In [295]:
augmented_model.compile(
    optimizer='adam',
    loss='CategoricalCrossentropy'
)

augmented_model.fit(x=X_train, y=y_train, epochs=1000)

Epoch 1/1000


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 2.4902  
Epoch 2/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.3285
Epoch 3/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 999us/step - loss: 2.3980
Epoch 4/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 937us/step - loss: 2.3641
Epoch 5/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 911us/step - loss: 2.3424
Epoch 6/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 955us/step - loss: 2.3180
Epoch 7/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 974us/step - loss: 2.3612
Epoch 8/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 964us/step - loss: 2.3526
Epoch 9/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 884us/step - loss: 2.3220
Epoch 10/1000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2.2897 
Epoch

<keras.src.callbacks.history.History at 0x392cbd670>

In [296]:
prob_pred = augmented_model.predict(X_test)
y_pred = np.argmax(prob_pred, axis=1)
print(f"The percentage of missclassifications for the overfitting neural network on the test set is given by {100 * (1 - (np.sum(np.argmax(prob_pred, axis=1) == y_test) / len(y_test)))}%.")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396us/step
The percentage of missclassifications for the overfitting neural network on the test set is given by 22.199999999999996%.


Using the `RandomZoom` layer for data augmentation drastically improved the model performance so that we almost reach now a precision of 80%. Rotations lead to a decrease of performance. By zooming, the border of the image is cut off, which could correspond to focussing on the predictive features.