# Homework - DNN

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist
from tensorflow.keras.callbacks import EarlyStopping

## Preprocessing

In [None]:
# the data, shuffled and split between train and test sets
(X_train_pre, y_train_pre), (X_test_pre, y_test_pre) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
input_dim = 784 # 28*28
X_train = X_train_pre.reshape(X_train_pre.shape[0], input_dim)
X_test = X_test_pre.reshape(X_test_pre.shape[0], input_dim)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255.
X_test /= 255.

print('training data shape:', X_train.shape)
print('{} train samples, {} test samples'.format(X_train.shape[0], X_test.shape[0]))

training data shape: (60000, 784)
60000 train samples, 10000 test samples


In [None]:
num_classes = 10 # 0-9
y_train = keras.utils.to_categorical(y_train_pre, num_classes)
y_test = keras.utils.to_categorical(y_test_pre, num_classes)

In [None]:
validation_split = 0.166667
index = np.random.permutation(X_train.shape[0])

train_idx = int(X_train.shape[0]*validation_split)

X_val, X_train = X_train[index[:train_idx]], X_train[index[train_idx:]]
y_val, y_train = y_train[index[:train_idx]], y_train[index[train_idx:]]


## The models

### Build the model **1**

In [None]:
# Define model 1 parameters
activation_function = 'relu'
batch_size = 128
epochs = 20
optimizer = keras.optimizers.Adam(learning_rate=0.001)
n_layers = 1
network_architecture = [16]
patience = 2

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

# Build model
model = keras.Sequential()
model.add(layers.Input(shape=(input_dim,)))
for units in network_architecture:
    model.add(layers.Dense(units, activation=activation_function))
model.add(layers.Dense(num_classes, activation='softmax'))  # Output layer

In [None]:
model.summary()

In [None]:
# Compile model
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train model
history = model.fit(
    X_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.6563 - loss: 1.1231 - val_accuracy: 0.9014 - val_loss: 0.3439
Epoch 2/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9084 - loss: 0.3308 - val_accuracy: 0.9170 - val_loss: 0.2869
Epoch 3/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9213 - loss: 0.2775 - val_accuracy: 0.9255 - val_loss: 0.2594
Epoch 4/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9272 - loss: 0.2553 - val_accuracy: 0.9301 - val_loss: 0.2427
Epoch 5/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9323 - loss: 0.2412 - val_accuracy: 0.9335 - val_loss: 0.2298
Epoch 6/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9351 - loss: 0.2279 - val_accuracy: 0.9334 - val_loss: 0.2214
Epoch 7/20
[1m391/391[0m 

In [None]:
_, acc = model.evaluate(X_test, y_test, verbose=0)

print("Testing set accuracy: {:.2f}%".format(acc*100))

Testing set accuracy: 94.83%


### Build the model **2**

In [None]:
# Model 2 parameters
activation_function_2 = 'tanh'  # Changed activation function
batch_size_2 = 128  #
epochs_2 = 30  # Changed epochs
optimizer_2 = keras.optimizers.SGD(learning_rate=0.01)  # Changed optimizer and learning rate
n_layers_2 = 2  # Changed number of layers
network_architecture_2 = [32, 16]  # Changed network architecture
patience_2 = 3  # Changed patience for early stopping

# Early stopping callback for Model 2
early_stopping_2 = EarlyStopping(monitor='val_loss', patience=patience_2, restore_best_weights=True)

# Build Model 2
model_2 = keras.Sequential()
model_2.add(layers.Input(shape=(input_dim,)))
for units in network_architecture_2:
    model_2.add(layers.Dense(units, activation=activation_function_2))
model_2.add(layers.Dense(num_classes, activation='softmax'))  # Output layer

In [None]:
model_2.summary()

In [None]:
# Compile Model 2
model_2.compile(loss='categorical_crossentropy', optimizer=optimizer_2, metrics=['accuracy'])

# Train Model 2
history_2 = model_2.fit(
    X_train, y_train,
    batch_size=batch_size_2,
    epochs=epochs_2,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping_2],
    verbose=1
)

Epoch 1/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.4662 - loss: 1.7719 - val_accuracy: 0.7250 - val_loss: 1.1073
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7454 - loss: 1.0273 - val_accuracy: 0.8213 - val_loss: 0.8073
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8279 - loss: 0.7621 - val_accuracy: 0.8609 - val_loss: 0.6353
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8613 - loss: 0.6114 - val_accuracy: 0.8720 - val_loss: 0.5355
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8768 - loss: 0.5248 - val_accuracy: 0.8833 - val_loss: 0.4724
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8871 - loss: 0.4651 - val_accuracy: 0.8914 - val_loss: 0.4296
Epoch 7/30
[1m391/391[0m 

In [None]:
_, acc = model_2.evaluate(X_test, y_test, verbose=0)

print("Testing set accuracy: {:.2f}%".format(acc*100))

Testing set accuracy: 93.74%


### Build the model **3**

In [None]:
# Model 3 parameters
activation_function_3 = 'softplus'  # Changed activation function
batch_size_3 = 64  # Changed batch size
epochs_3 = 35  # Changed epochs
optimizer_3 = keras.optimizers.RMSprop(learning_rate=0.001)  # Changed optimizer and learning rate
n_layers_3 = 3  # Changed number of layers
network_architecture_3 = [128, 64, 32]  # Changed network architecture
patience_3 = 4  # Changed patience for early stopping

# Early stopping callback for Model 3
early_stopping_3 = EarlyStopping(monitor='val_loss', patience=patience_3, restore_best_weights=True)

# Build Model 3
model_3 = keras.Sequential()
model_3.add(layers.Input(shape=(input_dim,)))
for units in network_architecture_3:
    model_3.add(layers.Dense(units, activation=activation_function_3))
model_3.add(layers.Dense(num_classes, activation='softmax'))  # Output layer

In [None]:
model_3.summary()

In [None]:
# Compile Model 3
model_3.compile(loss='categorical_crossentropy', optimizer=optimizer_3, metrics=['accuracy'])

# Train Model 3
history_3 = model_3.fit(
    X_train, y_train,
    batch_size=batch_size_3,
    epochs=epochs_3,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping_3],
    verbose=1
)

Epoch 1/35
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7952 - loss: 0.6766 - val_accuracy: 0.9123 - val_loss: 0.2778
Epoch 2/35
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9375 - loss: 0.2093 - val_accuracy: 0.9354 - val_loss: 0.2064
Epoch 3/35
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9574 - loss: 0.1425 - val_accuracy: 0.9515 - val_loss: 0.1568
Epoch 4/35
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9660 - loss: 0.1105 - val_accuracy: 0.9634 - val_loss: 0.1233
Epoch 5/35
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.9710 - loss: 0.0917 - val_accuracy: 0.9651 - val_loss: 0.1111
Epoch 6/35
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9774 - loss: 0.0743 - val_accuracy: 0.9651 - val_loss: 0.1183
Epoch 7/35
[1m782/782[0m 

In [None]:
_, acc = model_3.evaluate(X_test, y_test, verbose=0)

print("Testing set accuracy: {:.2f}%".format(acc*100))

Testing set accuracy: 97.33%


**Performance Comparison **

Model 3 outperformed the other two models with an accuracy of 97.33%, benefiting from a deeper architecture ([128, 64, 32]) that enabled better pattern learning. The **Softplus** activation function contributed to training stability, while the **RMSprop** optimizer dynamically adjusted learning rates for improved performance. A **smaller batch size (64)** helped capture finer details, though at the cost of increased training time, and a **larger patience value (4)** allowed early stopping to optimize more effectively. **Model 1 (94.83%)**, with its **single hidden layer of 16 neurons**, had a simple architecture that limited learning capacity. While **ReLU** is an effective activation function, the low number of neurons may have restricted its impact, and a deeper network could have enhanced performance despite the **Adam optimizer's** strong capabilities. **Model 2 (93.74%)** had the lowest accuracy, possibly due to **Tanh activation**, which can cause saturation issues, and **SGD optimizer with a fixed learning rate (0.01)**, which may have been too aggressive or not adaptive enough. Although it featured two layers ([32, 16]), the architecture was still relatively shallow, limiting its effectiveness.

**Recommendations**

To further optimize Model 3, several hyperparameter tuning strategies can be applied. Adjusting **batch sizes (32, 64, 128)** may help balance generalization and training efficiency. A **deeper network ([256, 128, 64, 32])** with **Batch Normalization** for stability and **Dropout (0.3–0.5)** for regularization can improve model performance. Additionally, increasing **training epochs to 50** while implementing **early stopping (patience=5) and monitoring validation accuracy** can ensure optimal training without overfitting. These tuning techniques can improve accuracy, stability, and generalization while mitigating overfitting risks.