``Objectives``

* training feed-forward fully connected neural networks (FFNN);

* a full set of experiments to explore different hyperparameters and hidden layer sizes for two datasets, and then document your findings.

``Data``
* Digits MNIST
* Fashion MNIST

### Import libraries

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style

from keras.datasets import fashion_mnist
from keras.datasets import mnist

import tensorflow as tf
from tensorflow import keras
from keras import metrics
tf.get_logger().setLevel('INFO')



---
### Step 1: Data ingestion

In [None]:
# Load the Fashion MNIST dataset.
(X_train_fashion, Y_train_fashion), (X_test_fashion, Y_test_fashion) = fashion_mnist.load_data()

# Load the Digits MNIST dataset.
(X_train_digits, Y_train_digits), (X_test_digits, Y_test_digits) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m 5513216/11490434[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m17s[0m 3us/step

---
### Step 2: Data preprocessing

``Fashion MNIST``

In [None]:
# Normalize
X_train_fashion = X_train_fashion / 255.
X_test_fashion = X_test_fashion / 255.

# Flatten Y_train and Y_test, so they become vectors of label values.
Y_train_fashion = Y_train_fashion.flatten()
Y_test_fashion = Y_test_fashion.flatten()

label_names = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
               'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']

# Apply random shufflying to training examples.
np.random.seed(0)
indices = np.arange(X_train_fashion.shape[0])
shuffled_indices = np.random.permutation(indices)
X_train_fashion = X_train_fashion[shuffled_indices]
Y_train_fashion = Y_train_fashion[shuffled_indices]

``Digits MNIST``

In [None]:
# Normalize
X_train_digits = X_train_digits / 255.
X_test_digits = X_test_digits / 255.

# Flatten Y_train and Y_test, so they become vectors of label values.
Y_train_digits = Y_train_digits.flatten()
Y_test_digits = Y_test_digits.flatten()

# Apply random shufflying to training examples.
np.random.seed(0)
indices = np.arange(X_train_digits.shape[0])
shuffled_indices = np.random.permutation(indices)
X_train_digits = X_train_digits[shuffled_indices]
Y_train_digits = Y_train_digits[shuffled_indices]




---
### Step 3: Exploratory data analysis (EDA)


1. Show the first training example in X_train_fashion;
2. Show the first training example in X_train_digits;
3. Display the first 5 images in X_train_digits for each class in Y_train_digits, arranged in a 10x5 grid.

In [None]:
# First training example in X_train_fashion
plt.figure(figsize=(5, 5))
plt.imshow(X_train_fashion[0])
plt.title("First Training Example in Fashion Dataset")
plt.axis("off")
plt.show()

# First training example in X_train_digits
plt.figure(figsize=(5, 5))
plt.imshow(X_train_digits[0])
plt.title("First Training Example in Digits Dataset")
plt.axis("off")
plt.show()

# Function to find first 5 images per class and display them
def display_first_5_images_per_class(X_train, Y_train, num_classes=10):
    plt.figure(figsize=(15, 30))

    for class_id in range(num_classes):

        class_indices = np.where(Y_train == class_id)[0][:5]

        for i, idx in enumerate(class_indices):
            plt.subplot(num_classes, 5, class_id * 5 + i + 1)
            plt.imshow(X_train[idx])
            plt.title(class_id)
            plt.axis('off')

    plt.tight_layout()
    plt.show()

display_first_5_images_per_class(X_train_digits, Y_train_digits, 10)


### Step 4: Modeling

In [None]:
def build_model(n_classes,
                hidden_layer_sizes=[],
                activation='relu',
                optimizer='SGD',
                learning_rate=0.01,
                metric='metric'):
    """Build a multi-class logistic regression model using Keras.

    Args:
    n_classes: Number of output classes in the dataset.
    hidden_layer_sizes: A list with the number of units in each hidden layer.
    activation: The activation function to use for the hidden layers.
    optimizer: The optimizer to use (SGD, Adam).
    learning_rate: The desired learning rate for the optimizer.
    metric: The desired metric.

    Returns:
    model: A tf.keras model (graph).
    """
    tf.keras.backend.clear_session()
    np.random.seed(0)
    tf.random.set_seed(0)

    model = tf.keras.Sequential([
    tf.keras.Input(shape=(28, 28)),  # Define input explicitly
    tf.keras.layers.Flatten(),  # No input_shape needed here
    ])

    for i, hidden_layer_size in enumerate(hidden_layer_sizes):
      model.add(tf.keras.layers.Dense(units=hidden_layer_size,
                                    activation=activation,
                                    name=f'Hidden_{i+1}'))


    # Output layer
    model.add(tf.keras.layers.Dense(units=n_classes, activation='softmax', name='Output'))

    # Define optimizers
    optimizers = {
      'SGD': tf.keras.optimizers.SGD(learning_rate=learning_rate),
      'Adam': tf.keras.optimizers.Adam(learning_rate=learning_rate),
      'RMSprop': tf.keras.optimizers.RMSprop(learning_rate=learning_rate),
      'Adagrad': tf.keras.optimizers.Adagrad(learning_rate=learning_rate)
    }
    optimizer = optimizers.get(optimizer, tf.keras.optimizers.SGD(learning_rate=learning_rate))


    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=optimizer,
                  metrics=[metric])
    return model


---
### Step 5: Experiments

In [None]:
def train_and_evaluate(data='digits',
                       hidden_layer_sizes=[],
                       activation='tanh',
                       optimizer='Adam',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10):

  # Build the model.
    model = build_model(n_classes=10,
                      hidden_layer_sizes=hidden_layer_sizes,
                      activation=activation,
                      optimizer=optimizer,
                      metric=metric,
                      learning_rate=learning_rate)

  # Select the dataset.
    if data == 'digits':
        X_train = X_train_digits
        X_test = X_test_digits
        Y_train = Y_train_digits
        Y_test = Y_test_digits

    elif data == 'fashion':
        X_train = X_train_fashion
        X_test = X_test_fashion
        Y_train = Y_train_fashion
        Y_test = Y_test_fashion
    else:
        raise 'Unsupported dataset: %s' %data

  # Train the model.
    print('Training the', data, 'model...')
    history = model.fit(
        x=X_train,
        y=Y_train,
        epochs=num_epochs,
        batch_size=64,
        validation_split=0.1,
        verbose=0)

    # Retrieve the training metrics (after each train epoch) and the final validation
    # accuracy.
    train_accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    plt.plot(train_accuracy, label='train_accuracy')
    plt.plot(val_accuracy, label='validation accuracy')
    plt.xticks(range(num_epochs))
    plt.xlabel('Train epochs')
    plt.legend()
    plt.show()

    print('Training accuracy: %1.4f' %train_accuracy[-1])
    print('Validation accuracy: %1.4f' %val_accuracy[-1])

    return model


# run the function
model = train_and_evaluate()

1. Conduct experiments and record the training and validation set accuracy results in the table below. 

Data | Hidden sizes | Activation| Optimizer | Learning rate | #Parameters | Training accuracy| Validation accuracy
-|-|-|-|-|-|-|-
 digits | [] | tanh | Adam |0.01| 7850 | 0.9245 | 0.9098
 digits | [] | tanh | SGD |0.01| 7850 | 0.9002 | 0.9025
 digits | [] | relu | SGD |0.01| 7850 | 0.9011 | 0.8998
 digits | [] | relu | Adam |0.01| 7850 | 0.9247 | 0.9093
 digits | [128] | relu | Adam |0.01| 101770 | 0.9819 | 0.9622
 digits | [256, 128] | relu | Adam |0.01| 235146 | 0.9798 | 0.9640
-|-|-|-|-|-|-|-
 fashion | [] | tanh | SGD |0.01| 7850 | 0.8354 | 0.8208
 fashion | [] | relu | SGD |0.01| 7850 | 0.8356 | 0.8205
 fashion | [] | relu | Adam |0.01| 7850 | 0.8505| 0.8257
 fashion | [128] | relu | Adam |0.01| 101770 | 0.8834 | 0.8557
 fashion | [256, 128] | relu | Adam |0.01| 235146 | 0.8820 | 0.8630

In [None]:
model1 = train_and_evaluate(data='digits',
                       hidden_layer_sizes=[],
                       activation='tanh',
                       optimizer='SGD',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)


In [None]:
model2 = train_and_evaluate(data='digits',
                       hidden_layer_sizes=[],
                       activation='relu',
                       optimizer='SGD',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

In [None]:
model3 = train_and_evaluate(data='digits',
                       hidden_layer_sizes=[],
                       activation='relu',
                       optimizer='Adam',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

In [None]:
model4 = train_and_evaluate(data='digits',
                       hidden_layer_sizes=[128],
                       activation='relu',
                       optimizer='Adam',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

In [None]:
model5 = train_and_evaluate(data='digits',
                       hidden_layer_sizes=[256,128],
                       activation='relu',
                       optimizer='Adam',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

In [None]:
model6 = train_and_evaluate(data='fashion',
                       hidden_layer_sizes=[],
                       activation='tanh',
                       optimizer='SGD',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

In [None]:
model7 = train_and_evaluate(data='fashion',
                       hidden_layer_sizes=[],
                       activation='relu',
                       optimizer='SGD',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

In [None]:
model8 = train_and_evaluate(data='fashion',
                       hidden_layer_sizes=[],
                       activation='relu',
                       optimizer='Adam',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

In [None]:
model9 = train_and_evaluate(data='fashion',
                       hidden_layer_sizes=[128],
                       activation='relu',
                       optimizer='Adam',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

In [None]:
# Preferred architecture configuration
# Hidden Layers: [256, 128]
# Activation Function: relu
# Optimizer: Adam
# Learning Rate: 0.01
# Number of Epochs: 20

model10 = train_and_evaluate(data='fashion',
                       hidden_layer_sizes=[256,128],
                       activation='relu',
                       optimizer='Adam',
                       learning_rate=0.01,
                       metric='accuracy',
                       num_epochs=10)

---
### Step 6: Evaluation and Generalization

In [None]:
train_loss_digits, train_accuracy_digits = model5.evaluate(X_train_digits, Y_train_digits)
test_loss_digits, test_accuracy_digits = model5.evaluate(X_test_digits, Y_test_digits)

print(f"Aggregate training accuracy for digits datasets: {train_accuracy_digits:.4f}")
print(f"Aggregate test accuracy for digits datasets: {test_accuracy_digits:.4f}")


train_loss_fashion, train_accuracy_fashion = model10.evaluate(X_train_fashion, Y_train_fashion)
test_loss_fashion, test_accuracy_fashion = model10.evaluate(X_test_fashion, Y_test_fashion)

print(f"Aggregate training accuracy for fashion datasets: {train_accuracy_fashion:.4f}")
print(f"Aggregate test accuracy for fashion datasets: {test_accuracy_fashion:.4f}")

# The model shows strong generalization capabilities because
# train and test accuracies are close (gaps: 1% from digits, 2.5% from fashion).
