In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from keras.datasets import mnist
from keras.utils import to_categorical
from keras import layers, regularizers
from keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Sequential

**Load MNIST dataset**

In [44]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [45]:
print("Image_shape: ",x_train.shape)
print("Label_shape: ",y_train.shape)

Image_shape:  (60000, 28, 28)
Label_shape:  (60000,)


The values in the MNIST dataset range from 0 (white) to 255 (black). To improve numerical consistency, these values should be normalized.

The labels range from 0 to 9, each representing one of the ten digits. Currently, they have not been transformed into one-hot encoded format. Converting them to categorical labels will enhance the training process. For example:

0 → [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

1 → [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

2 → [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

In [46]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

First approach: Use a Multi-Layer Perceptron (MLP).

It is necessary to flatten the training data to a shape of
28*28=784 instead of 28,28






In [29]:
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28)

In [30]:
print("Image_shape: ",x_train.shape)
print("Label_shape: ",y_train.shape)

Image_shape:  (60000, 784)
Label_shape:  (60000, 10)


In [None]:
print(x_train[0],y_train[0])

In [36]:
inputs = keras.Input(shape=(28 * 28,), name="mnist")
x = Dense(512, activation='relu')(inputs)
outputs = Dense(10, activation='softmax')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

The values in the MNIST dataset range from 0 (white) to 255 (black). To improve numerical consistency, these values should be normalized.

The labels range from 0 to 9, each representing one of the ten digits. Currently, they have not been transformed into one-hot encoded format. Converting them to categorical labels will enhance the training process. For example:

0 → [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

1 → [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

2 → [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

In [37]:
model.compile(
    loss=keras.losses.categorical_crossentropy,
    optimizer='adam',
    metrics=['accuracy']
)

In [38]:
model.fit(
    x_train, y_train,
    batch_size=128,
    epochs=10,
    verbose=1,
    validation_data=(x_test, y_test)
)

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.8669 - loss: 0.4668 - val_accuracy: 0.9599 - val_loss: 0.1401
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.9673 - loss: 0.1164 - val_accuracy: 0.9720 - val_loss: 0.0919
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.9789 - loss: 0.0728 - val_accuracy: 0.9771 - val_loss: 0.0765
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.9864 - loss: 0.0507 - val_accuracy: 0.9783 - val_loss: 0.0698
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.9901 - loss: 0.0352 - val_accuracy: 0.9790 - val_loss: 0.0674
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.9931 - loss: 0.0258 - val_accuracy: 0.9779 - val_loss: 0.0667
Epoch 7/10
[1m469/46

<keras.src.callbacks.history.History at 0x7ab6ff603cd0>

Second approach: Use a Convolutional Neural Network (CNN).

It is necessary to reshape the training data to a shape of 28,28,1 (1 channel)

In [48]:
x_train = x_train.reshape(-1, 28,28,1)
x_test = x_test.reshape(-1, 28,28,1)

In [49]:
print("Image_shape: ",x_train.shape)
print("Label_shape: ",y_train.shape)

Image_shape:  (60000, 28, 28, 1)
Label_shape:  (60000, 10)


In [50]:
input_shape = (28, 28, 1)

inputs = keras.Input(shape=input_shape)
x = Conv2D(32, kernel_size=(3, 3), activation='relu')(inputs)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
outputs = Dense(10, activation='softmax')(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

In [51]:
model.compile(
    loss=keras.losses.categorical_crossentropy,
    optimizer='adam',
    metrics=['accuracy']
)

In [52]:
model.fit(
    x_train, y_train,
    batch_size=128,
    epochs=10,
    verbose=1,
    validation_data=(x_test, y_test)
)

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 414ms/step - accuracy: 0.9081 - loss: 0.2969 - val_accuracy: 0.9847 - val_loss: 0.0469
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 414ms/step - accuracy: 0.9872 - loss: 0.0421 - val_accuracy: 0.9871 - val_loss: 0.0380
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 414ms/step - accuracy: 0.9934 - loss: 0.0212 - val_accuracy: 0.9856 - val_loss: 0.0466
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 410ms/step - accuracy: 0.9956 - loss: 0.0139 - val_accuracy: 0.9891 - val_loss: 0.0364
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 411ms/step - accuracy: 0.9976 - loss: 0.0083 - val_accuracy: 0.9874 - val_loss: 0.0473
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 413ms/step - accuracy: 0.9980 - loss: 0.0061 - val_accuracy: 0.9881 - val_loss: 0.0441
Epoc

<keras.src.callbacks.history.History at 0x7ab6ff418820>

**Discussion and Further Analysis:**

CNNs clearly provide better predictive performance compared to vanilla MLPs. The reasons behind the differences in the two architectures are:

*   MLPs treat each value in a sample (image) equally through a flattening process.
*   CNNs learn through kernels (filters) that focus on small regions of an image, which gives them a better understanding of the given image.

Other architectures or approaches can also enhance the predictive capacity of a model, such as pre-training with metric learning before fine-tuning.