# Imports

In [1]:
import tensorflow as tf
print(tf.__version__)

import os
import time

import numpy as np # linear algebra
import matplotlib.pyplot as plt


# generate original training and test data
img_size = 28
n_classes = 10

#MNIST data image of shape 28*28=784
input_size = 784

# 0-9 digits recognition (labels)
output_size = 10

2.17.0


In [2]:
#------------------------------------------------------------
#option 1: load MNIST dataset 
#from tensorflow.examples.tutorials.mnist import input_data
#mnist = input_data.read_data_sets("data/", one_hot=True)


#------------------------------------------------------------
#option 2: load MNIST dataset 
print('\nLoading MNIST')
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = np.reshape(x_train, [-1, img_size*img_size])
x_train = x_train.astype(np.float32)/255

x_test = np.reshape(x_test, [-1, img_size*img_size])
x_test = x_test.astype(np.float32)/255

to_categorical = tf.keras.utils.to_categorical 
y_train = to_categorical(y_train)
y_test  = to_categorical(y_test)

print('\nSpliting data')

ind = np.random.permutation(x_train.shape[0])
x_train, y_train = x_train[ind], y_train[ind]

# 10% for validation 
validatationPct = 0.1
n = int(x_train.shape[0] * (1-validatationPct))
x_valid = x_train[n:]
x_train = x_train[:n]
#
y_valid = y_train[n:]
y_train = y_train[:n]

train_num_examples = x_train.shape[0]
valid_num_examples = x_valid.shape[0]
test_num_examples  = x_test.shape[0]

print(train_num_examples, valid_num_examples, test_num_examples)


Loading MNIST

Spliting data
54000 6000 10000


# Parameters

In [3]:
# Global Parameters
#--------------------------------
# learning rate
learning_rate = 0.05

#training_epochs = 1000
#batch_size = 30

training_epochs = 100
batch_size = 50

display_step = 10

#Network Architecture
# -----------------------------------------
#
# Two hidden layers
#
#------------------------------------------
# number of neurons in layer 1
n_hidden_1 = 200
# number of neurons in layer 2
n_hidden_2 = 300

#MNIST data image of shape 28*28=784
input_size = 784

# 0-9 digits recognition (labels)
output_size = 10

# Define Layer Function

In [4]:
def layer(x, weight_shape, bias_shape):
    """
    Defines the network layers.
    
    Input:
        - x: input vector of the layer
        - weight_shape: shape of the weight matrix
        - bias_shape: shape of the bias vector
    Output:
        - output: the output of the layer after matrix multiplication and non-linear transformation
    """
    # Comes from the study by He et al. for ReLU layers
    w_std = (2.0 / weight_shape[0])**0.5

    # Initialize weights and biases using TensorFlow's built-in initializers
    initializer_w = tf.random_normal_initializer(stddev=w_std)
    initializer_b = tf.zeros_initializer()

    # Create weights and bias variables
    W = tf.Variable(initializer_w(shape=weight_shape), trainable=True, name="W")
    b = tf.Variable(initializer_b(shape=bias_shape), trainable=True, name="b")

    print('Weight Matrix:', W)
    print('Bias Vector:', b)

    # Apply matrix multiplication and ReLU activation
    return tf.nn.relu(tf.matmul(x, W) + b)


# Define Inference Function

In [5]:
def inference(x, input_size, n_hidden_1, n_hidden_2, output_size):
    """
    Defines a neural network with two hidden layers and an output layer.
    
    Input:
        - x: a batch of input features (input shape = (batch_size, input_size))
    Output:
        - logits: the output of the network before applying activation (logits)
                  (output shape = (batch_size, output_size))
    """
    
    # Hidden layer 1
    hidden_1 = tf.keras.layers.Dense(n_hidden_1, activation='relu', name="hidden_layer_1")(x)
    
    # Hidden layer 2
    hidden_2 = tf.keras.layers.Dense(n_hidden_2, activation='relu', name="hidden_layer_2")(hidden_1)
    
    # Output layer (logits)
    output = tf.keras.layers.Dense(output_size, name="output")(hidden_2)

    return output

# Define Loss Function

## Define First Loss 

In [6]:
def loss_1(output, y):
    """
    Computes the average error per data sample 
    by computing the cross-entropy loss over a minibatch.
    
    Input:
        - output: the output of the inference function (logits)
        - y: true labels of the sample batch (one-hot encoded)
    Output:
        - loss: scalar tensor representing the loss for the batch
    """
    # Compute the log of the output (logits are expected to be probabilities or softmax outputs)
    log_output = tf.math.log(output)
    
    # Compute the element-wise product of true labels and log output
    dot_product = y * log_output
    
    # Sum the negative log-likelihoods across the class dimension (axis 1)
    xentropy = -tf.reduce_sum(dot_product, axis=1)
    
    # Compute the mean loss across the batch
    loss = tf.reduce_mean(xentropy)

    return loss

## Define Second Loss

In [7]:
def loss_2(output, y):
    """
    Computes softmax cross entropy between logits and labels and returns the loss.
    
    Input:
        - output: the output (logits) of the inference function (shape: batch_size * num_of_classes)
        - y: true labels for the sample batch (shape: batch_size * num_of_classes)
    Output:
        - loss: the scalar loss value for the batch
    """
    # Computes softmax cross entropy between logits (output) and true labels (y)
    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y)
    
    # Return the mean cross-entropy loss across the batch
    loss = tf.reduce_mean(xentropy)

    return loss

# Define the optimizer and training target

In [8]:
def training(cost, global_step):
    """
    Defines the necessary elements to train the network.
    
    Input:
        - cost: the loss of the corresponding batch
        - global_step: the number of batches seen so far
    """
    # Define a scalar summary for the cost (loss) value
    with tf.summary.create_file_writer('./logs/training').as_default():
        tf.summary.scalar("cost", cost, step=global_step)
    
    # Use the new optimizer from TensorFlow 2.x
    optimizer = tf.optimizers.SGD(learning_rate)
    
    # Define the training step
    train_op = optimizer.minimize(cost, var_list=tf.trainable_variables())
    
    # Increment the global step manually (if required)
    global_step.assign_add(1)

    return train_op

# Define evaluation method

In [9]:
def evaluate(output, y):
    """
    Evaluates the accuracy on the validation set.
    Input:
        - output: prediction vector of the network for the validation set
        - y: true value for the validation set
    Output:
        - accuracy: accuracy on the validation set (scalar between 0 and 1)
    """
    # Check if the predicted class equals the true class
    correct_prediction = tf.equal(tf.argmax(output, 1), tf.argmax(y, 1))
    
    # Compute accuracy as the mean of correct predictions
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # Log validation accuracy using TensorFlow summary (if needed)
    with tf.summary.create_file_writer('./logs/validation').as_default():
        tf.summary.scalar("validation_error", 1.0 - accuracy, step=0)

    return accuracy

# Main function

In [11]:
if __name__ == '__main__':
    
    start_time = time.time()
    
    if not os.path.isdir('./logs/'):
        os.makedirs('./logs/')
    log_files_path = './logs/'

    # Define inputs directly (no need for placeholders)
    input_size = 784
    output_size = 10
    batch_size = 128
    training_epochs = 20
    display_step = 1
    n_hidden_1 = 200
    n_hidden_2 = 300
    
    # Define your model using a custom class
    class MyModel(tf.keras.Model):
        def __init__(self, input_size, n_hidden_1, n_hidden_2, output_size):
            super(MyModel, self).__init__()
            # Define layers
            self.hidden_1 = tf.keras.layers.Dense(n_hidden_1, activation='relu')
            self.hidden_2 = tf.keras.layers.Dense(n_hidden_2, activation='relu')
            self.output_layer = tf.keras.layers.Dense(output_size)
        
        def call(self, inputs):
            # Forward pass
            x = self.hidden_1(inputs)
            x = self.hidden_2(x)
            return self.output_layer(x)

    # Instantiate the model with the architecture parameters
    model = MyModel(input_size, n_hidden_1, n_hidden_2, output_size)
    
    # Define optimizer
    optimizer = tf.optimizers.Adam()

    # Define the checkpoint manager
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint, './logs/multi_layer', max_to_keep=5)

    # Training loop
    for epoch in range(training_epochs):
        
        avg_cost = 0.
        total_batch = int((train_num_examples + batch_size - 1) / batch_size)
        
        for i in range(total_batch):
            start = i * batch_size
            end = min(train_num_examples, start + batch_size)
            minibatch_x = x_train[start:end]
            minibatch_y = y_train[start:end]
            
            # Define training step using GradientTape
            with tf.GradientTape() as tape:
                output = model(minibatch_x)
                cost = loss_2(output, minibatch_y)
            
            # Compute gradients and apply them
            gradients = tape.gradient(cost, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
            avg_cost += cost.numpy() / total_batch

        if epoch % display_step == 0:
            # Evaluate on validation data
            accuracy = evaluate(model(x_valid), y_valid)
            print(f"Epoch: {epoch:03d}, cost={avg_cost:.7f}, Validation Error={1-accuracy:.7f}")
            
            # Save checkpoint
            checkpoint_manager.save()

    # Final test accuracy
    accuracy = evaluate(model(x_test), y_test)
    print("Test Accuracy:", accuracy)

    elapsed_time = time.time() - start_time
    print(f'Execution time (seconds) was {elapsed_time:.3f}')
            

Epoch: 000, cost=0.2860187, Validation Error=0.0448334
Epoch: 001, cost=0.1069164, Validation Error=0.0340000
Epoch: 002, cost=0.0695702, Validation Error=0.0333334
Epoch: 003, cost=0.0484154, Validation Error=0.0308333
Epoch: 004, cost=0.0346433, Validation Error=0.0265000
Epoch: 005, cost=0.0270449, Validation Error=0.0271667
Epoch: 006, cost=0.0248251, Validation Error=0.0263333
Epoch: 007, cost=0.0223597, Validation Error=0.0283333
Epoch: 008, cost=0.0197223, Validation Error=0.0246667
Epoch: 009, cost=0.0152146, Validation Error=0.0235000
Epoch: 010, cost=0.0099945, Validation Error=0.0246667
Epoch: 011, cost=0.0103586, Validation Error=0.0236667
Epoch: 012, cost=0.0111240, Validation Error=0.0263333
Epoch: 013, cost=0.0107814, Validation Error=0.0243334
Epoch: 014, cost=0.0102970, Validation Error=0.0253333
Epoch: 015, cost=0.0098749, Validation Error=0.0253333
Epoch: 016, cost=0.0078512, Validation Error=0.0251667
Epoch: 017, cost=0.0058092, Validation Error=0.0250000
Epoch: 018