In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import math

In [3]:
def load_data():
    (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
    
    train_images = train_images.reshape(train_images.shape[0], -1).astype('float32') / 255
    test_images = test_images.reshape(test_images.shape[0], -1).astype('float32') / 255
    
    train_labels = tf.keras.utils.to_categorical(train_labels, 10)
    test_labels = tf.keras.utils.to_categorical(test_labels, 10)
    
    return (train_images, train_labels), (test_images, test_labels)

In [4]:
(train_images, train_labels), (test_images, test_labels) = load_data()

In [5]:
def initialize_parameters(layer_dims):
    parameters = {}
    for l in range(1, len(layer_dims)):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2./layer_dims[l-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters

In [6]:
def relu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    Z_shifted = Z - np.max(Z, axis=0, keepdims=True)
    expZ = np.exp(Z_shifted)
    return expZ / np.sum(expZ, axis=0, keepdims=True)

In [7]:
def forward_propagation(X, parameters):
    cache = {}
    A = X
    L = len(parameters) // 2
    
    for l in range(1, L + 1):
        Z = np.dot(parameters['W' + str(l)], A) + parameters['b' + str(l)]
        if l == L:
            A = softmax(Z)
        else:
            A = relu(Z)
        cache['A' + str(l)] = A
        cache['Z' + str(l)] = Z
    
    return A, cache

In [8]:
def compute_cost(AL, Y, parameters, lambd):
    m = Y.shape[1]
    cross_entropy_cost = -np.sum(Y * np.log(AL + 1e-8)) / m
    L2_regularization_cost = (lambd / (2 * m)) * sum(np.sum(np.square(parameters['W' + str(l)])) for l in range(1, len(parameters) // 2 + 1))
    cost = cross_entropy_cost + L2_regularization_cost
    return cost

In [9]:
def backward_propagation(parameters, cache, X, Y, lambd):
    grads = {}
    L = len(parameters) // 2
    m = X.shape[1]
    AL = cache['A' + str(L)]
    
    # Gradient of the cost with respect to ZL
    dZL = AL - Y
    grads['dW' + str(L)] = (np.dot(dZL, cache['A' + str(L-1)].T) + lambd * parameters['W' + str(L)]) / m
    grads['db' + str(L)] = np.sum(dZL, axis=1, keepdims=True) / m
    grads['dZ' + str(L)] = dZL
    
    for l in reversed(range(1, L)):
        dA = np.dot(parameters['W' + str(l+1)].T, grads['dZ' + str(l+1)])
        dZ = dA * (cache['Z' + str(l)] > 0)
        if l > 1:
            grads['dW' + str(l)] = (np.dot(dZ, cache['A' + str(l-1)].T) + lambd * parameters['W' + str(l)]) / m
        else:
            grads['dW' + str(l)] = (np.dot(dZ, X.T) + lambd * parameters['W' + str(l)]) / m
        grads['db' + str(l)] = np.sum(dZ, axis=1, keepdims=True) / m
        grads['dZ' + str(l)] = dZ
    
    return grads

In [10]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    for l in range(1, L + 1):
        parameters['W' + str(l)] -= learning_rate * grads['dW' + str(l)]
        parameters['b' + str(l)] -= learning_rate * grads['db' + str(l)]
    return parameters

In [11]:
def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
    np.random.seed(seed)
    m = X.shape[1]
    mini_batches = []
    
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation]
    
    num_complete_minibatches = math.floor(m / mini_batch_size)
    for k in range(num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size:(k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size:(k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [12]:
def model(X, Y, layers_dims, learning_rate=0.0075, num_iterations=3000, lambd=0.7, mini_batch_size=64, seed=0):
    parameters = initialize_parameters(layers_dims)
    costs = []
    
    for i in range(num_iterations):
        minibatches = random_mini_batches(X.T, Y.T, mini_batch_size, seed)
        
        for minibatch in minibatches:
            minibatch_X, minibatch_Y = minibatch
            AL, cache = forward_propagation(minibatch_X, parameters)
            cost = compute_cost(AL, minibatch_Y, parameters, lambd)
            grads = backward_propagation(parameters, cache, minibatch_X, minibatch_Y, lambd)
            parameters = update_parameters(parameters, grads, learning_rate)
        
        if i % 100 == 0:
            print(f"Cost after iteration {i}: {cost}")
            costs.append(cost)
    
    return parameters, costs

In [13]:
layers_dims = [train_images.shape[1], 128,128,128,64, 64, 10]

In [14]:
parameters, costs = model(train_images, train_labels, layers_dims, learning_rate=0.0075, num_iterations=3000, lambd=0.7, mini_batch_size=64, seed=0)

Cost after iteration 0: 10.291111847111548


KeyboardInterrupt: 