In [156]:
# Imports
from keras.datasets import mnist
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [157]:
# Load the dataset and examine the splitting
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

print("X_train shape: ", X_train.shape) # --> 60000 28x28 images
print("X_test.shape: ", X_test.shape) # --> 10000 28x28x images

X_train shape:  (60000, 28, 28)
X_test.shape:  (10000, 28, 28)


In [158]:
# Activation functions
def sigmoid(X):
    return 1 / (1 + np.exp(-X))

def relu(X):
    return np.maximum(0, X)

def softmax(X):
    exps = np.exp(X - X.max())
    return exps / np.sum(exps, axis = 0)

In [159]:
def initialize_parameters(layer_dims):
    """
    Initializes the weights and biases for each layer
    
    Argument:
    layer_dims - list containing the dimensions of each layer
    
    Returns:
    parameters stored in a dictionary:
        Wl - weight matrix of layer l - shape (layer_dims[l], layer_dims[l - 1])
        bl - bias vector of layer l - shape (layer_dims[l], 1)    
    """
    
    parameters = {}
    for i in range(1, len(layer_dims)):
        parameters["W" + str(i)] = np.random.randn(layer_dims[i], layer_dims[i - 1])
        parameters["b" + str(i)] = np.zeros((layer_dims[i], 1))
        
    return parameters
    

In [160]:
def linear_pass(A, W, b):
    """
    The linear part of forward propagation
    
    Arguments:
    A - the activations from the previous layer
    W - weights matrix
    b - bias vector
    
    Returns:
    Z - pre-activation parameter
    cache - a tuple stored for more efficient backward pass
    """
    
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    
    return Z, cache

In [161]:
def activation(A_prev, W, b, activation):
    """
    Applies the activation to the linear part during forward propagation
    
    Arguments:
    A_prev - activations from the previous layer
    W - weights matrix
    b - bias vector
    
    Returns:
    A - the activations
    cache - a tuple with linear_cache and activation_cache for more efficient backward pass
    """
    
    Z, linear_cache = linear_pass(A_prev, W, b)
    
    if activation == "sigmoid":
        A = sigmoid(Z)
    elif activation == "relu":
        A = relu(Z)
    elif activation == "softmax":
        A = softmax(Z)
    activation_cache = 6969
    
    cache = (linear_cache, activation_cache)
        
    return A, cache

In [162]:
def forward_propagation(X, parameters):
    """
    The forward pass in the form of [LINEAR -> RELU] * (L - 1) -> LINEAR -> SOFTMAX
    
    Arguments:
    X - the data input - column representation, shape (input_size, number_of_examples)
    parameters - the output of parameter initialization
    
    Returns:
    AL - the activations/probability vector
    caches - list of all caches
    """
    
    caches = []
    A = X
    L = len(parameters) // 2 # Number of layers
    
    # Build the [LINEAR -> RELU] blocks
    for i in range(1, L):
        A_prev = A
        A, cache = activation(A_prev, parameters["W" + str(i)], parameters["b" + str(i)], "relu")
        caches.append(cache)
        
    # Build the last LINEAR -> SOFTMAX block
    AL, cache = activation(A, parameters["W" + str(L)], parameters["b" + str(L)], "softmax")
    caches.append(cache)
    
    return AL, caches

In [184]:
def compute_cost(AL, Y):
    """
    Computes the cost function
    
    Arguments:
    AL - the activations corresponding to the predictions
    Y - the label vector
    
    Returns:
    cost - the cross-entropy cost
    """
    
    m = Y.shape[0]
    
    cost = (-1 / m) * np.sum(np.dot(Y, np.log(AL).T) + np.dot((1 - Y), np.log(1 - AL).T))
    cost = np.squeeze(cost)
    
    return cost

In [185]:
def backpropagation():
    print("ello mate u alright")

In [186]:
def model(X, Y, layer_dims, learning_rate, num_iterations):
    
    costs = []
    
    # 1 - initialize the weights and biases
    parameters = initialize_parameters(layer_dims)
    
    # 2 - do one forward pass
    AL, caches = forward_propagation(X, parameters)
    
    # 3 - compute cost
    cost = compute_cost(AL, Y_train)
    
    return AL, cost

In [187]:
# Main
X_train_flatten = X_train.reshape(X_train.shape[0], -1).T
print(X_train_flatten.shape)
layer_dims = [example.shape[0], 128, 64, 10]
prediction, cost = model(X_train_flatten, Y_train, layer_dims, 0.0075, 1000)
print(prediction)
print(cost)
    

(784, 60000)
Y:
60000
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
nan


  return exps / np.sum(exps, axis = 0)
  cost = (-1 / m) * np.sum(np.dot(Y, np.log(AL).T) + np.dot((1 - Y), np.log(1 - AL).T))
