In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from mnist import MNIST
%matplotlib inline

  return f(*args, **kwds)


In [22]:
mndata = MNIST('MNIST')
X_test, Y_test = mndata.load_testing()
X_train, Y_train = mndata.load_training()

X_test = np.array(list(map(lambda x: np.array(x), X_test))).T
Y_test = np.array(Y_test).T

X_train = np.array(list(map(lambda x: np.array(x), X_train))).T
Y_train = np.array(Y_train).T

## Normalize data

In [23]:
def normalize_data(X_train, X_test):
    total = np.concatenate((X_train, X_test), axis=1)
    avg = np.mean(total, axis=1, keepdims=True)
    norm = np.linalg.norm(total, axis=1, keepdims=True)
    norm = np.array([np.apply_along_axis(lambda x: x if x != 0 else 1, arr=norm, axis=1)]).T
    X_train_normalized = (X_train - avg)/norm
    X_test_normalized = (X_test - avg)/norm
    return X_train_normalized, X_test_normalized

X_train_normalized, X_test_normalized = normalize_data(X_train, X_test)

In [24]:
X_train = X_test_normalized
X_test = X_test_normalized

## One-hot labels encoding

In [43]:
def one_hot(labels):
    onehot = np.zeros((CLASSES_NUMBER, len(labels)))
    for i in range(len(labels)):
        onehot[int(labels[i]),i] = 1
    return onehot

Y_train_onehot = one_hot(Y_train)
Y_test_onehot = one_hot(Y_test)

print(Y_train_onehot.shape)
print(Y_test_onehot.shape)

(10, 60000)
(10, 10000)


In [44]:
Y_train = Y_test_onehot
Y_test = Y_test_onehot

## Plot Image

In [14]:
IMAGE_WIDTH = 28
INPUTS_NUMBER = IMAGE_WIDTH*IMAGE_WIDTH
CLASSES_NUMBER = 10

In [6]:
def plot_image(record):
    plt.imshow(record.reshape(IMAGE_WIDTH,-1))

# NN Implementation

## Activation Functions

In [132]:
def sigmoid(z):
    res = 1./(1.+np.exp(-z))
    return (res, z)

def relu(z):
    return (np.maximum(0, z), z)
    
    
def sigmoid_backward(dA, cache):
    Z = cache
    A = sigmoid(Z)[0]
    dx = dA*(A*(1-A))
    return dx
    

def relu_backward(dA, cache):
    z = cache
    dz = np.array(dA, copy=True)
    dz[z <= 0] = 0
    return dz

## Initialization

In [7]:
def initialize_parameters(layers_dims):
    np.random.seed(3)
    parameters = []
    for i in range(1,len(layers_dims)):
        layer_params = {}
        layer_params['W'] = tf.Variable(np.random.randn(layers_dims[i], layers_dims[i-1])*0.01, name='W'+str(i))
        layer_params['b'] = tf.Variable(tf.zeros((layers_dims[i], 1), dtype=tf.float64), name='b'+str(i))
        parameters.append(layer_params)
    return parameters

In [8]:
initialize_parameters([4,5,6])

[{'W': <tf.Variable 'W1:0' shape=(5, 4) dtype=float64_ref>,
  'b': <tf.Variable 'b1:0' shape=(5, 1) dtype=float64_ref>},
 {'W': <tf.Variable 'W2:0' shape=(6, 5) dtype=float64_ref>,
  'b': <tf.Variable 'b2:0' shape=(6, 1) dtype=float64_ref>}]

## Compute Softmax Regression cost

In [9]:
def compute_cost(AL, Y):
    Y_ = tf.placeholder(tf.float64, [None,None])
    cost = -tf.reduce_sum(Y_*tf.log(AL))  
    with tf.Session() as ses:
        ses.run(tf.global_variables_initializer())
        return ses.run(cost, feed_dict={Y_: Y})

In [10]:
AL = tf.Variable([[1., 2., 3.]], dtype=tf.float64)

print(compute_cost(AL, [[1, 2, 3]]))

-4.68213122712422


## Forward Propagation

In [46]:
def linear_forward(A, W, b):
    Z = tf.matmul(W, A) + b
    cache = (A, W, b)
    return Z


def linear_activation_forward(A_prev, W, b):
    Z = linear_forward(A_prev, W, b)
    A = tf.nn.relu(Z)        
    return A


def model(dim, X):
    
    parameters = initialize_parameters(dim)
    
    L = len(parameters)
    A = X
    
    for i in range(L-1):
        W = parameters[i]['W']
        b = parameters[i]['b']
        A = linear_activation_forward(A, W, b)
    W = parameters[L-1]['W']
    b = parameters[L-1]['b']
    A = linear_forward(A, W, b)
    
    return tf.nn.softmax(A)

In [47]:
X = tf.placeholder(tf.float64, [2, None], name='X')
mod = model([2,3,4], X)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(mod, feed_dict={X: [[1],[2]]}))

[[1.]
 [1.]
 [1.]
 [1.]]


## Training

In [49]:
Y_ = tf.placeholder(tf.float64, [CLASSES_NUMBER, None])
X_ = tf.placeholder(tf.float64, [INPUTS_NUMBER, None])
cost_function = -tf.reduce_sum(Y_*tf.log(model([INPUTS_NUMBER, 6, CLASSES_NUMBER], X_)))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cost_function)

In [5]:
def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    dW = np.dot(dZ, A_prev.T)/m
    db = np.sum(dZ, axis=1, keepdims=True)/m
    dA_prev = np.dot(W.T, dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db


def linear_activation_backward(dA, cache, activation):

    linear_cache, activation_cache = cache
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db


def backward_prop(dAL, caches):
    L = len(caches)
    dA = dAL
    grads = []
    dA, dW, db = linear_activation_backward(dA, caches[L-1], 'sigmoid')
    layer_grads = {'dW': dW, 'db': db}
    grads.append(layer_grads)
    for i in reversed(range(0, L-1)):
        dA, dW, db = linear_activation_backward(dA, caches[i], 'relu')
        layer_grads = {'dW': dW, 'db': db}
        grads.append(layer_grads)
    grads.reverse()
    return grads


def update_parameters(parameters, grads, learning_rate):
    for i in range(len(parameters)):
        parameters[i]['W'] = parameters[i]['W'] - grads[i]['dW']*learning_rate
        parameters[i]['b'] = parameters[i]['b'] - grads[i]['db']*learning_rate
    return parameters


def predict(parameters, X):
    probs = forward_prop(parameters=parameters, X=X)[0]
    return (probs > 0.5).astype(int)

 
def L_layer_model(X_train, Y_train, X_test, Y_test, layers_dims, num_iterations=2000, batch_size=512, learning_rate=0.05):
    np.random.seed(1)
    layers_dims.insert(0, X_train.shape[0])
    parameters = initialize_parameters(layers_dims=layers_dims)
    costs = []
    iterations = []
    for i in range(num_iterations):
        AL, caches = forward_prop(parameters=parameters, X=X_train)
        # Cost function. Inputs: "A2, Y, parameters". Outputs: "cost".
        cost = compute_cost(AL=AL, Y=Y_train)
        # Backpropagation. Inputs: "parameters, cache, X, Y". Outputs: "grads".
        dAL = -(np.divide(Y_train, AL) - np.divide(1 - Y_train, 1 - AL))
        grads = backward_prop(dAL, caches)
        
        # Gradient descent parameter update. Inputs: "parameters, grads". Outputs: "parameters".
        parameters = update_parameters(grads=grads, learning_rate=learning_rate, parameters=parameters)
        # Print the cost every 100 iterations
        if i % 100 == 0:
            iterations.append(i)
            costs.append(cost)
            print("Cost after iteration %i: %f" %(i, cost))
            
    Y_prediction_test = predict(parameters,X=X_test)
    Y_prediction_train = predict(parameters,X=X_train)
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
    plt.plot(iterations, costs)
    plt.show()
    
    return parameters