[View in Colaboratory](https://colab.research.google.com/github/Jpeiii/DeepLearning.ai/blob/master/Programming_Assignment_2_Regularization.ipynb)

**Problem Statement: **

You have just been hired as an AI expert by the French Football Corporation. They would like you to recommend positions where France's goal keeper should kick the ball so that the French team's players can then hit it with their head.

**Your goal: **

Use a deep learning model to find the positions on the field where the goalkeeper should kick the ball.



**Regularization:**



1.   Non-regularized model
2.   L2 Regularization
3.   Dropout











**1. Non-regularized model**

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.io

def load_2D_dataset():

    data = scipy.io.loadmat('C:/Users/DELL/Desktop/A.I/Deep learning.data/Assignment 7')
    train_X = data['X'].T
    train_Y = data['y'].T
    test_X = data['Xval'].T
    test_Y = data['yval'].T

    plt.scatter(train_X[0, :], train_X[1, :], c=np.squeeze(train_Y), s=40, cmap=plt.cm.Spectral);

    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = load_2D_dataset()


def sigmoid(Z):

    A = 1/(1 + np.exp(-Z))

    return A

def relu(Z):

    A = np.maximum(0, Z)

    return A

def initialize_parameters_random(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) / np.sqrt(layer_dims[l - 1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))

        assert (parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l - 1])
        assert (parameters['W' + str(l)].shape == layer_dims[l], 1)

    return parameters


def compute_cost(A3, Y):

    m = Y.shape[1]
    logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1-A3), (1-Y))
    cost = 1/m* np.nansum(logprobs)

    return cost

def forward_propagation(X, parameters):

    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']


    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = relu(Z2)
    Z3 = np.dot(W3, A2) + b3
    A3 = sigmoid(Z3)
    cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)


    return A3, cache

def backward_propagation(X, Y, cache):

    m = X.shape[1]

    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dz3 = 1/m * (A3 - Y)
    dW3 = np.dot(dz3, A2.T)
    db3 = np.sum(dz3, axis=1, keepdims=True)

    da2 = np.dot(W3.T, dz3)
    dz2 = np.multiply(da2, np.int64(A2 > 0))
    dW2 = np.dot(dz2, A1.T)
    db2 = np.sum(dz2, axis=1, keepdims=True)

    da1 = np.dot(W2.T, dz2)
    dz1 = np.multiply(da1, np.int64(A1 > 0))
    dW1 = np.dot(dz1, X.T)
    db1 = np.sum(dz1, axis=1, keepdims=True)

    gradients = {'dz3': dz3,
                 'dW3': dW3,
                 'db3': db3,
                 'da2': da2,
                 'dz2': dz2,
                 'dW2': dW2,
                 'db2': db2,
                 'da1': da1,
                 'dz1': dz1,
                 'dW1': dW1,
                 'db1': db1}

    return gradients

def update_parameters(parameters, gradients, learning_rate):

    L = len(parameters)//2

    for k in range(L):
        parameters['W' + str(k+1)] = parameters['W' + str(k+1)] - learning_rate * gradients['dW' + str(k+1)]
        parameters['b' + str(k+1)] = parameters['b' + str(k+1)] - learning_rate * gradients['db' + str(k+1)]

    return parameters

def predict(X,Y, parameters):

    m = X.shape[1]
    p = np.zeros((1, m), dtype=np.int)

    A3, cache =  forward_propagation(X, parameters)

    for i in range(0, A3.shape[1]):
        if A3[0, i] > 0.5:
            p[0, i] = 1
        else:
            p[0, i] = 0

    print('Accuracy:' + str(np.mean((p[0,:] == Y[0,:]))))

    return p

def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=np.squeeze(y), cmap=plt.cm.Spectral)
    plt.show()

def predict_decision(parameters, X):
    # used for plotting decision boundary

    A3, cache =  forward_propagation(X, parameters)
    predictions = (A3 > 0.5)
    return predictions



def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True):
    gradients = {}
    costs = []
    m = X.shape[1]
    layer_dims = [X.shape[0], 20, 3, 1]
    parameters = initialize_parameters_random(layer_dims)

    for i in range(0, num_iterations):

        A3, cache = forward_propagation(X, parameters)
        cost = compute_cost(A3, Y)
        gradients = backward_propagation(X, Y, cache)
        parameters = update_parameters(parameters, gradients, learning_rate)
        if print_cost and i % 10000 == 0:
            print("Cost after iteration {}: {}".format(i, cost))
            costs.append(cost)

    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

    return parameters


parameters = model(train_X, train_Y)

print ("On the training set:")
predictions_train = predict(train_X, train_Y, parameters)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, parameters)

plt.title("Model without regularization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_decision(parameters, x.T), train_X, train_Y)
plt.show()


**2. L2 Regularization**

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.io

def load_2D_dataset():

    data = scipy.io.loadmat('C:/Users/DELL/Desktop/A.I/Deep learning.data/Assignment 7')
    train_X = data['X'].T
    train_Y = data['y'].T
    test_X = data['Xval'].T
    test_Y = data['yval'].T

    #plt.scatter(train_X[0, :], train_X[1, :], c=np.squeeze(train_Y), s=40, cmap=plt.cm.Spectral);

    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = load_2D_dataset()

def sigmoid(x):

    s = 1/(1 + np.exp(-x))

    return s

def relu(x):

    s = np.maximum(0,x)

    return s

def initialize_parameters(layer_dims):

    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.rand(layer_dims[l], layer_dims[l-1])/np.sqrt(layer_dims[l-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))

    return parameters

def compute_cost(A3, Y):

    m = Y.shape[1]
    logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1-A3), (1-Y))
    cost = 1/m* np.nansum(logprobs)

    return cost

def compute_cost_with_regularization(A3, Y, parameters, lambd):

    m = Y.shape[1]

    W1 = parameters['W1']
    W2 = parameters['W2']
    W3 = parameters['W3']

    cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross entropy part of the cost

    L2_regularization_cost = (1/m*lambd/2)*(np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3)))

    cost = cross_entropy_cost + L2_regularization_cost

    return cost

def forward_propagation(X, parameters):

    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']

    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = relu(Z2)
    Z3 = np.dot(W3, A2) + b3
    A3 = sigmoid(Z3)

    cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)

    return A3, cache

def backward_propagation_with_regularization(X, Y, cache, lambd):

    m = X.shape[1]

    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dz3 = A3 - Y
    dw3 = 1/m * np.dot(dz3, A2.T) + lambd/m *W3
    db3 = 1/m * np.sum(dz3, axis=1, keepdims=True)

    da2 = np.dot(W3.T, dz3)
    dz2 = np.multiply(da2, np.int64(A2 > 0))
    dw2 = 1/m * np.dot(dz2, A1.T) + lambd/m *W2
    db2 = 1/m * np.sum(dz2, axis=1, keepdims=True)

    da1 = np.dot(W2.T, dz2)
    dz1 = np.multiply(da1, np.int64(A1 > 0))
    dw1 = 1/m * np.dot(dz1, X.T) + lambd/m *W1
    db1 = 1/m * np.sum(dz1, axis=1, keepdims=True)

    gradients = {'dz3' :dz3,
                 'dw3' :dw3,
                 'db3' :db3,
                 'dz2' :dz2,
                 'dw2' :dw2,
                 'db2' :db2,
                 'dz1' :dz1,
                 'db1' :db1,
                 'dw1' :dw1}

    return gradients

def update_parameters(parameters, gradients, learning_rate):

    L = len(parameters)//2
    for k in range(L):
        parameters['W' + str(k+1)] = parameters['W' + str(k+1)] - learning_rate * gradients['dw' + str(k+1)]
        parameters['b' + str(k+1)] = parameters['b' + str(k+1)] - learning_rate * gradients['db' + str(k+1)]

    return parameters

def predict(X, Y, parameters):

    m = X.shape[1]
    p = np.zeros((1,m), dtype=np.int)

    A3, cache = forward_propagation(X, parameters)

    for i in range(0, A3.shape[1]):
        if A3[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    print('Accuracy:' + str(np.mean((p[0,:] == Y[0,:]))))

    return p

def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True, lambd=0.7):

    gradients = {}
    costs = []
    m = X.shape[1]
    layer_dims = [X.shape[0], 20, 3, 1]
    parameters = initialize_parameters(layer_dims)

    for i in range(0, num_iterations):
        A3, cache = forward_propagation(X, parameters)
        cost = compute_cost_with_regularization(A3, Y, parameters, lambd)
        gradients = backward_propagation_with_regularization(X, Y, cache, lambd)
        parameters = update_parameters(parameters, gradients, learning_rate)

        if print_cost and i % 10000 == 0:
            print('Cost after iteration{}:{}'.format(i, cost))
        if print_cost and i % 1000 == 0:
            costs.append(cost)


    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (x1,000)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

    return parameters

parameters = model(train_X, train_Y, learning_rate=0.3, num_iterations=30000, print_cost=True, lambd=0.7)

def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=np.squeeze(y), cmap=plt.cm.Spectral)
    plt.show()

def predict_decision(parameters, X):
    # used for plotting decision boundary

    A3, cache =  forward_propagation()X, parameters)
    predictions = (A3 > 0.5)

    return predictions

print ("On the training set:")
predictions_train = predict(train_X, train_Y, parameters)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, parameters)

plt.title("Model without regularization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_decision(parameters, x.T), train_X, train_Y)
plt.show()


**3. Dropout**

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.io

def load_2D_dataset():

    data = scipy.io.loadmat('C:/Users/DELL/Desktop/A.I/Deep learning.data/Assignment 7')
    train_X = data['X'].T
    train_Y = data['y'].T
    test_X = data['Xval'].T
    test_Y = data['yval'].T

    #plt.scatter(train_X[0, :], train_X[1, :], c=np.squeeze(train_Y), s=40, cmap=plt.cm.Spectral);

    return train_X, train_Y, test_X, test_Y

train_X, train_Y, test_X, test_Y = load_2D_dataset()

def sigmoid(x):

    s = 1/(1 + np.exp(-x))

    return s

def relu(x):

    s = np.maximum(0,x)

    return s

def initialize_parameters(layer_dims):

    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))


    return parameters

def compute_cost(A3, Y):

    m = Y.shape[1]
    logprobs = np.multiply(-np.log(A3), Y) + np.multiply(np.log(1 - A3), (1-Y))
    cost = 1/m * np.nansum(logprobs)

    return cost

def forward_propagation_with_dropout(X, parameters, keep_prob = 0.5):

    np.random.seed(1)

    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]

    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)
    D1 = np.random.randn(A1.shape[0], A1.shape[1])
    D1 = D1 < keep_prob
    A1 = A1 * D1
    A1 = A1 / keep_prob

    Z2 = np.dot(W2, A1) + b2
    A2 = relu(Z2)
    D2 = np.random.randn(A2.shape[0], A2.shape[1])
    D2 = D2 < keep_prob
    A2 = A2 * D2
    A2 = A2 / keep_prob

    Z3 = np.dot(W3, A2) + b3
    A3 = sigmoid(Z3)

    cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)

    return A3, cache

def backward_propagation_with_dropout(X, Y, cache, keep_prob):

    m = X.shape[1]
    (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y
    dW3 = 1./m * np.dot(dZ3, A2.T)
    db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)
    dA2 = np.dot(W3.T, dZ3)
    dA2 = dA2 * D2
    dA2 = dA2 / keep_prob
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = 1./m * np.dot(dZ2, A1.T)
    db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)
    dA1 = np.dot(W2.T, dZ2)
    dA1 = dA1 * D1
    dA1 = dA1 / keep_prob
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1./m * np.dot(dZ1, X.T)
    db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)

    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients


def update_parameters(parameters, grads, learning_rate):

    L = len(parameters) // 2


    for k in range(L):

        parameters["W" + str(k+1)] = parameters["W" + str(k+1)] - learning_rate * grads["dW" + str(k+1)]
        parameters["b" + str(k+1)] = parameters["b" + str(k+1)] - learning_rate * grads["db" + str(k+1)]

    return parameters

def predict(X, Y, parameters):

    m = X.shape[1]
    p = np.zeros((1,m), dtype=np.int)

    A3, cache = forward_propagation_with_dropout(X, parameters)

    for i in range(0, A3.shape[1]):
        if A3[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    print('Accuracy:' + str(np.mean((p[0,:] == Y[0,:]))))

    return p

def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=np.squeeze(y), cmap=plt.cm.Spectral)
    plt.show()

def predict_decision(parameters, X):
    # used for plotting decision boundary

    A3, cache =  forward_propagation_with_dropout(X, parameters)
    predictions = (A3 > 0.5)

    return predictions


def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, keep_prob = 1):


    grads = {}
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 20, 3, 1]

    parameters = initialize_parameters(layers_dims)


    for i in range(0, num_iterations):
        A3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
        cost = compute_cost(A3, Y)
        grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)
        parameters = update_parameters(parameters, grads, learning_rate)

        # Print the loss every 10000 iterations
        if print_cost and i % 10000 == 0:
            print("Cost after iteration {}: {}".format(i, cost))
        if print_cost and i % 1000 == 0:
            costs.append(cost)


    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (x1,000)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

    return parameters


parameters = model(train_X, train_Y, keep_prob = 0.86, learning_rate = 0.3)

print ("On the train set:")
predictions_train = predict(train_X, train_Y, parameters)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, parameters)

plt.title("Model with dropout")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x:  predict_decision(parameters, x.T), train_X, train_Y)
