# Assignment 4: Neural Network Learning

In [1]:
!pip install mat4py

Collecting mat4py
  Downloading mat4py-0.5.0-py2.py3-none-any.whl (13 kB)
Installing collected packages: mat4py
Successfully installed mat4py-0.5.0


In [23]:
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.5.3.56-cp38-cp38-win_amd64.whl (34.9 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.5.3.56


## Loading and Visualizing Data 

In [1]:
#imports
from mat4py import loadmat
import numpy as np
import math
import cv2
import pdb

In [2]:
#initializations
input_layer_size = 400
hidden_layer_size = 25
num_labels = 10

In [3]:
#loadig data
data = loadmat('ex4data1.mat')
X = np.array(data['X'])
y = np.array(data['y'])

print('shape of features data', X.shape)
print('shape of labels data', y.shape)

#selecting 100 random examples 
rand_indices = np.random.choice(X.shape[0], size=100, replace=False)
rand_examples = X[rand_indices,:]

def displayData(X):
    target_img_size = (20,20)
    rows = math.floor(math.sqrt(X.shape[0]))
    cols = math.ceil(X.shape[0]/rows)
    
    for i in range(rows):
        j = cols*i
        temp = np.reshape(X[j:j+1], target_img_size, order='F')
        for j in range(cols):
            j += cols*i
            temp = np.concatenate((temp, np.reshape(X[j:j+1], target_img_size, order='F')), axis=1)
        if i == 0:
            frame = temp
        else:
            frame = np.concatenate((frame, temp), axis=0)
        
    #resizing frame
    scale = 200
    width = int(frame.shape[1] * scale/100)
    height = int(frame.shape[0] * scale/100)
    dim = (width, height)
    frame = cv2.resize(frame, dim, interpolation=cv2.INTER_AREA)
    
    #displaying frame
    cv2.imshow('frame', frame)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
displayData(rand_examples)

shape of features data (5000, 400)
shape of labels data (5000, 1)


## Loading parameters

In [4]:
#loading trained parameters
params = loadmat('ex4weights.mat')
theta1 = np.array(params['Theta1']) # 25 by 401
theta2 = np.array(params['Theta2']) # 10 by 26

#unrolling parameters
unroll_params = np.concatenate((theta1.reshape(-1,1), theta2.reshape(-1,1)), axis=0)  

print('shape of theta1', theta1.shape)
print('shape of theta2', theta2.shape)

shape of theta1 (25, 401)
shape of theta2 (10, 26)


## Compute Cost

In [5]:
#sigmoid function
def sigmoid(z):
    return 1/(1+np.exp(-z))

#sigmoid gradient
def sigmoid_grad(z):
    return sigmoid(z) * (1 - sigmoid(z))

In [6]:
def costFunction(unroll_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda):
    #reshaping unrolled parametrs 
    theta1 = unroll_params[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,(input_layer_size+1))
    theta2 = unroll_params[hidden_layer_size*(input_layer_size+1):].reshape(num_labels,(hidden_layer_size+1))
    
    #no of examples 
    m = X.shape[0]
    
    J = None
    theta1_grad = None
    theta2_grad = None
    
    temp = np.arange(1,num_labels+1)
    y_vec = (y == temp).astype(int)
    
    a1 = np.insert(X, 0, np.ones(X.shape[0]), axis=1)
    z2 = a1 @ theta1.T
    a2 = sigmoid(z2)
    a2 = np.insert(a2, 0, np.ones(a2.shape[0]), axis=1)
    z3 = a2 @ theta2.T
    a3 = sigmoid(z3)
    y_hat = a3
    
    J = -1/m * np.sum(y_vec * np.log(y_hat) + (1-y_vec) * np.log(1-y_hat))
    J += (_lambda/(2*m)) * (np.sum(np.square(theta1[:,1:])) + np.sum(np.square(theta2[:,1:])))
    
    lambda3 = a3 - y_vec
    z2 = np.hstack((np.ones((z2.shape[0],1)), z2))
    lambda2 = theta2.T @ lambda3.T * sigmoid_grad(z2).T
    
    theta1_grad = lambda2[1:,:] @ a1
    theta2_grad = lambda3.T @ a2
    
    temp = np.zeros_like(theta1)
    temp[:,1:] += theta1[:,1:]
    d1 = 1/m * (theta1_grad + _lambda * temp)
    temp = np.zeros_like(theta2)
    temp[:,1:] += theta2[:,1:]
    d2 = 1/m * (theta2_grad + _lambda * temp)
    
    unroll_theta_grad = np.concatenate((d1.reshape(-1,1), d2.reshape(-1,1)), axis=0)
    
    return J, unroll_theta_grad
    
    
cost, grad = costFunction(unroll_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda=3)
print(cost)
grad

0.5760512469501331


array([[ 6.18712766e-05],
       [-6.33744979e-12],
       [ 1.31648811e-12],
       ...,
       [-5.20670005e-05],
       [ 1.03178624e-05],
       [-2.31823790e-05]])

## Gradient Check

In [7]:
def initializeWeights(out, _in):
    w = np.zeros((out, _in + 1))
    return np.reshape(np.sin(np.arange(1,np.size(w)+1)), w.shape) / 10

def checkGradient(_lambda=0):
    input_layer_size = 3
    hidden_layer_size = 5
    num_labels = 3
    m = 5
    epsilon = 1e-7
    
    #generating some random data    
    theta1 = initializeWeights(hidden_layer_size, input_layer_size)
    theta2 = initializeWeights(num_labels, hidden_layer_size)
    X = initializeWeights(m, input_layer_size-1)
    y = np.reshape(np.mod(np.arange(1,m+1), num_labels) + 1, (m,1))

    unroll_params = np.concatenate((theta1.reshape(-1,1), theta2.reshape(-1,1)), axis=0)
    
    grad_approx = np.zeros_like(unroll_params)
    
    for i in range(unroll_params.shape[0]):
        theta_plus = np.copy(unroll_params)
        theta_plus[i] += epsilon
        theta_minus = np.copy(unroll_params)
        theta_minus[i] -= epsilon
        J_plus, _ = costFunction(theta_plus, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda)
        J_minus, _ = costFunction(theta_minus, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda)
        grad_approx[i] = (J_plus - J_minus)/(2*epsilon)
        
    _, grad = costFunction(unroll_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda)
    
    numerator = np.linalg.norm(grad_approx - grad)
    denominator = np.linalg.norm(grad_approx) + np.linalg.norm(grad)
    diff = numerator/denominator
    
    if diff > 1e-7:
        print('wrong')
    else:
        print('right')
        
    return diff, grad_approx, grad

diff, grad_approx, grad = checkGradient()
print(diff)
diff, grad_approx, grad = checkGradient(3)
print(diff)

right
1.098566784805451e-08
right
1.1200886235755243e-08


## Training neural network

In [27]:
#initialize parameters
def randomInitializeWeights(l_in, l_out):
    w = np.zeros((l_out, l_in+1))
    epsilon_init = np.sqrt(6) / np.sqrt(l_in + l_out)
    return np.random.rand(l_out, l_in+1) * 2 * epsilon_init - epsilon_init
    
initial_theta1 = randomInitializeWeights(input_layer_size, hidden_layer_size)
initial_theta2 = randomInitializeWeights(hidden_layer_size, num_labels)
print(initial_theta1.shape, initial_theta2.shape)

#optimization
iterations = 4000
alpha = 0.01
_lambda = 1
unroll_params = np.concatenate((initial_theta1.reshape(-1,1), initial_theta2.reshape(-1,1)), axis=0)

for i in range(iterations):
    _, grad = costFunction(unroll_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda)
    unroll_params -= alpha * grad
    
theta1 = unroll_params[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,(input_layer_size+1))
theta2 = unroll_params[hidden_layer_size*(input_layer_size+1):].reshape(num_labels,(hidden_layer_size+1))
print(theta1.shape, theta2.shape)

(25, 401) (10, 26)
(25, 401) (10, 26)


## Prediction

In [28]:
def predict(theta1, theta2, X):
    a1 = np.insert(X, 0, np.ones(X.shape[0]), axis=1)
    a2 = sigmoid(a1 @ theta1.T)
    a2 = np.insert(a2, 0, np.ones(a2.shape[0]), axis=1)
    a3 = sigmoid(a2 @ theta2.T)
    p = np.argmax(a3, 1).reshape(X.shape[0], 1)
    return p + 1
    
pred = predict(theta1, theta2, X)
print(pred)
print('model accuracy', (np.sum(pred == y) / pred.shape[0]) * 100)

[[10]
 [10]
 [10]
 ...
 [ 4]
 [ 4]
 [10]]
model accuracy 81.28
