In [1]:
# Imports
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
np.set_printoptions(threshold=np.nan)
import sys

In [2]:
# One hot Encoding the categorical output values to binary by adding 1's for that index and 0's otherwise
def oneHotEncode(y):
#     print('Before Encoding ')
#     print(y)
#     print(type(y))
    enc = pd.get_dummies(y['y'])
#     print('After Encoding ')
#     print(enc)
    return np.matrix(enc)

# Applying Sigmoid Activation function to the hidden layer outputs used while forward propagation
# works with scalar, arrays and matrix as well
# Purpose of this method is to do squishing on the linear function
def apply_sigmoid(z):
    return 1/(1+np.exp(-z))

# Applying Sigmoid Activation function to the hidden layer outputs used while backward propagation to get gradients
# works with scalar, arrays and matrix as well
# Purpose of this method is to do undo the squishing on the linear function
def apply_sigmoid_prime(z):
    return np.dot(z, (1-z))


# Forward propagation to calculate yHat by applying activation function twice
def forward_propagate(X, W1, W2, b1, b2):
    Z1 = np.dot(W1.T, X) + b1
#     print('----Z1----')
#     print(Z1)
    A1 = apply_sigmoid(Z1)
#     print('----A1----')
#     print(A1)
    Z2 = np.dot(W2.T, A1) + b2
#     print('----Z2----')
#     print(Z2)
    A2 = apply_sigmoid(Z2) # Output of the last layer(output layer)
#     print('----A2----')
#     print(A2)
    return A1, A2, Z1, Z2

# Backward Propagation function to calculate the gradients
def back_propagate(Z1, X, Y, A1, A2, W2):
    m = X.shape[1]
#     print(m)
#     print('----Y----')
#     print(Y)
#     print('----A2----')
#     print(A2)
    dZ2 = (A2 - Y);
#     print('----dz2----')
#     print(dZ2)
#     print(type(dZ2))
#     print('----A1T----')
#     print(A1.T)
    dW2 = (1./m) * np.dot(dZ2, A1.T)
#     print('----dw2----')
#     print(dW2)
    db2 = (1./m) * np.sum(dZ2, axis=1)
    print('----db2----')
    print(db2.shape)
    dZ1 = (np.dot(dZ2, W2.T)* apply_sigmoid_prime(Z1).T) # element wise product of same dimension matrices
#     print('----dz1----')
#     print(dZ1)
    dW1 = (1./m) * np.dot(dZ1, X.T)
#     print('----dw1----')
#     print(dW1)
    db1 = (1./m) * np.sum(dZ1, axis =1)
#     print('----db1----')
#     print(db1)
    return dW1, db1, dW2, db2
    
# Get the loss of for the training example
def get_cost(Y, Yhat):
    m= Y.shape[1]
#     print(m)
    loss = np.multiply(Y, np.log(Yhat)) + np.multiply((1-Y), np.log(1 - Yhat))
#     print(loss)
    cost = (-1/m) * np.sum(loss)
    return cost

def gradientDescent(X, Y, alpha, iters):  
    # Call Forward propagation to calculate yHat
    W1 = 0.01* np.random.randn(inputLayerSize,hiddenLayerSize);
    W2 = 0.01* np.random.randn(hiddenLayerSize,outputLayerSize);
    b1 = np.zeros((hiddenLayerSize,1));
    b2 = np.zeros((outputLayerSize,1));
    old_cost = sys.maxsize
    new_cost = sys.maxsize
    dW1 = None
    db1 =None
    dW2 = None
    db2 = None
    cost_history = []
    for i in range(iters):
        A1, A2, Z1, Z2 = forward_propagate(X, W1, W2, b1, b2)
        dW1, db1, dW2, db2 = back_propagate(Z1, X, Y, A1, A2, W2)
#         print('Before')
#         print(W1)
        W1 = W1 - (alpha * dW1).T
#         print('After')
#         print((alpha * db1).T)
        b1 = b1 - (alpha * db1)
        W2 = W2 - (alpha * dW2).T
        b2 = b2 - (alpha * db2)
        old_cost = new_cost
        new_cost = get_cost(Y, A2)
#         if(abs(old_cost - new_cost) < 0.00000000000001):
#             print("breaking" + str(old_cost) + str(new_cost))
#             break;
        print ("cost : " + str(new_cost) + " Iteration: " + str(i))
        cost_history.append(new_cost)
    return dW1 , db1, dW2, db2, cost_history, new_cost

# Softmax activation function to get the probablity of the classes
def softmax(z):
    softMax = (np.exp(z).T / np.sum(np.exp(z),axis=1)).T
    print('softmax is ')
    print(softMax)
    return softMax

def plotCostHistory(cost_history):
     plt.plot(cost_history)
     plt.ylabel('Cost');
     plt.xlabel('Iterations');
     plt.title('Cost Progression with Iterations for different learning rates')
     plt.show()

In [3]:
# Reading the training data
data_train = pd.read_csv('ex3_train.csv', sep=",", encoding='utf-8', header='infer')
df_train = data_train
m = df_train.shape[0]

y_train = pd.DataFrame(df_train['y'])
X_train = df_train.drop(['y'], axis=1)

X_train_mat = np.matrix(X_train).T
y_train_mat = oneHotEncode(y_train).T

In [4]:
# Reading the test data
data_test = pd.read_csv('ex3_test.csv', sep=",", encoding='utf-8', header='infer')
df_test = data_test

y_test = df_test['y']
X_test = df_test.drop(['y'], axis=1)

X_test_mat = np.matrix(X_test).T
y_test_mat = np.matrix(y_test)

print(X_test_mat.shape)
print(y_test_mat.shape)

#Plot the selected pixel
# num = 7
# pixels = np.array(X_test[num:num+1], dtype='uint8')
# print(y_test[num:num+1])
# pixels = pixels.reshape((20, 20))
# plt.imshow(pixels, cmap='gray')
# plt.show()


# Adding one's column for bias
# X_test.insert(0,-1,1) 



(400, 1500)
(1, 1500)


In [5]:
# Defining Hyperparameters
inputLayerSize = X_train_mat.shape[0]
hiddenLayerSize = 25 # As specified in assignment requirements
outputLayerSize = 10
print(inputLayerSize)
print(outputLayerSize)
print(hiddenLayerSize)

400
10
25


In [6]:
# Optimization function to check cost propagartion for different learning rates

def optimize():
    alpha = [0.1, 0.001]
    for a in alpha:
        dW1 , db1, dW2, db2, cost_history, new_cost = gradientDescent(X_train_mat, y_train_mat, a, 10)
        plotCostHistory(cost_history)
        return dW1 , db1, dW2, db2

In [7]:
optimize()

ValueError: shapes (10,3500) and (10,25) not aligned: 3500 (dim 1) != 10 (dim 0)

In [None]:
W1 = 0.01* np.random.randn(inputLayerSize,hiddenLayerSize);
W2 = 0.01* np.random.randn(hiddenLayerSize,outputLayerSize);
b1 = np.zeros((hiddenLayerSize,1));
b2 = np.zeros((outputLayerSize,1));
print(W1.shape)
print(W2.shape)
print(b1.shape)
print(b2.shape)
print(y_train_mat)
A1, A2, Z1, Z2 = forward_propagate(X_train_mat, W1, W2, b1, b2)
dW1, db1, dW2, db2 = back_propagate(Z1, X_train_mat, y_train_mat, A1, A2, W2)