In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from keras.datasets import mnist

# Importing MNIST Dataset
(X_train, Y_train) , (X_test, Y_test) = mnist.load_data()
print('X_train: ' + str(X_train.shape))
print('Y_train: ' + str(Y_train.shape))
print('X_test:  ' + str(X_test.shape))
print('Y_test:  ' + str(Y_test.shape))

X_train: (60000, 28, 28)
Y_train: (60000,)
X_test:  (10000, 28, 28)
Y_test:  (10000,)


In [None]:
# Filtering the data to use only digit 0 and digit 1
train_filter = np.where((Y_train == 0 ) | (Y_train == 1))
test_filter = np.where((Y_test == 0) | (Y_test == 1))

X_train , X_test = X_train[train_filter] , X_test[test_filter]
Y_train , Y_test = Y_train[train_filter] , Y_test[test_filter]

In [None]:
print(X_train)

[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]


In [None]:
print(Y_train)

[0 1 1 ... 1 0 1]


In [None]:
# Standardization of data
X_train = ((X_train - np.mean(X_train , axis = 0)) / (np.std(X_train , axis = 0)+18e-1))
X_test = ((X_test -  np.mean(X_test , axis = 0)) / (np.std(X_test , axis = 0)+18e-1))

"""
Below is supposed the right way to standardize the data. We calculated the mean and std of the X_train. Then, we used it in both X_train and X_test
The question is why we used the mean and std of X_train in the standardization of X_test. beacuse, since X_test has less data than X_train and X_test could
contain outliers or repeated data therefore, it will be best if we standardize based on the majority of the data which is X_train

mean = np.mean(X_train , axis = 0)
std = np.std(X_train , axis = 0)
X_train = (X_train - mean) / (std)
X_test = (X_test - mean) / (std)
"""

In [None]:
# Reshaping the data to use it as 2D not 3D
X_train = X_train.reshape(X_train.shape[0] , -1)
X_test = X_test.reshape(X_test.shape[0] , -1)
print('X_train: ' + str(X_train.shape))
print('Y_train: ' + str(Y_train.shape))
print('X_test:  ' + str(X_test.shape))
print('Y_test:  ' + str(Y_test.shape))
print(len(X_train))

X_train: (12665, 784)
Y_train: (12665,)
X_test:  (2115, 784)
Y_test:  (2115,)
12665


In [None]:
# Function to calculate the accuracy
def accuracy(predict,real,normalize=True):
  return (np.sum(predict == real) / len(real)) * 100

In [None]:
# Define Parameters needed for optimizers
pre_v_dw , pre_v_db = np.zeros(X_train.shape[1]) , 0
pre_m_dw , pre_m_db = np.zeros(X_train.shape[1]) , 0

In [None]:
# Implementing the momentum optimizer
def momentum(w , b , dw , db , t , beta1):
  global pre_m_dw
  global pre_m_db
  m_dw = beta1 * pre_m_dw + (1 - beta1) * (dw) #Weights
  m_db = beta1 * pre_m_db + (1 - beta1) * (db) #Bias

  pre_v_dw = m_dw.copy()
  pre_v_db = m_db.copy()

  # bias correction
  m_dw = m_dw / (1 - beta1**t)
  m_db = m_db / (1 - beta1**t)

  return m_dw , m_db

In [None]:
# Implementing the RMS prop optimizer
def rms_prop(w , b , dw , db , t , beta2):
  global pre_v_dw
  global pre_v_db

  v_dw = beta2 * pre_v_dw + (1 - beta2) * (dw**2) #Weights
  v_db = beta2 * pre_v_db + (1 - beta2) * (db**2) #Bias

  pre_v_dw = v_dw.copy()
  pre_v_db = v_db.copy()

  # bias correction
  v_dw = v_dw / (1 - beta2**t)
  v_db = v_db / (1 - beta2**t)

  return v_dw , v_db

In [None]:
# Implemeting our activation Function (Sigmoid Function)
def sigmoid(z):

    sigmoid_fn = 1/ (1 + np.exp(-z))

    return sigmoid_fn

In [None]:
# Implemeting the cross entropy
def costFunction(x, y , alpha , w , b , lamda , optimize_val):

    #Define Parameters needed for optimizers
    t = 1
    beta1 = 0.9
    beta2 = 0.999
    epsilon = 10e-8

    z = np.dot(w, x.T) + b #Net-Input
    phi = sigmoid(z) #Activation Function

    #Cost function
    cost = np.mean(-y * np.log(phi+epsilon) - (1 - y) * np.log(1 - phi+epsilon)) + (lamda * np.sum(abs(w)) / (2 * x.shape[0]))

    #Compute Derivative
    dj_dw = ((np.dot((phi - y).T, x)) / x.shape[0]) + ((lamda * np.sign(w))/(x.shape[0] * 2))
    dj_db = (np.mean((phi - y)))

    if optimize_val == 1:
      #RMS prop Optimizer Function
      v_dw , v_db = rms_prop(w , b , dj_dw , dj_db , t , beta2)

      #Update Weights & Bias
      w = w - (alpha * (dj_dw / (np.sqrt(v_dw + epsilon))))
      b = b - (alpha * (dj_db / (np.sqrt(v_db + epsilon))))

    else:
      #AdaM Optimizer Function
      m_dw , m_db = momentum(w , b , dj_dw , dj_db , t , beta1)
      v_dw , v_db = rms_prop(w , b , dj_dw , dj_db , t , beta2)
      #Update Weights & Bias
      w = w - (alpha * ((m_dw) / (np.sqrt(v_dw + epsilon))))
      b = b - (alpha * ((m_db) / (np.sqrt(v_db + epsilon))))

    return cost , w , b


In [None]:
# Implementing the Logistic Regression
def gradientDescent(X , Y , alpha , num_iterations):

    #Define Parameters
    cost = 0.0
    w = np.random.randn(X.shape[1])
    b = np.random.randn()

    # Getting the lambda value from the user
    lambda_val = float(input("\nEnter the lambda value: "))

    # Getting the optimization method from the user
    optimize_val = int(input("\n1.RMS prop Optimizer \n2.AdaM Optimizer \nSelect one of the above optimizers: "))

    # Shuffle the data
    n = X.shape[0]
    batch_siz = int(input("\nEnter the batch size: ")) # Getting the batch size from the user

    num_batch = n // batch_siz
    indices = np.arange(n)
    np.random.shuffle(indices)

    # Applying Mini-Batch
    for i in range(num_iterations):
      for batch in range(num_batch):
        if batch!=num_batch-1:
          start = batch * batch_siz
          end = (batch + 1) * batch_siz

          val_indices = indices[start:end]
          train_indices = np.concatenate([indices[:start], indices[end:]])

          X_train_batch = X[train_indices].copy()
          Y_train_batch = Y[train_indices].copy()
          X_val = X[val_indices]
          Y_val = Y[val_indices]
        else:
          start = batch * batch_siz
          end = ((batch + 1) * batch_siz) +(n % batch_siz)

          val_indices = indices[start:end]
          train_indices = np.concatenate([indices[:start], indices[end:]])

          X_train_batch = X[train_indices].copy()
          Y_train_batch = Y[train_indices].copy()
          X_val = X[val_indices]
          Y_val = Y[val_indices]

        cost , w , b = costFunction(X_train_batch , Y_train_batch , alpha , w , b , lambda_val , optimize_val)
        #print("Batch#: " + str(batch) + " , weight: " + str(w) + " , bias: " + str(b))

    return cost , w , b

In [None]:
# Function that calculates the prediction
def predict(x, w, b):

  z = np.dot(w, x.T) + b #Net-Input
  result = sigmoid(z)

  return np.where(result >= 0.5, 1, 0).flatten()

In [None]:
def result_fn(X, Y):

  cost , w , b = gradientDescent(X , Y , 0.01 , 20)
  pred = predict(X , w , b)
  accuracy_score = accuracy(pred , Y)
  print("Learning Rate: " + str(0.01) + " , Accuracy: " + str(accuracy_score))


In [None]:
# Lambda value of 0.01
result_fn(X_train, Y_train)


Enter the lambda value: 0.01

1.RMS prop Optimizer 
2.AdaM Optimizer 
Select one of the above optimizers: 1

Enter the batch size: 64
Learning Rate: 0.01 , Accuracy: 91.94630872483222


In [None]:
# Lambda value of 0.1
result_fn(X_train, Y_train)


Enter the lambda value: 0.1

1.RMS prop Optimizer 
2.AdaM Optimizer 
Select one of the above optimizers: 1

Enter the batch size: 64
Learning Rate: 0.01 , Accuracy: 93.24121594946703


As shown from above, when we increased the the lambda value , the accuracy also increases

In [None]:
# Batch Size of 64
result_fn(X_train, Y_train)


Enter the lambda value: 0.01

1.RMS prop Optimizer 
2.AdaM Optimizer 
Select one of the above optimizers: 1

Enter the batch size: 64
Learning Rate: 0.01 , Accuracy: 99.40781681800237


In [None]:
# Batch Size of 256
result_fn(X_train, Y_train)


Enter the lambda value: 0.01

1.RMS prop Optimizer 
2.AdaM Optimizer 
Select one of the above optimizers: 1

Enter the batch size: 256
Learning Rate: 0.01 , Accuracy: 97.82866166600868


In [None]:
# Using RMS prop optimizer with lambda value of 0.01 and batch size of 64
result_fn(X_train, Y_train)


Enter the lambda value: 0.01

1.RMS prop Optimizer 
2.AdaM Optimizer 
Select one of the above optimizers: 1

Enter the batch size: 64
Learning Rate: 0.01 , Accuracy: 99.58941966048164


In [None]:
# Using Adam optimizer with lambda value of 0.01 and batch size of 64
result_fn(X_train, Y_train)


Enter the lambda value: 0.01

1.RMS prop Optimizer 
2.AdaM Optimizer 
Select one of the above optimizers: 2

Enter the batch size: 64
Learning Rate: 0.01 , Accuracy: 99.71575207264114
