#**Importing tools**

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import signal

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#**Layer class**

In [3]:
class Layer():
  def __init__(self):
    self.input = None
    self.output = None
  def forward_prop(self, input):
    pass
  def backward_prop(self, output_gradient, learning_rate):
    pass

#**Convolution class for convolutional layers**

In [4]:
class Convolution(Layer):
  def __init__(self, input_shape, kernel_size, depth):        #depth reprezents the number of "neurons" in the layer
    input_depth, input_height, input_width = input_shape
    self.input_shape = input_shape
    self.depth = depth
    self.input_depth = input_depth
    self.output_shape = (depth, input_height - kernel_size + 1, input_width - kernel_size + 1)
    self.kernels_shape = (depth, input_depth, kernel_size, kernel_size)
    self.kernels = np.random.rand(*self.kernels_shape) - 0.5
    self.biases = np.random.rand(*self.output_shape) - 0.5
  def forward_prop(self, input):
    self.input = input
    self.output = np.copy(self.biases)
    for i in range(self.depth):
      for j in range(self.input_depth):
        self.output[i] += signal.correlate2d(self.input[j], self.kernels[i,j], "valid")

    return self.output
  def backward_prop(self, output_gradient, learning_rate):
    kernels_gradient = np.zeros(self.kernels_shape)
    input_gradient = np.zeros(self.input_shape)
    for i in range(self.depth):
      for j in range(self.input_depth):
        kernels_gradient[i,j] = signal.correlate2d(self.input[j], output_gradient[i], "valid")
        input_gradient[j] += signal.convolve2d(output_gradient[i], self.kernels[i,j], "full")


    self.kernels -= learning_rate * kernels_gradient
    self.biases -= learning_rate * output_gradient
    return input_gradient

#**Max pooling class**

In [5]:
class MaxPool2D(Layer):
  def __init__(self, size = 2, stride = 2):
    self.size = size
    self.stride = stride
    self.original = None
  def forward_prop(self, input):
    C,H,W = input.shape

    S = self.stride
    self.input = input
    HH = round(1 + (H - self.size)/S)
    WW = round(1 + (W - self.size)/S)
    self.output = np.zeros((C,HH,WW))

    for depth in range(C):
        for r in range(0,H,S):
          for c in range(0,W,S):
            self.output[depth, int(r/S), int(c/S)] = np.max(self.input[depth, r:r+self.size, c:c+self.size])

    return self.output
  def backward_prop(self, output_gradient, learning_rate):
    C,H,W = self.input.shape
    S = self.stride
    C,HH,WW = output_gradient.shape

    input_gradient = np.zeros(self.input.shape)

    for depth in range(C):
        for r in range(HH):
          for c in range(WW):
            x_pool = self.input[depth, r*S:r*S+self.size, c*S:c*S+self.size]
            mask = (x_pool == np.max(x_pool))
            input_gradient[depth, r*S:r*S+self.size, c*S:c*S+self.size] = mask*output_gradient[depth,r,c]

    return input_gradient



#**Reshape class**

In [6]:
class Reshape(Layer):
    def __init__(self, input_shape, output_shape):
      self.input_shape = input_shape
      self.output_shape = output_shape

    def forward_prop(self, input):
      return np.reshape(input, self.output_shape)

    def backward_prop(self, output_gradient, learning_rate):
      return np.reshape(output_gradient, self.input_shape)

#**Dense class for normal hidden layers**

In [7]:
class Dense(Layer):
  def __init__(self, input_size, output_size):
    self.weights = np.random.rand(output_size, input_size) - 0.5
    self.bias = np.random.rand(output_size, 1) -0.5
  def forward_prop(self, input):
    self.input = input
    return np.dot(self.weights, self.input) + self.bias
  def backward_prop(self, output_gradient, learning_rate):
    weights_gradient = np.dot(output_gradient, self.input.T)
    input_gradient = np.dot(self.weights.T, output_gradient)
    self.weights -= learning_rate * weights_gradient
    self.bias -= learning_rate * output_gradient
    return input_gradient

#**Activation class**

In [8]:
class Activation(Layer):
  def __init__(self, activation, activation_prime):
    self.activation = activation
    self.activation_prime = activation_prime
  def forward_prop(self, input):
    self.input = input
    return self.activation(self.input)
  def backward_prop(self, output_gradient, learning_rate):
    return np.multiply(output_gradient, self.activation_prime(self.input))

#**ReLU and Softmax classes**

In [9]:
class ReLU(Activation):
  def __init__(self):
    def relu(x):
      return np.maximum(x, 0)
    def relu_prime(x):
      return x > 0
    super().__init__(relu, relu_prime)
class Softmax(Layer):
  def forward_prop(self, input):
    tmp = input - max(input)
    x = np.exp(tmp)
    self.output = x / sum(x)
    return self.output
  def backward_prop(self, output_gradient, learning_rate):
    n = np.size(self.output)
    return np.dot((np.identity(n) - self.output.T) * self.output, output_gradient)

#**Loss, Train, Predict functions**

In [10]:
def get_prediction(A):
  return np.argmax(A, 0)

def get_accuracy(Y_pred, Y_real):
  return np.sum(Y_pred == Y_real) / Y_real.size

def binary_cross_entropy(y_true, y_pred):
    return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))

def binary_cross_entropy_prime(y_true, y_pred):
    return ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)

def predict(network, input):
    output = input
    for layer in network:
      output = layer.forward_prop(output)
    return output

def train(network, loss, loss_prime, x_train, y_train, epochs = 1000, learning_rate = 0.01, verbose = True):
  for e in range(epochs):
    error = 0
    accuracy = 0
    for x,y in zip(x_train, y_train):
      output = predict(network, x)
      error += loss(y,output)
      if get_prediction(output) == get_prediction(y):
        accuracy += 1
      grad = loss_prime(y, output)
      for layer in reversed(network):
        grad = layer.backward_prop(grad, learning_rate)
    error /= len(x_train)
    accuracy /= len(x_train)
    if verbose:
      print(f"{e + 1}/{epochs}, error={error}, accuracy={accuracy}")

#**Training the CNN**

In [12]:
from keras.utils import to_categorical
data = pd.read_csv("/content/drive/MyDrive/kaggle/MNIST_Dataset/train.csv")
data = np.array(data)
np.random.shuffle(data)

def preprocess_data(data_array):

  data_train = data_array[100:1000,:]
  Y_train = data_train[:,0]
  Y_train = to_categorical(Y_train)
  Y_train = Y_train.reshape(len(Y_train), 10, 1)  #one hot encode
  X_train = data_train[:,1:]
  X_train = X_train.reshape(len(X_train), 1, 28, 28)
  X_train = X_train/255

  data_test = data_array[0:100, :]
  Y_test = data_test[:,0]
  Y_test = Y_test.reshape(100,1)
  X_test = data_test[:,1:]
  X_test = X_test.reshape(len(X_test), 1, 28, 28)
  X_test = X_test/255

  return Y_train, X_train, Y_test, X_test

Y_train, X_train, Y_test, X_test = preprocess_data(data)

In [17]:
network = [
    Convolution((1, 28, 28), 3, 5),
    MaxPool2D(),
    ReLU(),
    Convolution((5, 13, 13), 3, 10),
    MaxPool2D(),
    ReLU(),
    Reshape((10, 6, 6), (10 * 6 * 6, 1)),
    Dense(10 * 6 * 6, 128),
    ReLU(),
    Dense(128, 10),
    Softmax()
]

train(
    network,
    binary_cross_entropy,
    binary_cross_entropy_prime,
    X_train,
    Y_train,
    epochs=10,
    learning_rate=0.05
)

1/10, error=0.2548784780950899, accuracy=0.47333333333333333
2/10, error=0.1195104015309659, accuracy=0.7533333333333333
3/10, error=0.07396714696287807, accuracy=0.8555555555555555
4/10, error=0.04917908004492084, accuracy=0.9088888888888889
5/10, error=0.0317989665477027, accuracy=0.9488888888888889
6/10, error=0.020421381995795136, accuracy=0.9766666666666667
7/10, error=0.012158855686555623, accuracy=0.9888888888888889
8/10, error=0.007021267323670256, accuracy=0.9966666666666667
9/10, error=0.004562241648762289, accuracy=0.9977777777777778
10/10, error=0.0031179985857958414, accuracy=1.0


In [18]:
Y_predictions = []
for x, y in zip(X_test, Y_test):
    output = predict(network, x)
    pred_number = get_prediction(output)
    Y_predictions.append(pred_number)

get_accuracy(Y_predictions, Y_test)

0.9

Overfitting Problem caused maybe because of the lack of convolution padding and number of kernels