# Key Elements
1. Convolution Operation
2. Convolution Layer
3. Reshape Layer
4. Binary Cross Entropy Loss
5. Sigmoid Activation
6. Solve MNIST 

### Cross-Correlation
<img src="Cross_correlation.png" width="600" height="300">

### Convolution
<img src="Convolution.png" width="600" height="300">

$conv(I, K) = I \ast K = I \star rot180(K)$


# Convolutional Layer
<img src="convolution_layer.png" width="700" height="300">
<div>
<img src="convolution_layer2.png" width="900" height="300">
</div>
<div>
<img src="convolution_forward.png" width="700" height="400">
</div>

# Connection to Dense Layer
<img src="convolution_dense_layer.png" width="600" height="300"> \
If $X_j$ and $K_{ij}$ are matrix of size $1 \times 1$, then this equation is same as the forward equation of the dense layer\
Where $K_{ij}$ are weights and $X_j$ are input

In [2]:
import numpy as np

# Base Layer
class Layer:
    def __init__(self):
        self.input = None
        self.output = None
    
    def forward(self, input):
        # TODO: return output
        pass

    def backward(self, output_gradient, learning_rate):
        # TODO: update parameters and return input gradient
        pass

class Dense(Layer):
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(output_size, input_size)
        self.bias = np.random.randn(output_size, 1)
    
    def forward(self, input):
        self.input = input
        return np.dot(self.weights, self.input) + self.bias
    
    def backward(self, output_gradient, learning_rate):
        # TODO: update parameters and return input gradient
        weight_gradient = np.dot(output_gradient, self.input.T)
        self.weights -= learning_rate * weight_gradient
        self.bias -= learning_rate * output_gradient

        return np.dot(self.weights.T, output_gradient)

class Activation(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation
    
    def forward(self, input):
        self.input = input
        return self.activation(self.input)
    
    def backward(self, output_gradient, learning_rate):
        return np.multiply(output_gradient, self.activation_prime(self.input)) 

class Tanh(Activation):
    def __init__(self):
        tanh = lambda x:np.tanh(x)
        tanh_prime = lambda x : 1 - np.tanh(x) ** 2
        super().__init__(tanh, tanh_prime)

def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_prime(y_true, y_pred):
    return 2 * (y_pred - y_true) / np.size(y_true)

In [1]:
import numpy as np
from scipy import signal

class Convolutional(Layer):
    # depth : number of kernels
    def __init__(self, input_shape, kernel_size, depth):
        input_depth, input_height, input_width = input_shape
        self.depth = depth
        self.input_shape = input_shape
        self.input_depth = input_depth
        self.output_shape = (depth, input_height - kernel_size + 1, input_width - kernel_size + 1)
        self.kernels_shape = (depth, input_depth, kernel_size, kernel_size)

        self.kernels = np.random.randn(*self.kernels_shape)
        self.biases = np.random.randn(*self.output_shape)
    
    def forward(self, input):
        self.input = input
        self.output = np.copy(self.biases) # adding bias happens here

        # depth is number of kernel
        for i in range(self.depth):
            for j in range(self.input.depth):
                self.output[i] += signal.correlate2d(self.input[j], self.kernels[i, j], "valid")

        return self.output

# Forward Propagation

### $ Y_i = B_i + \sum_{j=1}^{n} X_j \star K_{ij}$
### $ Y_i = B_i + X_1 \star K_{i1} + \dots + X_n \star K_{in}$

# Backward of convolution layer

## $\frac{\partial E}{\partial K_{ij}}$
Looking at simple cropped version.. \
<img src="conv_backward1.png" width="600" height="300">\
<img src="conv_backward2.png" width="600" height="300">\
<img src="conv_backward3.png" width="600" height="300">
<img src="conv_backward4.png" width="300" height="150">

This is equal to **$\frac{\partial X}{\partial K} = X \star \frac{\partial E}{\partial Y}$**

**$Y = B + X \star K \implies \frac{\partial E}{\partial K} = X \star \frac{\partial E}{\partial Y} $**


Take an example of $\frac{\partial E}{\partial K_{21}}$
<img src="conv_back5.png" width="330" height="150">\

$\frac{\partial E}{\partial K_{21}} = \frac{\partial E}{\partial Y_2} \cdot \frac{\partial Y_2}{\partial K_{21}}$ which we can apply above formula

$\frac{\partial E}{\partial K_{21}} = X_1 \star \frac{\partial E}{\partial Y_2} \implies \frac{\partial E}{\partial K_{ij}} = X_j \star \frac{\partial E}{\partial Y_i}$

## $\frac{\partial E}{\partial B_i}$
<div>
<img src="conv_back6.png" width="600" height="300">
<img src="conv_back7.png" width="300" height="150">
</div>
<img src="conv_back8.png" width="600" height="300">

## $\frac{\partial E}{\partial X_i}$
<div>
<img src="conv_back10.png" width="300" height="150">
<img src="conv_back9.png" width="700" height="350">
</div>
<img src="conv_back11.png" width="400" height="100">

which is the very definition of the 'Full Convolution,

**$ \frac{\partial E}{\partial X} = \frac{\partial E}{\partial Y} \ast_{F} K$**

<img src="conv_back12.png" width="600" height="300">





### 3 Equations for Backward Convolution Layer
- $\frac{\partial E}{\partial K_{ij}} = X_j \star \frac{\partial E}{\partial Y_i}$

- $\frac{\partial E}{\partial B_i} = \frac{\partial E}{\partial Y_i}$

- $\frac{\partial E}{\partial X_j} = \sum_{i=1}^{n} \frac{\partial E}{\partial Y_i} \ast_{F} K_{ij}$

In [3]:
import numpy as np
from scipy import signal

class Convolutional(Layer):
    # depth : number of kernels
    def __init__(self, input_shape, kernel_size, depth):
        input_depth, input_height, input_width = input_shape
        self.depth = depth
        self.input_shape = input_shape
        self.input_depth = input_depth
        self.output_shape = (depth, input_height - kernel_size + 1, input_width - kernel_size + 1)
        self.kernels_shape = (depth, input_depth, kernel_size, kernel_size)

        self.kernels = np.random.randn(*self.kernels_shape)
        self.biases = np.random.randn(*self.output_shape)
    
    def forward(self, input):
        self.input = input
        self.output = np.copy(self.biases) # adding bias happens here

        # depth is number of kernel
        for i in range(self.depth):
            for j in range(self.input_depth):
                self.output[i] += signal.correlate2d(self.input[j], self.kernels[i, j], "valid")

        return self.output
    
    def backward(self, output_gradient, learning_rate):
        # dE /dK
        kernels_gradient = np.zeros(self.kernels_shape)
        input_gradient = np.zeros(self.input_shape)

        for i in range(self.depth):
            for j in range(self.input_depth):
                kernels_gradient[i, j] = signal.correlate2d(self.input[j], output_gradient[i], "valid")
                input_gradient[j] += signal.convolve2d(output_gradient[i], self.kernels[i, j], "full")
        self.kernels -= learning_rate * kernels_gradient
        self.biases -= learning_rate * output_gradient

        return input_gradient

## Reshape Layer

In [4]:
# Reshape Layer
class Reshape(Layer):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape

    def forward(self, input):
        return np.reshape(input, self.output_shape)
    
    def backward(self, output_gradeint, learning_rate):
        return np.reshape(output_gradeint, self.input_shape)


## Binary Cross Entropy
$Y^* = \begin{bmatrix} y_1^* \\ y_2^* \\ \vdots \\ y_i^*\end{bmatrix} \qquad Y = \begin{bmatrix} y_1 \\ y_2 \\ \vdots \\y_i \end{bmatrix}$

$E = -\frac{1}{n} \sum_{i=1}^n y_i^*\log(y_i) + (1 - y_i^*)\log (1-y_i)$

$\frac{\partial E}{\partial y_i} = \frac{1}{n}(\frac{1-y_i^*}{1 - y_i} - \frac{y^*}{y_i})$

<img src="binary_entropy.png" width="600" height="350">


In [5]:
def binary_cross_entropy(y_true, y_pred):
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true)* np.log(1 - y_pred))

def binary_cross_entropy_prime(y_true, y_pred):
    return ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)

## Sigmoid Activation
<img src="sigmoid.png" width="600" height="300">

$\alpha(x) = \frac{1}{1 + e^{-x}}$

$\alpha'(x) = \alpha(x) * (1-\alpha(x))$

In [6]:
class Sigmoid(Activation):
    def __init__(self):
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))
        
        def sigmoid_prime(x):
            s = sigmoid(x)
            return s * (1 - s)
        
        super().__init__(sigmoid, sigmoid_prime)


In [37]:
import numpy as np

# Base Layer
class Layer:
    def __init__(self):
        self.input = None
        self.output = None
    
    def forward(self, input):
        # TODO: return output
        pass

    def backward(self, output_gradient, learning_rate):
        # TODO: update parameters and return input gradient
        pass

class Dense(Layer):
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(output_size, input_size)
        self.bias = np.random.randn(output_size, 1)
    
    def forward(self, input):
        self.input = input
        return np.dot(self.weights, self.input) + self.bias
    
    def backward(self, output_gradient, learning_rate):
        # TODO: update parameters and return input gradient
        weight_gradient = np.dot(output_gradient, self.input.T)
        self.weights -= learning_rate * weight_gradient
        self.bias -= learning_rate * output_gradient

        return np.dot(self.weights.T, output_gradient)

class Activation(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation
    
    def forward(self, input):
        self.input = input
        return self.activation(self.input)
    
    def backward(self, output_gradient, learning_rate):
        return np.multiply(output_gradient, self.activation_prime(self.input)) 

class Tanh(Activation):
    def __init__(self):
        tanh = lambda x:np.tanh(x)
        tanh_prime = lambda x : 1 - np.tanh(x) ** 2
        super().__init__(tanh, tanh_prime)

def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_prime(y_true, y_pred):
    return 2 * (y_pred - y_true) / np.size(y_true)


from scipy import signal

class Convolutional(Layer):
    # depth : number of kernels
    def __init__(self, input_shape, kernel_size, depth):
        input_depth, input_height, input_width = input_shape
        self.depth = depth
        self.input_shape = input_shape
        self.input_depth = input_depth
        self.output_shape = (depth, input_height - kernel_size + 1, input_width - kernel_size + 1)
        self.kernels_shape = (depth, input_depth, kernel_size, kernel_size)

        self.kernels = np.random.randn(*self.kernels_shape)
        self.biases = np.random.randn(*self.output_shape)
    
    def forward(self, input):
        self.input = input
        self.output = np.copy(self.biases) # adding bias happens here

        # depth is number of kernel
        for i in range(self.depth):
            for j in range(self.input_depth):
                self.output[i] += signal.correlate2d(self.input[j], self.kernels[i, j], "valid")

        return self.output
    
    def backward(self, output_gradient, learning_rate):
        # dE /dK
        kernels_gradient = np.zeros(self.kernels_shape)
        input_gradient = np.zeros(self.input_shape)

        for i in range(self.depth):
            for j in range(self.input_depth):
                kernels_gradient[i, j] = signal.correlate2d(self.input[j], output_gradient[i], "valid")
                input_gradient[j] += signal.convolve2d(output_gradient[i], self.kernels[i, j], "full")
        self.kernels -= learning_rate * kernels_gradient
        self.biases -= learning_rate * output_gradient

        return input_gradient

def binary_cross_entropy(y_true, y_pred):
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true)* np.log(1 - y_pred))

def binary_cross_entropy_prime(y_true, y_pred):
    return ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)

class Reshape(Layer):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape

    def forward(self, input):
        return np.reshape(input, self.output_shape)
    
    def backward(self, output_gradeint, learning_rate):
        return np.reshape(output_gradeint, self.input_shape)

class Sigmoid(Activation):
    def __init__(self):
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        def sigmoid_prime(x):
            s = sigmoid(x)
            return s * (1 - s)

        super().__init__(sigmoid, sigmoid_prime)

In [38]:
# Solve MNISt
import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils

def preprocess_data(x, y, limit):
    zero_index = np.where(y == 0)[0][:limit]
    one_index = np.where(y == 1)[0][:limit]

    all_indices = np.hstack((zero_index, one_index))
    all_indices = np.random.permutation(all_indices)
    x, y = x[all_indices], y[all_indices]

    x = x.reshape(len(x), 1, 28, 28)
    x == x.astype("float32")/255

    y = np_utils.to_categorical(y)
    y = y.reshape(len(y), 2, 1)

    # x dim = (size of sample, 1, 28, 28)
    # y dim = (size of sample, 2, 1) ex) [ [[0], [1]], [[1], [0]], ... ]
    return x, y

# load MNIST from server, limit to 100 images per class since we're not training on GPU
(x_train,y_train), (x_test, y_test) = mnist.load_data()

x_train, y_train = preprocess_data(x_train, y_train, 100)
x_test, y_test = preprocess_data(x_test, y_test, 100)

# neural network
input_w = 28
input_h = 28
input_c = 1
kernel_s = 3
kernel_num = 5 # output depth(channel)

out_w = input_w - kernel_s + 1
out_h = input_h - kernel_s + 1

network = [
    Convolutional((input_c, input_w, input_h), kernel_s, kernel_num),
    Sigmoid(),
    Reshape((kernel_num, out_w, out_h), (kernel_num * out_w * out_h, 1)),

    Dense(kernel_num * out_w * out_h, 100),
    Sigmoid(),

    Dense(100, 2),
    Sigmoid()
]

epochs = 20
learning_rate = 0.1

# train
for e in range(epochs):
    error = 0
    for x, y in zip(x_train, y_train):
        # forward
        output = x
        for layer in network:
            output = layer.forward(output)
        
        # error
        error += binary_cross_entropy(y, output)

        # backward
        grad = binary_cross_entropy_prime(y, output)
        for layer in reversed(network):
            grad = layer.backward(grad, learning_rate)

    error /= len(x_train)
    print(f"{e + 1}/{epochs}, error={error}")

# test
for x, y in zip(x_test, y_test):
    output = x
    for layer in network:
        output = layer.forward(output)
    print(f"pred: {np.argmax(output)}, true: {np.argmax(y)}")
    




  return 1 / (1 + np.exp(-x))
  return -np.mean(y_true * np.log(y_pred) + (1 - y_true)* np.log(1 - y_pred))
  return -np.mean(y_true * np.log(y_pred) + (1 - y_true)* np.log(1 - y_pred))
  return ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)


1/20, error=nan
2/20, error=nan
3/20, error=nan
4/20, error=nan
5/20, error=nan
6/20, error=nan
7/20, error=nan
8/20, error=nan
9/20, error=nan
10/20, error=nan
11/20, error=nan
12/20, error=nan
13/20, error=nan
14/20, error=nan
15/20, error=nan
16/20, error=nan
17/20, error=nan
18/20, error=nan
19/20, error=nan
20/20, error=nan
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 0
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 0
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 1
pred: 0, true: 0
pred: 0, true: 0
pred: 0, true: 1
pred: 0, true: 0
pred: 