In [None]:
!pip install mnist

Collecting mnist
  Downloading mnist-0.2.2-py2.py3-none-any.whl (3.5 kB)
Installing collected packages: mnist
Successfully installed mnist-0.2.2


In [None]:
from ssl import ALERT_DESCRIPTION_CERTIFICATE_REVOKED
import mnist
import numpy as np

class Firstlayer:
    # I need to build a capture function to read every pixel on the image, so I built a 5x5 module to read the information on the image as a load.
    
    def __init__(self, num_filters):
        self.num_filters = num_filters
        
        # In this way, the initial structure of my preset read pixel function is defined
        # I divide by 25 to ensure that the deviation of the initial value of the input is smaller for the experiment as a whole
        self.filters = np.random.randn(num_filters, 3, 3) / 9
        
    def iterate_regions(self, image):
        # Create a matrix for reading detail data 
        h, w = image.shape
        
        for i in range(h - 2):
            for j in range(w - 2):
                im_region = image[i:(i + 3), j:(j + 3)]
                yield im_region, i, j
                
    def forward(self, input):
        # 28x28
        self.last_input = input
        
        # input_im: Create a matrix to normalize image dimensions
        h, w = input.shape
        output = np.zeros((h - 2, w - 2, self.num_filters))
        
        for im_region, i, j in self.iterate_regions(input):
            output[i, j] = np.sum(im_region * self.filters, axis=(1, 2))
            
        return output
    
    def backprop(self, d_L_d_out, learn_rate):
        # d_L_d_out: This variable represents the weight of each layer's output loss
        # learn_rate: For more precision I define this value as float
        d_L_d_filters = np.zeros(self.filters.shape)
        
        for im_region, i, j in self.iterate_regions(self.last_input):
            for f in range(self.num_filters):
                # d_L_d_filters[f]: 5x5 matrix
                # d_L_d_out[i, j, f]: num
                # im_region: 5x5 matrix in image
                d_L_d_filters[f] += d_L_d_out[i, j, f] * im_region
                
        # Update filters
        self.filters -= learn_rate * d_L_d_filters
        
        #We aren't returning anything here since we use first layer because as the first layer there is no layer to feed back lost information.
        #On the other hand we'd need to return the loss gradient for this layer's inputs, just like every other layer in my system.
        return None
        
class Secondlayer:
    # 2-dimensional module used to  the output of the first layer, with a minimum boundary of 3.

    def iterate_regions(self, image):
        '''
        Create a 3x3 image for the filtering operation. 
        Note that each image used for the filtering operation does not overlap. 
        The image is displayed as a 2d numpy array
        '''
        # image: 3d matix of first layer output
        
        h, w, _ = image.shape
        new_h = h // 2
        new_w = w // 2

        for i in range(new_h):
            for j in range(new_w):
                im_region = image[(i * 2):(i * 2 + 2), (j * 2):(j * 2 + 2)]
                yield im_region, i, j

    def forward(self, input):
        '''
        Performs a forward pass of the max the layer using the given input.
        Returns a 3d numpy array with dimensions (h / 2, w / 2, num_filters).
        - input is a 3d numpy array with dimensions (h, w, num_filters)
        '''
        # 26x26x8
        self.last_input = input
        
        # input: 3d matrix of first layer
        h, w, num_filters = input.shape
        output = np.zeros((h // 2, w // 2, num_filters))

        for im_region, i, j in self.iterate_regions(input):
            output[i, j] = np.amax(im_region, axis=(0, 1))
        
        return output
        
    def backprop(self, d_L_d_out):
        # d_L_d_out:This variable represents the weight of each layer's output loss
        
        d_L_d_input = np.zeros(self.last_input.shape)
        
        for im_region, i, j in self.iterate_regions(self.last_input):
            h, w, f = im_region.shape
            amax = np.amax(im_region, axis=(0, 1))
            
            for i2 in range(h):
                for j2 in range(w):
                    for f2 in range(f):
                        # Copy without gradient until pixel reaches maximum
                        if im_region[i2, j2, f2] == amax[f2]:
                            d_L_d_input[i + i2, j + j2, f2] = d_L_d_out[i, j, f2]
                            
        return d_L_d_input
        
class Softmax:
    # Softmax turns arbitrary real values into probabilities, which are often useful in Machine Learning
    
    def __init__(self, input_len, nodes):
        # I divide the initial input value by the length of the input value to caculate the input value to the initial value with the least variance
        # I define the amount as:
        # input_len: length of input nodes
        # nodes: lenght of ouput nodes
        
        self.weights = np.random.randn(input_len, nodes) / input_len
        self.biases = np.zeros(nodes)

    def forward(self, input):
        '''
        Through the softmax layer perfrom a forward pass, return and input at the same time
         The returned value is a 1d numpy array containing the respective probability values.
         The value of i for input is any array with any dimensions.
        '''
        # build 3d vector
        self.last_input_shape = input.shape
        
        # transform 3d vector to 1d vector
        input = input.flatten()
        
        # build 1d vector after transforming
        self.last_input = input

        input_len, nodes = self.weights.shape

        totals = np.dot(input, self.weights) + self.biases
        
        # output 1d vector before softmax layer works
        self.last_totals = totals
        
        exp = np.exp(totals)
        return exp / np.sum(exp, axis=0)
    
    def backprop(self, d_L_d_out, learn_rate):
        # make sure only 1 element of d_L_d_out which I have defined is nonzero
        for i, gradient in enumerate(d_L_d_out):
            # Let me design that if k != c, gradient = 0
            # else: gradient = 1
            # try to find i when k == c
            if gradient == 0:
                continue
        
            # define the e^totals which means the total numbers of input element
            t_exp = np.exp(self.last_totals)
        
            # Sum of all elements
            S = np.sum(t_exp)
            
            # define out[i] as Gradients of all elements
            # and also all gradients are given value with k != c
            d_out_d_t = -t_exp[i] * t_exp / (S ** 2)
            # change the value of k == c
            d_out_d_t[i] = t_exp[i] * (S - t_exp[i]) / (S ** 2)
        
            # define out[i] as Gradients of all elements
            # connect gradients with every weight in every node
            # However,this is not the final results
            d_t_d_w = self.last_input  
            d_t_d_b = 1
            d_t_d_inputs = self.weights
        
            # Gradients of loss against totals
            d_L_d_t = gradient * d_out_d_t
        
            # define Gradients of loss against weights/biases/input as:d_L_d_w d_L_d_b d_t_d_inputs
           
            d_L_d_w = d_t_d_w[np.newaxis].T @ d_L_d_t[np.newaxis]
            d_L_d_b = d_L_d_t * d_t_d_b
    
            d_L_d_inputs = d_t_d_inputs @ d_L_d_t
        
            # Update values after caculate
            self.weights -= learn_rate * d_L_d_w
            self.biases -= learn_rate * d_L_d_b
        
            # it will be used in previous second layer
            # reshape into that matrix
            return d_L_d_inputs.reshape(self.last_input_shape)
            
# I try to use the whole test example,I found a magical phenomenon, when I tested 1000 data, the
# accuracy rate I got was lower than 80%, and when I tested all the data
#sets, I found that the accuracy rate of the test did not drop but increased to 90%
test_images = mnist.test_images()[:10000]
test_labels = mnist.test_labels()[:10000]

# I try to use the whole test example。
train_images = mnist.train_images()[:10000]
train_labels = mnist.train_labels()[:10000]
test_images = mnist.test_images()[:10000]
test_labels = mnist.test_labels()[:10000]

FIR = Firstlayer(8)                    # 28x28x1 -> 24x24x8
SEC = Secondlayer()                    # 24x24x8 -> 8x8x8
softmax = Softmax(13 * 13 * 8, 10)    # 8x8x8 -> 10

def forward(image, label):
    '''
     After a lot of caculate ,above codes completes a forward pass of the whole system and calculates the accuracy and
    cross-entropy loss.
    - image is defined as a flexible  2d numpy array
    - label  is defined as a number
    '''
    # I transform the image from [0, 255] to [-0.5, 0.5] to make it simple
    out = FIR.forward((image / 255) - 0.5)
    out = SEC.forward(out)
    out = softmax.forward(out)

    # It is generally a loss function used to quantify the difference between two probability distributions (mostly used in classification problems)
    loss = -np.log(out[label])
    acc = 1 if np.argmax(out) == label else 0

    return out, loss, acc
    
    # out means vertor of probability
    # loss means num
    # acc means 1 or 0

def train(im, label, lr=.005):
    # Forward pass
    out, loss, acc = forward(im, label)
    
    # Calculate the whole intial gradient
    gradient = np.zeros(10)
    gradient[label] = -1 / out[label]
    
    # Backprop pass 
    gradient = softmax.backprop(gradient, lr)
    gradient = SEC.backprop(gradient)
    gradient = FIR.backprop(gradient, lr)
    
    return loss, acc
    
print('MNIST dataset initialized!')


# Train the system for 3 epochs
for epoch in range(3):
    print('--- Epoch %d ---' % (epoch + 1))
    
    # Shuffle the training data
    permutation = np.random.permutation(len(train_images))
    train_images = train_images[permutation]
    train_labels = train_labels[permutation]
    
    # Train
    loss = 0
    num_correct = 0
    # i means index
    # im means image
    # label means label
    for i, (im, label) in enumerate(zip(train_images, train_labels)):
        if i > 0 and i % 100 == 99:
            print(
                '[Step %d] Past 100 steps: Average Loss %.3f | Accuracy: %d%%' %
                (i + 1, loss / 100, num_correct)
            )
            loss = 0
            num_correct = 0

        l, acc = train(im, label)
        loss += 1
        num_correct += acc
        
print('\n--- The whole Testing output ---')
loss = 0
num_correct = 0
for im, label in zip(test_images, test_labels):
    _, l, acc = forward(im, label)
    loss += l
    num_correct += acc
loss_ar = []
acc_ar = []
train_loss =loss/len(test_images)
train_acc =num_correct/len(test_images)
loss_ar.append(train_loss)
acc_ar.append(train_acc)
num_tests = len(test_images)
print('Test Loss:', loss / num_tests)
print('Test Accuracy:', num_correct / num_tests)

MNIST dataset initialized!
--- Epoch 1 ---
[Step 100] Past 100 steps: Average Loss 0.990 | Accuracy: 13%
[Step 200] Past 100 steps: Average Loss 1.000 | Accuracy: 38%
[Step 300] Past 100 steps: Average Loss 1.000 | Accuracy: 41%
[Step 400] Past 100 steps: Average Loss 1.000 | Accuracy: 58%
[Step 500] Past 100 steps: Average Loss 1.000 | Accuracy: 62%
[Step 600] Past 100 steps: Average Loss 1.000 | Accuracy: 69%
[Step 700] Past 100 steps: Average Loss 1.000 | Accuracy: 68%
[Step 800] Past 100 steps: Average Loss 1.000 | Accuracy: 69%
[Step 900] Past 100 steps: Average Loss 1.000 | Accuracy: 78%
[Step 1000] Past 100 steps: Average Loss 1.000 | Accuracy: 77%
[Step 1100] Past 100 steps: Average Loss 1.000 | Accuracy: 76%
[Step 1200] Past 100 steps: Average Loss 1.000 | Accuracy: 79%
[Step 1300] Past 100 steps: Average Loss 1.000 | Accuracy: 82%
[Step 1400] Past 100 steps: Average Loss 1.000 | Accuracy: 80%
[Step 1500] Past 100 steps: Average Loss 1.000 | Accuracy: 76%
[Step 1600] Past 100 