# MNIST Digit Classification with our own Framework

Lab Assignment from [AI for Beginners Curriculum](https://github.com/microsoft/ai-for-beginners).

### Reading the Dataset

This code download the dataset from the repository on the internet. You can also manually copy the dataset from `/data` directory of AI Curriculum repo.

In [1]:
# !rm *.pkl
# !wget https://raw.githubusercontent.com/microsoft/AI-For-Beginners/main/data/mnist.pkl.gz
# !gzip -d mnist.pkl.gz
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os
import gzip
import pylab


np.random.seed(1)
import random

In [2]:
# import pickle
# with open('mnist.pkl','rb') as f:
#     MNIST = pickle.load(f)


with gzip.open('../../../data/mnist.pkl.gz', 'rb') as mnist_pickle:
    MNIST = pickle.load(mnist_pickle, encoding='latin1')

In [3]:
labels = MNIST[0][1]
data = MNIST[0][0]

Let's see what is the shape of data that we have:

In [4]:
data.shape

(50000, 784)

### Splitting the Data

We will use Scikit Learn to split the data between training and test dataset:

In [5]:
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(data,labels,test_size=0.2)

print(f"Train samples: {len(features_train)}, test samples: {len(features_test)}")

Train samples: 40000, test samples: 10000


### Instructions

1. Take the framework code from the lesson and paste it into this notebook, or (even better) into a separate Python module
1. Define and train one-layered perceptron, observing training and validation accuracy during training
1. Try to understand if overfitting took place, and adjust layer parameters to improve accuracy
1. Repeat previous steps for 2- and 3-layered perceptrons. Try to experiment with different activation functions between layers.
1. Try to answer the following questions:
    - Does the inter-layer activation function affect network performance?
    - Do we need 2- or 3-layered network for this task?
    - Did you experience any problems training the network? Especially as the number of layers increased.
    - How do weights of the network behave during training? You may plot max abs value of weights vs. epoch to understand the relation.

# Kaleb Code
Let's start with the framework of the implementation (provided by the lesson)

In [6]:
# Framework
class Linear:
    def __init__(self,nin,nout):
        self.W = np.random.normal(0, 1.0/np.sqrt(nin), (nout, nin))
        self.b = np.zeros((1,nout))
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        
    def forward(self, x):
        self.x=x
        return np.dot(x, self.W.T) + self.b
    
    def backward(self, dz):
        dx = np.dot(dz, self.W)
        dW = np.dot(dz.T, self.x)
        db = dz.sum(axis=0)
        self.dW = dW
        self.db = db
        return dx
    
    def update(self,lr):
        self.W -= lr*self.dW
        self.b -= lr*self.db
    
class Softmax:
    def forward(self,z):
        self.z = z
        zmax = z.max(axis=1,keepdims=True)
        expz = np.exp(z-zmax)
        Z = expz.sum(axis=1,keepdims=True)
        return expz / Z
    def backward(self,dp):
        p = self.forward(self.z)
        pdp = p * dp
        return pdp - p * pdp.sum(axis=1, keepdims=True)
    
class CrossEntropyLoss:
    def forward(self,p,y):
        self.p = p
        self.y = y
        p_of_y = p[np.arange(len(y)), y]
        log_prob = np.log(p_of_y)
        return -log_prob.mean()
    def backward(self,loss):
        dlog_softmax = np.zeros_like(self.p)
        dlog_softmax[np.arange(len(self.y)), self.y] -= 1.0/len(self.y)
        return dlog_softmax / self.p

class Tanh:
    def forward(self,x):
        y = np.tanh(x)
        self.y = y
        return y
    def backward(self,dy):
        return (1.0-self.y**2)*dy


The framework is established where x (input) -> linear layer -> softmax layer -> loss

## Now for the net to allow for easier layering

In [7]:
class Net:
    def __init__(self):
        self.layers = []
    
    def add(self,l):
        self.layers.append(l)
        
    def forward(self,x):
        for l in self.layers:
            x = l.forward(x)
        return x
    
    def backward(self,z):
        for l in self.layers[::-1]:
            z = l.backward(z)
        return z
    
    def update(self,lr):
        for l in self.layers:
            if 'update' in l.__dir__():
                l.update(lr)

## Code for perceptrons

In [8]:
def train(positive_examples, negative_examples, num_iterations = 100):
    num_dims = positive_examples.shape[1]
    weights = np.zeros((num_dims,1)) # initialize weights
    
    pos_count = positive_examples.shape[0]
    neg_count = negative_examples.shape[0]
    
    report_frequency = 10
    
    for i in range(num_iterations):
        pos = random.choice(positive_examples)
        neg = random.choice(negative_examples)

        z = np.dot(pos, weights)   
        if z < 0:
            weights = weights + pos.reshape(weights.shape)

        z  = np.dot(neg, weights)
        if z >= 0:
            weights = weights - neg.reshape(weights.shape)
            
        if i % report_frequency == 0:             
            pos_out = np.dot(positive_examples, weights)
            neg_out = np.dot(negative_examples, weights)        
            pos_correct = (pos_out >= 0).sum() / float(pos_count)
            neg_correct = (neg_out < 0).sum() / float(neg_count)
            print("Iteration={}, pos correct={}, neg correct={}".format(i,pos_correct,neg_correct))

    return weights

def set_mnist_pos_neg(positive_label, negative_label):
    # KALEB CODE
    # Train = 0; Features = 0 || Labels = 1
    positive_indices = [i for i, j in enumerate(MNIST[0][1])
                          if j == positive_label]
    negative_indices = [i for i, j in enumerate(MNIST[0][1])
                          if j == negative_label]


    positive_images = MNIST[0][0][positive_indices]
    negative_images = MNIST[0][0][negative_indices]
    
    return positive_images, negative_images


# All vs one function that compares the one_label (one) against all other_labels (all)
def set_mnist_all_vs_one(one_label, count = 10):
    # Initialize the positive and negative images arrays
    pos_imgs = []
    neg_imgs = []
    # For loop that goes through and finds the positive and negative images for our label (one_label) and every other label in the data set
    for x in range(count):
        if x != one_label:
            pos, neg = set_mnist_pos_neg(one_label, x)
            # Once we found them, add to full list of positive images and negative images
            pos_imgs.extend(pos)
            neg_imgs.extend(neg)
            
    return np.array(pos_imgs), np.array(neg_imgs)    


perceptrons = []
for i in range(3):
    # print("*************************************************************************************")
    # print(f"Gathering images for perceptron {i}")
    positive_images, negative_images = set_mnist_all_vs_one(i)
    # print(f"Training perceptron {i}")
    perceptrons.append(train(positive_images, negative_images))

def classify(digit, perceptron_weights):  
    predictedDigit = -1 # initialize to a digit that doesn't exist to keep track of not found
    maxZ = float('-inf') # initialize to a 0 confidence as anything else that is predicted as the correct digit will be higher than 0
    for i in range(len(perceptron_weights)):
        z = np.dot(digit, perceptron_weights[i]) # Find the confidence of our guess
        if (z >= maxZ): # If it's the best we've seen so far, let's go with that one
            maxZ = z
            predictedDigit = i

    return predictedDigit, maxZ

class Perceptron:
    def __init__(self,nin,nout):
        self.W = np.random.normal(0, 1.0/np.sqrt(nin), (nout, nin))
        self.b = np.zeros((1,nout))
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        
    def forward(self, x):
        self.x=x
        return np.dot(x, self.W.T) + self.b
    
    def backward(self, dz):
        dx = np.dot(dz, self.W)
        dW = np.dot(dz.T, self.x)
        db = dz.sum(axis=0)
        self.dW = dW
        self.db = db
        return dx
    
    def update(self,lr):
        self.W -= lr*self.dW
        self.b -= lr*self.db

Iteration=0, pos correct=0.0, neg correct=1.0
Iteration=10, pos correct=0.7530413625304136, neg correct=0.9193219135528534
Iteration=20, pos correct=0.9939172749391727, neg correct=0.4278645602201118
Iteration=30, pos correct=0.6672749391727494, neg correct=0.965873790716251
Iteration=40, pos correct=0.9912814274128142, neg correct=0.7125676755125588
Iteration=50, pos correct=0.9860097323600974, neg correct=0.7829502085737108
Iteration=60, pos correct=0.8169099756690997, neg correct=0.9752596077039141
Iteration=70, pos correct=0.9845904298459043, neg correct=0.8475193041625987
Iteration=80, pos correct=0.9738442822384428, neg correct=0.904300168634064
Iteration=90, pos correct=0.9738442822384428, neg correct=0.904300168634064
Iteration=0, pos correct=0.0, neg correct=1.0
Iteration=10, pos correct=0.5901725959845016, neg correct=0.9953747574567935
Iteration=20, pos correct=0.9711165903487143, neg correct=0.9244618925138758
Iteration=30, pos correct=0.9711165903487143, neg correct=0.9244

# Implementation
### Starting with just 1 layer

In [16]:
net = Net()
net.add(Linear(784,10))
net.add(Softmax())
loss = CrossEntropyLoss()


In [17]:
def train_epoch(net, train_x, train_labels, loss=CrossEntropyLoss(), batch_size=4, lr=0.1):
    for i in range(0,len(train_x),batch_size):
        xb = train_x[i:i+batch_size]
        yb = train_labels[i:i+batch_size]

        p = net.forward(xb)
        l = loss.forward(p,yb)
        dp = loss.backward(l)
        dx = net.backward(dp)
        net.update(lr)

train_epoch(net,features_train,labels_train, loss)



In [18]:
def get_loss_acc(x,y,loss=CrossEntropyLoss()):
    p = net.forward(x)
    l = loss.forward(p,y)
    pred = np.argmax(p,axis=1)
    acc = (pred==y).mean()
    return l,acc
print("Final loss={}, accuracy={}: ".format(*get_loss_acc(features_train,labels_train)))
print("Test loss={}, accuracy={}: ".format(*get_loss_acc(features_test,labels_test)))

Final loss=0.3484966662336011, accuracy=0.899725: 
Test loss=0.3834930576854378, accuracy=0.8942: 


## Two layers

In [19]:
net = Net()
net.add(Linear(784,100))
net.add(Tanh())
net.add(Linear(100,10))
net.add(Softmax())
loss = CrossEntropyLoss()

In [20]:
train_epoch(net,features_train,labels_train, loss)
print("Final loss={}, accuracy={}: ".format(*get_loss_acc(features_train,labels_train)))
print("Test loss={}, accuracy={}: ".format(*get_loss_acc(features_test,labels_test)))

Final loss=0.18393319869763414, accuracy=0.9414: 
Test loss=0.2228993248726967, accuracy=0.9308: 


## Three layers

In [21]:
net = Net()
net.add(Linear(784,100))
net.add(Tanh())
net.add(Linear(100,100))
net.add(Tanh())
net.add(Linear(100,10))
net.add(Softmax())
loss = CrossEntropyLoss()

In [22]:
train_epoch(net,features_train,labels_train, loss)
print("Final loss={}, accuracy={}: ".format(*get_loss_acc(features_train,labels_train)))
print("Test loss={}, accuracy={}: ".format(*get_loss_acc(features_test,labels_test)))

Final loss=0.22471452550598606, accuracy=0.933: 
Test loss=0.2721654854951008, accuracy=0.9266: 
