# Digits Recognition

## CS640

### U59844514 Xiankang Wu

### U37992938 Rongyu Wang

<p>      This code is about the application of our NN on digits recognition.

<p>      It's capable of recognizing 10 digits (0 to 9).
    
<p>      Check out the discussion on the efficiency of our NN (problem 6) at the end.

In [12]:
# Import Python libraries
import numpy as np    # numpy is the fundamental package for scientific computing with Python, such linear algebra, array...
import matplotlib.pyplot as plt      # matplotlib is a Python 2D plotting library which produces publication quality figures.
from sklearn.model_selection import KFold    #k-fold cross-validation

In [13]:
class LogisticRegression:
    """
    This lab implements a Logistic Regression Classifier.
    """
    #Here we need to add a parameter hidden_dim
    def __init__(self, input_dim, output_dim, hidden_dim):
        """
        Initializes the parameters of the logistic regression classifer to 
        random values.
        args:
            input_dim: Number of dimensions of the input data
            output_dim: Number of classes
            hidden_dim: Number of nodes in the hidden layer
        """
        #initialize a(1) and a(2) with random matrix(input_dim * hidden_dim)
        self.theta1 = np.random.randn(input_dim, hidden_dim) / np.sqrt(input_dim)       
        self.bias1 = np.zeros((1, hidden_dim))
        #initialize a(2) and a(3) with random matrix(hidden_dim * output_dim)
        self.theta2 = np.random.randn(hidden_dim, output_dim) / np.sqrt(hidden_dim) 
        self.bias2 = np.zeros((1, output_dim))

    #--------------------------------------------------------------------------
    
    #Not sure if needed:
    #def tanh_deriv(x):
        #return 1.0 - np.tanh(x)**2
    
    def compute_cost(self,X, y):
        """
        Computes the total cost on the dataset.

        args:
            X: Data array
            y: Labels corresponding to input data

        returns:
            cost: average cost per data sample
        """
        #Number of entries in .csv
        num_examples = np.shape(X)[0]
        # Hyperbolic tangent funcition as activation function
        z1 = np.dot(X,self.theta1) + self.bias1
        a = np.tanh(z1)
        z2 = np.dot(a, self.theta2) + self.bias2
        exp_z2 = np.exp(z2)
        
        #axis = 1: sum over rows
        #We can see that 0≤softmax_scores≤1. the values of softmax_scores can be considered as probabilities.
        #https://math.stackexchange.com/questions/2471528/understanding-the-softmax-function
        softmax_scores = exp_z2 / np.sum(exp_z2, axis=1, keepdims=True)
        one_hot_y = np.zeros((num_examples,np.max(y)+1))
        logloss = np.zeros((num_examples,))        
        for i in range(np.shape(X)[0]):
            one_hot_y[i,y[i]] = 1
            logloss[i] = -np.sum(np.log(softmax_scores[i,:]) * one_hot_y[i,:])
        data_loss = np.sum(logloss)
        return 1./num_examples * data_loss
    #--------------------------------------------------------------------------
    
    def predict(self,X):
        """
        Makes a prediction based on current model parameters.
        
        args:
            X: Data array
            
        returns:
            predictions: array of predicted labels
        """
        z1 = np.dot(X,self.theta1) + self.bias1
        a = np.tanh(z1)
        z2 = np.dot(a, self.theta2) + self.bias2
        exp_z2 = np.exp(z2)
        softmax_scores = exp_z2 / np.sum(exp_z2, axis=1, keepdims=True)
        predictions = np.argmax(softmax_scores, axis = 1)
        return predictions
    
    #--------------------------------------------------------------------------
    # implement logistic regression using gradient descent 
    # default learning rate = 0.01
    #--------------------------------------------------------------------------
    def fit(self,X,y,num_epochs,alpha=0.01):
        
        #Learns model parameters to fit the data.
        for epoch in range(0, num_epochs):
            
            # Forward propagation
            z1 = np.dot(X, self.theta1) + self.bias1
            a = np.tanh(z1)
            z2 = np.dot(a, self.theta2) + self.bias2
            exp_z2 = np.exp(z2)
            softmax_scores = exp_z2 / np.sum(exp_z2, axis=1, keepdims=True)                

            # Backpropagation
            beta1 = np.zeros_like(softmax_scores)
            one_hot_y = np.zeros_like(softmax_scores)
            for i in range(X.shape[0]):
                one_hot_y[i, y[i]] = 1
            beta2 = softmax_scores - one_hot_y
            
            # Compute gradients of model parameters
            dtheta2 = np.dot(a.T, beta2)
            dbias2 = np.sum(beta2, axis = 0, keepdims=True) 
            beta1 = np.dot(beta2, self.theta2.T) * (1 - np.power(a, 2))
            dtheta1 = np.dot(X.T, beta1)                                   
            dbias1 = np.sum(beta1, axis = 0)
            
            
            # Gradient descent parameter update
            # alpha is the learning rate
            self.theta2 -= alpha * dtheta2
            self.bias2  -= alpha * dbias2
            self.theta1 -= alpha * dtheta1
            self.bias1  -= alpha * dbias1
        return 0

In [14]:
def plot_decision_boundary(model, X, y):
    """
    Function to print the decision boundary given by model.
    
    args:
        model: model, whose parameters are used to plot the decision boundary.
        X: input data
        y: input labels
    """
    
    x1_array, x2_array = np.meshgrid(np.arange(-10, 10, 0.01), np.arange(-10, 10, 0.01))
    grid_coordinates = np.c_[x1_array.ravel(), x2_array.ravel()]
    # Makes a prediction based on current model parameters.
    Z = model.predict(grid_coordinates)
    Z = Z.reshape(x1_array.shape)
    plt.contourf(x1_array, x2_array, Z, cmap=plt.cm.bwr)
    #plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.bwr)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.bwr, alpha = 0.05)
    plt.show()

In [15]:
def Confusion_matrix(X, y):
    acc = 0
    y_pred = logreg.predict(X)
    con_mat = np.zeros((output_dim, output_dim))
    for i in range(len(y_pred)):
        con_mat[y_pred[i], y[i]] += 1
        if y[i] == y_pred[i]: 
            acc += 1
    acc = acc*1.0/len(y_pred)
    
    return acc, con_mat

In [16]:
#1. Load data
X_train = np.genfromtxt('DATA/Digit_X_train.csv', delimiter=',') #https://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html
y_train = np.genfromtxt('DATA/Digit_y_train.csv', delimiter=',').astype(np.int64)
X_test = np.genfromtxt('DATA/Digit_X_test.csv', delimiter=',')
y_test = np.genfromtxt('DATA/Digit_y_test.csv', delimiter=',').astype(np.int64)

In [17]:
#2. Initialize model
input_dim = np.shape(X_train)[1]
output_dim = np.max(y_train) + 1

In [18]:
#3. data analysis with 20 nodes in the hidden layer
# With 10 nodes we can achieve an 0.927 accuracy. With 20 nodes we can achieve 0.945 accuracy.
# Number of nodes larger than 20 may cause overfitting, which decreases the performance of our NN.

#hidden_dim added in the class initializer list. 
logreg = LogisticRegression(input_dim, output_dim, 10)
print ("Test with 10 hidden nodes: \n")
logreg.fit(X_train, y_train, 1000, alpha=0.001) 
acc, con_mat = Confusion_matrix(X_test, y_test)
print ("Confusion Matrix: \n", con_mat)
print ("Accuracy:", acc)
print ("Cost:", logreg.compute_cost(X_test,y_test), "\n")

Test with 10 hidden nodes: 

Confusion Matrix: 
 [[85.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0. 80.  0.  0.  1.  0.  1.  0.  3.  0.]
 [ 0.  0. 83.  1.  0.  0.  0.  1.  0.  0.]
 [ 0.  1.  3. 82.  0.  0.  0.  0.  1.  1.]
 [ 1.  0.  0.  0. 86.  0.  0.  1.  0.  0.]
 [ 1.  0.  0.  3.  0. 88.  0.  5.  4.  2.]
 [ 1.  0.  0.  0.  1.  1. 90.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. 80.  0.  0.]
 [ 0.  2.  0.  3.  0.  0.  0.  0. 79.  0.]
 [ 0.  8.  0.  2.  4.  2.  0.  2.  1. 88.]]
Accuracy: 0.9354838709677419
Cost: 0.27503690413780246 



In [19]:
#3. data analysis with 20 nodes in the hidden layer
# With 10 nodes we can achieve an 0.927 accuracy. With 20 nodes we can achieve 0.945 accuracy.
# Number of nodes larger than 20 may cause overfitting, which decreases the performance of our NN.

#hidden_dim added in the class initializer list. 
logreg = LogisticRegression(input_dim, output_dim, 20)
print ("Test with 20 hidden nodes: \n")

logreg.fit(X_train, y_train, 1000, alpha=0.001) 
acc, con_mat = Confusion_matrix(X_test, y_test)
print ("Confusion Matrix: \n", con_mat)
print ("Accuracy:", acc)
print ("Cost:", logreg.compute_cost(X_test,y_test), "\n")

Test with 20 hidden nodes: 

Confusion Matrix: 
 [[86.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0. 81.  0.  0.  1.  1.  1.  0.  3.  0.]
 [ 0.  0. 83.  1.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  3. 77.  0.  0.  0.  0.  0.  1.]
 [ 1.  0.  0.  0. 87.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  6.  0. 85.  0.  0.  4.  2.]
 [ 1.  0.  0.  0.  0.  2. 90.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0. 85.  1.  1.]
 [ 0.  2.  0.  5.  0.  0.  0.  0. 80.  0.]
 [ 0.  7.  0.  1.  3.  3.  0.  3.  0. 88.]]
Accuracy: 0.9365962180200222
Cost: 0.28285079773210087 



In [20]:
#3. data analysis with 20 nodes in the hidden layer
# With 10 nodes we can achieve an 0.927 accuracy. With 20 nodes we can achieve 0.945 accuracy.
# Number of nodes larger than 20 may cause overfitting, which decreases the performance of our NN.


#hidden_dim added in the class initializer list. 
logreg = LogisticRegression(input_dim, output_dim, 30)
print ("Test with 30 hidden nodes: \n")

logreg.fit(X_train, y_train, 1000, alpha=0.001) 
acc, con_mat = Confusion_matrix(X_test, y_test)
print ("Confusion Matrix: \n", con_mat)
print ("Accuracy:", acc)
print ("Cost:", logreg.compute_cost(X_test,y_test), "\n")

Test with 30 hidden nodes: 

Confusion Matrix: 
 [[86.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0. 83.  0.  0.  1.  0.  1.  1.  3.  0.]
 [ 0.  0. 84.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  2. 79.  0.  0.  0.  0.  1.  1.]
 [ 1.  0.  0.  0. 86.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  5.  0. 87.  0.  0.  3.  2.]
 [ 1.  0.  0.  0.  1.  1. 90.  0.  0.  0.]
 [ 0.  0.  0.  2.  0.  0.  0. 86.  0.  0.]
 [ 0.  1.  0.  5.  0.  0.  0.  0. 81.  1.]
 [ 0.  5.  0.  0.  3.  3.  0.  1.  0. 88.]]
Accuracy: 0.9454949944382648
Cost: 0.253904081505883 



## Problem 6: Performance discussion of our neural net

<p> In this section, we trained our neural net based on digits training sets and apply it to recognizing test sets.
    With 10 nodes in the hidden layer, we can achieve 0.927 accuracy. With 20 nodes we can achieve 0.945 accuracy.
    We found that number of nodes larger than 20 may cause overfitting, which decreases the performance of our NN.