In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
from scipy.io import loadmat
from scipy.linalg import block_diag
from tqdm import tqdm
from sklearn.metrics import confusion_matrix

#### Path to different files containing useful data

In [None]:
name_path = "./ascii_names.txt"
category_labels_path = "category_labels.txt"
input_save_path = "./saved_names_matrix.npy"
val_ind_path = "./Validation_Inds.txt"

### ConvNet class definition

In [None]:
class ConvNet():
    """
    A simple ConvNet implementation with 2 convolutional layers 
    followed by a fully connected layer and a softmax output 
    layer.
    
    Attributes:
        n1 (int): Number of filters applied at the first layer.
        n2 (int): Number of filters applied at the second layer.
        k1 (int): Width of the filters applied at the first layer.
        k2 (int): Width of the filters applied at the second layer.
        eta (float): Learning rate used for training.
        rho (float): Momentum term.
        F (list<np.array>): List of weights for each of the convolutional layers.
        W (np.array): Weights for the last fully connected layer.
        d (int): Input dimensionality.
        K (int): Output dimensionality.
        nlen_list (list): List with the number of columns of the input when in its
            original form before being vectorized, for each layer.     
    """
    
    def he_initialized_array(self, shape, fin):
        """
            Returns an array initialized with He initialization.
            
        Args:
            shape (tuple): The shape of the weight matrix.
            fin (int): Number of inputs from the previous layer.
            
        Returns:
            A He initialized np.array with shape dimensions.
        """
        sig = np.sqrt(2.0 / fin)
        return np.random.normal(0, sig, shape)
    
    def __init__(self, n1, n2, k1, k2, d, K, nlen):
        """ Constructor"""
        # Set default values for training hyperparameters
        self.eta = 0.001
        self.rho = 0.9
        
        # Set rest of hyperparams
        self.d = d
        self.K = K
        self.nlen_list = [nlen]   
    
        # Initialize layer weights
        self.F = []
        
        fin = k1 # Since the input matrix is sparse the number of inputs is effectively k1 everytime the filter is applied
        self.nlen_list.append(self.nlen_list[-1] - k1 + 1) # Number of input columns to next layer
        self.F.append(self.he_initialized_array((d, k1, n1), fin))
        
        fin = n1 * k2
        self.nlen_list.append(self.nlen_list[-1] - k2 + 1) 
        self.F.append(self.he_initialized_array((n1, k2, n2), fin))
        
        fin = self.nlen_list[-1] * n2
        self.nlen_list.append(self.nlen_list[-1] * n2)
        self.W = self.he_initialized_array((K, fin), fin) 
        
    def make_mf_matrix(self, F, nlen):
        """
        Constructs the matrix of the filters of a layer used to
        perform the convolution by matrix multiplication.
        
        Args:
            F (np.array): A d x k x nf matrix containing the convolutional
                filters of a certain layer where d is the height of the convo-
                lutional filter, k is its width and nf is the number of filters
                in the layer.
            nlen (int): Number of columns in the input of that layer. 
        
        Returns:
            An (nlen - k + 1) * nf x nlen * d matrix that can be used to
            perform the convolution when multiplied by the
            vectorized input.
        """
        n, k, nf = F.shape
        vectorized_filters = F.T.reshape((nf, n * k)) 
        MF_matrix = np.zeros(((nlen - k + 1) * nf, nlen * n))
        for i in range(nlen - k + 1):
            MF_matrix[i * nf : i * nf + nf, i * n: i * n + n * k] = vectorized_filters
        return MF_matrix
    
    def make_mx_matrix(self, x_input, d, k, nf, nlen):
        """
        Computes the input matrix used for the convolutions during the 
        back-propagation.
        
        Args:
            vec_input: Vectorized version of the input to the convolutional
                layer.
            d: corresponding height of the filter
            k: corresponding width of the filter
            nf: number of filters to be applied
            nlen: Number of columns in the input of that layer. 
        Returns:
            A (nlen - k + 1) * filter_no x k * filter_no * height with the
            results of the convolutions.
        """
        if len(x_input.shape) > 1:
            x_input = x_input.flatten()
            
        MX_Matrix = np.zeros(((nlen - k + 1) * nf, k * nf * d))
        
        for i in range(nlen - k + 1):
            MX_Matrix[i * nf: i * nf + nf, :] = block_diag(*[x_input[d * i: d * i + k * d] for j in range(nf)]) 
        return MX_Matrix        
    
    def softmax(self, s):
        """
        Implementation of the softmax activation function

        Args:
            s: an 1xd vector of a classifier's outputs

        Returns:
            An 1xd vector with the results of softmax given the input
            vector s.
        """
        exponents = np.exp(s - np.max(s, axis = 0)) # Max subtraction for numerical stability
        output_exp_sum = np.sum(exponents, axis = 0)
        p = exponents / output_exp_sum
        return p
    
    def cross_entropy_loss(self, X, Y, MFs, p = None):
        """
        Calculates the cross entropy loss
        """
        if p is None:
            log_X = np.multiply(Y , self.forwardPass(X, MFs)[0]).sum(axis=0)
            log_X[log_X == 0] = np.finfo(float).eps
            return -np.log(log_X)
        else:
            y = np.argmax(Y, axis = 0) 
            py = np.array([p[i] for i in y])
            log_X = py * np.multiply(Y , self.forwardPass(X, MFs)[0]).sum(axis=0)
            log_X[log_X == 0] = np.finfo(float).eps
            return -np.log(log_X)

    
    def computeLoss(self, X_batch, Y_batch, MFs, p = None):
        """
        Computes the loss of the network given a batch of data.
        
        Args:
            X_batch: NxD matrix with N data sample inputs
            Y_batch: NxM matrix with N data sample outputs
        
        Returns:
            A scalar float value corresponding to the loss.
        """        
        return np.mean(self.cross_entropy_loss(X_batch, Y_batch, MFs, p))# + lamda * np.sum(self.W ** 2)

    def computeAccuracy(self, X, y, MFs):
        """
        Computes the accuracy of the network.

        Args:
            X: Input matrix
            y: Output labels

        Returns:
            The accuracy of the network (i.e. the percentage of
            correctly classified inputs in X).

        """
        softmax_outputs = self.forwardPass(X, MFs)[0] # Get probability distribution of outputs
        # Reduce to a vector of the labels with the highest probability
        predictions = np.argmax(softmax_outputs, axis = 0)
        accuracy = (predictions == y).mean()
        return accuracy
   

    def forwardPass(self, X_batch, MFs):
        """
        Performs a forward pass and returns the result:
        
        Args:
            X_batch: NxD matrix with N data sample inputs
            MFs: Matrices needed to perform convolution as 
                matrix multiplication.
            
        Returns:
            A matrix with the predicted one-hot representations along with the outputs
            of the first and second layer as well as the MF matrices calculated.
        """
        # Apply first convolutional layer to input data followed by a ReLU activation
        X_batch1 = MFs[0].dot(X_batch)
        X_batch1[X_batch1 < 0.0] = 0.0

        # Apply second convolutional layer to input data followed by a ReLU activation
        X_batch2 = MFs[1].dot(X_batch1)
        X_batch2[X_batch2 < 0.0] = 0.0

        # Apply the fully connected layer
        output = self.W.dot(X_batch2)
        # Apply softmax
        P_batch = self.softmax(output)
        return P_batch, X_batch1, X_batch2
    

    
    def backwardPass(self, Y_batch, P_batch, X_batch, X_batch1, X_batch2, MFs):
        """
        Performs a backward pass and returns the gradients:
        
        Args:
            Y_batch: NxM matrix with N data sample outputs
            P_batch: Output after the softmax activation layer
            X_batch2: Output of the second convolutional layer after the ReLU.
            X_batch1: Output of the first convolutional layer after the ReLU.
            X_batch: Original batch with the inputs.
            MFs: Matrices needed to perform convolution as 
                matrix multiplication.
            
        Returns:
            The gradients of the weights of each layer (i.e. grad_F1, grad_F2, grad_W).
        """
        # Initialize all gradients to zero
        grad_W = np.zeros(self.W.shape)
        grad_F1 = np.zeros(self.F[0].shape)
        grad_F2 = np.zeros(self.F[1].shape)
        
        # Compute gradient of W
        n = Y_batch.shape[1]
        G_batch = -(Y_batch - P_batch)
        grad_W = G_batch.dot(X_batch2.T) / n
        
        
        # Propagate gradient through fully connected layer and ReLU of 2nd layer
        G_batch = self.W.T.dot(G_batch)
        G_batch *= np.where(X_batch2 > 0, 1, 0)
        
        # Compute gradient of the second layer's filters
        n = X_batch1.shape[1]
        for j in range(n):
            g_j = G_batch[:,j]
            x_j = X_batch1[:,j]

            MX_matrix = self.make_mx_matrix(x_j, *self.F[1].shape, self.nlen_list[1])
            v = g_j.T.dot(MX_matrix)
            grad_F2 += v.reshape(grad_F2.shape, order='F')
        grad_F2 /= n
        
        # Propagate gradient through second convolutional layer and ReLU of 1st layer
        G_batch = MFs[1].T.dot(G_batch)
        G_batch *= np.where(X_batch1 > 0, 1, 0)
        
        # Compute gradient of the first layer's filters
        n = X_batch.shape[1]
        for j in range(n):
            g_j = G_batch[:,j]
            x_j = X_batch[:,j]
            MX_matrix = self.make_mx_matrix(x_j, *self.F[0].shape, self.nlen_list[0])
            v = g_j.T.dot(MX_matrix)
            grad_F1 += v.reshape(grad_F1.shape, order='F')
        grad_F1 /= n       
        
        return grad_F1, grad_F2, grad_W

    def compute_grad_num_slow(self, X_batch, Y_batch, h = 1e-5, p = None):
        '''Centered difference gradient'''
        # Initialize all gradients to zero
        grad_W = np.zeros(self.W.shape) 
        grad_F1 = np.zeros(self.F[0].shape)
        grad_F2 = np.zeros(self.F[1].shape)

        MFs = [self.make_mf_matrix(self.F[0], self.nlen_list[0])]
        MFs.append(self.make_mf_matrix(self.F[1], self.nlen_list[1]))
        
        for j in tqdm(range(self.W.shape[0])):
            for k in range(self.W.shape[1]):
                self.W[j, k] -= h
                c1 = self.computeLoss(X_batch, Y_batch, MFs, p);
                self.W[j, k] += 2 * h
                c2 = self.computeLoss(X_batch, Y_batch, MFs, p);
                self.W[j, k] -= h
                grad_W[j, k] = (c2-c1) / (2 * h)
        
        
        for j in tqdm(range(self.F[1].shape[0])):
            for k in range(self.F[1].shape[1]):
                for i in range(self.F[1].shape[2]):
                    self.F[1][j, k, i] -= h
                    MFs = [self.make_mf_matrix(self.F[0], self.nlen_list[0])]
                    MFs.append(self.make_mf_matrix(self.F[1], self.nlen_list[1])) 
                    c1 = self.computeLoss(X_batch, Y_batch, MFs, p);

                    self.F[1][j, k, i]  += 2 * h
                    MFs = [self.make_mf_matrix(self.F[0], self.nlen_list[0])]
                    MFs.append(self.make_mf_matrix(self.F[1], self.nlen_list[1])) 
                    c2= self.computeLoss(X_batch, Y_batch, MFs, p);

                    self.F[1][j, k, i]  -= h
                    grad_F2[j, k, i]  = (c2-c1) / (2 * h)

        
        for j in tqdm(range(self.F[0].shape[0])):
            for k in range(self.F[0].shape[1]):
                for i in range(self.F[0].shape[2]):
                    self.F[0][j, k, i]  -= h
                    MFs = [self.make_mf_matrix(self.F[0], self.nlen_list[0])]
                    MFs.append(self.make_mf_matrix(self.F[1], self.nlen_list[1])) 
                    c1 = self.computeLoss(X_batch, Y_batch, MFs, p);

                    self.F[0][j, k, i]  += 2 * h
                    MFs = [self.make_mf_matrix(self.F[0], self.nlen_list[0])]
                    MFs.append(self.make_mf_matrix(self.F[1], self.nlen_list[1])) 
                    c2= self.computeLoss(X_batch, Y_batch, MFs, p);

                    self.F[0][j, k, i]  -= h
                    grad_F1[j, k, i] = (c2-c1) / (2 * h)

                
        return grad_F1, grad_F2, grad_W
    
    def getClassBins(self, y):
        K = len(np.unique(y))
        bins = []
        for i in range(K):
            bins.append([j for j in range(len(y)) if y[j] == i])
        return bins


    def miniBatchGD(self, X, Y, GDparams, verbose = False, X_val = None, Y_val = None, tol = 1e-10, n_update = 1, patience = 5,
                   imbalanced_set = False):
        """
        Implementation of mini-batch gradient descent.

         Args:
            X: Training input matrix
            Y: Training set desired output matrix
            GDparams: Object of the class Params with the hyperparameters
                used for learning.
            verbose: Prints info in each iteration about the progress of
                training when equal to True.
            X_val: Validation set input matrix
            Y_val: Validation set desired output matrix
            n_update: After each <n_update> updates the validation and training
                accuracy and loss are computed.

        Returns:
            The following tuple is returned where the validation lists
            are empty if no validation set is given: (training_loss_list,
            validation_loss_list, training_acc_list, validation_acc_list).
        """
        results = ([],[],[],[])
        mini_batch_count = X.shape[1] // GDparams.n_batch
        y = np.argmax(Y, axis = 0)
        
        
        if imbalanced_set:
            bins = self.getClassBins(y)
            class_samples = [len(bin) for bin in bins]
            min_samples = min(class_samples)
            
        
        MFs = [self.make_mf_matrix(self.F[0], self.nlen_list[0])]
        MFs.append(self.make_mf_matrix(self.F[1], self.nlen_list[1]))
        
        if(X_val is not None and Y_val is not None):
            y_val = np.argmax(Y_val, axis = 0)
        results[0].append(self.computeLoss(X, Y, MFs))
        results[2].append(self.computeAccuracy(X, y, MFs))
        
        if(X_val is not None and Y_val is not None):
            results[1].append(self.computeLoss(X_val, Y_val, MFs))
            results[3].append(self.computeAccuracy(X_val, y_val, MFs))
            best_acc = results[3][-1]
            best_F = list(self.F)
            best_W = np.copy(self.W)
            early_stop_counter = patience
            
        if(verbose):
                print("Starting state ")
                print("    Training cost: " + str(results[0][-1]))
                print("    Training accuracy: " + str(results[2][-1]))
                if(X_val is not None and Y_val is not None):
                    print("    Validation cost: " + str(results[1][-1]))
                    print("    Validation accuracy: " + str(results[3][-1]))
                    
        # If momentum is used
        if GDparams.rho != 0.0:
            # Create zero matrix for each parameter
            V_W = np.zeros(self.W.shape)
            V_F2 = np.zeros(self.F[1].shape)
            V_F1 = np.zeros(self.F[0].shape)
                    
        learning_rate = GDparams.eta
        steps = 0
        
        if imbalanced_set:
            X_ = np.copy(X)
            Y_ = np.copy(Y)
            y_ = np.copy(y)
        for i in tqdm(range(GDparams.n_epochs)):
            if imbalanced_set:
                indices = []                
                for k in range(self.K):
                    indices.append(np.random.choice(bins[k], size = min_samples, replace=False))
                indices = np.array(indices).flatten()
                np.random.shuffle(indices)
                X = np.copy(X_[:,indices])
                Y = np.copy(Y_[:,indices])
                y = np.argmax(Y, axis = 0)
                mini_batch_count = X.shape[1] // GDparams.n_batch
                

            for j in range(mini_batch_count):
                steps += 1                    
                if(j < mini_batch_count - 1):
                    start = j * GDparams.n_batch
                    end = start + GDparams.n_batch
                    mini_batch_input = X[:,start:end]
                    mini_batch_output = Y[:,start:end]
                else:
                    # Take the remaining samples in the last mini batch
                    mini_batch_input = X[:,j * GDparams.n_batch:]
                    mini_batch_output = Y[:,j * GDparams.n_batch:]
            
                # Construct MF Matrices
                MFs = [self.make_mf_matrix(self.F[0], self.nlen_list[0])]
                MFs.append(self.make_mf_matrix(self.F[1], self.nlen_list[1]))
                P_batch, X_batch1, X_batch2 = self.forwardPass(mini_batch_input, MFs)
                grad_F1, grad_F2, grad_W = self.backwardPass(mini_batch_output, P_batch, mini_batch_input,\
                                                             X_batch1, X_batch2, MFs)
                
                # Converge if all gradients are zero
                if np.all(grad_W < tol) == 0 and np.all(grad_F1 < tol) and np.all(grad_F2 < tol):
                    print("Learning converged at epoch " + str(i))
                    break              
                
                if GDparams.rho == 0.0:
                    self.W -= learning_rate * grad_W
                    self.F[1] -= learning_rate * grad_F2
                    self.F[0] -= learning_rate * grad_F1
                else:
                    V_W = GDparams.rho * V_W + learning_rate * grad_W
                    V_F2 = GDparams.rho * V_F2 + learning_rate * grad_F2
                    V_F1 = GDparams.rho * V_F1 + learning_rate * grad_F1
                    self.W -= V_W
                    self.F[1] -= V_F2
                    self.F[0] -= V_F1
                
                if steps % n_update == 0:
                    if imbalanced_set:
                        X = np.copy(X_)
                        Y = np.copy(Y_)
                        y = np.copy(y_)
                    results[0].append(self.computeLoss(X, Y, MFs))
                    results[2].append(self.computeAccuracy(X, y, MFs))
                    if(X_val is not None and Y_val is not None):
                        results[1].append(self.computeLoss(X_val, Y_val, MFs))
                        results[3].append(self.computeAccuracy(X_val, y_val, MFs))
                        if results[3][-1] > best_acc:
                            early_stop_counter = patience
                            best_acc = results[3][-1]
                            best_F = list(self.F)
                            best_W = np.copy(self.W)
                        else:
                            early_stop_counter -= 1
                            if early_stop_counter == 0:
                                break
                                
                    if(verbose):
                        print("Iteration " + str(i * mini_batch_count + j))
                        print("    Training cost: " + str(results[0][-1]))
                        print("    Training accuracy: " + str(results[2][-1]))
                        if(X_val is not None and Y_val is not None):
                            print("    Validation cost: " + str(results[1][-1]))
                            print("    Validation accuracy: " + str(results[3][-1]))
            # Decay the learning rate
            learning_rate *= GDparams.decay_rate
            if early_stop_counter == 0:
                break
            
    
        self.F = best_F
        self.W = best_W
        
        return results
    

In [None]:
class Params:
    """
    Class containing hyperparameters used for
    gradient descent learning.
    
    Attributes:
        n_batch: Number of samples in each mini-batch.
        eta: Learning rate
        n_epochs: Maximum number of learning epochs.
        decay_rate: The percentage of decay of the learning rate after each epoch, i.e.
            a factor less than 1 by which the learning rate gets multiplied after each 
            epoch.
        rho: percentage of use of the gradients of previous turns in learning to add momentum
    """
    def __init__(self, n_batch, eta, n_epochs, decay_rate = 1.0, rho = 0.0):
        """ Constructor """
        self.n_batch = n_batch
        self.eta = eta
        self.n_epochs = n_epochs
        self.decay_rate = decay_rate
        self.rho = rho

In [None]:
def read_names_and_labels(file_path):
    """
    Reads the names and labels from the given file and
    returns a list for each.
    
    Args:
        file_path: Path to the file containing the names
            and each of their labels.
    
    Returns:
        A tuple with two lists (<names_list>, <labels_list>).
    """
    names = []
    labels = []
    with open(name_path, "r") as f:
        for line in f:
            split_fields = line.split(" ")
            names.append(' '.join(split_fields[:-1])) # Append name
            labels.append(split_fields[-1]) # Append label
    return (names, labels)

In [None]:
def encode_string(string, character_dictionary, max_length):
    """
    One-hot encodes the character string, converting each 
    of its letters to one-hot encoded vectors and stacking
    them from left to right. 
    
    Args:
        name: The string to be encoded.
        character_dictionary: A dictionary which has a unique
            index for each character in the alphabet used by
            the string.
        max_length: maximum length of the string. If the string
            has a length less than max_length, zero columnds are
            added as padding after the encoded character columns.

    
    Returns:
        A C x max_length vector with the one-hot encoded characters
        of the string and possibly zero padding in the last columns
        where C is the number of different characters in the alpha-
        bet used.
    """
    d = len(character_dictionary)
    encoded_string = np.zeros((d, max_length))
    for i in range(len(string)):
        encoded_string[character_dictionary[string[i]],i] = 1
    return encoded_string

In [None]:
def one_hot_encoding(label_id, label_no):
    """
    Returns a one-hot encoded numpy vector with 1 at the index
    of the label and 0 for each other element.
    
    Args:
        label_id: Index of label.
        label_no: Number of total labels.
    
    Returns:
        A one-hot encoded vector with label_no elements.
    """
    vector = np.zeros(label_no) 
    vector[label_id] = 1
    return vector

In [None]:
def getRelativeErrors(grad1, grad2):
    """
    Computes the relative errors of grad_1 and grad_2 gradients
    """
    abs_diff = np.absolute(grad1 - grad2) 
    abs_sum = np.absolute(grad1) + np.absolute(grad2)
    max_elems = np.where(abs_sum > np.finfo(float).eps, abs_sum, np.finfo(float).eps)
    relativeErrors = abs_diff / max_elems
    return relativeErrors

In [None]:
def makePlots(tr_losses, val_losses, tr_accuracies, val_accuracies, n_updates):
    plt.rcParams["figure.figsize"] = (10,5)
    plt.subplot(1,2,1)
    plt.plot([i*n_updates for i in range(len(tr_losses))], tr_losses, 'r-', label='Train')
    plt.plot([i*n_updates for i in range(len(val_losses))], val_losses, 'b-', label='Validation')
    plt.title('Cost function')
    plt.xlabel('Update steps')
    plt.ylabel('Cost value')
    plt.legend()
    plt.subplot(1,2,2)
    plt.plot([i*n_updates for i in range(len(tr_accuracies))], tr_accuracies, 'r-', label='Training data')
    plt.plot([i*n_updates for i in range(len(val_accuracies))], val_accuracies, 'b-', label='Validation data')
    plt.title('Accuracy')
    plt.xlabel('Update steps')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [None]:
def plot_confusion_matrix(convNet, inv_class_dictionary, Y, X,
                      title='Confusion matrix', normalize = False,
                      cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    K = len(inv_class_dictionary)
    classes = [inv_class_dictionary[i + 1] for i in range(K)]
    MFs = [convNet.make_mf_matrix(convNet.F[0], convNet.nlen_list[0])]
    MFs.append(convNet.make_mf_matrix(convNet.F[1], convNet.nlen_list[1]))
    cm = confusion_matrix(np.argmax(Y, axis = 0), np.argmax(convNet.forwardPass(X, MFs)[0], axis = 0), labels = [i for i in range(K)])

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## Preparing data

##### Read the data from the text file

In [None]:
try:
    names, labels = read_names_and_labels(name_path)
except Exception as e:
    print("Requested file " + name_path + " does not exist or cannot be accessed.")

##### Create a character dictionary, get the size of the vocabulary and the maximum name length

In [None]:
n_len = -1 # Maximum name length
char_idx = 0
character_dict = {}
for name in names:
    # Compare current length with maximum name length
    cur_len = len(name)
    if  cur_len > n_len:
        n_len = cur_len
    # Store any previously unseen characters into dictionary
    for character in name:
        if character not in character_dict.keys():
            character_dict[character] = char_idx
            char_idx += 1
labels = np.array(labels, dtype = int)
d = len(character_dict) # number of unique characters
K = len(np.unique(labels)) # number of unique classes

In [None]:
# Build inverse dictionary mapping
inv_character_dict = {v: k for k, v in character_dict.items()}
# Check for correctness
print(character_dict['o'])
print(inv_character_dict[2])

In [None]:
# Read the different class names and indices and build a dictionary
if(os.path.exists(category_labels_path)):
    class_names = np.loadtxt(category_labels_path, usecols = 1, dtype = str)
    class_indices = np.loadtxt(category_labels_path, usecols = 0, dtype = int)
    K = len(class_names)
    class_dictionary = {}
    for i in range(K):
        class_dictionary[class_names[i]] = class_indices[i]
    inv_class_dictionary = {v: k for k, v in class_dictionary.items()}
    # Check for correctness
    print(class_dictionary['Arabic'])
    print(inv_class_dictionary[1])
else: 
    print("Requested file " + category_labels_path + " does not exist.")

In [None]:
print("DIFFERENT UNIQUE CHARACTERS: " + str(d))
print("MAXIMUM NAME LENGTH: " + str(n_len))
print("NUMBER OF UNIQUE CLASSES: " + str(K))

#### Encode names and labels using one-hot-encoding

In [None]:
if not os.path.exists(input_save_path):
    # Encode and save the inputs in a matrix when each column corresponds to a different name
    vectorized_input_size = d * n_len
    X = np.zeros((vectorized_input_size, len(names)))
    for idx, name in enumerate(names):
        X[:, idx] = encode_string(name, character_dict, n_len).flatten(order = 'F')
    np.save(input_save_path, X)
else:
    X = np.load(input_save_path)

In [None]:
# One-hot encoding for output
Y = np.array([one_hot_encoding(label - 1, K) for label in labels]).T

In [None]:
# Get the indices of the inputs that are going to used in the validation set
if(os.path.exists(val_ind_path)):
    validation_indices = np.loadtxt(val_ind_path, dtype=int)

#### Discard indices that do not correspond to any input and split into training and validation data

In [None]:
validation_indices = validation_indices[validation_indices < X.shape[1]]
X_tr = np.delete(X, validation_indices, axis = 1)
X_val = X[:, validation_indices]
Y_tr = np.delete(Y, validation_indices, axis = 1)
Y_val = Y[:, validation_indices]

#### Create a ConvNet object instance

In [None]:
n1 = n2 = 20
k1 = 5
k2 = 3
conv_net = ConvNet(n1 = 20, n2 = 20, k1 = 5, k2 = 3, d = d, K = K, nlen = n_len)

In [None]:
MF_Matrix = conv_net.make_mf_matrix(conv_net.F[0], nlen=n_len)
MX_Matrix = conv_net.make_mx_matrix(X_tr[:,0], d, k1, n1, n_len)
s1 = MX_Matrix.dot(conv_net.F[0].flatten('F').reshape(-1, 1))
s2 = MF_Matrix.dot(X[:,0].reshape(-1, 1))
print(np.allclose(s1,s2, rtol=1e-2, atol=1e-10))

In [None]:
ax = plt.gca()
ax.invert_yaxis()
plt.pcolormesh(MX_Matrix)
plt.show()

In [None]:
ax = plt.gca()
ax.invert_yaxis()
plt.pcolormesh(MF_Matrix)
plt.show()

#### Use precomputed filters and convolution outputs to verify that MF_Matrix is correct
#### If MF_Matrix computation is correct then the MX_Matrix computation must be also correct if they are always equal

In [None]:
dictionary = loadmat("./DebugInfo")
x_input = dictionary['x_input']
X_input = dictionary['X_input']
F = dictionary['F']
vecF = dictionary['vecF']
vecS = dictionary['vecS']
S = dictionary['S']

MF_Matrix = conv_net.make_mf_matrix(F, n_len)
s = MF_Matrix.dot(x_input)
print(np.allclose(s, vecS))

#### Debug gradient calculation through backward pass by comparing to the gradient values computed with numerical approximation

In [None]:
MFs = [conv_net.make_mf_matrix(conv_net.F[0], conv_net.nlen_list[0])]
MFs.append(conv_net.make_mf_matrix(conv_net.F[1], conv_net.nlen_list[1]))
P_batch, X_batch1, X_batch2 = conv_net.forwardPass(X[:,:100], MFs)

In [None]:
np.random.seed(400)
grad_F1, grad_F2, grad_W = conv_net.backwardPass(Y[:,:100], P_batch, X[:,:100], X_batch1, X_batch2, MFs)
grad_F1_approx, grad_F2_approx, grad_W_approx = conv_net.compute_grad_num_slow(X[:,:100], Y[:,:100])

errors1 = getRelativeErrors(grad_F1, grad_F1_approx)
errors2 = getRelativeErrors(grad_F2, grad_F2_approx)
errors3 = getRelativeErrors(grad_W, grad_W_approx)
print(np.max(errors1))
print(np.max(errors2))
print(np.max(errors3))

print(np.mean(errors1))
print(np.mean(errors2))
print(np.mean(errors3))

#### Train network with AdaGrad and momentum

In [None]:
# Tune the learning rate hyperparameter
etas = [0.001, 0.005, 0.01, 0.05]
for eta in etas:
    rho = 0.9
    epochs = 25 #125
    mini_batch_size = 100
    decay_rate = 0.95
    n_update = 100
    
    np.random.seed(400)
    filter_width_constants = [5, 3]
    filter_numbers = [20, 20]
    d = len(character_dict)
    K = Y_tr.shape[0]
    conv_net = ConvNet(n1 = filter_numbers[0], n2 = filter_numbers[1] , k1 = filter_width_constants[0],\
                       k2 = filter_width_constants[1], d = d, K = K,\
                       nlen = n_len)

    GDparams = Params(mini_batch_size, eta, epochs, decay_rate, rho)
    results = conv_net.miniBatchGD(X_tr, Y_tr, GDparams, verbose = True, X_val = X_val, Y_val = Y_val, tol = 1e-10, n_update = n_update, patience = 1e+10)
    
    makePlots(results[0], results[1], results[2], results[3], n_update)
    plot_confusion_matrix(conv_net, inv_class_dictionary, Y_tr, X_tr)
    plt.rcParams["figure.figsize"] = (28,15)
    plt.show()
    plot_confusion_matrix(conv_net, inv_class_dictionary, Y_val, X_val)
    plt.rcParams["figure.figsize"] = (28,15)
    plt.show()

#### Train network with AdaGrad and momentum while compensating for the imbalanced dataset

In [None]:
# Tune the learning rate hyperparameter
etas = [0.05]
for eta in etas:
    rho = 0.9
    epochs = 2000
    mini_batch_size = 100
    decay_rate = 0.95
    n_update = 2500
    
    np.random.seed(400)
    filter_width_constants = [5, 3]
    filter_numbers = [20, 20]
    d = len(character_dict)
    K = Y_tr.shape[0]
    conv_net = ConvNet(n1 = filter_numbers[0], n2 = filter_numbers[1] , k1 = filter_width_constants[0],\
                       k2 = filter_width_constants[1], d = d, K = K,\
                       nlen = n_len)

    GDparams = Params(mini_batch_size, eta, epochs, decay_rate, rho)
    results = conv_net.miniBatchGD(X_tr, Y_tr, GDparams, verbose = True, X_val = X_val, Y_val = Y_val, tol = 1e-10, n_update = n_update, patience = 1e+10, imbalanced_set = True)
    makePlots(results[0], results[1], results[2], results[3], n_update)
    plt.rcParams["figure.figsize"] = (28,15)
    plot_confusion_matrix(conv_net, inv_class_dictionary, Y_tr, X_tr)
    plt.show()
    plt.rcParams["figure.figsize"] = (28,15)
    plot_confusion_matrix(conv_net, inv_class_dictionary, Y_val, X_val)
    plt.show()

In [None]:
surnames = ["Stylianidis", "Rodriguez", "Siniuokov", "Gaddy", "Chizzali", "Shi"]
vectorized_input_size = d * n_len
X_tst = np.zeros((vectorized_input_size, len(surnames)))
for idx, name in enumerate(surnames):
    X_tst[:, idx] = encode_string(name, character_dict, n_len).flatten(order = 'F')

In [None]:
MFs = [conv_net.make_mf_matrix(conv_net.F[0], conv_net.nlen_list[0])]
MFs.append(conv_net.make_mf_matrix(conv_net.F[1], conv_net.nlen_list[1]))
P_batch, X_batch1, X_batch2 = conv_net.forwardPass(X_tst, MFs)

In [None]:
predictions = np.argmax(P_batch, axis = 0)
print([inv_class_dictionary[prediction + 1] for prediction in predictions])

In [None]:
plt.rcParams['figure.figsize'] = (15, 10)
plt.pcolormesh(P_batch)
plt.show()