# In this assignment, a simple ConvNet is implemented and trained to predict the language of a surname from its spelling in English.

In [17]:
import os
import numpy as np
from tqdm import tqdm

## 1. ConvNet class implementation

In [2]:
class ConvNet:
    """
    """
    
    def __init__(self):
        """
        Initializes the convolutional neural network
        """
        pass

## 2. Preparing data

In [3]:
def encodeString(string, character_dictionary, max_length):
    """
    One-hot encodes the character string, converting each 
    of its letters to one-hot encoded vectors and stacking
    them from left to right. 
    
    Args:
        name: The string to be encoded.
        character_dictionary: A dictionary which has a unique
            index for each character in the alphabet used by
            the string.
        max_length: maximum length of the string. If the string
            has a length less than max_length, zero columnds are
            added as padding after the encoded character columns.

    
    Returns:
        A C x max_length vector with the one-hot encoded characters
        of the string and possibly zero padding in the last columns
        where C is the number of different characters in the alpha-
        bet used.
    """
    d = len(character_dictionary)
    encoded_string = np.zeros((d, max_length))
    for i in range(len(string)):
        encoded_string[character_dictionary[string[i]],i] = 1
    return encoded_string

In [52]:
# Path to the files containing the data
name_path = "ascii_names.txt"
category_labels_path = "category_labels.txt"
# Path to file used to save the inputs after their encoding
save_input_path = "onehot_encoded_inputs.npy"
# Path to file with the indices of the inputs that are going to used in the validation set
val_ind_path = "Validation_Inds.txt"

In [53]:
# Read the data
names = []
labels = []
if(os.path.exists(name_path)):
    with open(name_path,"r") as f:
        for line in tqdm(f):
            entry = line.split()
            names.append(' '.join(entry[:-1]))
            labels.append(entry[-1])
    f.close()
    names = np.array(names)
    labels = np.array(labels, dtype = int) 
else:
    print("Requested file " + name_path + " does not exist.")

20050it [00:00, 1002644.39it/s]


In [54]:
# Read the different class names and indices and build a dictionary
if(os.path.exists(category_labels_path)):
    class_names = np.loadtxt(category_labels_path, usecols = 1, dtype = str)
    class_indices = np.loadtxt(category_labels_path, usecols = 0, dtype = int)
    K = len(class_names)
    class_dictionary = {}
    for i in range(K):
        class_dictionary[class_names[i]] = class_indices[i]
    inv_class_dictionary = {v: k for k, v in class_dictionary.items()}
    # Check for correctness
    print(class_dictionary['Arabic'])
    print(inv_class_dictionary[1])
else: 
    print("Requested file " + category_labels_path + " does not exist.")

1
Arabic


### Determine number of unique characters and set up dictionary / Determine maximum length of name in dataset

In [7]:
character_dictionary = {}
unique_idx = 0
max_length = -1
for name in tqdm(names):
    length = len(name)
    if(length > max_length):
        max_length = length
    for i in range(len(name)):
        if(name[i] not in character_dictionary.keys()):
            character_dictionary[name[i]] = unique_idx
            unique_idx += 1
unique_character_no = len(character_dictionary)

100%|████████████████████████████████| 20050/20050 [00:00<00:00, 488934.73it/s]


In [8]:
print("DIFFERENT UNIQUE CHARACTERS: " + str(unique_character_no))
print("MAXIMUM NAME LENGTH: " + str(max_length))

DIFFERENT UNIQUE CHARACTERS: 55
MAXIMUM NAME LENGTH: 19


In [9]:
# Build inverse dictionary mapping
inv_character_dictionary = {v: k for k, v in character_dictionary.items()}
# Check for correctness
print(character_dictionary['o'])
print(inv_character_dictionary[2])

2
o


### One-hot encoding and vectorization of the input names

In [55]:
# Encode and save the inputs in a matrix when each column corresponds to a different name
vectorized_input_size = unique_character_no * max_length
X = np.zeros((vectorized_input_size, names.shape[0]))
for idx, name in enumerate(tqdm(names)):
    X[:,idx] = encodeString(name, character_dictionary, max_length).flatten(order = 'F')

100%|█████████████████████████████████| 20050/20050 [00:00<00:00, 30879.26it/s]


In [56]:
# Save inputs in a file if they are not already saved
if(not os.path.exists(save_input_path)):
    np.save(save_input_path, X)

In [57]:
# Get the indices of the inputs that are going to used in the validation set
if(os.path.exists(val_ind_path)):
    validation_indices = np.loadtxt(val_ind_path)