# <center>English names Gender Classification<center/>

In this notebook english names gender classification is implemented. The classification models are : <br>
1. Recurrent Neural Network - LSTM 
2. Multilayer Perceptron - MLP

The implemetation is in tensorflow and all necessary methods are implemeted in this notebook. Therefore it can easily be run on [colab](https://colab.research.google.com/notebooks/welcome.ipynb#recent=true) platform to test run and see the results  

## Import necessary libraries

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import sys, os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

from matplotlib import pyplot as plt
%matplotlib inline

## Load data 

In [4]:
data_path = "Data/English/train_eng.csv"
test_data_path = "Data/English/test_eng.csv"

# Load data
p_train_data = pd.read_csv(data_path)
p_test_data = pd.read_csv(test_data_path)

# Convert data to numpy arrays
train = p_train_data.values
test = p_test_data.values

train = np.stack(sorted(list(train), key=lambda x: len(x[0])))

## Set training & models parameters

In [5]:
letter_embedding_size = 5
lstm_hidden_size = 5
epochs = 50
minibatch_size = 256

## Define helper methods 

In [6]:
def transform_data(data, max_len):
    unique = list(set("".join(data[:,0])))
    unique.sort()
    vocab = dict(zip(unique, range(1,len(unique)+1))) # start from 1 for zero padding

    classes = list(set(data[:,1]))
    classes.sort()
    class_map = dict(zip(classes, range(len(unique))))

    names = list(data[:,0])
    labels = list(data[:,1])

    def transform_name(name):
        point = np.zeros((1, max_len), dtype=int)
        name_mapped = np.array(list(map(lambda l: vocab[l], name)))
        point[0,0: len(name_mapped)] = name_mapped
        return point

    transform_label = lambda lbl: np.array([[class_map[lbl]]])

    names = list(map(transform_name, names))
    labels = list(map(transform_label, labels))

    names = np.concatenate(names, axis=0)
    labels = np.concatenate(labels, axis=0)

    return names, labels, vocab

def get_minibatches(names, labels, mb_size):
    batches = []
    position = 0
    
    while position + mb_size < len(labels):
        batches.append((names[position: position + mb_size], labels[position: position + mb_size]))
        position += mb_size

    batches.append((names[position:], labels[position:]))

    return batches

## Define LSTM model

In [7]:
def create_LSTM(emb_size, vocab_size, lstm_hidden_size, T, learning_rate=0.001):
    pad_vector = tf.zeros(shape=(1, emb_size), dtype=tf.float32, name="zero_padding")
    symbol_embedding = tf.get_variable('symbol_embeddings', shape=(vocab_size, emb_size), dtype=tf.float32)

    symbol_embedding = tf.concat([pad_vector, symbol_embedding], axis=0)

    input_ = tf.placeholder(shape=[None, T], dtype=tf.int32)
    labels_ = tf.placeholder(shape=[None, 1], dtype=tf.float32)

    embedded = tf.nn.embedding_lookup(symbol_embedding, input_)

    lstm = tf.nn.rnn_cell.LSTMCell(lstm_hidden_size)
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=embedded, dtype=tf.float32)
    output = outputs[:, -1, :]
    logits = tf.keras.layers.Dense(1)(output)

    classify = tf.nn.sigmoid(logits)

    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels_), axis=0)

    train = tf.contrib.opt.LazyAdamOptimizer(learning_rate).minimize(loss)

    print("trainable parameters:", np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]))

    return {
        'train': train,
        'input': input_,
        'labels': labels_,
        'loss': loss,
        'classify': classify
    }


In [8]:
def evaluate_LSTM(tf_session, tf_loss, tf_classify, data, labels):
    """
    Evaluate loss and accuracy on a single minibatch
    :param tf_session: current opened session
    :param tf_loss: tensor for calculating loss
    :param tf_classify: tensor for calculating sigmoid activations
    :param data: data from the current batch
    :param labels: labels from the current batch
    :return: loss_value, accuracy_value
    """

    loss_val, predict = tf_session.run([tf_loss, tf_classify], {
        input_: data,
        labels_: labels
    })
    acc_val = accuracy_score(labels, np.where(predict > 0.5, 1, 0))

    return loss_val, acc_val

## Specify LSTM Specific training parameters

In [26]:
tf.reset_default_graph()

max_len = p_train_data['Name'].str.len().max()
train_data, train_labels, voc = transform_data(train, max_len)
test_data, test_labels, _ = transform_data(test, max_len)
batches = get_minibatches(train_data, train_labels, minibatch_size)
terminals = create_LSTM(letter_embedding_size, len(voc), lstm_hidden_size, max_len)

train_ = terminals['train']
input_ = terminals['input']
labels_ = terminals['labels']
loss_ = terminals['loss']
classify_ = terminals['classify']

pl_loss = np.zeros((epochs,2))
pl_acc = np.zeros((epochs,2))

trainable parameters: 486


## Training & Evaluating LSTM 

In [27]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for e in range(epochs):
        for batch in batches:
            names, labels = batch

            sess.run([train_], {
                input_: names,
                labels_: labels})

        train_loss = 0
        train_acc = 0
        for mini_batch in batches:
            names, labels = mini_batch
            mini_loss, mini_acc = evaluate_LSTM(sess, loss_, classify_, names, labels)
            train_loss += mini_loss
            train_acc += mini_acc

        train_loss = train_loss/len(batches)
        train_acc = train_acc/len(batches)
        pl_loss[e,0] = train_loss
        pl_acc[e,0] = train_acc
        
        # Performance on the test set
        test_loss, test_acc = evaluate_LSTM(sess, loss_, classify_, test_data, test_labels)
        pl_loss[e,1] = test_loss
        pl_acc[e,1] = test_acc
        
        print("Epoch {:d}, train loss {:.4f}, train acc {:.4f}, test loss {:.4f}, test accuracy {:.4f}".format(e+1, train_loss[0], train_acc, test_loss[0], test_acc))


Epoch 1, train loss 0.5709, train acc 0.7132, test loss 0.5705, test accuracy 0.7134
Epoch 2, train loss 0.5325, train acc 0.7393, test loss 0.5320, test accuracy 0.7396
Epoch 3, train loss 0.5233, train acc 0.7433, test loss 0.5227, test accuracy 0.7436
Epoch 4, train loss 0.5211, train acc 0.7418, test loss 0.5207, test accuracy 0.7420
Epoch 5, train loss 0.5114, train acc 0.7482, test loss 0.5111, test accuracy 0.7484
Epoch 6, train loss 0.5032, train acc 0.7596, test loss 0.5030, test accuracy 0.7597
Epoch 7, train loss 0.4986, train acc 0.7642, test loss 0.4985, test accuracy 0.7643
Epoch 8, train loss 0.4979, train acc 0.7649, test loss 0.4979, test accuracy 0.7650
Epoch 9, train loss 0.4981, train acc 0.7645, test loss 0.4981, test accuracy 0.7646
Epoch 10, train loss 0.4959, train acc 0.7655, test loss 0.4959, test accuracy 0.7656
Epoch 11, train loss 0.4927, train acc 0.7675, test loss 0.4927, test accuracy 0.7676
Epoch 12, train loss 0.4892, train acc 0.7695, test loss 0.4892

## Define Multi-layer perceptron model

In [28]:
def create_NN_model(emb_size, vocab_size, T, learning_rate=0.001):

    pad_vector = tf.zeros(shape=(1, emb_size), dtype=tf.float32, name="zero_padding")
    symbol_embedding = tf.get_variable('symbol_embeddings', shape=(vocab_size, emb_size), dtype=tf.float32)

    symbol_embedding = tf.concat([pad_vector, symbol_embedding], axis=0)

    input_ = tf.placeholder(shape=[None, T], dtype=tf.int32)
    labels_ = tf.placeholder(shape=[None, 1], dtype=tf.float32)

    embedded = tf.nn.embedding_lookup(symbol_embedding, input_)

    layer_1 = tf.keras.layers.Dense(13,activation=tf.nn.leaky_relu)(embedded)
    layer_2 = tf.keras.layers.Dense(7,activation=tf.nn.relu)(layer_1)

    output = tf.keras.layers.Flatten()(layer_2)
    logits = tf.keras.layers.Dense(1)(output)

    classify = tf.nn.sigmoid(logits)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels_), axis=0)

    train = tf.contrib.opt.LazyAdamOptimizer(learning_rate).minimize(loss)

    return {
        'train': train,
        'input': input_,
        'labels': labels_,
        'loss': loss,
        'classify': classify
    }

def evaluate_NN(tf_session, tf_loss, tf_classify, data, labels):

    loss_val, predict = tf_session.run([tf_loss, tf_classify], {
        input_: data,
        labels_: labels
    })
    acc_val = accuracy_score(labels, np.where(predict > 0.5, 1, 0))

    return loss_val, acc_val

## Specify MLP Specific training parameters

In [29]:
tf.reset_default_graph()
max_len = p_train_data['Name'].str.len().max()
train_data, train_labels, voc = transform_data(train, max_len)
test_data, test_labels, _ = transform_data(test, max_len)
batches = get_minibatches(train_data, train_labels, minibatch_size)
terminals = create_NN_model(letter_embedding_size, len(voc),max_len)

train_ = terminals['train']
input_ = terminals['input']
labels_ = terminals['labels']
loss_ = terminals['loss']
classify_ = terminals['classify']

pl_loss = np.zeros((epochs,2))
pl_acc = np.zeros((epochs,2))

## Training & Evaluating MLP

In [30]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for e in range(epochs):
        for batch in batches:
            names, labels = batch

            sess.run([train_], {
                input_: names,
                labels_: labels
            })
            
        train_loss = 0
        train_acc = 0
        for mini_batch in batches:
            names, labels = mini_batch
            mini_loss, mini_acc = evaluate_NN(sess, loss_, classify_, names, labels)
            train_loss += mini_loss
            train_acc += mini_acc

        train_loss = train_loss/len(batches)
        train_acc = train_acc/len(batches)
        pl_loss[e,0] = train_loss
        pl_acc[e,0] = train_acc

        # Performance on the test set
        test_loss, test_acc = evaluate_NN(sess, loss_, classify_, test_data, test_labels)
        pl_loss[e,1] = test_loss
        pl_acc[e,1] = test_acc
        
        print("Epoch {:d}, train loss {:.4f}, train acc {:.4f}, test loss {:.4f}, test accuracy {:.4f}".format(e, train_loss[0], train_acc, test_loss[0], test_acc))

Epoch 0, train loss 0.6146, train acc 0.6658, test loss 0.6139, test accuracy 0.6664
Epoch 1, train loss 0.6068, train acc 0.6719, test loss 0.6060, test accuracy 0.6722
Epoch 2, train loss 0.5876, train acc 0.6789, test loss 0.5869, test accuracy 0.6791
Epoch 3, train loss 0.5720, train acc 0.6879, test loss 0.5712, test accuracy 0.6880
Epoch 4, train loss 0.5634, train acc 0.6947, test loss 0.5627, test accuracy 0.6948
Epoch 5, train loss 0.5547, train acc 0.7019, test loss 0.5540, test accuracy 0.7019
Epoch 6, train loss 0.5499, train acc 0.7062, test loss 0.5493, test accuracy 0.7062
Epoch 7, train loss 0.5461, train acc 0.7106, test loss 0.5456, test accuracy 0.7107
Epoch 8, train loss 0.5411, train acc 0.7142, test loss 0.5406, test accuracy 0.7143
Epoch 9, train loss 0.5392, train acc 0.7167, test loss 0.5388, test accuracy 0.7168
Epoch 10, train loss 0.5376, train acc 0.7193, test loss 0.5372, test accuracy 0.7194
Epoch 11, train loss 0.5317, train acc 0.7246, test loss 0.5312,

## <center> Conclusion <center/>