# Recurrent Networks - AML | Innopolis University
# Josep de Cid Rodríguez

In [None]:
from google.colab import drive

drive.mount('/gdrive')

In [None]:
! cp /gdrive/My\ Drive/IU/AML/Labs/Lab5/train_eng.csv .
! cp /gdrive/My\ Drive/IU/AML/Labs/Lab5/test_eng.csv .

! cp /gdrive/My\ Drive/IU/AML/Labs/Lab5/train_rus.csv .
! cp /gdrive/My\ Drive/IU/AML/Labs/Lab5/test_rus.csv .

## Data & Preprocessing

In [None]:
import abc
import time
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functools import reduce
from sklearn.model_selection import train_test_split

%matplotlib inline

Read of both *CSV* files, splitting our data in features (**names** column) and classification targets (**gender** column).

In [None]:
def read_dataset(path):
    df = pd.read_csv(path)
    x = df.values[:, 0]
    y = df.values[:, 1]
    return df, x, y

train_df =  pd.read_csv('train_eng.csv')
test_df = pd.read_csv('test_eng.csv')
all_df = pd.concat((train_df, test_df), axis=0)

train_ru_df = pd.read_csv('train_rus.csv')
test_ru_df = pd.read_csv('test_rus.csv')

train = train_df.values
test = test_df.values

train_rus = train_ru_df.values
test_rus = test_ru_df.values

We will apply the following preprocessing to our data:
1. Map every dictionary letter to an unique positive ID s.t. e.g. `Elizabeth` → `[5 38 35 52 27 28 31 46 34]`
2. Apply padding `0` to the samples to have the same length in all → `[5 38 35 52 27 28 31 46 34 0 ... 0]`
3. Encode our labels as `0` if *Female* or `1` if *Male*.

In [None]:
def preprocess_data(values, train=False, male_label='M', max_seq=None):
    if train:
        # Sort training names by length increasing
        values = np.stack(sorted(list(values), key=lambda x: len(x[0])))
    
    x, y = values[:, 0], values[:, 1:]

    # Create vocabulary mapping letter -> ID
    unique = sorted(set("".join(values[:, 0])))
    vocab = dict(zip(unique, range(1, len(unique) + 1)))

    # Max length of our sequences in the model
    if max_seq is None:
        max_seq = max(map(len, x))

    # Maps name to array of ids for every letter
    encode_letter = lambda letter: vocab[letter]
    encode_name = lambda name: list(map(encode_letter, name))
    encode_names = lambda names: list(map(encode_name, names))

    x_ids = encode_names(x)

    # Adds padding to names in id form to have length max_seq
    for idx, name in enumerate(x_ids):
        padding = max_seq - len(name)
        x_ids[idx] = np.array(name + [0]*padding)

    # Encode targets in 0 (Female) and 1 (Male)
    binaryze_targets = lambda targets: [[int(x[0] == male_label)] for x in targets]
    y_bin = np.array(binaryze_targets(y))

    return x_ids, y_bin, max_seq, vocab
    
x_train_ids, y_train_bin, max_seq, vocab = preprocess_data(train, train=True)
x_test_ids, y_test_bin, _, _ = preprocess_data(test, max_seq=max_seq)

x_train_rus_ids, y_train_rus_bin, max_seq_rus, vocab_rus = preprocess_data(train_rus, train=True, male_label='М')
x_test_rus_ids, y_test_rus_bin, _, _ = preprocess_data(test_rus, male_label='М', max_seq=max_seq_rus)

## Models

We will implement two parametrizable models in order to tune the hyperparameters and choose the most appropiate one. We will start with some standard Hyperparameters:

In [None]:
class NN:
    def __init__(self, x_train, y_train, x_val, y_val, print_mode=True):
        # Reset previous graphs
        tf.reset_default_graph()
        
        # Model Data 
        self.x_train = x_train
        self.y_train = y_train
        self.x_val = x_val
        self.y_val = y_val
        
        # Print mode (text | plot)
        self.print_mode = print_mode
        
    @abc.abstractmethod
    def create_graph(self):
        return
    
    def print_trainable_parameters(self):
        params = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])
        print('Trainable parameters {}'.format(params))
            
    def train(self, epochs=100, batch_size=256, patience=25):
        training_start_time = time.time()
        
        train_accuracies = []
        test_accuracies = []
        
        worse_epoch_count = 0
        best_train_accuracy = 0
        best_test_accuracy = 0
        
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            
            for epoch in range(1, epochs + 1):
                if self.print_mode:
                    progress = tf.keras.utils.Progbar(target=len(self.x_train),
                                                      stateful_metrics=['batch loss', 'time'],
                                                      width=30, interval=0.5)
                
                start_time = time.time()
                if self.print_mode:
                    print('> Epoch {}:'.format(epoch))
                
                for batch_idx, batch in enumerate(self._next_batch(self.x_train, self.y_train, batch_size, shuffle=True)):
                    features, targets = batch
                    d = {self.names: features, self.genders: targets}
                    loss, _ = sess.run([self.loss, self.optimize], feed_dict=d)
                    
                    if self.print_mode:
                        elapsed_time = time.time() - start_time
                        progress.update(batch_idx * batch_size,
                                        values=[('time', elapsed_time), ('batch loss', loss), ('epoch loss', loss)])
                    
                if self.print_mode:
                    progress.update(len(self.x_train), values=[('batch loss', loss), ('epoch loss', loss)])

                train_accuracy = sess.run(self.accuracy, feed_dict={self.names: self.x_train, self.genders: self.y_train})
                test_accuracy = sess.run(self.accuracy, feed_dict={self.names: self.x_val, self.genders: self.y_val})
                
                if self.print_mode:
                    print('Epoch {:2} | Training set accuracy = {:.4f}, Test set accuracy = {:.4f}'
                          .format(epoch, train_accuracy, test_accuracy))
                
                train_accuracies.append(train_accuracy)
                test_accuracies.append(test_accuracy)
                
                best_train_accuracy = max(train_accuracy, best_train_accuracy)
                if test_accuracy > best_test_accuracy:
                    worse_epoch_count = 0
                    best_test_accuracy = test_accuracy
                else:
                    worse_epoch_count += 1
                    if worse_epoch_count == patience:
                        if self.print_mode:
                            print('Early stopping at epoch {}'.format(epoch))
                        break

        return best_train_accuracy, train_accuracies, best_test_accuracy, test_accuracies, time.time() - training_start_time
                
    def _next_batch(self, x, y, batch_size, shuffle=False):
        position = 0
        while position + batch_size < len(x):
            offset = position + batch_size
            yield x[position:offset], y[position:offset]
            position = offset
        yield x[position:], y[position:]
        
    def _create_graph_input(self, input_dim, vocab_dim, emb_dim):
        # Placeholders for input and targets
        self.names = tf.placeholder(tf.int32, shape=[None, input_dim], name='Names')
        self.genders = tf.placeholder(tf.float32, shape=[None, 1], name='Genders')

        # Embedding Matrix (0-pad is not a variable, remains 0)
        padding_vector = tf.zeros(shape=(1, emb_dim), dtype=tf.float32, name='ZeroPadding')
        symbol_embedding = tf.get_variable('W', shape=(vocab_dim, emb_dim), dtype=tf.float32)
        symbol_embedding = tf.concat([padding_vector, symbol_embedding], axis=0)
    
        # Word embeddings
        return tf.nn.embedding_lookup(symbol_embedding, self.names)
    
    def _create_graph_output(self, last_layer, lr):
        # Dense layer with binary output
        logits = tf.layers.dense(last_layer, 1)
        
        # Loss & Optimization
        logits_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.genders)
        self.loss = tf.reduce_mean(logits_loss)
        optimizer = tf.contrib.opt.LazyAdamOptimizer(lr)
        self.optimize = optimizer.minimize(self.loss)

        # Prediction & Accuracy
        self.predictions = tf.round(tf.sigmoid(logits))
        self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.predictions, self.genders), dtype=tf.float32))

### RNN

In [None]:
class RNN(NN):
    def create_graph(self, input_dim, vocab_dim, emb_dim=5, lr=0.001,
                     unit_layers=[5], activations=['relu'], cell_type=[('LSTM', False)]):
        assert len(unit_layers) > 0
        assert len(unit_layers) == len(activations)
    
        # Word embeddings
        embedded_names = self._create_graph_input(input_dim, vocab_dim, emb_dim)
        
        # Add Layers
        cell_layers = []
        for idx, hidden_dim in enumerate(unit_layers):
            if cell_type[idx][0] == 'LSTM':
                cell_layers.append(tf.nn.rnn_cell.LSTMCell(hidden_dim, activation=activations[idx],
                                                           use_peepholes=cell_type[idx][1]))
            else:
                cell_layers.append(tf.nn.rnn_cell.GRUCell(hidden_dim, activation=activations[idx]))
            
        # Multilayer cell
        cell = tf.contrib.rnn.MultiRNNCell(cell_layers, state_is_tuple=True)

        # Dynamic RNN (Dynamic graph with a loop) using defined cell
        outputs, _ = tf.nn.dynamic_rnn(cell=cell, inputs=embedded_names, dtype=tf.float32)

        # Filter only last timestep output
        last_layer = outputs[:, -1, :]
        self._create_graph_output(last_layer, lr)

In [None]:
model = RNN(x_train_ids, y_train_bin, x_test_ids, y_test_bin)

model.create_graph(input_dim=max_seq, vocab_dim=len(vocab), emb_dim=10,
                   unit_layers=[20, 10], activations=['relu', 'relu'],
                   cell_type=[('LSTM', True), ('LSTM', True)])
model.print_trainable_parameters()

_ = model.train(epochs=500)

### FNN

In [None]:
class FNN(NN):
    def create_graph(self, input_dim, vocab_dim, emb_dim=5, lr=0.001, unit_layers=[5], activations=['relu'], mode=('flatten', 1)):
        # Word embeddings
        embedded_names = self._create_graph_input(input_dim, vocab_dim, emb_dim)
        flatten_names = self._flatten_input(embedded_names, input_dim, emb_dim, mode)

        # Add Dense Layers
        layers = [flatten_names]
        for idx, units in enumerate(unit_layers):
            layers.append(tf.layers.dense(layers[-1], units, activation=activations[idx]))

        last_layer = layers[-1]
        self._create_graph_output(last_layer, lr)
        
    def _flatten_input(self, tensor, input_dim, emb_dim, mode):
        assert len(mode) == 2
        assert mode[0] in ['flatten', 'max_pool', 'average', 'w_average']
        
        if mode[0] == 'max_pool':
            return tf.reduce_max(tensor, axis=mode[1])
        elif mode[0] == 'average':
            return tf.reduce_mean(tensor, axis=mode[1])
        elif mode == 'w_average':
            filt = tf.get_variable('filter', shape=(1, input_dim, 1)
                                   if mode[1] == 1 else (1, 1, emb_dim))
            return tf.reduce_mean(tensor * filt, axis=mode[1])
        else:
            return tf.reshape(tensor, shape=(-1, input_dim * emb_dim))

In [None]:
model = FNN(x_train_ids, y_train_bin, x_test_ids, y_test_bin)

model.create_graph(input_dim=max_seq, vocab_dim=len(vocab),
                   unit_layers=[102], activations=['sigmoid'])
model.print_trainable_parameters()

_ = model.train(epochs=500)

## Hyperparameter Search

In this homework, instead of the classic Grid Search, for hyperparameter optimization, we will use Random Search. [Why?](https://analyticsindiamag.com/wp-content/uploads/2018/06/both.png) Because random search has a probability of 95% of finding a combination of parameters within the 5% optima with only 60 iterations ([much faster](https://www.cnblogs.com/yymn/p/4536740.html)). Also compared to other methods it doesn't bog down in local optima. To understand it in a visual way, take a look at the following example:

![Grid Search vs Random Search](https://i.stack.imgur.com/cIDuR.png)

So let's start defining a function to generate random parameter configurations:

In [None]:
def generate_random_hyperparameters(rnn=True):
    '''Generate random learning rate and keep probability'''
    # Random normal distribution around 0.001 
    learning_rate = 10 ** -np.random.normal(3, 0.5)
    
    # From 1 to 3-4 layers
    max_lay = 3 if rnn else 4
    layers = np.random.randint(1, max_lay + 1)
    
    # 5..20 units/layer in RNN, 50..200 in MLP
    l = 5 if rnn else 50
    h = 20 if rnn else 200
    layers_neurons = np.random.randint(l, h, layers).tolist()
    
    # Batch size of 128, 256, 512 or 1024
    batch_size = np.random.choice([2 ** p for p in range(7, 11)])
    
    # Activation Tanh, Sigmoid or ReLU
    activation = np.random.choice(['tanh', 'sigmoid', 'relu'], layers)
    
    # RNN ONLY -> LSTM or GRU
    cell_types = np.random.choice(['LSTM', 'GRU'], layers)
    cell_peepholes = np.random.choice([True, False], layers)
    
    # MLP ONLY -> Flatten technique
    flatten_mode = np.random.choice(['flatten', 'max_pool', 'average', 'w_average'])
    flatten_axis = np.random.randint(1, 3)
    
    return {
        'LR': learning_rate,
        'LN': layers_neurons,
        'AC': activation,
        'BS': batch_size,
        'CT': list(zip(cell_types, cell_peepholes)),
        'FM': (flatten_mode, flatten_axis)
    }

def configuration_to_label(c, rnn=True):
    a_tags = list(map(lambda a: 'T' if a == 'tanh' else ('S' if a == 'sigmoid' else 'R'), c['AC']))
    if rnn:
        rnn_cell_label = lambda c: 'G' if c[0] == 'GRU' else ('LP' if c[1] else 'L')
        layers = ', '.join(map(lambda x: '{}{}{}'.format(rnn_cell_label(x[2]), x[0], x[1]), zip(c['LN'], a_tags, c['CT'])))
    else:
        layers = ', '.join(map(lambda x: '{}{}'.format(x[0], x[1]), zip(c['LN'], a_tags)))
    label = ' LR={:.5f}; BatchSize={}; Layers={}'.format(c['LR'], c['BS'], layers)
        
    if rnn:
        return '(RNN) {}'.format(label)
    else:
        if c['FM'][0] == 'flatten':
            return '(MLP - {}) {}'.format(c['FM'][0], label)
        else:
            return '(MLP - {}({})) {}'.format(c['FM'][0], c['FM'][1], label)

In [None]:
def plot_hyperparam_tunning(accuracies):
    best_accuracies_configs = sorted(accuracies, key=lambda m: m['best_accuracy']['test'], reverse=True)
    best_accuracies_configs_head = best_accuracies_configs[:5]

    colors = ['#198181', '#F70022', '#F7904C', '#00ADC9', '#6E50C8']
    max_epochs = max(map(lambda c: len(c['accuracies']['train']), best_accuracies_configs_head))
    max_train_accuracy = max(map(lambda c: c['best_accuracy']['train'], best_accuracies_configs_head))
    max_test_accuracy = max(map(lambda c: c['best_accuracy']['test'], best_accuracies_configs_head))

    _, (ax_train, ax_test) = plt.subplots(2, 1, figsize=(12, 10))
    plt.subplots_adjust(hspace=0.25)

    for i, m in enumerate(best_accuracies_configs_head):
        x = list(range(1, len(m['accuracies']['train']) + 1))
    
        ax_train.plot(x, m['accuracies']['train'], label=m['label'], color=colors[i])
        ax_train.axvline(len(x), color=colors[i], linestyle='--', linewidth=0.5)
    
        ax_test.plot(x, m['accuracies']['test'], label=m['label'], color=colors[i])
        ax_test.axvline(len(x), color=colors[i], linestyle='--', linewidth=0.5)

    ax_train.set_title('Train data accuracy')
    ax_train.set_ylim(m['accuracies']['train'][0] - 0.01, max_train_accuracy + 0.01)

    ax_test.set_title('Test data accuracy')
    ax_test.set_ylim(m['accuracies']['test'][0] - 0.01, max_test_accuracy + 0.01)

    for axe in (ax_train, ax_test):
        axe.set_xlabel('Epoch')
        axe.set_ylabel('Accuracy')
        axe.set_xlim(0, max_epochs + 2)
        axe.legend(loc=4, frameon=True, shadow=True, edgecolor='black')

    plt.show()

    accuracies_data = np.array([
        [m['best_accuracy']['train'] for m in best_accuracies_configs_head],\
        [m['best_accuracy']['test'] for m in best_accuracies_configs_head],\
        [len(m['accuracies']['train']) for m in best_accuracies_configs_head]
    ]).T

    accuracies_df = pd.DataFrame(accuracies_data, index=range(1, 6), columns=['Train data', 'Test data', 'Epochs'])
    accuracies_df.Epochs = accuracies_df.Epochs.astype('int32')
    accuracies_df

In [None]:
def random_search(n=60, k=5, rnn=True):
    best_accuracies_configs = []
    
    xk_ids, yk_bin, max_seq, vocab = preprocess_data(all_df.values, train=True)
    
    # Random iterations
    for idx in range(n):
        c = generate_random_hyperparameters(rnn=rnn)
        c_label = configuration_to_label(c, rnn=rnn)
        print('{}.- C = {}'.format(idx + 1, c_label))
        
        k_train_accs = np.ma.empty((100, k))
        k_train_accs.mask = True
        
        k_test_accs = np.ma.empty((100, k))
        k_test_accs.mask = True
        
        k_train_acc, k_test_acc, k_train_time = [], [], []
        
        # K-Fold cross validation
        for k_idx in range(k):
            x_train, x_test, y_train, y_test = train_test_split(xk_ids, yk_bin, test_size=0.2)
            
            if rnn:
                model = RNN(x_train, y_train, x_test, y_test, print_mode=False)
                model.create_graph(input_dim=max_seq, vocab_dim=len(vocab), lr=c['LR'], unit_layers=c['LN'], activations=c['AC'], cell_type=c['CT'])
            else:
                model = FNN(x_train, y_train, x_test, y_test, print_mode=False)
                model.create_graph(input_dim=max_seq, vocab_dim=len(vocab), lr=c['LR'], unit_layers=c['LN'], activations=c['AC'], mode=c['FM'])

            if k_idx == 0:
                model.print_trainable_parameters()
            
            best_train_acc, train_accs, best_test_acc, test_accs, training_time = model.train(batch_size=c['BS'])
            
            k_train_accs[:len(train_accs), k_idx] = train_accs
            k_train_acc.append(best_train_acc)
            
            k_test_accs[:len(train_accs), k_idx] = test_accs
            k_test_acc.append(best_test_acc)
            
            k_train_time.append(training_time)
            
            msg = '\t{}) Training time: {:.4f}, Train accuracy: {:.6f}, Test accuracy: {:.6f}'
            print(msg.format(k_idx + 1, training_time, best_train_acc, best_test_acc))

        training_time = np.mean(k_train_time)
        best_train_acc = np.mean(k_train_acc)
        best_test_acc = np.mean(k_test_acc)
            
        best_accuracies_configs.append({
            'accuracies': {'train': k_train_accs.mean(axis=1), 'test': k_test_accs.mean(axis=1)},
            'best_accuracy': {'train': best_train_acc, 'test': best_test_acc},
            'time': training_time,
            'label': c_label
        })
        
        msg = 'Training time: {:.4f}, Train accuracy: {:.6f}, Test accuracy: {:.6f}'
        print(msg.format(training_time, best_train_acc, best_test_acc))

        print(50*'-')
    
    return best_accuracies_configs

### RNN - Random Search

In [None]:
best_accuracies_rnn_configs = random_search(rnn=True)
plot_hyperparam_tunning(best_accuracies_rnn_configs)

with open('hyper_rnn.pkl', mode='wb') as f:
    pickle.dump(best_accuracies_rnn_configs, f)

### MLP - Random Search

In [None]:
best_accuracies_mlp_configs = random_search(rnn=False)
plot_hyperparam_tunning(best_accuracies_mlp_configs)

with open('hyper_mlp.pkl', mode='wb') as f:
    pickle.dump(best_accuracies_mlp_configs, f)

# Russian names

In [None]:
model = RNN(x_train_rus_ids, y_train_rus_bin, x_test_rus_ids, y_test_rus_bin)

model.create_graph(input_dim=max_seq_rus, vocab_dim=len(vocab_rus), emb_dim=10,
                   unit_layers=[20, 10], activations=['elu', 'elu'],
                   cell_type=[('GRU', None), ('LSTM', True)])
model.print_trainable_parameters()

_ = model.train(epochs=500)

In [None]:
model = FNN(x_train_rus_ids, y_train_rus_bin, x_test_rus_ids, y_test_rus_bin)

model.create_graph(input_dim=max_seq_rus, vocab_dim=len(vocab_rus),
                   unit_layers=[102], activations=['sigmoid'])
model.print_trainable_parameters()

_ = model.train(epochs=500)