# Autoregressive models

Today we will implement two very simple autoregressive models, namely Elman and GRUs. We will apply them to the surname dataset for both classification of the nationality. Tomorrow, this same code will be used for geenrating new surnames.

## Dataset

We start by exploring the dataset.

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
class SurnameDataset(Dataset):
    def __init__(self, sunrmane_df, vectorizer):
        self.surname_df = sunrmane_df
        self.vectorizer = vectorizer
        
        # I *really* liked this syntax solution of the original implementation
        self._max_sequence_len = max(map(len, self.surname_df)) + 2

        self.train_df = self.surname_df[self.surname_df['split'] == 'train']
        self.len_train = len(self.train_df)

        self.val_df = self.surname_df[self.surname_df['split'] == 'val']
        self.len_val = len(self.val_df)

        self.test_df = self.surname_df[self.surname_df['split'] == 'test']
        self.len_test = len(self.test_df)

        self._lookup_table = {'train': (self.train_df, self.len_train),
                              'val': (self.val_df, self.len_val),
                              'test': (self.test_df, self.len_test)}

        self.set_split('train')

        # Get class weights, we sort them by the index and only get the values
        value_counts = self.train_df['nationality_index'].value_counts().sort_index().values
        self.class_weights = 1.0 / torch.tensor(value_counts, dtype = torch.float32)

    def set_split(self, split = 'train'):
        self._target_split, self._len_split = self._lookup_table[split]
    
    @classmethod
    def load_dataset(cls, path):
        surnames = pd.read_csv(path)
        return cls(surnames, SunermaneVectorizer.from_dataframe(surnames))
    
    def __len__(self):
        return self._len_split

    def __getitem__(self, row_index):
        row = self._target_split.iloc[row_index]
        surname_vector, vec_len = self.vectorizer.vectorize(row['surname'], self._max_sequence_len)
        return {
            'x_data': surname_vector,
            'y_data': row['nationality_index'],
            'x_len': vec_len}

    def get_num_batches(self, batch_size):
        return len(self)//batch_size

Because we are working at the character level, a TF-IDF approach chould make sense for the classication task, but, for the generation task, it would be unecessarily complicated. Therefore, we shall use the same one-hot encoding for both tasks. This time, we also want to use a sequence vocabulary class in which we will add the special tokens (UNK, MASK, BEGIN, END) to the tokenization.

In [3]:
class Vocabulary:
    # The class is very similar to the vocabulary class of Day 2,
    # except that we don't have an unknown option, which will be 
    # added to its subclass only
    def __init__(self, token_to_idx = None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx =  token_to_idx
        self._idx_to_token = {idx: token
                              for token, idx in self._token_to_idx.items()}
    
    def to_dict(self):
        return {'token_to_idx': self._token_to_idx, 
                'idx_to_token': self._idx_to_token}
    
    def add_token(self, token):
        if token in self._token_to_idx:
            # If already in the dictionary, do nothing
            return self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
            return index
    
    def add_many(self, tokens):
        return list(map(self.add_token, tokens))
    
    def lookup_token(self, token):
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

In [4]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx = None, unk_token = '<UNK>',
                 maks_token = '<MASK>', begin_seq_token = '<BEGIN>', end_seq_token = '<END>'):
        super().__init__(token_to_idx)

        self._unk_token = unk_token
        self._mask_token = maks_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        # The inverse of ordering below is on purpose, we want <MASK> to be the first id
        # so that we can apply the embedding layer without calling an optional argument.
        # It is possible not to invert this order, however.
        self.mask_idx = self.add_token(self._mask_token)
        self.unk_idx = self.add_token(self._unk_token)
        self.begin_seq_idx = self.add_token(self._begin_seq_token)
        self.end_seq_idx = self.add_token(self._end_seq_token)

    def to_dict(self):
        contents = super().to_dict()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents
    
    def lookup_token(self, token):
        if self.unk_idx >= 0:
            return self._token_to_idx.get(token, self.unk_idx)
        else:
            return self._token_to_idx[token]

Now we define the vectorizer. It is basically the same as last time, but only now working at the character and not at the word level.

In [5]:
import numpy as np
class SunermaneVectorizer:
    def __init__(self, vocab):
        self.vocab = vocab
    
    def vectorize(self, surname, vector_len = -1):
        indices = [self.vocab.begin_seq_idx]
        for token in surname:
            indices.append(self.vocab.lookup_token(token))
        indices.append(self.vocab.end_seq_idx)

        # We do not truncate the indices if this is -1
        if vector_len < 0:
            vector_len = len(indices)
        
        # Now we do the one-hot encoding
        out_vector = np.zeros(vector_len, dtype = np.int64)
        # We assign the index to the tokens that do show up in the surname and
        # mask the part of the vector longer than the input sequence
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.vocab.mask_idx

        return out_vector, len(indices)

    @classmethod
    def from_dataframe(cls, data):
        voc = SequenceVocabulary()

        for _, row in data.iterrows():
            for char in row['surname']:
                voc.add_token(char)
        
        return cls(voc)


In [8]:
surnames =  SurnameDataset.load_dataset('surnames_with_splits.csv')
vectorizer = surnames.vectorizer
# Let us check the first surname
"".join(list(map(surnames.vectorizer.vocab.lookup_index,surnames[0]['x_data']))[:surnames[0]['x_len']])

'<BEGIN>Totah<END>'

Finally, we implement the batch generator function.

In [9]:
def generate_batches(dataset, batch_size, shuffle = True,
                     drop_last = True, device = 'cuda'): 
    # drop_last decides whether to drop the last batch if len(daset) % batchsize != 0
    dataloader = DataLoader(dataset = dataset, batch_size = batch_size,
                            shuffle = shuffle, drop_last = drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, _ in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## Elman

We will first implement the Elman RNN model.

In [10]:
import torch.nn as nn
import torch.nn.functional as F

class ElmanRNN(nn.Module):
    '''
    This will correspond to only a layer of our architecture.
    '''
    def __init__(self, input_size, hidden_size, batch_first = False):
        # If batch_first is True, the first dimension is the batch
        super().__init__()

        self.rnn_cell =  nn.RNNCell(input_size, hidden_size)

        self.batch_first = batch_first
        self.hidden_size = hidden_size
    
    def _initial_hidden(self, batch_size):
        # The initial hidden state is only the zero tensor
        # Notice that this is the hidden state of only *one* token
        return torch.zeros((batch_size, self.hidden_size))
    
    def forward(self, x_in, initial_hidden = None):
        '''
        The otuput will be of rank (batch, seq_size, hidden_state) if
        batch_first, otherwise it will be (seq_size, batch, hidden_state).
        '''
        if self.batch_first:
            batch_size, seq_size, feat_size = x_in.size()
            # In RNNs, we assume that the batch is the second dimension
            x_in =  x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()

        hiddens = []

        if initial_hidden is None:
            initial_hidden = self._initial_hidden(batch_size)
            # Makes the same device as the x_in
            initial_hidden = initial_hidden.to(x_in.device)

        # Hidden at time -1, this is not added to the sequence
        hidden_t = initial_hidden
                    
        for t in range(seq_size):
            # Notice that here we unroll the RNN cell
            hidden_t = self.rnn_cell(x_in[t], hidden_t)
            hiddens.append(hidden_t)

        # Stack the whole list of tensors along dimension 0, this case,
        # sequence dimension (recall we permuted if batch first)
        hiddens = torch.stack(hiddens)

        if self.batch_first:
            # Repermute to have batch first
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens


One of the nice things about this example is that the RNN part is only but a single cell, which can be thought of as a layer. We can add it to a more complicated architecture. The first step in thgat direction is to make a function that gives us the last hidden state of a sequence.

In [11]:

def column_gather(y_out, x_lengths):
    '''
    This function is for getting the last hidden state of each input in a 
    batch.

    Args:
        y_out: (batch, sequence, feature)
        x_lengths: (batch,)

    Returns:
        (batch, feature)
    '''
    # detatch is used to have the tensor not part of the computational graph,
    # so no gradients are take. Same as no_grad, but instead of holding
    # within an environment, holds for the full tensor.
    # The -1 is because, as usual, we start counting from 0
    x_lengths = x_lengths.long().detach().cpu().numpy() - 1

    out = []
    for batch_index, column_index in enumerate(x_lengths):
        out.append(y_out[batch_index, column_index])

    return torch.stack(out)

Now we build the whole model.

In [12]:

class SurnameClassifier(nn.Module):
    def __init__(self, embediing_size, num_embeddings, num_classes,
                 rnn_hidden_size, batch_first = True, padding_idx = 0):
        '''
        num_embeddings: Number of characters to embedding (i.e., vocab size).
        padding_idx: This index will be ignored by the later layers of the model
        as it only indicates padding of the vectors. In our case, we chose the
        padding token, <MASK>, to have index 0
        '''
        super().__init__()
        self.emb = nn.Embedding(num_embeddings = num_embeddings, 
                                embedding_dim = embediing_size,
                                padding_idx = padding_idx)
        self.rnn = ElmanRNN(input_size = embediing_size,
                             hidden_size = rnn_hidden_size,
                             batch_first = batch_first)
        self.fc1 = nn.Linear(in_features = rnn_hidden_size,
                         out_features = rnn_hidden_size)
        self.fc2 = nn.Linear(in_features = rnn_hidden_size,
                          out_features = num_classes)
        
    def forward(self, x_in, x_lens =  None, apply_softmax = False):
        '''
        x_in: should be of shape (batch, input_dim)
        x_lens: the length of each sequence of the batch, used to 
            find the last hidden state
        Returns:
            tensor of shape (batch, output_dim) 
        '''
        x_embedded = self.emb(x_in)
        y_out = self.rnn(x_embedded)

        if x_lens is not None:
            y_out = column_gather(y_out, x_lens)
        else:
            # If no lens are proviced, we do not truncate
            # the sequence, we take for last hidden state
            # the actual last component of the vector
            y_out = y_out[:, -1, :]

        # We use a 2-layer MLP for classification
        y_out = F.relu(self.fc1(F.dropout(y_out, 0.5)))
        y_out = self.fc2(F.dropout(y_out, 0.5))

        # Again, apply softmax only at eval, so to avoid numeric error
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        return y_out

## Training

Now we do training. We start implementing early stopping.

In [13]:
class EarlyStopping:
    def __init__(self, patience = 5, min_delta = 0, save = None):
        self.patience = patience
        self.min_delta = min_delta
        self.save = save
        self.best_loss = np.inf
        self.patience_counter = 0
        self.flag = False
    
    def __call__(self, val_loss, model = None):
        # If the validation loss improved, we basically 
        # update the new best lost and save the model
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.patience_counter = 0
        
            if (self.save) and (model is not None):
                torch.save(model.state_dict(), self.save)
        
        else:
            self.patience_counter += 1
            if self.patience_counter > self.patience:
                self.flag = True

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) 

The training loop will change very little.

In [15]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import accuracy_score, f1_score
# Allows for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lr = 1e-3
hidden_dimension = 5
n_epochs = 50
loss_func = nn.CrossEntropyLoss(surnames.class_weights)
loss_func.to(device)
batch_size = 64

embediing_size = 100
num_embeddings = len(vectorizer.vocab)
num_classes = surnames.class_weights.size(0)
rnn_hidden_size = 64

classifier = SurnameClassifier(embediing_size, num_embeddings, num_classes,
                 rnn_hidden_size, batch_first = True, padding_idx = 0)
classifier.to(device)
optimizer = optim.Adam(classifier.parameters(), lr = lr)
# Multiplies the learning rate by `gamma` every `step_size` epoch
scheduler = StepLR(optimizer, step_size = 10, gamma = 0.1)

early_stopping = EarlyStopping(min_delta = 0, save = 'best_rnn_model.pt')

for epoch in range(n_epochs): 
    surnames.set_split('train')
    batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)
    running_loss = 0.
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # 1. Zero the gradient
        classifier.zero_grad()

        # 2. Prediction
        y_pred = classifier(x_in = batch_dict['x_data'],
                            x_lens = batch_dict['x_len'])

        # 3. Compute loss
        loss = loss_func(y_pred, batch_dict['y_data'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)


        # 4. Backpropagate
        loss.backward()
        
        # 5. Optimize
        optimizer.step()

    # Evaluation part, we don't want paramereres to change
    classifier.eval()
    surnames.set_split('val')

    batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)
    
    val_loss = 0.
    val_acc = 0.
    classifier.eval()

    

    for batch_index, batch_dict in enumerate(batch_generator):
        # compute the output
        y_pred = classifier(x_in = batch_dict['x_data'], 
                            x_lens = batch_dict['x_len'])

        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_data'])
        val_loss += (loss.item() - val_loss) / (batch_index + 1)

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict['y_data'])
        val_acc += (acc_t - val_acc) / (batch_index + 1)

    # Updates early stop
    early_stopping(val_loss, classifier)
    if early_stopping.flag:
        print('Early stopped at epoch:', epoch)
        break

    # Updates scheduler
    scheduler.step()

    print('Epoch: ', epoch)
    print('Training loss', running_loss)
    print('Validation loss', val_loss)
    print('Validation accuracy: ', val_acc)

Epoch:  0
Training loss 2.847928847869237
Validation loss 2.7432575607299805
Validation accuracy:  0.185625
Epoch:  1
Training loss 2.581378722190856
Validation loss 2.547506198883057
Validation accuracy:  0.25375000000000003
Epoch:  2
Training loss 2.3645131756861995
Validation loss 2.2878582715988163
Validation accuracy:  0.3056249999999999
Epoch:  3
Training loss 2.1555502235889445
Validation loss 2.1567963743209844
Validation accuracy:  0.32499999999999996
Epoch:  4
Training loss 2.050681791702905
Validation loss 2.100511260032654
Validation accuracy:  0.34
Epoch:  5
Training loss 1.9421749045451484
Validation loss 2.0514539575576785
Validation accuracy:  0.37312499999999993
Epoch:  6
Training loss 1.8799122194449105
Validation loss 2.0695172739028926
Validation accuracy:  0.38062499999999994
Epoch:  7
Training loss 1.811729171872139
Validation loss 2.0161529874801634
Validation accuracy:  0.36687500000000006
Epoch:  8
Training loss 1.7731204847494761
Validation loss 1.948538117408

Let us check the test accuracy.

In [20]:
classifier.load_state_dict(torch.load("best_rnn_model.pt"))
classifier.eval()

# Evaluation part, we don't want paramereres to change
classifier.eval()
surnames.set_split('test')

batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)

val_loss = 0.
val_acc = 0.

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in = batch_dict['x_data'], 
                        x_lens = batch_dict['x_len'])

    # step 3. compute the loss
    loss = loss_func(y_pred, batch_dict['y_data'])
    val_loss += (loss.item() - val_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_data'])
    val_acc += (acc_t - val_acc) / (batch_index + 1)

print('Validation loss', val_loss)
print('Validation accuracy: ', val_acc)

Validation loss 1.8549499082565306
Validation accuracy:  0.38875


Not too bad; the book got a test accuracy of 0.41, so we are not really far off.

## GRU
We can easily substitute the elam RNN in our model for a GRU cell.

In [21]:
class SurnameClassifierGRU(nn.Module):
    def __init__(self, embediing_size, num_embeddings, num_classes,
                 rnn_hidden_size, batch_first = True, padding_idx = 0):
        '''
        num_embeddings: Number of characters to embedding (i.e., vocab size).
        padding_idx: This index will be ignored by the later layers of the model
        as it only indicates padding of the vectors. In our case, we chose the
        padding token, <MASK>, to have index 0
        '''
        super().__init__()
        self.emb = nn.Embedding(num_embeddings = num_embeddings, 
                                embedding_dim = embediing_size,
                                padding_idx = padding_idx)
        # Only thing we changed
        self.rnn = nn.GRU(input_size = embediing_size,
                             hidden_size = rnn_hidden_size,
                             batch_first = batch_first)
        self.fc1 = nn.Linear(in_features = rnn_hidden_size,
                         out_features = rnn_hidden_size)
        self.fc2 = nn.Linear(in_features = rnn_hidden_size,
                          out_features = num_classes)
        
    def forward(self, x_in, x_lens =  None, apply_softmax = False):
        '''
        x_in: should be of shape (batch, input_dim)
        x_lens: the length of each sequence of the batch, used to 
            find the last hidden state
        Returns:
            tensor of shape (batch, output_dim) 
        '''
        x_embedded = self.emb(x_in)
        # GRU returns two tensors instead of just one
        # In fact, the second is exactly the last hidden state
        # and we could use it directly, but that would mean changing the code
        y_out, _ = self.rnn(x_embedded)

        if x_lens is not None:
            y_out = column_gather(y_out, x_lens)
        else:
            # If no lens are proviced, we do not truncate
            # the sequence, we take for last hidden state
            # the actual last component of the vector
            y_out = y_out[:, -1, :]

        # We use a 2-layer MLP for classification
        y_out = F.relu(self.fc1(F.dropout(y_out, 0.5)))
        y_out = self.fc2(F.dropout(y_out, 0.5))

        # Again, apply softmax only at eval, so to avoid numeric error
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        return y_out

In [22]:
lr = 1e-3
hidden_dimension = 5
n_epochs = 50
loss_func = nn.CrossEntropyLoss(surnames.class_weights)
loss_func.to(device)
batch_size = 64
# Allows for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embediing_size = 100
num_embeddings = len(vectorizer.vocab)
num_classes = surnames.class_weights.size(0)
rnn_hidden_size = 64

classifier = SurnameClassifierGRU(embediing_size, num_embeddings, num_classes,
                 rnn_hidden_size, batch_first = True, padding_idx = 0)
classifier.to(device)
optimizer = optim.Adam(classifier.parameters(), lr = lr)
# Multiplies the learning rate by `gamma` every `step_size` epoch
scheduler = StepLR(optimizer, step_size = 10, gamma = 0.1)

early_stopping = EarlyStopping(min_delta = 0, save = 'best_rnn_model.pt')

for epoch in range(n_epochs): 
    surnames.set_split('train')
    batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)
    running_loss = 0.
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # 1. Zero the gradient
        classifier.zero_grad()

        # 2. Prediction
        y_pred = classifier(x_in = batch_dict['x_data'],
                            x_lens = batch_dict['x_len'])

        # 3. Compute loss
        loss = loss_func(y_pred, batch_dict['y_data'])
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)


        # 4. Backpropagate
        loss.backward()
        
        # 5. Optimize
        optimizer.step()

    # Evaluation part, we don't want paramereres to change
    classifier.eval()
    surnames.set_split('val')

    batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)
    
    val_loss = 0.
    val_acc = 0.
    classifier.eval()

    

    for batch_index, batch_dict in enumerate(batch_generator):
        # compute the output
        y_pred = classifier(x_in = batch_dict['x_data'], 
                            x_lens = batch_dict['x_len'])

        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_data'])
        val_loss += (loss.item() - val_loss) / (batch_index + 1)

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict['y_data'])
        val_acc += (acc_t - val_acc) / (batch_index + 1)

    # Updates early stop
    early_stopping(val_loss, classifier)
    if early_stopping.flag:
        print('Early stopped at epoch:', epoch)
        break

    # Updates scheduler
    scheduler.step()

    print('Epoch: ', epoch)
    print('Training loss', running_loss)
    print('Validation loss', val_loss)
    print('Validation accuracy: ', val_acc)

Epoch:  0
Training loss 2.8308652798334757
Validation loss 2.6919045448303223
Validation accuracy:  0.238125
Epoch:  1
Training loss 2.497430376211802
Validation loss 2.377619743347168
Validation accuracy:  0.315625
Epoch:  2
Training loss 2.223309781153996
Validation loss 2.150444746017456
Validation accuracy:  0.343125
Epoch:  3
Training loss 2.013066142797469
Validation loss 2.049569473266602
Validation accuracy:  0.374375
Epoch:  4
Training loss 1.920120473702748
Validation loss 1.9476311683654783
Validation accuracy:  0.38
Epoch:  5
Training loss 1.824985920389493
Validation loss 2.0319922304153444
Validation accuracy:  0.376875
Epoch:  6
Training loss 1.7292488873004912
Validation loss 1.9460203790664674
Validation accuracy:  0.395
Epoch:  7
Training loss 1.6501203606526067
Validation loss 1.8549621295928955
Validation accuracy:  0.4025
Epoch:  8
Training loss 1.5949995815753941
Validation loss 1.764444160461426
Validation accuracy:  0.40249999999999997
Epoch:  9
Training loss 1.

In [23]:
classifier.load_state_dict(torch.load("best_rnn_model.pt"))
classifier.eval()

# Evaluation part, we don't want paramereres to change
classifier.eval()
surnames.set_split('test')

batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)

val_loss = 0.
val_acc = 0.

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in = batch_dict['x_data'], 
                        x_lens = batch_dict['x_len'])

    # step 3. compute the loss
    loss = loss_func(y_pred, batch_dict['y_data'])
    val_loss += (loss.item() - val_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_data'])
    val_acc += (acc_t - val_acc) / (batch_index + 1)

print('Validation loss', val_loss)
print('Validation accuracy: ', val_acc)

Validation loss 1.7643994569778443
Validation accuracy:  0.42000000000000004


So GRU does improve the model!