# Generation with autoregessive models

We will again use a GRU model on the `surname` dataste, but this time, the task will be generating new surnames (conditioned on the nationalities). 

## Dataset

In [7]:
import pandas as pd

The dataset loading classes will be basically the same as yesterday's, except that now, when vectorizing, we will be encoding both input and output. For vocabulary, we will basically copy the classes from yesterday.

In [8]:
class Vocabulary:
    # The class is very similar to the vocabulary class of Day 2,
    # except that we don't have an unknown option, which will be 
    # added to its subclass only
    def __init__(self, token_to_idx = None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx =  token_to_idx
        self._idx_to_token = {idx: token
                              for token, idx in self._token_to_idx.items()}
    
    def to_dict(self):
        return {'token_to_idx': self._token_to_idx, 
                'idx_to_token': self._idx_to_token}
    
    def add_token(self, token):
        if token in self._token_to_idx:
            # If already in the dictionary, do nothing
            return self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
            return index
    
    def add_many(self, tokens):
        return list(map(self.add_token, tokens))
    
    def lookup_token(self, token):
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx = None, unk_token = '<UNK>',
                 maks_token = '<MASK>', begin_seq_token = '<BEGIN>', end_seq_token = '<END>'):
        super().__init__(token_to_idx)

        self._unk_token = unk_token
        self._mask_token = maks_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        # The inverse of ordering below is on purpose, we want <MASK> to be the first id
        # so that we can apply the embedding layer without calling an optional argument.
        # It is possible not to invert this order, however.
        self.mask_idx = self.add_token(self._mask_token)
        self.unk_idx = self.add_token(self._unk_token)
        self.begin_seq_idx = self.add_token(self._begin_seq_token)
        self.end_seq_idx = self.add_token(self._end_seq_token)

    def to_dict(self):
        contents = super().to_dict()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents
    
    def lookup_token(self, token):
        if self.unk_idx >= 0:
            return self._token_to_idx.get(token, self.unk_idx)
        else:
            return self._token_to_idx[token]

Now, let us give a look at the vectorizer. The main idea here is that we will create two vectors representing the sequence, from_vector and to_vector. The difference is that, at time t, from_vector[t] is lagging one token with respect to to_vector[t]. In particular, from_vector starts at the sequence 0th input and ends at input -2, whereas to_vector starts at 1st and ends at -1.

In [9]:
import numpy as np
class SunermaneVectorizer:
    def __init__(self, vocab):
        self.vocab = vocab
    
    def vectorize(self, surname, vector_len = -1):
        indices = [self.vocab.begin_seq_idx]
        for token in surname:
            indices.append(self.vocab.lookup_token(token))
        indices.append(self.vocab.end_seq_idx)

        if vector_len < 0:
            vector_len = len(indices)
        
        # Input vector
        from_vector = np.zeros(vector_len, dtype = np.int64)
        from_indices = indices[:-1]
        from_vector[:len(from_indices)] = from_indices
        from_vector[len(from_indices):] = self.vocab.mask_idx

        # Output vector
        to_vector = np.zeros(vector_len, dtype = np.int64)
        to_indices = indices[1:]
        to_vector[:len(to_indices)] = to_indices
        to_vector[len(to_indices):] = self.vocab.mask_idx

        return from_vector, to_vector
    
    @classmethod
    def from_dataframe(cls, data):
        voc = SequenceVocabulary()

        for _, row in data.iterrows():
            for char in row['surname']:
                voc.add_token(char)
        
        return cls(voc)

Finally, we change the SurnameDataset to allow the correct outputs, that is, intead of classes, vectors. Only the getitem method will therefore change significantly.

In [10]:
from torch.utils.data import Dataset, DataLoader

class SurnameDataset(Dataset):
    def __init__(self, sunrmane_df, vectorizer):
        self.surname_df = sunrmane_df
        self.vectorizer = vectorizer
        
        self._max_sequence_len = max(map(len, self.surname_df)) + 2

        self.train_df = self.surname_df[self.surname_df['split'] == 'train']
        self.len_train = len(self.train_df)

        self.val_df = self.surname_df[self.surname_df['split'] == 'val']
        self.len_val = len(self.val_df)

        self.test_df = self.surname_df[self.surname_df['split'] == 'test']
        self.len_test = len(self.test_df)

        self._lookup_table = {'train': (self.train_df, self.len_train),
                              'val': (self.val_df, self.len_val),
                              'test': (self.test_df, self.len_test)}

        self.set_split('train')

    def set_split(self, split = 'train'):
        self._target_split, self._len_split = self._lookup_table[split]
    
    @classmethod
    def load_dataset(cls, path):
        surnames = pd.read_csv(path)
        return cls(surnames, SunermaneVectorizer.from_dataframe(surnames))
    
    def __len__(self):
        return self._len_split

    def __getitem__(self, row_index):
        row = self._target_split.iloc[row_index]
        from_vector, to_vector  = self.vectorizer.vectorize(row['surname'], self._max_sequence_len)
        return {
            'x_data': from_vector,
            'y_data': to_vector,
            'class_index': row['nationality_index']}

    def get_num_batches(self, batch_size):
        return len(self)//batch_size

def generate_batches(dataset, batch_size, shuffle = True,
                     drop_last = True, device = 'cuda'): 
    # drop_last decides whether to drop the last batch if len(daset) % batchsize != 0
    dataloader = DataLoader(dataset = dataset, batch_size = batch_size,
                            shuffle = shuffle, drop_last = drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, _ in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

Let us check if everything is fine.

In [12]:
surnames = SurnameDataset.load_dataset('surnames_with_splits.csv')
print(surnames[0]['x_data'])
print(surnames[0]['y_data'])

[2 4 5 6 7 8 0 0 0 0 0 0 0 0 0 0 0 0 0]
[4 5 6 7 8 3 0 0 0 0 0 0 0 0 0 0 0 0 0]


We note that, indeed, `x_data` is 1-lag behind `y_data`.

## Model

The model will be only slightly different from last GRU model, only changing the head (no classification needed), include an embedding of the nationality, and add some dropout.

In [26]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim


class SurnameGenerationModel(nn.Module):
    def __init__(self, char_embedding_size, char_vocab_size, num_nationalities,
                 rnn_hidden_size, batch_first = True, padding_idx = 0, dropout_p = 0.5):
        super().__init__()
        
        self.char_emb = nn.Embedding(num_embeddings = char_vocab_size,
                                     embedding_dim = char_embedding_size,
                                     padding_idx = padding_idx)

        self.nation_emb = nn.Embedding(num_embeddings = num_nationalities,
                                       embedding_dim = rnn_hidden_size)

        self.rnn = nn.GRU(input_size = char_embedding_size, 
                          hidden_size = rnn_hidden_size,
                          batch_first = batch_first)
        
        self.fc = nn.Linear(in_features = rnn_hidden_size, 
                            out_features = char_vocab_size)
        
        self._dropout_p = dropout_p

    def forward(self, x_in, nationality_index, apply_softmax = False):
        x_embedded = self.char_emb(x_in)
        
        nationality_embedded = self.nation_emb(nationality_index).unsqueeze(0)

        # We use the nationality embedding as a first hidden state
        # Recall that y_out has all hidden states, not only the last one
        y_out, _ = self.rnn(x_embedded, nationality_embedded)

        batch_size, seq_size, feat_size = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)

        y_out = self.fc(F.dropout(y_out, p = self._dropout_p))
                         
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
            
        return y_out


## Training

Because we modularized the code (always modularize code!), training basically does not change.

In [27]:
class EarlyStopping:
    def __init__(self, patience = 5, min_delta = 0, save = None):
        self.patience = patience
        self.min_delta = min_delta
        self.save = save
        self.best_loss = np.inf
        self.patience_counter = 0
        self.flag = False
    
    def __call__(self, val_loss, model = None):
        # If the validation loss improved, we basically 
        # update the new best lost and save the model
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.patience_counter = 0
        
            if (self.save) and (model is not None):
                torch.save(model.state_dict(), self.save)
        
        else:
            self.patience_counter += 1
            if self.patience_counter > self.patience:
                self.flag = True

The loss will necessairly change a little.

In [28]:
def normalize_sizes(y_pred, y_true):
    """
    Normalize tensor sizes to meet what is expected by the loss function.
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true



def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    # Only pick the indices that have been not masked to compute the
    # error, otherwise there would be a sequence length bias
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

In [38]:
from torch.optim.lr_scheduler import StepLR

# Dataset
vectorizer = surnames.vectorizer
mask_index = vectorizer.vocab.mask_idx


# Model hyperparameters
char_embedding_size = 32
rnn_hidden_size = 32

# Training hyperparameters
lr = 1e-3
n_epochs = 100
batch_size = 128


# Allows for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model
classifier = SurnameGenerationModel(char_embedding_size = char_embedding_size,
                               char_vocab_size = len(vectorizer.vocab),
                               num_nationalities = len(surnames.train_df['nationality_index'].unique()),
                               rnn_hidden_size= rnn_hidden_size,
                               padding_idx = mask_index,
                               dropout_p = 0.5)
classifier.to(device)


# Optimization
optimizer = optim.Adam(classifier.parameters(), lr = lr)
scheduler = StepLR(optimizer, step_size = 10, gamma = 0.1)
early_stopping = EarlyStopping(min_delta = 0, save = 'best_gru_generation_model.pt')

for epoch in range(n_epochs): 
    surnames.set_split('train')
    batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)
    classifier.train()

    running_loss = 0.
    running_acc = 0.

    for batch_index, batch_dict in enumerate(batch_generator):
        # 1. Zero the gradient
        classifier.zero_grad()

        # 2. Prediction
        y_pred = classifier(x_in = batch_dict['x_data'],
                            nationality_index = batch_dict['class_index'])

        # 3. Compute loss
        loss = sequence_loss(y_pred, batch_dict['y_data'], mask_index)


        # 4. Backpropagate
        loss.backward()
        
        # 5. Optimize
        optimizer.step()

        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        acc_t = compute_accuracy(y_pred, batch_dict['y_data'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

    # Evaluation part, we don't want paramereres to change
    classifier.eval()
    surnames.set_split('val')

    batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)
    
    running_loss = 0.
    running_acc = 0.
    classifier.eval()



    for batch_index, batch_dict in enumerate(batch_generator):
        y_pred = classifier(x_in = batch_dict['x_data'], 
                            nationality_index = batch_dict['class_index'])

        loss = sequence_loss(y_pred, batch_dict['y_data'], mask_index)

        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        acc_t = compute_accuracy(y_pred, batch_dict['y_data'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

    # Updates early stop
    early_stopping(running_loss, classifier)
    if early_stopping.flag:
        print('Early stopped at epoch:', epoch)
        break

    # Updates scheduler
    scheduler.step()

    print('Epoch: ', epoch)
    print('Validation loss', running_loss)
    print('Validation accuracy: ', running_acc)

Epoch:  0
Validation loss 3.4284615119298296
Validation accuracy:  11.703848558619095
Epoch:  1
Validation loss 3.1108711759249372
Validation accuracy:  16.954573372335798
Epoch:  2
Validation loss 2.938374082247416
Validation accuracy:  18.887165055162278
Epoch:  3
Validation loss 2.822935819625855
Validation accuracy:  20.920857264460672
Epoch:  4
Validation loss 2.749722341696421
Validation accuracy:  22.260679767018736
Epoch:  5
Validation loss 2.699275175730387
Validation accuracy:  22.932706815493596
Epoch:  6
Validation loss 2.6611728469530744
Validation accuracy:  23.667307312633064
Epoch:  7
Validation loss 2.6381132801373797
Validation accuracy:  24.035927241641417
Epoch:  8
Validation loss 2.6024572451909385
Validation accuracy:  25.134773236507908
Epoch:  9
Validation loss 2.5872876246770224
Validation accuracy:  25.13582065617907
Epoch:  10
Validation loss 2.58618156115214
Validation accuracy:  25.47710355463245
Epoch:  11
Validation loss 2.5856249729792276
Validation accu

Let us check the test accuracy.

In [40]:
classifier.load_state_dict(torch.load("best_gru_generation_model.pt"))
classifier.eval()

# Evaluation part, we don't want paramereres to change
classifier.eval()
surnames.set_split('test')

batch_generator = generate_batches(surnames, batch_size = batch_size, device = device)

running_loss = 0.
running_acc = 0.

for batch_index, batch_dict in enumerate(batch_generator):
        y_pred = classifier(x_in = batch_dict['x_data'], 
                            nationality_index = batch_dict['class_index'])

        loss = sequence_loss(y_pred, batch_dict['y_data'], mask_index)

        running_loss += (loss.item() - running_loss) / (batch_index + 1)
        acc_t = compute_accuracy(y_pred, batch_dict['y_data'], mask_index)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

print('Validation loss', running_loss)
print('Validation accuracy: ', running_acc)

Validation loss 2.5740342140197754
Validation accuracy:  25.85422707320446


  classifier.load_state_dict(torch.load("best_gru_generation_model.pt"))


Very bad, but the one obtained by the book is not much better (27%).

## Decoding

We will now implement here sample decoding to use in the model. We will be using the temperature transformation to control the degree of hallucination of the generated text.

In [143]:
def sample_from_model(model, vectorizer, nationalities, sample_size = 20,
                      temperature = 1.0):
    # We can sample more than a sample each time
    num_samples = len(nationalities)
    # We will have a batch of num_samples, each starting 
    # with begin_seq_idx
    begin_seq_idx = [vectorizer.vocab.begin_seq_idx
                     for _ in range(num_samples)]
    begin_seq_idx = torch.tensor(begin_seq_idx, 
                                 dtype = torch.int64).unsqueeze(dim = 1)
    # This will be appended at each new time step
    indices = [begin_seq_idx]
    nationality_idx = torch.tensor(nationalities, 
                                   dtype = torch.int64)
    # We embed the nationality as the 0th hidden state of the model
    h_t = model.nation_emb(nationality_idx).unsqueeze(0)
    


    for time_step in range(sample_size):
        # We will basically reconstruct the model here
        # so that we can allow predictions of one time step
        # to serve as input of the next
        x_t = indices[time_step]
        x_embd_t = model.char_emb(x_t)
        rnn_out , h_t = model.rnn(x_embd_t, h_t)
        prediction_vector = model.fc(rnn_out.squeeze(dim = 1))
        probability_vector = F.softmax(prediction_vector / temperature, dim=1)
        # This samples according to the monomial given by the probability vector
        indices.append(torch.multinomial(probability_vector, num_samples=1))
    
    # We permute here because, as of now, it is of form [seq, batch]
    
    indices = torch.stack(indices).squeeze()
    indices = indices.view(indices.size(0),-1).permute(1, 0)
    return indices

def decode_samples(sampled_indices, vectorizer):
    decoded_surnames = []
    vocab = vectorizer.vocab
    
    for sample_idx in range(sampled_indices.shape[0]):
        surname = ''
        for time_step in range(sampled_indices.shape[1]):
            sampled_item = sampled_indices[sample_idx, time_step].item()
            if sampled_item == vocab.begin_seq_idx:
                continue
            elif sampled_item == vocab.end_seq_idx:
                break
            else:
                surname += vocab.lookup_index(sampled_item)
        decoded_surnames.append(surname)
    return decoded_surnames


In [157]:
classifier = classifier.cpu()
# This makes a dictionary where nationality is written as an index
nationality_to_idx = surnames.train_df.set_index("nationality")["nationality_index"].to_dict()
decode_samples(sample_from_model(classifier, vectorizer, [nationality_to_idx['German']]*20,  temperature = 0.4), vectorizer)

['Tons',
 'Nater',
 'Solun',
 'Ran',
 'Sarger',
 'Harir',
 'Parna',
 'Belso',
 'Narla',
 'Hashan',
 'Bertel',
 'Care',
 'Mare',
 'Selhi',
 'Samer',
 'Mannes',
 'Wetto',
 'Sher',
 'Nadel',
 'Serer']

Not very good, but well, expect for such a low accuracy model.