## RNN language Model

An autoregressive RNN model which can generate people’s names.

In [1]:
import torch
import torch.nn as nn
import numpy as np
import string
import json
import os

### Create an vocabulary
- vocabulary: special tokens + letters + numbers 
    - "<bos>": Beginning of sequence token,
    - "." : End of sequence token, 
    - "": Empty string used to denote elements for the RNN to ignore
- id_to_char: map the charater in the vocabulary by index
- char_to_id: the opposite of id_to_char, which maps each character/token in the vocabulary to a unique integer.

In [2]:
def get_vocab():
    vocab = ["", "<bos>", "."] + list(string.ascii_lowercase + string.ascii_uppercase + string.digits + " ")
    id_to_char = {i: v for i, v in enumerate(vocab)}
    char_to_id = {v: i for i, v in enumerate(vocab)}
    return vocab, id_to_char, char_to_id

### Load data
- sequence modeling

In [3]:
def load_data(filename):
    data = json.load(open(filename, "r"))
    data = [v+'.' for v in data]
    return data

### Convert Sequence to ID
- convert a list of sequences to a 2D numpy array of token IDs by char_to_id
- set a max length for ID length, truncate or padding with 0

In [4]:
def seqs_to_ids(seqs, char_to_id, max_len=20):
    all_seqs = []
    for name in seqs:
        if not name:  # skip None
            continue
            
        # name -> ids
        seq_ids = [char_to_id[char] for char in name if char in char_to_id]

        # Truncate or pad
        seq_ids = seq_ids[:max_len]
        while len(seq_ids) < max_len:
            seq_ids.append(0)  
        all_seqs.append(seq_ids)

    return np.array(all_seqs)

### RNN Class
- Convert words to vectors
- GRU
- Linear out layer

In [5]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, emb_size = 32, gru_size=32):
        super(RNNLM, self).__init__()

        # store layer sizes
        self.emb_size = emb_size
        self.gru_size = gru_size

        # for embedding characters (ignores those with value 0: the padded values)
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        # GRU layer
        self.gru = nn.GRU(emb_size, gru_size, batch_first=True)
        # linear layer for output
        self.linear = nn.Linear(gru_size, vocab_size)
    
    def forward(self, x, h_last=None):
        
        # Embed, discrete -> continuous
        embedded_seq = self.emb(x)

        # Pass the embedded sequence through the GRU
        if h_last is not None:
            gru_out, h = self.gru(embedded_seq, h_last)
        else:
            gru_out, h = self.gru(embedded_seq)

        # Pass the GRU's output through the linear layer to get logits
        out = self.linear(gru_out)

        return out, h

### Model Trianing

In [6]:
def train_model(model, Xtrain, Ytrain, Xval, Yval, id_to_char, max_epoch):
    
    optim = torch.optim.Adam(model.parameters(), lr=0.0001)
    lossfn = nn.CrossEntropyLoss(ignore_index=0)

    batch_size = 32
    num_batches = int(Xtrain.shape[0] / batch_size)

    # run the main training loop over many epochs
    for e in range(max_epoch):
        
        # shuffle the training data
        permutation = torch.randperm(Xtrain.shape[0])
        Xtrain_shuffled = Xtrain[permutation]
        Ytrain_shuffled = Ytrain[permutation]

        # iterate over the dataset in batches
        for i in range(0, Xtrain.shape[0], batch_size):
            # get the current batch
            Xbatch = Xtrain_shuffled[i:i+batch_size]
            Ybatch = Ytrain_shuffled[i:i+batch_size]

            # trian
            optim.zero_grad()
            logits, _ = model(Xbatch)
            loss = lossfn(logits.view(-1, logits.shape[-1]), Ybatch.view(-1))
            loss.backward()
            optim.step()

        # after each epoch, compute the validation loss
        with torch.no_grad():
            logits_val, _ = model(Xval)
            val_loss = lossfn(logits_val.view(-1, logits_val.shape[-1]), Yval.view(-1))
            print(f"Epoch {e+1}/{max_epoch} - Validation Loss: {val_loss.item()}")


### Generate sequence of tokens

In [7]:
def gen_string(model, id_to_char, max_len=20, sample=True):

    # put the model into eval mode because we don't need gradients
    model.eval()

    # we will use a batch size of one for generation
    x = torch.ones((1, 1), dtype=torch.long) # x is the <bos> token id which = 1
    h = torch.zeros((1,1,model.gru_size), dtype=torch.float) # h0 is all zeros
    out_str = ""
    
    # generate the sequence step by step
    for i in range(max_len):

        logits, h = model(x, h)
        probs = torch.nn.functional.softmax(logits, dim=-1)

        if sample:
            # sample from the distribution
            next_token_id = torch.multinomial(probs[0], 1)
        else:
            # use argmax decoding
            next_token_id = torch.argmax(probs[0], dim=-1)
        
        next_char = id_to_char[next_token_id.item()]
        
        # check if the end token is reached
        if next_char == '.':  # end
            break
        
        # convert the token id to a character
        out_str += next_char

        # set the input for the next iteration
        x = next_token_id.view(1, 1)

    # set the model back to training mode
    model.train()

    return out_str

### Val loss

In [8]:
def calc_val_loss(model, Xval, Yval):

    # use cross entropy loss
    lossfn = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

    # put the model into eval mode because we don't need gradients
    model.eval()

    # calculate number of batches, we need to be precise this time
    batch_size = 32
    num_batches = int(Xval.shape[0] / batch_size)
    if Xval.shape[0] % batch_size != 0:
        num_batches += 1

    # sum up the total loss
    total_loss = 0
    total_chars = 0
    for n in range(num_batches):

        # calculate batch start end idxs 
        s = n * batch_size
        e = (n+1)*batch_size
        if e > Xval.shape[0]:
            e = Xval.shape[0]

        # compute output of model        
        out,_ = model(Xval[s:e])

        # compute loss and store
        loss = lossfn(out.permute(0, 2, 1), Yval[s:e]).detach().cpu().numpy()
        total_loss += loss

        char_count = torch.count_nonzero(Yval[s:e].flatten())
        total_chars += char_count.detach().cpu().numpy()

    # compute average loss per character
    total_loss /= total_chars
    
    # set the model back to training mode in case we need gradients later
    model.train()

    return total_loss


### Main function

In [9]:
def main():
    # load the data from disk
    data = load_data(os.path.join("data", "names.json"))

    # get the letter 'vocabulary'
    vocab, id_to_char, char_to_id = get_vocab()
    vocab_size = len(vocab)

    # convert the data into a sequence of ids
    Y = seqs_to_ids(data, char_to_id)
    # the input needs to be shifted by 1 and have the <bos> tokenid prepended to it
    # this also means we have to remove the last element of the sequence to keep the length constant
    X = np.concatenate([np.ones((Y.shape[0], 1)), Y[:, :-1]], axis=1)

    # split the data int training and validation
    # convert the data into torch tensors
    train_frac = 0.9
    num_train = int(X.shape[0]*train_frac)
    Xtrain = torch.tensor(X[:num_train], dtype=torch.long)
    Ytrain = torch.tensor(Y[:num_train], dtype=torch.long)
    Xval = torch.tensor(X[num_train:], dtype=torch.long)
    Yval = torch.tensor(Y[num_train:], dtype=torch.long)

    # train the model
    model = RNNLM(vocab_size)
    train_model(model, Xtrain, Ytrain, Xval, Yval, id_to_char, max_epoch=10)

    # use the model to generate and print some names
    print("Argmax: ", gen_string(model, id_to_char, sample=False))
    print("Random:")
    for i in range(10):
        gstr = gen_string(model, id_to_char)
        print(gstr)

In [10]:
if __name__ == "__main__":
    main()

Epoch 1/10 - Validation Loss: 2.7959718704223633
Epoch 2/10 - Validation Loss: 2.560406446456909
Epoch 3/10 - Validation Loss: 2.456402540206909
Epoch 4/10 - Validation Loss: 2.4059159755706787
Epoch 5/10 - Validation Loss: 2.3775088787078857
Epoch 6/10 - Validation Loss: 2.3552451133728027
Epoch 7/10 - Validation Loss: 2.3368332386016846
Epoch 8/10 - Validation Loss: 2.32159686088562
Epoch 9/10 - Validation Loss: 2.3085272312164307
Epoch 10/10 - Validation Loss: 2.2972211837768555
Argmax:  John Marton
Random:
Mithake Lehid
Al Erjerd Benettton
Chrentrin Gdeter
Neizari Repergum
Div Yocott
Carasher Farbenininb
Ravid Vavaten
Tmatsaid Bantharlin
Liomailear Try
Tonrii Walaed
