In [1]:
from argparse import Namespace
from functools import partial
from pathlib import Path
from collections import (
    OrderedDict,Counter
)

# torch
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import default_collate
## other
from torchdata import datapipes as dp
from torchtext import vocab
from torchdata.dataloader2 import DataLoader2
from torchmetrics import Accuracy

# manipulation
import numpy as np

# visualization
import matplotlib.pyplot as plt

# others
from tqdm import tqdm

In [2]:
args = Namespace(
    # data
    data_base_path = "../../data/surnames/",
    dataset = ["train","test","val"],
    
    # vocab
    mask_tkn = "<MASK>",
    ukn_tkn = "<UKN>",
    beg_tkn = "<B>",
    end_tkn = "<E>",
    
    # model
    embedding_size = 10,
    rnn_hidden_state = 9,
    model_base_path = "../../models/seq_model/seq_pred",
    model_filename = "model.pth",
    
    # Training
    num_epochs = 100,
    batch_size = 2,
    learning_rate = 1e-3,
    seed = 1423,
    
    # runtime
    cuda = torch.cuda.is_available(),
    device = "cuda" if torch.cuda.is_available() else "cpu",
)

for k,v in args._get_kwargs():
    if "base" in k:
        Path(v).mkdir(parents=True,exist_ok=True)

# Datapipe

## build pipe dict

In [3]:
def build_pipe_dict(args=args):
    pipe_dict = {}
    for fname in args.dataset:
        pipe_dict[fname] = dp.iter.FileOpener([args.data_base_path+f"{fname}.csv"]).parse_csv()
    return pipe_dict

In [4]:
pipe_dict = build_pipe_dict()
pipe_dict

{'train': CSVParserIterDataPipe,
 'test': CSVParserIterDataPipe,
 'val': CSVParserIterDataPipe}

In [5]:
{k:len(list(pipe)) for k,pipe in pipe_dict.items()}

{'train': 7684, 'test': 1648, 'val': 1648}

In [6]:
{k:next(iter(pipe)) for k,pipe in pipe_dict.items()}

{'train': ['Woodford', 'English'],
 'test': ['Kore', 'English'],
 'val': ['Winship', 'English']}

# build vocab dict

In [7]:
def build_vocab_dict(train_pipe,args=args):
    name_counter = Counter()
    nation_counter = Counter()
    max_seq_length = -1
    for name,nation in train_pipe:
        max_seq_length = max(len(name),max_seq_length)
        name_counter.update(name)
        nation_counter.update([nation])
        
    sort_fn = lambda kf : (-kf[1],kf[0])
    name_sort_tuples = sorted(name_counter.items(),key=sort_fn)
    nation_sort_tuples = sorted(nation_counter.items(),key=sort_fn)
    
    name_vocab = vocab.vocab(ordered_dict=OrderedDict(name_sort_tuples),
                             specials=[args.mask_tkn,
                                       args.ukn_tkn,
                                       args.beg_tkn,
                                       args.end_tkn])
    name_vocab.set_default_index(name_vocab[args.ukn_tkn])
    name_vocab.max_seq_length = max_seq_length + 2
    
    nation_vocab = vocab.vocab(ordered_dict=OrderedDict(nation_sort_tuples))
    freq = [count for _,count in nation_sort_tuples]
    nation_vocab.class_weights = 1.0 / torch.tensor(freq)
    
    return {"char":name_vocab,
            "nation":nation_vocab}

In [8]:
vocab_dict = build_vocab_dict(pipe_dict["train"])
vocab_dict

{'char': Vocab(), 'nation': Vocab()}

In [9]:
{k:len(vocab) for k,vocab in vocab_dict.items()}

{'char': 83, 'nation': 18}

In [10]:
vocab_dict["char"].max_seq_length

19

In [11]:
vocab_dict["nation"].class_weights

tensor([0.0005, 0.0006, 0.0009, 0.0018, 0.0024, 0.0025, 0.0034, 0.0055, 0.0060,
        0.0063, 0.0065, 0.0078, 0.0092, 0.0119, 0.0189, 0.0192, 0.0250, 0.0263])

## build dataset dict

In [46]:
def vectorize(name,char_vocab,args=args):
    indices = [char_vocab[args.beg_tkn]]
    indices.extend(char_vocab.lookup_indices(list(name)))
    indices.append(char_vocab[args.end_tkn])
    
    from_vector = np.full(shape=char_vocab.max_seq_length,
                          fill_value=char_vocab[args.mask_tkn],
                          dtype=np.int64)
    from_indices = indices[:-1]
    from_vector[:len(from_indices)] = from_indices
    
    to_vector = np.full(shape=char_vocab.max_seq_length,
                        fill_value=char_vocab[args.mask_tkn],
                        dtype=np.int64)
    to_indices = indices[1:]
    to_vector[:len(to_indices)] = to_indices
    
    return from_vector,to_vector

In [47]:
vectorize("lak",vocab_dict["char"])

(array([ 2, 11,  4, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=int64),
 array([11,  4, 15,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=int64))

In [14]:
def create_dataset(vocab_dict,args,row):
    from_vector , to_vector  = vectorize(row[0],char_vocab=vocab_dict["char"],args=args)
    nation_index = vocab_dict["nation"][row[1]]
    
    return {"x":from_vector,
            "y":to_vector,
            "class_index":nation_index}
    
def collate_fn(args,batch):
    return {k:v.to(args.device)
            for k,v in default_collate(batch).items()}

In [15]:
def build_dataset_dict(pipe_dict,vocab_dict,args=args):
    dataset_dict = {}
    fn = partial(create_dataset,vocab_dict,args)
    for dataset,pipe in pipe_dict.items():
        if dataset == "train":
            pipe = pipe.shuffle()
        
        pipe = pipe.map(fn)
        pipe = pipe.batch(args.batch_size,drop_last=True)
        pipe = pipe.collate(partial(collate_fn,args))
        dataset_dict[dataset] = pipe
    
    return dataset_dict

In [16]:
dataset_dict = build_dataset_dict(pipe_dict,vocab_dict)
dataset_dict

{'train': CollatorIterDataPipe,
 'test': CollatorIterDataPipe,
 'val': CollatorIterDataPipe}

In [17]:
{k:len(list(pipe)) for k,pipe in dataset_dict.items()}

{'train': 3842, 'test': 824, 'val': 824}

# Model

In [48]:
class SurnameGenerativeModel(nn.Module):
    def __init__(self,char_embedding_size,char_vocab_size,
                 rnn_hidden_size,batch_first=True,padding_idx = 0,
                 dropout_rate = 0.5) -> None:
        super().__init__()
        self.char_emb = nn.Embedding(embedding_dim=char_embedding_size,
                                     num_embeddings=char_vocab_size,
                                     padding_idx=padding_idx)
        self.rnn = nn.GRU(input_size = char_embedding_size,
                          hidden_size = rnn_hidden_size,
                          batch_first = batch_first)
        self.fc = nn.Linear(in_features=rnn_hidden_size,
                            out_features=char_vocab_size)
        self.dropout_rate = dropout_rate
        
    def forward(self,input,apply_softmax=False):
        # shape [batch,seq_length]
        x_emb = self.char_emb(input)
        # shape [batch,seq,emb]
        y_out,_ = self.rnn(x_emb)
        # shape [batch,seq,hidden]
        batch_size,seq_size,feat_size = y_out.size()
        
        y_out = y_out.contiguous().view(batch_size*seq_size,feat_size)
        
        y_out = self.fc(F.dropout(y_out,p=self.dropout_rate))
        
        if apply_softmax:
            y_out = F.softmax(y_out,dim=1)
        
        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size,seq_size,new_feat_size)
        
        return y_out

In [49]:
model = SurnameGenerativeModel(char_embedding_size=args.embedding_size,
                               char_vocab_size=len(vocab_dict["char"]),
                               rnn_hidden_size=args.rnn_hidden_state,
                               padding_idx=vocab_dict["char"][args.mask_tkn])

In [50]:
pipe_dict = build_pipe_dict()
vocab_dict = build_vocab_dict(pipe_dict["train"])
dataset_dict = build_dataset_dict(pipe_dict,vocab_dict,args)

In [51]:
sample = next(iter(dataset_dict["train"]))
sample

{'x': tensor([[ 2, 23,  4, 19,  5, 21, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 2, 36,  6,  8, 13,  5, 20,  6,  9, 17,  6,  0,  0,  0,  0,  0,  0,  0,
           0]]),
 'y': tensor([[23,  4, 19,  5, 21, 12,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0],
         [36,  6,  8, 13,  5, 20,  6,  9, 17,  6,  3,  0,  0,  0,  0,  0,  0,  0,
           0]]),
 'class_index': tensor([2, 4])}

In [52]:
vocab_dict["char"].max_seq_length

19

In [53]:
sample_embed = model.char_emb(sample["x"])
sample_embed.shape

torch.Size([2, 19, 10])

In [54]:
sample_rnn,_ = model.rnn(sample_embed)
sample_rnn.shape

torch.Size([2, 19, 9])

In [55]:
batch_size,seq_size,feat_size = sample_rnn.size()
sample_out = sample_rnn.contiguous().view(batch_size*seq_size,feat_size)
sample_out.shape

torch.Size([38, 9])

In [56]:
len(vocab_dict["char"])

83

In [57]:
sample_fc = model.fc(sample_out)
sample_fc.shape

torch.Size([38, 83])

In [58]:
sample_y_pred = sample_fc.view(batch_size,seq_size,-1)
sample_y_pred.shape

torch.Size([2, 19, 83])

In [59]:
sample_y_pred

tensor([[[ 0.1896, -0.2189,  0.0741,  ...,  0.2338, -0.0344, -0.2021],
         [ 0.2890, -0.5058,  0.3368,  ...,  0.2131, -0.0575, -0.4025],
         [ 0.2100, -0.3422,  0.2796,  ...,  0.1858, -0.2470, -0.3566],
         ...,
         [ 0.2238, -0.3094,  0.1804,  ...,  0.0646, -0.1630, -0.3456],
         [ 0.2238, -0.3094,  0.1806,  ...,  0.0649, -0.1632, -0.3451],
         [ 0.2237, -0.3093,  0.1807,  ...,  0.0650, -0.1634, -0.3448]],

        [[ 0.1896, -0.2189,  0.0741,  ...,  0.2338, -0.0344, -0.2021],
         [ 0.2557, -0.4169,  0.0987,  ..., -0.0110, -0.1196, -0.3164],
         [ 0.1859, -0.2784,  0.1836,  ...,  0.0848, -0.2731, -0.2868],
         ...,
         [ 0.2282, -0.2930,  0.1882,  ...,  0.0735, -0.1640, -0.3412],
         [ 0.2262, -0.2988,  0.1854,  ...,  0.0708, -0.1632, -0.3417],
         [ 0.2250, -0.3025,  0.1837,  ...,  0.0688, -0.1630, -0.3423]]],
       grad_fn=<ViewBackward0>)

In [60]:
sample_y_pred.view(-1,sample_y_pred.size(2))

tensor([[ 0.1896, -0.2189,  0.0741,  ...,  0.2338, -0.0344, -0.2021],
        [ 0.2890, -0.5058,  0.3368,  ...,  0.2131, -0.0575, -0.4025],
        [ 0.2100, -0.3422,  0.2796,  ...,  0.1858, -0.2470, -0.3566],
        ...,
        [ 0.2282, -0.2930,  0.1882,  ...,  0.0735, -0.1640, -0.3412],
        [ 0.2262, -0.2988,  0.1854,  ...,  0.0708, -0.1632, -0.3417],
        [ 0.2250, -0.3025,  0.1837,  ...,  0.0688, -0.1630, -0.3423]],
       grad_fn=<ViewBackward0>)

In [61]:
sample["y"]

tensor([[23,  4, 19,  5, 21, 12,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0],
        [36,  6,  8, 13,  5, 20,  6,  9, 17,  6,  3,  0,  0,  0,  0,  0,  0,  0,
          0]])

# Loss fn

In [62]:
def normalize_size(y_pred,y_true):
    if len(y_pred.size()) == 3:
        # shape [batch,seq,hidden]
        y_pred = y_pred.contiguous().view(-1,y_pred.size(dim=2))
        # shape [batch*seq,hidden]
    if len(y_true.size()) == 2:
        # shape [batch,seq]
        y_true = y_true.contiguous().view(-1)
        # shape [batch*seq]
    return y_pred,y_true
    
    


def sequence_loss(y_pred,y_true,mask_index):
    y_pred , y_true = normalize_size(y_pred,y_true)
    return F.cross_entropy(input=y_pred, # shape [batch*seq,hidden]
                           target=y_true, # shape [batch*seq]
                           ignore_index=mask_index # this remove value in the target and wont compute loss 
                           )

In [63]:
y_pred = torch.rand(size=(2,7,9))
y_true = torch.randint(low=0,high=7,size=(2,7))

In [64]:
y_true

tensor([[4, 3, 0, 3, 4, 4, 3],
        [6, 2, 6, 4, 1, 5, 6]])

In [65]:
sequence_loss(y_pred,y_true,0)

tensor(2.1802)

In [66]:
def compute_acc(y_pred,y_true,mask_indices):
    y_pred ,y_true = normalize_size(y_pred,y_true)
    
    _,y_pred_indices =y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices,y_true).float()
    valid_indices = torch.ne(y_true,mask_indices).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()
    
    return (n_correct / n_valid)  * 100    

In [67]:
compute_acc(y_pred,y_true,0)

15.384615384615385

# Training

## helper function

In [68]:
def make_train_state(args=args):
    return {"stop_early":False,
            "early_stopping_step":0,
            "early_stopping_val":1e5,
            "epoch_index":0,
            "model_filepath":args.model_base_path+args.model_filename,
            "train_loss":[],
            "train_acc":[],
            "val_loss":[],
            "val_acc":[],
            "test_loss":-1,
            "test_acc":-1}

def update_train_state(train_state,model,args=args):
    if train_state["epoch_index"] == 0:
        torch.save(model.state_dict(),train_state["model_filepath"])
        train_state["stop_early"] = False
    
    elif train_state["epoch_index"] >= 1:
        loss_tm1 , loss_t = train_state["val_loss"][-2:]
        if loss_t >= train_state["early_stopping_val"]:
            train_state["early_stopping_step"] +=1
        else:
            torch.save(model.state_dict(),train_state["model_filepath"])
            train_state["early_stopping_step"] = 0
            
        train_state["stop_early"] = train_state["early_stopping_step"] >= args.early_stopping_criteria
    
    return train_state

## Initialize

In [69]:
pipe_dict = build_pipe_dict()
vocab_dict = build_vocab_dict(pipe_dict["train"])
dataset_dict = build_dataset_dict(pipe_dict,vocab_dict,args)

In [70]:
model = SurnameGenerativeModel(char_embedding_size=args.embedding_size,
                               char_vocab_size=len(vocab_dict["char"]),
                               rnn_hidden_size=args.rnn_hidden_state)
optimizer = optim.Adam(params=model.parameters(),
                       lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode="min",factor=0.5,
                                                 patience=1)
train_state = make_train_state(args)
mask_index = vocab_dict["char"][args.mask_tkn]

## Loop

In [72]:
for epoch_index in tqdm(range(args.num_epochs)):
    train_state["epoch_index"] = epoch_index
    
    # get the data
    batch_generator = DataLoader2(datapipe=dataset_dict["train"])
    model.train()
    running_loss = 0.0
    running_acc = 0.0
    
    for batch_idx,batch_dict in enumerate(batch_generator):
        model.zero_grad()
        logits = model(batch_dict["x"])
        
        # loss
        loss = sequence_loss(logits,batch_dict["y"],mask_index)
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) /(batch_idx+1)
        
        # acc
        acc_t = compute_acc(logits,batch_dict["y"],mask_index)
        running_acc += (acc_t - running_acc) / (batch_idx+1)
        
        loss.backward()
        optimizer.step()
        
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)
    
    # iterate over the val 
    batch_generator = DataLoader2(dataset_dict["val"])
    running_loss = 0.0
    running_acc = 0.0
    model.eval()
    
    for batch_idx,batch_dict in enumerate(batch_generator):
        with torch.inference_mode():
            logits = model(batch_dict["x"])
            
            # loss
            loss = sequence_loss(logits,batch_dict["y"],mask_index)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) /(batch_idx+1)
            
            # acc
            acc_t = compute_acc(logits,batch_dict["y"],mask_index)
            running_acc += (acc_t - running_acc) / (batch_idx+1)   
            
    train_state["val_loss"].append(running_loss)         
    train_state["val_acc"].append(running_acc)
    
    scheduler.step(train_state["val_loss"][-1])
    
    if train_state["stop_early"]:
        break          

  1%|          | 1/100 [00:35<59:15, 35.92s/it]


KeyboardInterrupt: 

In [None]:
plt.plot(train_state["train_loss"],label="train_loss")
plt.plot(train_state["train_acc"],label="train_acc")
plt.plot(train_state["val_loss"],label="val_loss")
plt.plot(train_state["val_acc"],label="val_acc")
plt.legend();

In [73]:
batch_generator = DataLoader2(dataset_dict["test"])
running_loss = 0.0
running_acc = 0.0
model.eval()

for batch_idx,batch_dict in enumerate(batch_generator):
    with torch.inference_mode():
        logits = model(batch_dict["x"])
        
        # loss
        loss = sequence_loss(logits,batch_dict["y"],mask_index)
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) /(batch_idx+1)
        
        # acc
        acc_t = compute_acc(logits,batch_dict["y"],mask_index)
        running_acc += (acc_t - running_acc) / (batch_idx+1)   
        
train_state["test_loss"] = running_loss
train_state["test_acc"] = running_acc

AttributeError: 'int' object has no attribute 'append'

In [None]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

In [84]:
def sample_from_model(model,char_vocab,num_samples,sample_size=20,args=args):
    # creating the begin index for all samples
    begin_seq_index = [char_vocab[args.beg_tkn]
                       for _ in range(num_samples)]
    # converting and add dim
    begin_seq_index = torch.tensor(begin_seq_index,dtype=torch.int64).unsqueeze(dim=1)
    
    indices = [begin_seq_index]
    h_t = None
    
    for time_step in range(sample_size):
        x_t = indices[time_step]
        print(x_t.shape)
        # shape [num_sample,1]
        x_emd_t = model.char_emb(x_t)
        print(x_emd_t.shape)
        # shape [num_sample,1,emb]
        rnn_t,h_t = model.rnn(x_emd_t,h_t)
        print(rnn_t.shape)
        print(h_t.shape)
        # shape rnn_t [num_samples,1,hidden]
        # shape h_t [1,hidden]
        pred_vector = model.fc(
            rnn_t.squeeze(dim=1)  # shape [num_sample,hidden]
            )
        print(pred_vector.shape)
        # shape [num_sample,char_vocab_size]
        prob_vector = F.softmax(pred_vector,dim=1)
        # shape [num_sample,char_vocab_size]
        indices.append(torch.multinomial(prob_vector,num_samples=1))
    indices = torch.stack(indices)
    print(indices.shape)
    indices = indices.squeeze().permute(1,0)
    return indices     

In [85]:
def decode_samples(sampled_indices,char_vocab,args=args):
    decoded_surnames = []
    
    for sample_index in range(sampled_indices.shape[0]):
        surname = ""
        for time_step in range(sampled_indices.shape[1]):
            sample_item = sampled_indices[sample_index,time_step].item()
            if sample_item == char_vocab[args.beg_tkn]:
                continue
            elif sample_item == char_vocab[args.end_tkn]:
                break
            else:
                surname += char_vocab.get_itos()[sample_index]
        decoded_surnames.append(surname)
    return decoded_surnames

In [86]:
num_names = 10
model = model.cpu()
sampled_surnames = decode_samples(sample_from_model(model,vocab_dict["char"],num_samples=num_names),
                                  char_vocab=vocab_dict["char"])
sampled_surnames

torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
torch.Size([10, 1])
torch.Size([10, 1, 10])
torch.Size([10, 1, 9])
torch.Size([1, 10, 9])
torch.Size([10, 83])
t

['<MASK><MASK><MASK><MASK><MASK>',
 '<UKN><UKN><UKN><UKN><UKN><UKN><UKN><UKN><UKN><UKN><UKN><UKN><UKN>',
 '<B>',
 '<E><E><E><E>',
 'aaaaa',
 'eee',
 'oooooo',
 'ii',
 'nnnnnnn',
 'rrrrrrr']