In [1]:
import torch
import torchtext
import json
import os 
import re
import string

import pandas as pd
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn

from torchtext.data import (Dataset, Field, LabelField, 
                            BucketIterator, Iterator, Example) 
from tqdm.notebook import tqdm as tqdm_notebook
from argparse import Namespace

# Args Configurations

In [2]:
args = Namespace(
    # Data Path and Information
    surnames_csv="data/surnames/surnames_with_splits.csv",
    model_state_file="model.pth",
    save_dir="model_storage",
    # Model hyper parameters
    hidden_dim=300,
    # Training  hyper parameters
    seed=1337,
    num_epochs=100,
    early_stopping_criteria=5,
    learning_rate=0.001,
    batch_size=64,
    # Runtime options
    catch_keyboard_interrupt=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
    cuda=False
)

## Global Settings

In [3]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

if args.expand_filepaths_to_save_dir:
    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/model.pth
Using CUDA: False


# Load Dataset

In [4]:
surnames = pd.read_csv(args.surnames_csv)

In [5]:
surnames.head()

Unnamed: 0,surname,nationality,split
0,Piccoli,Italian,train
1,Matano,Japanese,train
2,Batten,English,train
3,Yarnold,English,train
4,Tutton,English,train


# TorchText Fields

In [6]:
def tokenizer(text):
    text = [char for char in text]
    return text

In [7]:
SURNAME = Field(sequential=True, pad_token=None, tokenize=tokenizer, lower=False)
NATIONALITY = LabelField(dtype=torch.float32, lower=True)

# TorchText Dataset

In [8]:
class SurnamesDataset(Dataset):
    def __init__(self, surnames_df, fields):
        examples = []
        for idx, row in tqdm_notebook(surnames_df.iterrows(), 
                                      total=surnames_df.shape[0]):
            surname = row.surname
            nationality = row.nationality
            examples.append(Example.fromlist([surname, nationality], fields))
        super().__init__(examples, fields)
            
    @staticmethod
    def sort_key(data):
        return len(data.surname)
    
    @classmethod
    def splits(cls, fields, surnames_df):
        train_data = cls(surnames_df[surnames_df.split == "train"], fields)
        val_data = cls(surnames_df[surnames_df.split == "val"], fields)
        test_data = cls(surnames_df[surnames_df.split == "test"], fields)
        return train_data, val_data, test_data        

In [9]:
fields = [('surname', SURNAME), ('nationality', NATIONALITY)]
train_data, val_data, test_data = SurnamesDataset.splits(fields, surnames)

HBox(children=(FloatProgress(value=0.0, max=7685.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1647.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1648.0), HTML(value='')))




In [10]:
vars(train_data[0])

{'surname': ['P', 'i', 'c', 'c', 'o', 'l', 'i'], 'nationality': 'italian'}

## Build Vocab

In [11]:
SURNAME.build_vocab(train_data, min_freq=0, specials=['<unk>'])
NATIONALITY.build_vocab(train_data)

In [12]:
vars(SURNAME.vocab).keys()

dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])

In [13]:
len(SURNAME.vocab.itos)

82

In [14]:
vars(NATIONALITY.vocab)

{'freqs': Counter({'italian': 420,
          'japanese': 542,
          'english': 2080,
          'russian': 1661,
          'arabic': 1122,
          'dutch': 165,
          'french': 160,
          'german': 403,
          'polish': 84,
          'irish': 128,
          'czech': 290,
          'greek': 109,
          'spanish': 181,
          'scottish': 52,
          'portuguese': 39,
          'chinese': 154,
          'vietnamese': 41,
          'korean': 54}),
 'itos': ['english',
  'russian',
  'arabic',
  'japanese',
  'italian',
  'german',
  'czech',
  'spanish',
  'dutch',
  'french',
  'chinese',
  'irish',
  'greek',
  'polish',
  'korean',
  'scottish',
  'vietnamese',
  'portuguese'],
 'unk_index': None,
 'stoi': defaultdict(None,
             {'english': 0,
              'russian': 1,
              'arabic': 2,
              'japanese': 3,
              'italian': 4,
              'german': 5,
              'czech': 6,
              'spanish': 7,
              'dutch':

# One Hot Vectorizer

In [15]:
class Vectorizer(object):
    def __init__(self, surname_field):
        self._surname_field = surname_field
    
    def vectorize(self, batch_matrix):
        batch_size = batch_matrix.shape[1]
        one_hot = torch.zeros((batch_size, len(self._surname_field.vocab.itos)), dtype=torch.long)
        indices = batch_matrix.T
        source = (indices!=0).long()
        one_hot.scatter_(1, indices, source)
        return one_hot[:, 1:]

In [16]:
oh_vectorizer = Vectorizer(SURNAME)

# Define Iterators

In [17]:
train_iterator, val_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = args.batch_size,
    sort_within_batch = False,
    device = args.device,
)

test_iterator = Iterator(test_data, batch_size=args.batch_size,
                        device=args.device, sort=False, sort_within_batch=False)

# Model

In [18]:
class SurnameClassifier(nn.Module):
    """ A 2-layer Multilayer Perceptron for classifying surnames """
    def __init__(self, input_dim, hidden_dim, output_dim):
        """
        Args:
            input_dim (int): the size of the input vectors
            hidden_dim (int): the output size of the first Linear layer
            output_dim (int): the output size of the second Linear layer
        """
        super(SurnameClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x_in, apply_softmax=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, input_dim)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, output_dim)
        """
        intermediate_vector = F.relu(self.fc1(x_in))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

# Helper utilities

In [19]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct/len(y_pred_indices) * 100

## Initialization

In [20]:
classifier = SurnameClassifier(input_dim=len(SURNAME.vocab.itos)-1, 
                               hidden_dim=args.hidden_dim, 
                               output_dim=len(NATIONALITY.vocab.itos))
print(classifier)

data_loaders = {"train": train_iterator, "val": val_iterator, "test": test_iterator}
datasets = {"train": train_data, "val": val_data, "test": test_data}

classifier = classifier.to(args.device)
weights = 1/torch.tensor([NATIONALITY.vocab.freqs[c] for c in NATIONALITY.vocab.itos], dtype=torch.float)
loss_func = nn.CrossEntropyLoss(weight=weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

train_bar = tqdm_notebook(desc='split=train',
                          total=len(data_loaders["train"]), 
                          position=1, 
                          leave=True)

val_bar = tqdm_notebook(desc='split=val',
                        total=len(data_loaders["val"]), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(data_loaders["train"]):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()
            
            # step 2. compute the output
            
            x_in = oh_vectorizer.vectorize(batch_dict.surname)
            y_pred = classifier(x_in=x_in.float())

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict.nationality.long())
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict.nationality)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, 
                                  acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset
        running_loss = 0.
        running_acc = 0.
        classifier.eval()
        
        with torch.no_grad():
            for batch_index, batch_dict in enumerate(data_loaders["val"]):

                # compute the output
                x_in = oh_vectorizer.vectorize(batch_dict.surname)
                y_pred = classifier(x_in=x_in.float())

                # step 3. compute the loss
                loss = loss_func(y_pred, batch_dict.nationality.long())
                loss_t = loss.item()
                running_loss += (loss_t - running_loss) / (batch_index + 1)

                # compute the accuracy
                acc_t = compute_accuracy(y_pred, batch_dict.nationality)
                running_acc += (acc_t - running_acc) / (batch_index + 1)

                val_bar.set_postfix(loss=running_loss, 
                                    acc=running_acc, 
                                    epoch=epoch_index)
                val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
except KeyboardInterrupt:
    print("Exiting loop")

SurnameClassifier(
  (fc1): Linear(in_features=81, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=18, bias=True)
)


HBox(children=(FloatProgress(value=0.0, description='training routine', style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=121.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='split=val', max=26.0, style=ProgressStyle(description_wid…

In [21]:
# compute the loss & accuracy on the test set using the best available model

# classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)


running_loss = 0.
running_acc = 0.
classifier.eval()

with torch.no_grad():
    for batch_index, batch_dict in tqdm_notebook(enumerate(data_loaders["test"]), 
                                                total = len(data_loaders["test"])):
        # compute the output
        x_in = oh_vectorizer.vectorize(batch_dict.surname)
        y_pred = classifier(x_in=x_in.float())

        # compute the loss
        loss = loss_func(y_pred, batch_dict.nationality.long())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict.nationality)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




In [22]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))


Test loss: 1.716
Test Accuracy: 47.16


# Testing

In [23]:
def predict_nationality(surname, classifier, surname_field, nationality_field, oh_vectorizer):
    """Predict the nationality from a new surname
    
    Args:
        surname (str): the surname to classifier
        classifier (SurnameClassifer): an instance of the classifier
        vectorizer (SurnameVectorizer): the corresponding vectorizer
    Returns:
        a dictionary with the most likely nationality and its probability
    """
    surname = [char.lower() for char in surname]
    surname = surname_field.numericalize([surname])
    vectorized_surname = oh_vectorizer.vectorize(surname)
    result = classifier(vectorized_surname.float(), apply_softmax=True)

    probability_values, indices = result.max(dim=1)
    index = indices.item()
    

    predicted_nationality = nationality_field.vocab.itos[index]
    probability_value = probability_values.item()

    return {'nationality': predicted_nationality, 'probability': probability_value}

In [24]:
new_surname = "McMahan"
classifier = classifier.to("cpu")
prediction = predict_nationality(new_surname, classifier, SURNAME, NATIONALITY, oh_vectorizer)
print("{} -> {} (p={:0.2f})".format(new_surname,
                                    prediction['nationality'],
                                    prediction['probability']))

McMahan -> scottish (p=0.37)


# TopK Inference

In [25]:
def predict_topk_nationalities(surname, classifier, surname_field, nationality_field, oh_vectorizer, k):
    """Predict the nationality from a new surname
    
    Args:
        surname (str): the surname to classifier
        classifier (SurnameClassifer): an instance of the classifier
        vectorizer (SurnameVectorizer): the corresponding vectorizer
    Returns:
        a dictionary with the most likely nationality and its probability
    """
    surname = [char.lower() for char in surname]
    surname = surname_field.numericalize([surname])
    vectorized_surname = oh_vectorizer.vectorize(surname)
    result = classifier(vectorized_surname.float(), apply_softmax=True)
    probability_values, indices = result.topk(dim=1, k=k)
    probability_values = probability_values.view(-1)
    index = indices.view(-1)
    
    results = []
    for idx_, idx in enumerate(index):
        predicted_nationality = nationality_field.vocab.itos[idx.item()]
        probability_value = probability_values[idx_].item()
        results.append({'nationality': predicted_nationality, 'probability': probability_value})
    return results

In [27]:
new_surname = "McMahan"
k = 5
classifier = classifier.to("cpu")
predictions = predict_topk_nationalities(new_surname, classifier, SURNAME, NATIONALITY, oh_vectorizer, k)

print("Top {} predictions:".format(k))
print("===================")
for prediction in predictions:
    print("{} -> {} (p={:0.2f})".format(new_surname,
                                        prediction['nationality'],
                                        prediction['probability']))

Top 5 predictions:
McMahan -> scottish (p=0.37)
McMahan -> irish (p=0.17)
McMahan -> german (p=0.11)
McMahan -> czech (p=0.10)
McMahan -> english (p=0.08)
