In [2]:
import torch
import torchtext
import json
import os 
import re
import string

import pandas as pd
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn

from torchtext.data import (Dataset, Field, LabelField, 
                            BucketIterator, Iterator, Example) 
from tqdm.notebook import tqdm as tqdm_notebook
from argparse import Namespace
from torch.utils.tensorboard import SummaryWriter

os.environ['KMP_DUPLICATE_LIB_OK']='True'
writer = SummaryWriter()
%load_ext tensorboard

# Args Configurations

In [3]:
args = Namespace(
    # Data Path and Information
    surnames_csv="data/surnames/surnames_with_splits.csv",
    model_state_file="model.pth",
    save_dir="model_storage",
    # Model hyper parameters
    # Model hyper parameters
    hidden_dim=100,
    num_channels=256,
    # Training hyper parameters
    seed=1337,
    learning_rate=0.001,
    batch_size=128,
    num_epochs=100,
    early_stopping_criteria=5,
    dropout_p=0.1,
    # Runtime options
    catch_keyboard_interrupt=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
    cuda=False
)

## Global Settings

In [4]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

if args.expand_filepaths_to_save_dir:
    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/model.pth
Using CUDA: False


# Load Dataset

In [5]:
surnames = pd.read_csv(args.surnames_csv)

In [6]:
surnames.head()

Unnamed: 0,surname,nationality,split
0,Lovell,English,train
1,Nader,Arabic,train
2,Avertchenko,Russian,train
3,Yim,Korean,train
4,Balanev,Russian,train


# TorchText Fields

In [8]:
def tokenizer(text):
    text = [char for char in text]
    return text

In [9]:
SURNAME = Field(sequential=True, tokenize=tokenizer, lower=False)
NATIONALITY = LabelField(dtype=torch.float32, lower=True)

# TorchText Dataset

In [10]:
class SurnamesDataset(Dataset):
    def __init__(self, surnames_df, fields):
        examples = []
        for idx, row in tqdm_notebook(surnames_df.iterrows(), 
                                      total=surnames_df.shape[0]):
            surname = row.surname
            nationality = row.nationality
            examples.append(Example.fromlist([surname, nationality], fields))
        super().__init__(examples, fields)
            
    @staticmethod
    def sort_key(data):
        return len(data.surname)
    
    @classmethod
    def splits(cls, fields, surnames_df):
        train_data = cls(surnames_df[surnames_df.split == "train"], fields)
        val_data = cls(surnames_df[surnames_df.split == "val"], fields)
        test_data = cls(surnames_df[surnames_df.split == "test"], fields)
        return train_data, val_data, test_data        

In [11]:
fields = [('surname', SURNAME), ('nationality', NATIONALITY)]
train_data, val_data, test_data = SurnamesDataset.splits(fields, surnames)

HBox(children=(FloatProgress(value=0.0, max=7685.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1647.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1648.0), HTML(value='')))




In [11]:
vars(train_data[0])

{'surname': ['L', 'o', 'v', 'e', 'l', 'l'], 'nationality': 'english'}

## Build Vocab

In [12]:
SURNAME.build_vocab(train_data, min_freq=0)
NATIONALITY.build_vocab(train_data)

In [13]:
vars(SURNAME.vocab).keys()

dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])

In [14]:
SURNAME.vocab.itos[:2]

['<unk>', '<pad>']

In [15]:
len(SURNAME.vocab.itos)

84

In [16]:
vars(NATIONALITY.vocab)

{'freqs': Counter({'english': 2080,
          'arabic': 1122,
          'russian': 1661,
          'korean': 54,
          'japanese': 542,
          'spanish': 181,
          'german': 403,
          'italian': 420,
          'czech': 290,
          'irish': 128,
          'french': 160,
          'greek': 109,
          'dutch': 165,
          'chinese': 154,
          'polish': 84,
          'scottish': 52,
          'portuguese': 39,
          'vietnamese': 41}),
 'itos': ['english',
  'russian',
  'arabic',
  'japanese',
  'italian',
  'german',
  'czech',
  'spanish',
  'dutch',
  'french',
  'chinese',
  'irish',
  'greek',
  'polish',
  'korean',
  'scottish',
  'vietnamese',
  'portuguese'],
 'unk_index': None,
 'stoi': defaultdict(None,
             {'english': 0,
              'russian': 1,
              'arabic': 2,
              'japanese': 3,
              'italian': 4,
              'german': 5,
              'czech': 6,
              'spanish': 7,
              'dutch':

# One Hot Vectorizer

In [141]:
class Vectorizer(object):
    def __init__(self, surname_field):
        self._surname_field = surname_field
        self.slicing = torch.tensor([i for i in range(len(self._surname_field.vocab.itos)) if i!=1])
        self.num_characters = len(self._surname_field.vocab.itos)
    
    def vectorize(self, batch_matrix):
        one_hot_sequence = F.one_hot(batch_matrix.T, self.num_characters)
        one_hot_sequence = one_hot_sequence[:, :, self.slicing]
        # return torch.transpose(one_hot_sequence,2,1)
        return (one_hot_sequence.sum(dim=1) > 0).long()

In [142]:
oh_vectorizer = Vectorizer(SURNAME)

In [167]:
for batch in test_iterator:
    a = oh_vectorizer.vectorize(batch.surname)[0]
    print(oh_vectorizer.vectorize(batch.surname))
    print(batch.surname[:,0])
    print(''.join([SURNAME.vocab.itos[i] for i, h in enumerate(a, 1) if h != 0]))
    print(''.join([SURNAME.vocab.itos[i] for i in batch.surname[:,0] if i!= 1]))
    d

tensor([[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([33,  3, 11,  3,  7,  8,  1,  1])
erstP
Peters


NameError: name 'd' is not defined

# Define Iterators

In [98]:
train_iterator, val_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = args.batch_size,
    sort_within_batch = True,
    device = args.device,
)

test_iterator = Iterator(test_data, batch_size=2,
                        device=args.device, sort=False, sort_within_batch=False)

# Model

In [22]:
class SurnameClassifier(nn.Module):
    def __init__(self, initial_num_channels, num_classes, num_channels):
        """
        Args:
            initial_num_channels (int): size of the incoming feature vector
            num_classes (int): size of the output prediction vector
            num_channels (int): constant channel size to use throughout network
        """
        super(SurnameClassifier, self).__init__()
        
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=initial_num_channels, 
                      out_channels=num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                      kernel_size=3, stride=2),
            nn.ELU(),
#             nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
#                       kernel_size=3, stride=2),
#             nn.ELU(),
#             nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
#                       kernel_size=3),
#             nn.ELU()
        )
        self.fc = nn.Linear(num_channels, num_classes)

    def forward(self, x_surname, apply_softmax=False):
        """The forward pass of the classifier
        
        Args:
            x_surname (torch.Tensor): an input data tensor. 
                x_surname.shape should be (batch, initial_num_channels, max_surname_length)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, num_classes)
        """
        features = self.convnet(x_surname).squeeze(dim=2)
        print(features.shape)
        prediction_vector = self.fc(features)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector


# Helper utilities

In [23]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct/len(y_pred_indices) * 100

## Initialization

In [37]:
classifier = SurnameClassifier(initial_num_channels=len(SURNAME.vocab.itos)-1, 
                               num_classes=len(NATIONALITY.vocab.itos),
                               num_channels=args.num_channels)
print(classifier)

SurnameClassifier(
  (convnet): Sequential(
    (0): Conv1d(83, 256, kernel_size=(3,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(256, 256, kernel_size=(3,), stride=(2,))
    (3): ELU(alpha=1.0)
  )
  (fc): Linear(in_features=256, out_features=18, bias=True)
)


In [38]:
def train(data_loaders, classifier, 
          optimizer, oh_vectorizer, loss_func, train_bar):
    """Train method
    """
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    
    for batch_index, batch_dict in enumerate(data_loaders["train"]):
        # the training routine is these 5 steps:

        # --------------------------------------
        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output

        x_in = oh_vectorizer.vectorize(batch_dict.surname)
        y_pred = classifier(x_surname=x_in.float())

        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict.nationality.long())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        # -----------------------------------------
        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict.nationality)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

        # update bar
        train_bar.set_postfix(loss=running_loss, 
                              acc=running_acc, 
                              epoch=epoch_index)
        train_bar.update()
        
    return running_loss, running_acc

In [39]:
def val(data_loaders, classifier, oh_vectorizer, loss_func, val_bar):
    """Validation Method
    """
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    with torch.no_grad():
        for batch_index, batch_dict in enumerate(data_loaders["val"]):

            # compute the output
            x_in = oh_vectorizer.vectorize(batch_dict.surname)
            y_pred = classifier(x_in=x_in.float())

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict.nationality.long())
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict.nationality)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            val_bar.set_postfix(loss=running_loss, 
                                acc=running_acc, 
                                epoch=epoch_index)
            val_bar.update()
        
    return running_loss, running_acc

In [40]:
data_loaders = {"train": train_iterator, "val": val_iterator, "test": test_iterator}
datasets = {"train": train_data, "val": val_data, "test": test_data}

classifier = classifier.to(args.device)
weights = 1/torch.tensor([NATIONALITY.vocab.freqs[c] for c in NATIONALITY.vocab.itos], 
                         dtype=torch.float)
loss_func = nn.CrossEntropyLoss(weight=weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

train_bar = tqdm_notebook(desc='split=train',
                          total=len(data_loaders["train"]), 
                          position=1, 
                          leave=True)

val_bar = tqdm_notebook(desc='split=val',
                        total=len(data_loaders["val"]), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset        
        running_loss, running_acc = train(data_loaders, classifier, 
                                          optimizer, oh_vectorizer, 
                                          loss_func, train_bar)
        
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset
        running_loss, running_acc = val(data_loaders, classifier, 
                                        oh_vectorizer, loss_func, 
                                        val_bar)

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

        if train_state['stop_early']:
            break

except KeyboardInterrupt:
    print("Exiting loop")

HBox(children=(FloatProgress(value=0.0, description='training routine', style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=61.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='split=val', max=13.0, style=ProgressStyle(description_wid…

torch.Size([128, 256, 2])


RuntimeError: size mismatch, m1: [32768 x 2], m2: [256 x 18] at ../aten/src/TH/generic/THTensorMath.cpp:41

In [98]:
# compute the loss & accuracy on the test set using the best available model
# classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)


running_loss = 0.
running_acc = 0.
classifier.eval()

with torch.no_grad():
    for batch_index, batch_dict in tqdm_notebook(enumerate(data_loaders["test"]), 
                                                total = len(data_loaders["test"])):
        # compute the output
        x_in = oh_vectorizer.vectorize(batch_dict.surname)
        y_pred = classifier(x_in=x_in.float())

        # compute the loss
        loss = loss_func(y_pred, batch_dict.nationality.long())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict.nationality)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




TypeError: forward() got an unexpected keyword argument 'x_in'

In [None]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))


# Testing

In [None]:
def predict_nationality(surname, classifier, surname_field, nationality_field, oh_vectorizer):
    """Predict the nationality from a new surname
    
    Args:
        surname (str): the surname to classifier
        classifier (SurnameClassifer): an instance of the classifier
        vectorizer (SurnameVectorizer): the corresponding vectorizer
    Returns:
        a dictionary with the most likely nationality and its probability
    """
    surname = [char.lower() for char in surname]
    surname = surname_field.numericalize([surname])
    vectorized_surname = oh_vectorizer.vectorize(surname)
    result = classifier(vectorized_surname.float(), apply_softmax=True)

    probability_values, indices = result.max(dim=1)
    index = indices.item()
    

    predicted_nationality = nationality_field.vocab.itos[index]
    probability_value = probability_values.item()

    return {'nationality': predicted_nationality, 'probability': probability_value}

In [None]:
new_surname = "McMahan"
classifier = classifier.to("cpu")
prediction = predict_nationality(new_surname, classifier, SURNAME, NATIONALITY, oh_vectorizer)
print("{} -> {} (p={:0.2f})".format(new_surname,
                                    prediction['nationality'],
                                    prediction['probability']))

# TopK Inference

In [None]:
def predict_topk_nationalities(surname, classifier, surname_field, nationality_field, oh_vectorizer, k):
    """Predict the nationality from a new surname
    
    Args:
        surname (str): the surname to classifier
        classifier (SurnameClassifer): an instance of the classifier
        vectorizer (SurnameVectorizer): the corresponding vectorizer
    Returns:
        a dictionary with the most likely nationality and its probability
    """
    surname = [char.lower() for char in surname]
    surname = surname_field.numericalize([surname])
    vectorized_surname = oh_vectorizer.vectorize(surname)
    result = classifier(vectorized_surname.float(), apply_softmax=True)
    probability_values, indices = result.topk(dim=1, k=k)
    probability_values = probability_values.view(-1)
    index = indices.view(-1)
    
    results = []
    for idx_, idx in enumerate(index):
        predicted_nationality = nationality_field.vocab.itos[idx.item()]
        probability_value = probability_values[idx_].item()
        results.append({'nationality': predicted_nationality, 'probability': probability_value})
    return results

In [None]:
new_surname = "McMahan"
k = 5
classifier = classifier.to("cpu")
predictions = predict_topk_nationalities(new_surname, classifier, SURNAME, NATIONALITY, oh_vectorizer, k)

print("Top {} predictions:".format(k))
print("===================")
for prediction in predictions:
    print("{} -> {} (p={:0.2f})".format(new_surname,
                                        prediction['nationality'],
                                        prediction['probability']))