In [1]:
from argparse import Namespace
from collections import Counter
import json
import os
import string

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

## Dataset class

In [2]:
class SurnameDataset(Dataset):
    def __init__(self,surname_df,surname_voectorizer):
        self.df = surname_df
        self.vectorizer = surname_voectorizer
        
        self.train_df = self.df[self.df.split == 'train']
        self.train_size = len(self.tarin_df)
        
        self.valid_df = self.df[self.df.split == 'valid']
        self.valid_size = len(self.valid_df)
        
        self.test_df = self.df[self.df.split == 'test']
        self.test_size = len(self.test_df)
        
        self.lookup_dict = {'train': (self.train_df, self.train_size),
                             'valid': (self.valid_df, self.valid_size),
                             'test': (self.test_df, self.test_size)}
        
    @classmethod    
    def load_data_build_vectorizer(cls,surname_path,vectorizer_path):
        surname_df = pd.read_csv(surname_path)
        vectorizer = cls.load_vectorizer_only(vectorizer_path)
        return cls(surname_df, vectorizer)
    
    @staticmethod
    def load_vectorizer_only(vectorizer_path):
        with open(vectorizer_path) as fp:
            return SurnameVectorizer.from_serializable(json.load(fp))
        
    def save_vectorizer(self, vectorizer_path):
   
        with open(vectorizer_path, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
    
    def get_vectorizer(self):
            
        return self.vectorizer
    
    def __len__(self):
        return self._target_size
    
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self.lookup_dict[split]

    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        
        row = self._target_df.iloc[index]

        surname_vector = self.vectorizer.vectorize(row.names)
        nationality_index = self.vectorizer.nationality_vocab.lookup_token(row.namescoutry)
        return {'x_surname': surname_vector,
                'y_nationality': nationality_index}
    
    def get_num_batches(self, batch_size):

        return len(self) // batch_size
# For now, we have finized the Dataset Class.

In [3]:

# using the dataloader to get the batch data.
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

# Vocabulary

In [4]:
# apparently this is the most important one.
class Vocabulary(object): # vocabulary should include all data we have both testing and training
    def __init__(self,token2idx = None, add_UNK = True, UNK = '<UNK>'):
        
        if token2idx == None:
            token2idx = {}
        
        self.token2idx = token2idx
        self.idx2token = {v:k for k,v in self.token2idx.items()} #initial two dicts
        
        self.add_UNK = add_UNK
        self.UNK = UNK
        
        self.UNK_index = -1 # the 0 based in python
        
        if add_UNK:
            self.UNK_index = self.add_token(UNK)
        
    def serialized(self):
        return {'token2idx' : self.token2idx,
               'add_UNK': self.add_UNK,
               'UNK':self.UNK}
    
    @classmethod
    def from_serialied(cls,content):
        return cls(**content)
    
    def add_token(self,token):
        
        try:
            idx = self.token2idx[token]
        except KeyError:
            idx = len(self.idx2token)
            self.token2idx[token] = idx
            self.idx2token[idx] = token
            
    def add_manytoken(self,tokens):
        
        return [self.add_token(t) for t in tokens]
    
    def look_up_token(self,token):
        
        if self.UNK_index >= 0: # if is a UNKNOW word
            return self.token2idx.get(token,self.UNK)
        else: # We dont have unknow word, so that we directly get it from dict.
            return self.token2idx[token]
            
    def look_up_index(self,idx):
        if idx not in self.idx2token:
            raise KeyError('There is no such (%d) in the Vocabulary % idx')
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self.token2idx)

# Vectorizer

In [5]:
class SurnameVectorizer(object):
    def __init__(self,names_vocab,namescountry_vocab):
        self.namesvocab = names
        self.countryvocab = namescountry
    
    def vetorize(self,name):
        vocab = self.namesvocab
        one_hot = np.zeros(len(vocab), dtype=np.float32) # 1 D vector
        for token in surname:
            one_hot[vocab.lookup_token(token)] = 1

        return one_hot
    @classmethod
    def from_dataframe(cls, surname_df): # getting the vocab
        
        surname_vocab = Vocabulary(UNK="@")
        nationality_vocab = Vocabulary(add_UNK=False)

        for index, row in surname_df.iterrows():
            for letter in row.surname:
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)

        return cls(surname_vocab, nationality_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        surname_vocab = Vocabulary.from_serialied(contents['surname_vocab'])
        nationality_vocab =  Vocabulary.from_serialied(contents['nationality_vocab'])
        return cls(surname_vocab=surname_vocab, nationality_vocab=nationality_vocab)

    def to_serializable(self):
        return {'surname_vocab': self.namesvocab.serialized(),
                'nationality_vocab': self.countryvocab.serialized()}

# Modeling

In [6]:
class SurnameClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        
        super(SurnameClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, apply_softmax=False):
        intermediate_vector = F.tanh(self.fc1(x))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

In [7]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args,model,train_state):
    if train_state['index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        train_state['stop_early'] = False
        
    elif train_state['index'] >= 1:
        val_loss_t_1,val_loss_t = train_state['val_loss'][-2:]
        
        if val_loss_t > train_state['early_stopping_best_val']:
            train_state['early_stopping_step'] += 1
        
        else:
            torch.save(model.state_dict(), train_state['model_filename'])
            train_state['early_stopping_step'] = 0
        
        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria
        
    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100
    

In [None]:
##Natural language processing with pytorch is not worthy to buy!