In [1]:
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

import time
import math
import numpy as np

import torch as t
import torch.nn as nn
from torch.autograd import Variable as V
import torch.utils.data as Data
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from pyfile.name_dataset import NameDataset


#### 1. load dataset

In [2]:
BATCH_SIZE = 256
train_dataset = NameDataset(is_train_set=True)
train_loader = Data.DataLoader(dataset=train_dataset,
                               batch_size=BATCH_SIZE,
                               shuffle=True)

test_dataset = NameDataset(is_train_set=False)
test_loader = Data.DataLoader(dataset=test_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)

N_COUNTRIES = len(train_dataset.get_countries())
print("Countries: ", N_COUNTRIES)


Countries:  18


In [3]:
print("train dataset lengths: ", len(train_dataset))
print("test dataset lengths: ", len(test_dataset))

train dataset lengths:  13374
test dataset lengths:  6700


In [17]:
# Some utility functions
def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 6
    return ("%dm %ds" % (m, s))


def create_variable(tensor):
    # Do cuda() before wraping with variable
    if t.cuda.is_available():
        return V(tensor.cuda())
    else:
        return V(tensor)

    
# pad sequences and sort the tensor
def pad_sequences(vectorized_seqs, seq_lengths, countries):
    seq_tensor = t.zeros((len(vectorized_seqs), seq_lengths.max())).long()
    
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seq_len] = t.LongTensor(seq)
    
    # Sort tensors by their length
    seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
    seq_tensor = seq_tensor[perm_idx]
    
    # Also sort the target (countries) in the same order
    target = countries2tensor(countries)
    if len(target):
        target = target[perm_idx]
    
    # Return variables
    # DataParallel requires everything to be a Variable
    seq_tensor = create_variable(seq_tensor)
    seq_lengths = create_variable(seq_lengths)
    target = create_variable(target)
    return seq_tensor, seq_lengths, target

# Create necessary variables, lengths, and target
def make_variables(names, countries):
    sequence_and_length = [str2ascii_arr(name) for name in names]
    vectorized_seqs = [sl[0] for sl in sequence_and_length]
    seq_lengths = t.LongTensor([sl[1] for sl in sequence_and_length])
    return pad_sequences(vectorized_seqs, seq_lengths, countries)


def str2ascii_arr(msg):
    arr = [ord(c) for c in msg]
    return arr, len(arr)


def countries2tensor(countries):
    country_ids = [train_dataset.get_country_id(country) for country in countries]
    return t.LongTensor(country_ids)


#### 2. difine model

In [44]:
# Parameters and DataLoaders
HIDDEN_SIZE = 100
N_LAYERS = 2
N_EPOCHS = 200
N_CHARS = 128   # ASCII

# learning_rate
BASE_LR = 0.01

In [45]:
# model
class RNNClassifier(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, bidirectional=True):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.n_directions = int(bidirectional) + 1
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, 
                          hidden_size, 
                          num_layers, 
                          bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)
    
    
    def forward(self, inputs, seq_lengths):
        # Note: we run this all at once(over the whole input sequence)
        # input shape: B * S(input_size)
        # transpose to make S(sequence) * B(batch)
        inputs = inputs.t()
        batch_size = inputs.size(1)
        
        # Make a hidden
        hidden = self._init_hidden(batch_size)
        
        # Embedding S * B -> S * B * I(embedding size)
        embedded = self.embedding(inputs)
        
        # Pack them up nicely
        gru_input = pack_padded_sequence(embedded, seq_lengths.data.cpu().numpy())
        
        # To compact weights again call flatten_parameters().
        self.gru.flatten_parameters()
        output, hidden = self.gru(gru_input, hidden)
        
        # Use the last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(hidden[-1])
        return fc_output
    
    def _init_hidden(self, batch_size):
        hidden = t.zeros(self.num_layers * self.n_directions, batch_size, self.hidden_size)
        return create_variable(hidden)

#### 3. Train model

In [46]:
# classifier = RNNClassifier(input_size=N_CHARS, hidden_size=HIDDEN_SIZE, output_size=N_COUNTRIES)

# Train cycle
def train():
    total_loss = 0
    
    # [0, 29]: 0, [30, 59]: 1, [60, 89]: 2, [90, 100]: 3
    lr_decay_epoch = epoch // 40
    
    for group in optimizer.param_groups:
        group['lr'] = BASE_LR * (0.1 ** lr_decay_epoch)
    
    
    
    for i, (names, countries) in enumerate(train_loader, 1):
        # print("names: ", names)
        # print("countries: ", countries)
        
        inputs, seq_lengths, target = make_variables(names, countries)
        output = classifier(inputs, seq_lengths)
        
        loss = criterion(output, target)
        total_loss += loss.data[0]
        
        classifier.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i % 10 == 0:
            print('[{}] Train Epoch: {} [{}/{}({:.0f})%]\tLoss: {:.2f}'.format(
                   time_since(start), 
                   epoch, i * len(names), len(train_loader.dataset),
                   100. * i * len(names) / len(train_loader.dataset),
                   total_loss / i * len(names)
                   ))
    return total_loss


In [47]:
# Testing cycle
def test(name=None):
    # Predict for a given name
    if name:
        inputs, seq_lengths, target = make_variables([name], [])
        output = classifier(inputs)
        
        pred = output.data.max(1, keepdim=True)[1]
        country_id = pred.cpu().numpy()[0][0]
        print(name, "is", train_dataset.get_country(country_id))
        return 
    
    print("evaluating trained model...")
    correct = 0
    train_data_size = len(test_loader.dataset)
    
    for names, countries in test_loader:
        inputs, seq_lengths, target = make_variables(names, countries)
        output = classifier(inputs, seq_lengths)
        
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        
    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
          correct, train_data_size, 100. * correct / train_data_size))

In [48]:
# optimizer
classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_COUNTRIES, N_LAYERS)
if t.cuda.device_count() > 1:
    print("Let's use", t.cuda.device_count(), "GPUs!")
    # dim = 0 [33, xxx] -> [11, ...], [11, ...], [11, ...] on 3 GPUs
    classifier = nn.DataParallel(classifier)

if t.cuda.is_available():
    classifier.cuda()

optimizer = t.optim.Adam(classifier.parameters(), lr=BASE_LR)
criterion = nn.CrossEntropyLoss()

start = time.time()
print("Training for %d epochs.." % N_EPOCHS)
for epoch in range(1, N_EPOCHS + 1):
    # Train cycle
    train()
    
    # Testing
    test()

Training for 200 epochs..
[0m 0s] Train Epoch: 1 [2560/13374(19)%]	Loss: 480.83
[0m 0s] Train Epoch: 1 [5120/13374(38)%]	Loss: 403.78
[0m 0s] Train Epoch: 1 [7680/13374(57)%]	Loss: 364.12
[0m 0s] Train Epoch: 1 [10240/13374(77)%]	Loss: 332.31
[0m 0s] Train Epoch: 1 [12800/13374(96)%]	Loss: 308.68
evaluating trained model...

Test set: Accuracy: 5090/6700 (76%)

[0m 1s] Train Epoch: 2 [2560/13374(19)%]	Loss: 180.37
[0m 1s] Train Epoch: 2 [5120/13374(38)%]	Loss: 180.90
[0m 1s] Train Epoch: 2 [7680/13374(57)%]	Loss: 176.99
[0m 1s] Train Epoch: 2 [10240/13374(77)%]	Loss: 171.58
[0m 1s] Train Epoch: 2 [12800/13374(96)%]	Loss: 169.21
evaluating trained model...

Test set: Accuracy: 5411/6700 (81%)

[0m 2s] Train Epoch: 3 [2560/13374(19)%]	Loss: 144.53
[0m 2s] Train Epoch: 3 [5120/13374(38)%]	Loss: 134.73
[0m 2s] Train Epoch: 3 [7680/13374(57)%]	Loss: 134.01
[0m 2s] Train Epoch: 3 [10240/13374(77)%]	Loss: 134.77
[0m 3s] Train Epoch: 3 [12800/13374(96)%]	Loss: 134.47
evaluating trained model..