In [37]:
from __future__ import unicode_literals, print_function, division
# from io import open

import unicodedata
from pathlib import Path
from random import randint
from string import ascii_letters

import torch
print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.2.0


In [27]:
# Prep
all_letters: str = ascii_letters + " .;:'"
n_letters: int = len(all_letters)
print(n_letters)

57


In [3]:
find_files = lambda glob_pattern: [f for f in Path().glob(glob_pattern)]

for p in find_files("**/data/names/*.txt"): print(p.name, p.name[:p.name.index(p.suffix)])

Arabic.txt Arabic
Chinese.txt Chinese
Czech.txt Czech
Dutch.txt Dutch
English.txt English
French.txt French
German.txt German
Greek.txt Greek
Irish.txt Irish
Italian.txt Italian
Japanese.txt Japanese
Korean.txt Korean
Polish.txt Polish
Portuguese.txt Portuguese
Russian.txt Russian
Scottish.txt Scottish
Spanish.txt Spanish
Vietnamese.txt Vietnamese


In [4]:
def unicode_to_ascii(s):
    return "".join([c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" and c in all_letters])

print(unicode_to_ascii('Ślusàrski'))

Slusarski


In [15]:
cat_lines = {}

fp_gen = find_files("**/data/names/*.txt")
for p in fp_gen:
    with p.open(encoding = "utf-8") as f:
        lines = f.read().strip().split("\n")
    _cat = p.name[:p.name.index(p.suffix)]
    cat_lines[_cat] = list(map(unicode_to_ascii, lines))


# Categories to list
all_cats = list(cat_lines.keys())
n_cats = len(all_cats)

print(f"Category count: {len(cat_lines)}")
print(cat_lines["Italian"][:5])

Category count: 18
['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


### Turn names into Tensors

In [29]:
def letter_to_index(Char: str) -> int:
    return all_letters.find(Char)

def letter_to_tensor(Char: str) -> torch.tensor:
    Tensor = torch.zeros(1, n_letters)
    Tensor[0][letter_to_index(Char)] = 1
    return Tensor

def line_to_tensor(Line: str) -> torch.tensor:
    Tensor = torch.zeros(len(Line), 1, n_letters)
    for Index, Char in enumerate(Line):
        Tensor[Index][0][letter_to_index(Char)] = 1
    return Tensor

# Tests
print(letter_to_tensor("J"))
print(line_to_tensor("Jones").size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])


### Create Recurrent Neural Network (RNN)

In [30]:
nn = torch.nn

class RNN(nn.Module):
    """Simple recurrent neural network (RNN)
    
    Model Notes:
        * Two linear layers
        * Two states: Input and Hidden
        * LogSoftmax after output.
    """
    
    __slots__ = ["input_size", "hidden_size", "output_size",]
    
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2b = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2b(combined)
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    


In [33]:
# Init RNN
n_hidden: int = 128
rnn: RNN = RNN(n_letters, n_hidden, n_cats)

input_rnn = letter_to_tensor("Albert")
hidden_rnn = torch.zeros(1, n_hidden)

# Note: If you have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.
# https://pytorch.org/tutorials/beginner/former_torchies/nnft_tutorial.html
output, next_hidden = rnn(input_rnn[0].unsqueeze(0), hidden_rnn)
print(output)


tensor([[-2.8838, -2.8746, -2.9002, -2.9509, -2.8458, -2.8187, -2.8447, -3.0177,
         -2.8923, -2.8417, -2.9362, -2.8376, -2.9969, -2.8647, -2.8154, -2.8135,
         -2.9203, -3.0089]], grad_fn=<LogSoftmaxBackward>)


## Training the Model

### Preparation

In [36]:
def category_from_output(output: torch.tensor) -> tuple:
    top_n, top_i = output.topk(1)
    cat_i = top_i[0].item()
    return all_cats[cat_i], cat_i

# Test
print(category_from_output(output))

('Scottish', 15)


In [38]:
N_SAMPLES: int = 10

def random_choice(L: list):
    return L[randint(0, len(L) - 1)]

def random_training_example() -> tuple:
    _cat = random_choice(all_cats)
    _line = random_choice(cat_lines[_cat])
    _cat_tensor = torch.tensor([all_cats.index(_cat)], dtype = torch.long)
    _line_tensor = line_to_tensor(_line)
    return _cat, _line, _cat_tensor, _line_tensor

rng = range(N_SAMPLES)
for i in rng:
    category, line, cat_tensor, line_tensor = random_training_example()
    print(f"Category: {category}, Line: {line}")

Category: Czech, Line: Hrdy
Category: Irish, Line: Gerald
Category: Irish, Line: Dalach
Category: Russian, Line: Valters
Category: Scottish, Line: Munro
Category: Vietnamese, Line: Kim
Category: Vietnamese, Line: Banh
Category: Scottish, Line: Gray
Category: English, Line: Warner
Category: Korean, Line: Kang


## Training the Model

### Training Network

In [None]:
criterion = nn.NLLLoss()
