In [19]:
from typing import Iterator, List, Dict
import numpy as np
import torch
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data import Instance
from allennlp.data.fields import TextField, LabelField, SequenceLabelField
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.tokenizers import Token
from allennlp.modules.token_embedders.token_characters_encoder import TokenCharactersEncoder
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import Trainer
from allennlp.predictors import Predictor
from allennlp.common.util import JsonDict
import glob
import unicodedata
import string
import random
from sklearn.model_selection import train_test_split

In [2]:
#model.py
class RnnLang(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self,
                tokens: Dict[str, torch.Tensor],
                token_characters: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings({**tokens, **token_characters})

        encoder_out = self.encoder(embeddings, mask)

        logits = self.hidden2tag(encoder_out)
        output = {"logits": logits}
        if labels is not None:
            self.accuracy(logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(logits, labels, mask)

        return output

In [41]:
class Reader(DatasetReader):
    
    def __init__(self , token_indexers: Dict[str, SingleIdTokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or{"tokens": SingleIdTokenIndexer()}
        self.token_character_indexers = {"token_characters": TokenCharactersIndexer()}
        self.all_letters = string.ascii_letters + " .,;'"
        self.category_lines = {}
        self.all_categories = []

    def text_to_instance(self, names: List[str], categories: List[str] = None) -> Instance:
        tokens = [Token(name) for name in names]
        token_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": token_field}
        token_character_field = TextField(tokens, self.token_character_indexers)
        fields["token_characters"] = token_character_field
        if categories != None:
            label_field = SequenceLabelField(labels=categories, sequence_field = token_field)
            fields["labels"] = label_field

        return Instance(fields)
    def unicode_to_ascii(self, s: str) -> str:
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
            and c in self.all_letters
        )
    def readLines(self, filename:str) -> list:
        lines = open(filename).read().strip().split('\n')
        return [self.unicode_to_ascii(line) for line in lines]
    def _read(self, file_path: str) -> Iterator[Instance]:
        all_filenames = glob.glob(file_path)
        name_cats = []
        for filename in all_filenames:
            category = filename.split('/')[-1].split('.')[0]
            lines = self.readLines(filename)
            self.category_lines[category] = lines
            name_cats.extend([(word, category) for word in lines])
        random.shuffle(name_cats)
        for i in range(0, len(name_cats), 10):
            chunk = name_cats[i:i + 10]
            yield self.text_to_instance([pair[0] for pair in chunk], [pair[1] for pair in chunk])

In [42]:
reader = Reader()

In [5]:
data = reader.read('../data/names/*.txt')

2008it [00:00, 16074.30it/s]


In [6]:
training_set, validation_set = train_test_split(data)

In [7]:
vocab = Vocabulary.from_instances(data)

03/30/2019 16:23:29 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.
100%|██████████| 2008/2008 [00:00<00:00, 6382.49it/s]


In [8]:
vocab

Vocabulary with namespaces:  tokens, Size: 17424 || token_characters, Size: 57 || labels, Size: 18 || Non Padded Namespaces: {'*labels', '*tags'}

In [9]:
WORD_EMBEDDING_DIM = 3
CHAR_EMBEDDING_DIM = 3
HIDDEN_DIM =6
EMBEDDING_DIM = WORD_EMBEDDING_DIM + CHAR_EMBEDDING_DIM

In [11]:
char_encoder = PytorchSeq2VecWrapper(torch.nn.RNN(CHAR_EMBEDDING_DIM, CHAR_EMBEDDING_DIM, batch_first=True))
token_char_embedding = Embedding(num_embeddings=vocab.get_vocab_size('token_characters'),
                            embedding_dim=WORD_EMBEDDING_DIM)
char_embeddings = TokenCharactersEncoder(token_char_embedding, char_encoder)

In [12]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=WORD_EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding, "token_characters": char_embeddings})

In [13]:
encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [14]:
model = RnnLang(word_embeddings, encoder, vocab)

In [73]:
optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens"), ("token_characters", "num_token_characters")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=training_set,
                  validation_dataset=validation_set,
                  patience=10,
                  num_epochs=15, cuda_device=-1)
trainer.train()

03/30/2019 16:39:50 - INFO - allennlp.training.trainer -   Beginning training.
03/30/2019 16:39:50 - INFO - allennlp.training.trainer -   Epoch 0/14
03/30/2019 16:39:50 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 309.576
03/30/2019 16:39:50 - INFO - allennlp.training.trainer -   Training
loss: 1.7559 ||: 100%|██████████| 753/753 [00:05<00:00, 140.19it/s]
03/30/2019 16:39:55 - INFO - allennlp.training.trainer -   Validating
loss: 1.5997 ||: 100%|██████████| 251/251 [00:00<00:00, 408.21it/s]
03/30/2019 16:39:56 - INFO - allennlp.training.trainer -                     Training |  Validation
03/30/2019 16:39:56 - INFO - allennlp.training.trainer -   loss          |     1.756  |     1.600
03/30/2019 16:39:56 - INFO - allennlp.training.trainer -   cpu_memory_MB |   309.576  |       N/A
03/30/2019 16:39:56 - INFO - allennlp.training.trainer -   Epoch duration: 00:00:06
03/30/2019 16:39:56 - INFO - allennlp.training.trainer -   Estimated training time remaining: 0:01:24
03

03/30/2019 16:40:48 - INFO - allennlp.training.trainer -   Estimated training time remaining: 0:00:39
03/30/2019 16:40:48 - INFO - allennlp.training.trainer -   Epoch 9/14
03/30/2019 16:40:48 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 309.772
03/30/2019 16:40:48 - INFO - allennlp.training.trainer -   Training
loss: 1.0782 ||: 100%|██████████| 753/753 [00:05<00:00, 136.60it/s]
03/30/2019 16:40:54 - INFO - allennlp.training.trainer -   Validating
loss: 1.1071 ||: 100%|██████████| 251/251 [00:00<00:00, 346.59it/s]
03/30/2019 16:40:55 - INFO - allennlp.training.trainer -                     Training |  Validation
03/30/2019 16:40:55 - INFO - allennlp.training.trainer -   loss          |     1.078  |     1.107
03/30/2019 16:40:55 - INFO - allennlp.training.trainer -   cpu_memory_MB |   309.772  |       N/A
03/30/2019 16:40:55 - INFO - allennlp.training.trainer -   Epoch duration: 00:00:06
03/30/2019 16:40:55 - INFO - allennlp.training.trainer -   Estimated training tim

{'peak_cpu_memory_MB': 309.776,
 'training_duration': '00:01:39',
 'training_start_epoch': 0,
 'training_epochs': 14,
 'epoch': 14,
 'training_loss': 0.7357016288703815,
 'training_cpu_memory_MB': 309.776,
 'validation_loss': 1.1203631451167908,
 'best_epoch': 10,
 'best_validation_loss': 1.0736781919145013}

In [69]:
class LngPredictor(Predictor):
    
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        instance = self._dataset_reader.text_to_instance(inputs)
        output_dict = self.predict_instance(instance)
        tag_ids = np.argmax(output_dict['logits'], axis=-1)
        return [self._model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]

In [74]:
predictor = LngPredictor(model, reader)

In [75]:
prediction = predictor.predict_json(["Ivanov", "Smith"])

In [76]:
prediction

['Russian', 'German']