In [185]:
from typing import Iterator, List, Dict
import numpy as np
import torch
import torch.optim as optim
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data import Instance
from allennlp.data.fields import TextField, LabelField, SequenceLabelField
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.tokenizers import Token
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import Trainer
import glob
import unicodedata
import string
import random
from sklearn.model_selection import train_test_split

In [489]:
#model.py
class RnnLang(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self,
                token_characters: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(token_characters)
        # Forward pass
        embeddings = self.word_embeddings(token_characters)
        batch, a, b, emb = embeddings.shape
        embeddings = embeddings.view(batch, a*b, emb)
        print(embeddings.shape)
        encoder_out = self.encoder(embeddings, mask)

        logits = self.hidden2tag(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        print('l', labels)
        print(logits)
        if labels is not None:
            self.accuracy(logits, labels)
            output["loss"] = self.loss_function(logits, labels)

        return output

In [450]:
class Reader(DatasetReader):
    
    def __init__(self , token_indexers: Dict[str, TokenCharactersIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or{"token_characters": TokenCharactersIndexer()}
        self.all_letters = string.ascii_letters + " .,;'"
        self.category_lines = {}
        self.all_categories = []

    def text_to_instance(self, tokens: List[Token], categories: List[str]) -> Instance:
        token_field = TextField(tokens, self.token_indexers)
        fields = {"token_characters": token_field}
        label_field = SequenceLabelField(labels=categories, sequence_field = token_field)
        fields["labels"] = label_field

        return Instance(fields)
    def unicode_to_ascii(self, s: str) -> str:
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
            and c in self.all_letters
        )
    def readLines(self, filename:str) -> list:
        lines = open(filename).read().strip().split('\n')
        return [self.unicode_to_ascii(line) for line in lines]
    def _read(self, file_path: str) -> Iterator[Instance]:
        all_filenames = glob.glob(file_path)
        name_cats = []
        for filename in all_filenames:
            category = filename.split('/')[-1].split('.')[0]
            lines = self.readLines(filename)
            self.category_lines[category] = lines
            name_cats.extend([(word, category) for word in lines])
        random.shuffle(name_cats)
        for i in range(0, len(name_cats), 10):
            chunk = name_cats[i:i + 10]
            yield self.text_to_instance([Token(pair[0]) for pair in chunk], [pair[1] for pair in chunk])

In [451]:
reader = Reader()

In [452]:
data = reader.read('../data/names/*.txt')



0it [00:00, ?it/s][A[A

47it [00:00, 469.94it/s][A[A

2008it [00:00, 12726.32it/s][A[A

In [453]:
training_set, validation_set = train_test_split(data)

In [454]:
vocab = Vocabulary.from_instances(data)

03/30/2019 15:03:19 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.



  0%|          | 0/2008 [00:00<?, ?it/s][A[A[A


 46%|████▌     | 925/2008 [00:00<00:00, 9245.42it/s][A[A[A


 98%|█████████▊| 1964/2008 [00:00<00:00, 9560.30it/s][A[A[A


100%|██████████| 2008/2008 [00:00<00:00, 9762.37it/s][A[A[A

In [455]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('token_characters'),
                            embedding_dim=3)
word_embeddings = BasicTextFieldEmbedder({"token_characters": token_embedding})

In [456]:
encoder = PytorchSeq2VecWrapper(torch.nn.RNN(3, 3, batch_first=True))

In [490]:
model = RnnLang(word_embeddings, encoder, vocab)

In [491]:
optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2, sorting_keys=[("token_characters", "num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=training_set,
                  validation_dataset=validation_set,
                  patience=10,
                  num_epochs=10, cuda_device=-1)
trainer.train()

03/30/2019 15:16:00 - INFO - allennlp.training.trainer -   Beginning training.
03/30/2019 15:16:00 - INFO - allennlp.training.trainer -   Epoch 0/9
03/30/2019 15:16:00 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 403.084
03/30/2019 15:16:00 - INFO - allennlp.training.trainer -   Training














  0%|          | 0/753 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A

torch.Size([2, 110, 3])
l tensor([[11,  0,  5,  2,  1,  3,  2,  1,  0, 10],
        [ 0,  3,  0,  1,  0,  5,  4,  5,  5,  2]])
tensor([[ 0.0823, -0.5347, -0.6219, -0.5650,  0.8452, -0.0363, -0.3801, -0.0315,
         -0.3240, -0.4200,  0.1876, -0.3078,  0.0930, -0.2659, -1.0331,  0.2120,
         -0.1832,  0.0153],
        [ 0.0503, -0.6052, -0.5426, -0.4576,  0.8204, -0.0660, -0.3833, -0.1063,
         -0.2593, -0.4059,  0.2308, -0.2768,  0.0596, -0.2202, -0.9736,  0.1510,
         -0.2464,  0.0137]], grad_fn=<AddmmBackward>)


ConfigurationError: 'gold_labels must have dimension == predictions.size() - 1 but found tensor of shape: torch.Size([2, 18])'