In [5]:
from __future__ import unicode_literals, print_function, division
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField,LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy , Average
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor
from allennlp.data.iterators import BucketIterator, BasicIterator
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder,PytorchSeq2VecWrapper
from torch.nn import LogSoftmax
from torch.nn.modules import NLLLoss
from io import open
import glob
import os
torch.manual_seed(1)

<torch._C.Generator at 0x193d85df6b0>

In [6]:
class PosDatasetReader(DatasetReader):
    """
    DatasetReader for PoS tagging data, one word per line and its label 
    Doveyski , Russian.txt
    """
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
    def text_to_instance(self, tokens: List[Token], label: str =None) -> Instance:
        word_field = TextField(tokens, self.token_indexers)
        fields = {"word": word_field}
 
        if label is None:
            return Instance(fields)
        
        
        label_field = LabelField(label=label)
        fields["label"] = label_field
            
        return Instance(fields)
       
            
    
        
    
    
    def findFiles(self,path): 
        return glob.glob(path)
    
    def _read(self, file_path: str) -> Iterator[Instance]:
            
        for filename in self.findFiles(file_path):
            
            with open(filename,encoding='utf-8') as f:
                for line in f:
                    word= line.strip().split('\n')
                    word=str(word[0])
                    yield self.text_to_instance([Token(ch) for ch in word], filename.split('\\')[1])    

In [7]:
reader = PosDatasetReader()
train_dataset = reader.read('names/*.txt')


0it [00:00, ?it/s]
3755it [00:00, 37276.39it/s]
20074it [00:00, 25549.29it/s]


In [8]:
train_dataset[3728].fields['word'].tokens , train_dataset[3728].fields['label'].label

([D, e, a, n, s], 'English.txt')

In [9]:
class WordClassifier(Model):
    def __init__(self,
                 char_embeddings: TextFieldEmbedder,
                 encoder:Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.char_embeddings = char_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = Average()
        self.m = LogSoftmax()#I used that in order to escape from loss to blow up
        self.loss = NLLLoss()#I used this from tutorial RNN classifier
    def forward(self,
                word: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(word)
        embeddings = self.char_embeddings(word)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if label is not None:
            output["loss"] = self.loss(self.m(tag_logits), label)
            pred = tag_logits.max(1)[1]#it is giving the maximum elements for each tensor inside pred and their indexes so we are taking indexes
            self.accuracy((torch.eq(pred,label).sum()).double()/len(label))
        return output
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [10]:
vocab = Vocabulary.from_instances(train_dataset)

100%|██████████| 20074/20074 [00:00<00:00, 134153.89it/s]


In [12]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
char_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = WordClassifier(char_embeddings, lstm, vocab)
if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2, sorting_keys=[("word", "num_tokens")])
iterator.index_with(vocab)

In [13]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  patience=10,
                  num_epochs=5,
                  cuda_device=cuda_device)
trainer.train()

accuracy: 0.5952, loss: 1.3568 ||: 100%|██████████| 10037/10037 [01:08<00:00, 146.01it/s]
accuracy: 0.6800, loss: 1.0753 ||: 100%|██████████| 10037/10037 [01:06<00:00, 151.00it/s]
accuracy: 0.6992, loss: 1.0142 ||: 100%|██████████| 10037/10037 [01:04<00:00, 156.26it/s]
accuracy: 0.7043, loss: 0.9903 ||: 100%|██████████| 10037/10037 [01:06<00:00, 172.71it/s]
accuracy: 0.7104, loss: 0.9707 ||: 100%|██████████| 10037/10037 [01:04<00:00, 154.75it/s]


{'best_epoch': 4,
 'peak_cpu_memory_MB': 0,
 'training_duration': '00:05:31',
 'training_start_epoch': 0,
 'training_epochs': 4,
 'epoch': 4,
 'training_accuracy': tensor(0.7104, dtype=torch.float64),
 'training_loss': 0.9706900416290403,
 'training_cpu_memory_MB': 0.0}