In [1]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import torch.nn as nn
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers.token import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

torch.manual_seed(1)

class LinguoDatasetReader(DatasetReader):
    """Dataset reader for preprocessed sentences (tokens separated by spaces) """
    GRAMMATICALITY_labels = ["ungrammatical","grammatical"]
    UG_TYPE_labels = ["WS","VA","AA","RV","G"]
    
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
    
    def text_to_instance(self, line):
        elements = line.strip().split()
        label = self.GRAMMATICALITY_labels[int(elements[0])]
        ugType = elements[1]
        sentence = elements [2:]
        sentence_field = TextField(tokens,self.token_indexers)
        fields = {"sentence":sentence_field}
        if glabel:
            glabel_field = LabelField(label=glabel,label_namespace = "grammaticality_labels")
            fields["g_label"] = glabel_field
        if ugType:
            ugType_field = LabelField(label=ugType, label_namespace = "ugtype_labels")
            fields["ug_type"] = ugType_field
        return Instance(fields)
    
    def _read(self, file_path:str, label:str=None, ugType:str=None) -> Iterator[Instance]:
        with open(file_path) as infile:
            for line in infile:
                ield self.text_to_instance([Token(word) for word in sentence],label,ugType)
                
class AllenLinguo(Model):
    
    def __init__(self,word_embeddings : TextFieldEmbedder,
                encoder : Seq2VecEncoder,
                vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        
        self.hidden2decision = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                              out_features=vocab.get_vocab_size("grammaticality_labels"))
        self.loss_function = nn.CrossEntropyLoss()
        self.accuracy = CategoricalAccuracy()
        
    def forward(self,
               sentence: Dict[str, torch.Tensor],
               g_label: torch.Tensor = None,
               ug_type: torch.Tensor = None) -> torch.Tensor:
        
        mask = get_text_field_mask(sentence)
        
        embeddings = self.word_embeddings(sentence)
        
        encoder_out = self.encoder(embeddings, mask)
        
        tag_logits = self.hidden2decision(encoder_out)
        
        output = {"tag_logits": tag_logits}
        
        if g_label is not None:
            self.accuracy(tag_logits, g_label)
            #print(tag_logits)
            output["loss"] = self.loss_function(tag_logits, g_label)
        
        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}    
            
        
                

In [6]:
training_fn = "/Users/pablo/Dropbox/workspace/darth_linguo/Data/toy_corpus/toy_training-GvsWS"
testing_fn = "/Users/pablo/Dropbox/workspace/darth_linguo/Data/toy_corpus/toy_testing-GvsWS"

reader = LinguoDatasetReader()

train_dataset = reader.read(training_fn)
validation_dataset = reader.read(testing_fn)

vocab = Vocabulary.from_instances(train_dataset,min_count={'tokens': 1})


1483it [00:00, 7805.67it/s]
371it [00:00, 14350.41it/s]
100%|██████████| 1483/1483 [00:00<00:00, 46955.54it/s]




----Vocabulary Statistics----


Top 10 most frequent tokens in namespace 'tokens':
	Token: <unk>		Frequency: 3238
	Token: de		Frequency: 2976
	Token: ,		Frequency: 2401
	Token: la		Frequency: 1996
	Token: que		Frequency: 1577
	Token: .		Frequency: 1534
	Token: <eos>		Frequency: 1483
	Token: en		Frequency: 1284
	Token: el		Frequency: 1136
	Token: y		Frequency: 1074

Top 10 longest tokens in namespace 'tokens':
	Token: extraordinariamente		length: 19	Frequency: 5
	Token: reestructuraciones		length: 18	Frequency: 8
	Token: telecomunicaciones		length: 18	Frequency: 2
	Token: Schleswig-Holstein		length: 18	Frequency: 1
	Token: autodiscriminación		length: 18	Frequency: 1
	Token: concentracionarios		length: 18	Frequency: 1
	Token: responsabilizarnos		length: 18	Frequency: 1
	Token: responsabilidades		length: 17	Frequency: 7
	Token: renacionalización		length: 17	Frequency: 1
	Token: contraproducentes		length: 17	Frequency: 1

Top 10 shortest tokens in namespace 'tokens':
	Token: !		length: 1

In [4]:
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model = AllenLinguo(word_embeddings, lstm, vocab)

optimizer = optim.SGD(model.parameters(), lr=0.1)

iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=25)

trainer.train()

predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

# tag_logits = predictor.predict("La proxima vez no habrá otra opción")['tag_logits']

#tag_ids = np.argmax(tag_logits, axis=-1)

# print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

accuracy: 0.4909, loss: 0.7008 ||: 100%|██████████| 742/742 [00:05<00:00, 125.01it/s]
accuracy: 0.4987, loss: 0.6931 ||: 100%|██████████| 186/186 [00:00<00:00, 389.90it/s]
accuracy: 0.4680, loss: 0.7032 ||: 100%|██████████| 742/742 [00:05<00:00, 129.34it/s]
accuracy: 0.4987, loss: 0.7008 ||: 100%|██████████| 186/186 [00:00<00:00, 394.13it/s]
accuracy: 0.4909, loss: 0.7015 ||: 100%|██████████| 742/742 [00:05<00:00, 132.34it/s]
accuracy: 0.4987, loss: 0.6971 ||: 100%|██████████| 186/186 [00:00<00:00, 397.00it/s]
accuracy: 0.5253, loss: 0.6982 ||: 100%|██████████| 742/742 [00:05<00:00, 129.51it/s]
accuracy: 0.4987, loss: 0.6904 ||: 100%|██████████| 186/186 [00:00<00:00, 393.10it/s]
accuracy: 0.6096, loss: 0.6383 ||: 100%|██████████| 742/742 [00:05<00:00, 129.84it/s]
accuracy: 0.5013, loss: 1.6901 ||: 100%|██████████| 186/186 [00:00<00:00, 392.06it/s]
accuracy: 0.8901, loss: 0.2705 ||: 100%|██████████| 742/742 [00:05<00:00, 126.32it/s]
accuracy: 0.9677, loss: 0.1229 ||: 100%|██████████| 18


[93m    Linking successful[0m
    /anaconda3/envs/allen/lib/python3.6/site-packages/en_core_web_sm -->
    /anaconda3/envs/allen/lib/python3.6/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [None]:

tag_logits = predictor.predict("La proxima vez no habrá otra opción")['tag_logits']

tag_ids = np.argmax(tag_logits, axis=-1)

In [16]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import torch.nn as nn
import numpy as np
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.metrics import CategoricalAccuracy


In [36]:
vocab = Vocabulary.from_files("/Users/pablo/Dropbox/workspace/darth_linguo/results/TEST2/vocabulary")
tag_logits = torch.tensor([[ 2, 1],[ 1, 2],[ 2, 1],[ 2, 1],[0, 1],[1,0],[1,0],[1,0],[1,0],[0,1],[0,1]])
g_label = torch.tensor([1,1,0,0,0,0,0,0,0,0,0])
ug_type = torch.tensor([0,0,1,1,1,2,2,3,3,3,3])

In [37]:
specific_gold = {n:[] for n in range(vocab.get_vocab_size(namespace="ugtype_labels"))}
specific_pred = {n:[] for n in range(vocab.get_vocab_size(namespace="ugtype_labels"))}
specificAccuracies = {n: CategoricalAccuracy() for n in range(vocab.get_vocab_size(namespace="ugtype_labels"))}
for ind in range(len(g_label)):
    g_lab = g_label[ind].item()
    logit = [ tag_logits[ind][0].item(), tag_logits[ind][1].item() ]
    spec_label = ug_type[ind].item()
    specific_gold[spec_label].append(g_lab)
    specific_pred[spec_label].append(logit)
    
for ind in specificAccuracies:
    if specific_pred[ind]:
        preds = torch.tensor(specific_pred[ind])
        labels = torch.tensor(specific_gold[ind])
        specificAccuracies[ind](preds,labels)
    name = vocab.get_token_from_index(ind, namespace="ugtype_labels")
    accuracy = specificAccuracies[ind].get_metric()
    print("{}:{}".format(name,accuracy))


G:0.5
WS:0.6666666666666666
RV:1.0
AA:0.5
VA:0.0


In [23]:
ws_accuracy(torch.tensor(specific_pred[2]), torch.tensor(specific_gold[2]))

In [30]:
ws_accuracy.get_metric()

0.5