In [9]:
import allen_linguo
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import torch.nn as nn
import numpy as np
from allennlp.data.vocabulary import Vocabulary
from allennlp.data import Instance
from allennlp.data.fields import TextField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers.token import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

In [10]:
training_fn = "/home/lab/Pablo/darth_linguo/Data/toy_ws/toy_ws-GvsMixgram-train"
testing_fn = "/home/lab/Pablo/darth_linguo/Data/toy_ws/toy_ws-GvsMixgram-val"

reader = allen_linguo.LinguoDatasetReader()

train_dataset = reader.read(training_fn)
validation_dataset = reader.read(testing_fn)

vocab = Vocabulary.from_instances(train_dataset,min_count={'tokens': 1})

281it [00:00, 37585.29it/s]
70it [00:00, 37391.91it/s]
01/07/2019 14:21:13 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.
100%|██████████| 281/281 [00:00<00:00, 94017.18it/s]


In [15]:
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model = allen_linguo.AllenLinguo(word_embeddings, lstm, vocab)

optimizer = optim.SGD(model.parameters(), lr=0.1)

iterator = BucketIterator(batch_size=10, sorting_keys=[("sentence", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=100)

trainer.train()



01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   Beginning training.
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   Epoch 0/99
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 294.224
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   GPU 0 memory usage MB: 641
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   Training
accuracy: 0.4947, loss: 0.6960 ||: 100%|██████████| 29/29 [00:00<00:00, 174.30it/s]
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   Validating
accuracy: 0.5000, loss: 0.6937 ||: 100%|██████████| 7/7 [00:00<00:00, 503.98it/s]
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -                       Training |  Validation
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   gpu_0_memory_MB |   641.000  |       N/A
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   loss            |     0.696  |     0.694
01/07/2019 14:23:07 - INFO - allennlp.training.trainer -   cpu_mem

01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   loss            |     0.696  |     0.693
01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   cpu_memory_MB   |   294.232  |       N/A
01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   accuracy        |     0.502  |     0.500
01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   Epoch duration: 00:00:00
01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   Estimated training time remaining: 0:00:18
01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   Epoch 7/99
01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 294.232
01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   GPU 0 memory usage MB: 641
01/07/2019 14:23:08 - INFO - allennlp.training.trainer -   Training
accuracy: 0.4875, loss: 0.6958 ||: 100%|██████████| 29/29 [00:00<00:00, 188.34it/s]
01/07/2019 14:23:09 - INFO - allennlp.training.trainer -   Validating
accuracy: 0.5000, loss: 0.6940 ||: 100%|███

accuracy: 0.4662, loss: 0.6963 ||: 100%|██████████| 29/29 [00:00<00:00, 182.44it/s]
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -   Validating
accuracy: 0.5000, loss: 0.6935 ||: 100%|██████████| 7/7 [00:00<00:00, 515.06it/s]
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -                       Training |  Validation
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -   gpu_0_memory_MB |   641.000  |       N/A
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -   loss            |     0.696  |     0.693
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -   cpu_memory_MB   |   294.256  |       N/A
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -   accuracy        |     0.466  |     0.500
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -   Epoch duration: 00:00:00
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -   Estimated training time remaining: 0:00:16
01/07/2019 14:23:10 - INFO - allennlp.training.trainer -   Epoch 14/99
01/07

01/07/2019 14:23:11 - INFO - allennlp.training.trainer -   Estimated training time remaining: 0:00:15
01/07/2019 14:23:11 - INFO - allennlp.training.trainer -   Epoch 20/99
01/07/2019 14:23:11 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 294.256
01/07/2019 14:23:11 - INFO - allennlp.training.trainer -   GPU 0 memory usage MB: 641
01/07/2019 14:23:11 - INFO - allennlp.training.trainer -   Training
accuracy: 0.4804, loss: 0.6970 ||: 100%|██████████| 29/29 [00:00<00:00, 180.46it/s]
01/07/2019 14:23:11 - INFO - allennlp.training.trainer -   Validating
accuracy: 0.5000, loss: 0.6938 ||: 100%|██████████| 7/7 [00:00<00:00, 514.12it/s]
01/07/2019 14:23:11 - INFO - allennlp.training.trainer -                       Training |  Validation
01/07/2019 14:23:11 - INFO - allennlp.training.trainer -   gpu_0_memory_MB |   641.000  |       N/A
01/07/2019 14:23:11 - INFO - allennlp.training.trainer -   loss            |     0.697  |     0.694
01/07/2019 14:23:11 - INFO - allennlp.trai

01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   gpu_0_memory_MB |   641.000  |       N/A
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   loss            |     0.697  |     0.693
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   cpu_memory_MB   |   294.256  |       N/A
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   accuracy        |     0.459  |     0.500
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   Epoch duration: 00:00:00
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   Estimated training time remaining: 0:00:14
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   Epoch 27/99
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 294.256
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   GPU 0 memory usage MB: 641
01/07/2019 14:23:12 - INFO - allennlp.training.trainer -   Training
accuracy: 0.4804, loss: 0.6960 ||: 100%|██████████| 29/29 [00:00<00:00, 177.83it/s]
01/07/2019 1

01/07/2019 14:23:13 - INFO - allennlp.training.trainer -   GPU 0 memory usage MB: 641
01/07/2019 14:23:13 - INFO - allennlp.training.trainer -   Training
accuracy: 0.4840, loss: 0.6963 ||: 100%|██████████| 29/29 [00:00<00:00, 182.61it/s]
01/07/2019 14:23:14 - INFO - allennlp.training.trainer -   Validating
accuracy: 0.5000, loss: 0.6944 ||: 100%|██████████| 7/7 [00:00<00:00, 523.06it/s]
01/07/2019 14:23:14 - INFO - allennlp.training.trainer -                       Training |  Validation
01/07/2019 14:23:14 - INFO - allennlp.training.trainer -   gpu_0_memory_MB |   641.000  |       N/A
01/07/2019 14:23:14 - INFO - allennlp.training.trainer -   loss            |     0.696  |     0.694
01/07/2019 14:23:14 - INFO - allennlp.training.trainer -   cpu_memory_MB   |   294.268  |       N/A
01/07/2019 14:23:14 - INFO - allennlp.training.trainer -   accuracy        |     0.484  |     0.500
01/07/2019 14:23:14 - INFO - allennlp.training.trainer -   Epoch duration: 00:00:00
01/07/2019 14:23:14 - IN

{'peak_cpu_memory_MB': 294.268,
 'peak_gpu_0_memory_MB': 641,
 'training_duration': '00:00:06',
 'training_start_epoch': 0,
 'training_epochs': 33,
 'epoch': 33,
 'training_accuracy': 0.48398576512455516,
 'training_loss': 0.6962747224446001,
 'training_cpu_memory_MB': 294.268,
 'training_gpu_0_memory_MB': 641,
 'validation_accuracy': 0.5,
 'validation_loss': 0.6944102559770856,
 'best_epoch': 24,
 'best_validation_accuracy': 0.5,
 'best_validation_loss': 0.6930497629301888}

In [5]:
tri_freqs

defaultdict(<function generateWS.extract_ngram_freq.<locals>.<lambda>()>,
            {'# #': defaultdict(int, {'I': 1, 'Sam': 1}),
             '# I': defaultdict(int, {'am': 1}),
             'I am': defaultdict(int, {'Sam': 1, '.': 1}),
             'am Sam': defaultdict(int, {'.': 1}),
             'Sam .': defaultdict(int, {'<eos>': 1}),
             '# Sam': defaultdict(int, {'I': 1}),
             'Sam I': defaultdict(int, {'am': 1}),
             'am .': defaultdict(int, {'<eos>': 1})})

In [6]:
import argparse

In [7]:
parser = argparse.ArgumentParser(description="testing")

In [None]:
parser.add_argument("--orders",)