In [1]:
import warnings
from typing import Dict

import torch
from allennlp.data import Token, Vocabulary, TokenIndexer, Tokenizer
from allennlp.data.fields import ListField, TextField
from allennlp.data.token_indexers import (
    SingleIdTokenIndexer,
    TokenCharactersIndexer,
    ELMoTokenCharactersIndexer,
    PretrainedTransformerIndexer,
    PretrainedTransformerMismatchedIndexer,
)
from allennlp.data.tokenizers import (
    CharacterTokenizer,
    PretrainedTransformerTokenizer,
    SpacyTokenizer,
    WhitespaceTokenizer,
)
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import (
    Embedding,
    TokenCharactersEncoder,
    ElmoTokenEmbedder,
    PretrainedTransformerEmbedder,
    PretrainedTransformerMismatchedEmbedder,
)
from allennlp.nn import util as nn_util

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer: Tokenizer = WhitespaceTokenizer()
token_indexer: TokenIndexer = ELMoTokenCharactersIndexer()
vocab = Vocabulary()
text = "This is some text ."
tokens = tokenizer.tokenize(text)
print("ELMo tokens:", tokens)
text_field = TextField(tokens, {"elmo_tokens": token_indexer})
text_field.index(vocab)
padding_lengths = text_field.get_padding_lengths()

tensor_dict = text_field.as_tensor(padding_lengths)
print("ELMo tensors:", tensor_dict)

ELMo tokens: [This, is, some, text, .]
ELMo tensors: {'elmo_tokens': {'elmo_tokens': tensor([[259,  85, 105, 106, 116, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 106, 116, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 116, 112, 110, 102, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 117, 102, 121, 117, 260, 261, 261, 261, 

In [3]:
transformer_model = "bert-base-cased"
tokenizer = PretrainedTransformerTokenizer(model_name=transformer_model)
token_indexer = PretrainedTransformerIndexer(model_name=transformer_model)
text = "Some text with an extraordinarily long identifier."
tokens = tokenizer.tokenize(text)
print("BERT tokens:", tokens)
text_field = TextField(tokens, {"bert_tokens": token_indexer})
text_field.index(vocab)

tensor_dict = text_field.as_tensor(text_field.get_padding_lengths())
print("BERT tensors:", tensor_dict)

Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29.0/29.0 [00:00<00:00, 50.1kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 696kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 208k/208k [00:01<00:00, 142kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 426k/426k [00:04<00:00, 89.6kB/s]


BERT tokens: [[CLS], Some, text, with, an, extra, ##ord, ##ina, ##rily, long, id, ##ent, ##ifier, ., [SEP]]
BERT tensors: {'bert_tokens': {'token_ids': tensor([  101,  1789,  3087,  1114,  1126,  3908,  6944,  2983, 11486,  1263,
        25021,  3452, 17792,   119,   102]), 'mask': tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True]), 'type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}}


In [4]:
tokenizer = PretrainedTransformerTokenizer(
    model_name=transformer_model,
    add_special_tokens=False,
)

context_text = "This context is frandibulous."
question_text = "What is the context like?"
context_tokens = tokenizer.tokenize(context_text)
question_tokens = tokenizer.tokenize(question_text)
print("Context tokens:", context_tokens)
print("Question tokens:", question_tokens)

combined_tokens = tokenizer.add_special_tokens(context_tokens, question_tokens)
print("Combined tokens:", combined_tokens)

text_field = TextField(combined_tokens, {"bert_tokens": token_indexer})
text_field.index(vocab)

tensor_dict = text_field.as_tensor(text_field.get_padding_lengths())
print("Combined BERT tensors:", tensor_dict)

Context tokens: [This, context, is, f, ##rand, ##ib, ##ulous, .]
Question tokens: [What, is, the, context, like, ?]
Combined tokens: [[CLS], This, context, is, f, ##rand, ##ib, ##ulous, ., [SEP], What, is, the, context, like, ?, [SEP]]
Combined BERT tensors: {'bert_tokens': {'token_ids': tensor([  101,  1188,  5618,  1110,   175, 13141, 13292, 14762,   119,   102,
         1327,  1110,  1103,  5618,  1176,   136,   102]), 'mask': tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True]), 'type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])}}


In [None]:
# Splits text into words (instead of wordpieces or characters).  For ELMo, you can
# just use any word-level tokenizer that you like, though for best results you
# should use the same tokenizer that was used with ELMo, which is an older version
# of spacy.  We're using a whitespace tokenizer here for ease of demonstration
# with binder.
tokenizer: Tokenizer = WhitespaceTokenizer()

# Represents each token with an array of characters in a way that ELMo expects.
token_indexer: TokenIndexer = ELMoTokenCharactersIndexer()

# Both ELMo and BERT do their own thing with vocabularies, so we don't need to add
# anything, but we do need to construct the vocab object so we can use it below.
# (And if you have any labels in your data that need indexing, you'll still need
# this.)
vocab = Vocabulary()

text = "This is some text ."
tokens = tokenizer.tokenize(text)
print("ELMo tokens:", tokens)

text_field = TextField(tokens, {"elmo_tokens": token_indexer})
text_field.index(vocab)

# We typically batch things together when making tensors, which requires some
# padding computation.  Don't worry too much about the padding for now.
padding_lengths = text_field.get_padding_lengths()

tensor_dict = text_field.as_tensor(padding_lengths)
print("ELMo tensors:", tensor_dict)

# Any transformer model name that huggingface's transformers library supports will
# work here.  Under the hood, we're grabbing pieces from huggingface for this
# part.
transformer_model = "bert-base-cased"

# To do modeling with BERT correctly, we can't use just any tokenizer; we need to
# use BERT's tokenizer.
tokenizer = PretrainedTransformerTokenizer(model_name=transformer_model)

# Represents each wordpiece with an id from BERT's vocabulary.
token_indexer = PretrainedTransformerIndexer(model_name=transformer_model)

text = "Some text with an extraordinarily long identifier."
tokens = tokenizer.tokenize(text)
print("BERT tokens:", tokens)

text_field = TextField(tokens, {"bert_tokens": token_indexer})
text_field.index(vocab)

tensor_dict = text_field.as_tensor(text_field.get_padding_lengths())
print("BERT tensors:", tensor_dict)

# Now we'll do an example with paired text, to show the right way to handle [SEP]
# tokens in AllenNLP.  We have built-in ways of handling this for two text pieces.
# If you have more than two text pieces, you'll have to manually add the special
# tokens.  The way we're doing this requires that you use a
# PretrainedTransformerTokenizer, not the abstract Tokenizer class.

# Splits text into wordpieces, but without adding special tokens.
tokenizer = PretrainedTransformerTokenizer(
    model_name=transformer_model,
    add_special_tokens=False,
)

context_text = "This context is frandibulous."
question_text = "What is the context like?"
context_tokens = tokenizer.tokenize(context_text)
question_tokens = tokenizer.tokenize(question_text)
print("Context tokens:", context_tokens)
print("Question tokens:", question_tokens)

combined_tokens = tokenizer.add_special_tokens(context_tokens, question_tokens)
print("Combined tokens:", combined_tokens)

text_field = TextField(combined_tokens, {"bert_tokens": token_indexer})
text_field.index(vocab)

tensor_dict = text_field.as_tensor(text_field.get_padding_lengths())
print("Combined BERT tensors:", tensor_dict)

In [6]:
tokenizer: Tokenizer = WhitespaceTokenizer()
token_indexer: TokenIndexer = ELMoTokenCharactersIndexer()
vocab = Vocabulary()
text = "This is some text."
tokens = tokenizer.tokenize(text)
print("ELMo tokens:", tokens)
text_field = TextField(tokens, {"elmo_tokens": token_indexer})
text_field.index(vocab)
token_tensor = text_field.as_tensor(text_field.get_padding_lengths())
print("ELMo tensors:", token_tensor)

ELMo tokens: [This, is, some, text.]
ELMo tensors: {'elmo_tokens': {'elmo_tokens': tensor([[259,  85, 105, 106, 116, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 106, 116, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 116, 112, 110, 102, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261],
        [259, 117, 102, 121, 117,  47, 260, 261, 261, 26

In [7]:
# We're using a tiny, toy version of ELMo to demonstrate this.
elmo_options_file = (
    "https://allennlp.s3.amazonaws.com/models/elmo/test_fixture/options.json"
)
elmo_weight_file = (
    "https://allennlp.s3.amazonaws.com/models/elmo/test_fixture/lm_weights.hdf5"
)
elmo_embedding = ElmoTokenEmbedder(
    options_file=elmo_options_file, weight_file=elmo_weight_file
)

embedder = BasicTextFieldEmbedder(token_embedders={"elmo_tokens": elmo_embedding})

tensor_dict = text_field.batch_tensors([token_tensor])
embedded_tokens = embedder(tensor_dict)
print("ELMo embedded tokens:", embedded_tokens)

ELMo embedded tokens: tensor([[[ 0.7915,  0.0000, -0.2392, -0.0000, -0.8512, -0.0000, -0.4313,
          -1.4576,  0.5932, -0.5559,  0.0271,  0.0000, -0.0000,  0.7962,
          -0.4822, -0.0000, -0.0000,  0.9739, -0.0000, -0.0000, -1.5000,
          -1.4474, -0.9234, -0.0000,  0.0000, -0.1619, -0.2561, -0.0000,
          -0.0000,  0.0000, -0.0000, -0.3873],
         [ 0.8214,  0.0000, -0.4139, -0.8952, -0.8725,  0.3791, -0.0000,
          -0.0000,  0.6740, -0.8773,  0.0000, -0.1073, -0.4150,  0.2156,
           0.0000,  0.3758,  0.0000,  0.5719, -1.3625, -0.6818, -0.0000,
          -0.0000, -1.4208,  0.3838,  0.0000, -0.5148, -0.7528,  0.0000,
          -0.0000, -0.0000, -0.4018,  0.6609],
         [ 0.7935,  1.2918, -0.0000, -0.0000, -0.5219, -0.2793, -0.8381,
          -0.0000,  0.0000, -0.6763,  0.0150,  0.2454, -0.0000,  0.9078,
          -0.1125,  0.4098, -0.0000,  0.0487, -0.0000,  0.0000, -0.6707,
          -0.0000, -0.0000,  0.2709,  1.7239, -0.6564,  0.0000,  0.0000,
        

In [13]:
token_tensor

{'elmo_tokens': {'elmo_tokens': tensor([[259,  85, 105, 106, 116, 260, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261],
          [259, 106, 116, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261],
          [259, 116, 112, 110, 102, 260, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
           261, 261, 261, 261, 261, 261, 261, 261],
          [259, 117, 102, 121, 117,  47, 260, 261, 261, 261, 261, 261, 261, 261,
    

In [16]:
token = token_tensor['elmo_tokens']['elmo_tokens']

In [24]:
# embedder

In [22]:
embedder(text_field.batch_tensors([token_tensor])).shape

torch.Size([1, 4, 32])

In [11]:
tensor_dict['elmo_tokens']['elmo_tokens'].shape

torch.Size([1, 4, 50])

In [8]:
embedded_tokens.shape

torch.Size([1, 4, 32])

In [None]:
# It's easiest to get ELMo input by just running the data code.  See the
# exercise above for an explanation of this code.
tokenizer: Tokenizer = WhitespaceTokenizer()
token_indexer: TokenIndexer = ELMoTokenCharactersIndexer()
vocab = Vocabulary()
text = "This is some text."
tokens = tokenizer.tokenize(text)
print("ELMo tokens:", tokens)
text_field = TextField(tokens, {"elmo_tokens": token_indexer})
text_field.index(vocab)
token_tensor = text_field.as_tensor(text_field.get_padding_lengths())
print("ELMo tensors:", token_tensor)

# We're using a tiny, toy version of ELMo to demonstrate this.
elmo_options_file = (
    "https://allennlp.s3.amazonaws.com/models/elmo/test_fixture/options.json"
)
elmo_weight_file = (
    "https://allennlp.s3.amazonaws.com/models/elmo/test_fixture/lm_weights.hdf5"
)
elmo_embedding = ElmoTokenEmbedder(
    options_file=elmo_options_file, weight_file=elmo_weight_file
)

embedder = BasicTextFieldEmbedder(token_embedders={"elmo_tokens": elmo_embedding})

tensor_dict = text_field.batch_tensors([token_tensor])
embedded_tokens = embedder(tensor_dict)
print("ELMo embedded tokens:", embedded_tokens)


# Again, it's easier to just run the data code to get the right output.

# We're using the smallest transformer model we can here, so that it runs on
# binder.
transformer_model = "google/reformer-crime-and-punishment"
tokenizer = PretrainedTransformerTokenizer(model_name=transformer_model)
token_indexer = PretrainedTransformerIndexer(model_name=transformer_model)
text = "Some text with an extraordinarily long identifier."
tokens = tokenizer.tokenize(text)
print("Transformer tokens:", tokens)
text_field = TextField(tokens, {"bert_tokens": token_indexer})
text_field.index(vocab)
token_tensor = text_field.as_tensor(text_field.get_padding_lengths())
print("Transformer tensors:", token_tensor)

embedding = PretrainedTransformerEmbedder(model_name=transformer_model)

embedder = BasicTextFieldEmbedder(token_embedders={"bert_tokens": embedding})

tensor_dict = text_field.batch_tensors([token_tensor])
embedded_tokens = embedder(tensor_dict)
print("Transformer embedded tokens:", embedded_tokens)

In [None]:
from allennlp.data import Vocabulary
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
import torch

# This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer;
# see the exercises above.
token_tensor = {"tokens": {"tokens": torch.LongTensor([1, 3, 2, 1, 4, 3])}}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(
    ["This", "is", "some", "text", "."], namespace="token_vocab"
)

glove_file = "https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz"

# This is for embedding each token.
embedding = Embedding(
    vocab=vocab,
    vocab_namespace="token_vocab",
    embedding_dim=50,
    pretrained_file=glove_file,
)

embedder = BasicTextFieldEmbedder(token_embedders={"tokens": embedding})

embedded_tokens = embedder(token_tensor)
print(embedded_tokens.size())