<a href="https://colab.research.google.com/github/JPA-BERT/jpa-bert.github.io/blob/master/notebooks/2020_0726torch_word_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# coding: utf-8
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx

#import data
#import model

In [3]:
# Set the random seed manually for reproducibility.
seed = 20200726
cuda = True
torch.manual_seed(seed)
if torch.cuda.is_available():
    if not cuda:
        print("WARNING: Set cuda=True")

device = torch.device("cuda" if cuda else "cpu")
device

device(type='cuda')

In [26]:
# this cell is the contenet of data.py
import os
from io import open
#import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        #self.train = self.tokenize(os.path.join(path, 'train.txt'))
        #self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        #self.test = self.tokenize(os.path.join(path, 'test.txt'))
        self.train = self.tokenize(os.path.join(path, 'train.csv'))
        self.valid = self.tokenize(os.path.join(path, 'test.csv'))
        self.test = self.tokenize(os.path.join(path, 'test.csv'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids

In [21]:
#download wikitext-2 dataset and GloVe embeddings
!wget https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz -P /data
!tar xzf /data/wikitext-2.tgz -C /data
!mv /data/wikitext-2/ /data/testwikitext2/

--2020-07-26 03:50:13--  https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.30.134
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.30.134|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4070055 (3.9M) [application/x-tar]
Saving to: ‘/data/wikitext-2.tgz.1’


2020-07-26 03:50:14 (7.86 MB/s) - ‘/data/wikitext-2.tgz.1’ saved [4070055/4070055]



In [27]:
!ls -l /data/testwikitext2/
data_path = '/data/testwikitext2'

total 11680
-rw-rw-r-- 1 1000 1000  1124390 Jan 18  2018 test.csv
-rw-rw-r-- 1 1000 1000 10827302 Jan 18  2018 train.csv
drwxrwxr-x 2 1000 1000     4096 Jan 18  2018 wikitext-2


In [28]:
###############################################################################
# Load data
###############################################################################

corpus = Corpus(data_path)

# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 10
batch_size = 20
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)


In [29]:
# model.py

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):

        super(RNNModel, self).__init__()
        self.ntoken = ntoken
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        decoded = decoded.view(-1, self.ntoken)
        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

        
# Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

        

In [30]:
###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = 'Transformer'
model = 'LSTM'

emsize = 200  # size of word embeddings
nhid = 200  # number of hidden units per layer
nlayers = 2  # number of layers
lr = 20. #  # initial learning rate
clip = 0.25  # gradient clipping
epochs = 40 # upper epoch limit
batch_size = 20 # batch size
bptt = 35  # sequence length
dropout = 0.2  # dropout applied to layers (0 = no dropout)
tied = True # tie the word embedding and softmax weights
seed = 1111  # random seed
cuda  = False  # use CUDA
log_interval = 200  # report interval
save = 'model.pt'  # path to save the final model
onnx_export = ''  # path to export the final model in onnx format
nhead = 2   # the number of heads in the encoder/decoder of the transformer model
dry_run = False  # verify the code and the model

if model == 'Transformer':
    model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
else:
    model = RNNModel(rnn_type=model, 
                     ntoken=ntokens, 
                     ninp=emsize, 
                     nhid=nhid, 
                     nlayers=nlayers, 
                     dropout=dropout, 
                     tie_weights=tied).to(device)

criterion = nn.NLLLoss()

###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)



In [31]:
# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    if model != 'Transformer':
        hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            if model == 'Transformer':
                output = model(data)
                output = output.view(-1, ntokens)
            else:
                output, hidden = model(data, hidden)
                hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if model != 'Transformer':
        hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        if model == 'Transformer':
            output = model(data)
            output = output.view(-1, ntokens)
        else:
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        if dry_run:
            break

def export_onnx(path, batch_size, seq_len):
    print('The model is also exported in ONNX format at {}'.
          format(os.path.realpath(onnx_export)))
    model.eval()
    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
    hidden = model.init_hidden(batch_size)
    torch.onnx.export(model, (dummy_input, hidden), path)


In [None]:
# Loop over epochs.
# lr = args.lr
best_val_loss = None

epochs = 20
# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')


| epoch   1 |   200/ 2986 batches | lr 20.00 | ms/batch 19.76 | loss  5.16 | ppl   174.23
| epoch   1 |   400/ 2986 batches | lr 20.00 | ms/batch 19.71 | loss  5.08 | ppl   161.06
| epoch   1 |   600/ 2986 batches | lr 20.00 | ms/batch 20.00 | loss  5.01 | ppl   150.60
| epoch   1 |   800/ 2986 batches | lr 20.00 | ms/batch 20.20 | loss  5.08 | ppl   160.94
| epoch   1 |  1000/ 2986 batches | lr 20.00 | ms/batch 20.19 | loss  5.10 | ppl   164.30
| epoch   1 |  1200/ 2986 batches | lr 20.00 | ms/batch 20.11 | loss  5.04 | ppl   154.37
| epoch   1 |  1400/ 2986 batches | lr 20.00 | ms/batch 20.19 | loss  4.99 | ppl   147.46
| epoch   1 |  1600/ 2986 batches | lr 20.00 | ms/batch 20.22 | loss  4.86 | ppl   128.52
| epoch   1 |  1800/ 2986 batches | lr 20.00 | ms/batch 20.22 | loss  4.96 | ppl   142.13
| epoch   1 |  2000/ 2986 batches | lr 20.00 | ms/batch 20.23 | loss  4.93 | ppl   138.32
| epoch   1 |  2200/ 2986 batches | lr 20.00 | ms/batch 20.23 | loss  4.97 | ppl   144.22
| epoch   

  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/ 2986 batches | lr 20.00 | ms/batch 20.27 | loss  4.98 | ppl   146.15
| epoch   2 |   400/ 2986 batches | lr 20.00 | ms/batch 20.21 | loss  4.92 | ppl   136.75
| epoch   2 |   600/ 2986 batches | lr 20.00 | ms/batch 20.21 | loss  4.86 | ppl   128.69
| epoch   2 |   800/ 2986 batches | lr 20.00 | ms/batch 20.22 | loss  4.93 | ppl   138.81
| epoch   2 |  1000/ 2986 batches | lr 20.00 | ms/batch 20.21 | loss  4.95 | ppl   141.45
| epoch   2 |  1200/ 2986 batches | lr 20.00 | ms/batch 20.20 | loss  4.90 | ppl   134.55
| epoch   2 |  1400/ 2986 batches | lr 20.00 | ms/batch 20.23 | loss  4.85 | ppl   127.98
| epoch   2 |  1600/ 2986 batches | lr 20.00 | ms/batch 20.22 | loss  4.72 | ppl   112.24
| epoch   2 |  1800/ 2986 batches | lr 20.00 | ms/batch 20.19 | loss  4.83 | ppl   125.11
| epoch   2 |  2000/ 2986 batches | lr 20.00 | ms/batch 20.22 | loss  4.80 | ppl   121.85
| epoch   2 |  2200/ 2986 batches | lr 20.00 | ms/batch 20.23 | loss  4.85 | ppl   127.79
| epoch   

In [36]:
 b# generate.py
outf = 'generated.txt'
words = 1000
temperature  = 1.0

is_transformer_model = hasattr(model, 'model_type') and model.model_type == 'Transformer'
if not is_transformer_model:
    hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

with open(outf, 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(words):
            if is_transformer_model:
                output = model(input, False)
                word_weights = output[-1].squeeze().div(args.temperature).exp().cpu()
                word_idx = torch.multinomial(word_weights, 1)[0]
                word_tensor = torch.Tensor([[word_idx]]).long().to(device)
                input = torch.cat([input, word_tensor], 0)
            else:
                output, hidden = model(input, hidden)
                word_weights = output.squeeze().div(temperature).exp().cpu()
                word_idx = torch.multinomial(word_weights, 1)[0]
                input.fill_(word_idx)

            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % log_interval == 0:
                print('| Generated {}/{} words'.format(i, words))


| Generated 0/1000 words
| Generated 200/1000 words
| Generated 400/1000 words
| Generated 600/1000 words
| Generated 800/1000 words


In [39]:
!head generated.txt

, but the Symmetrical Number @-@ Is Game Pieter Fantasy Goffman was used for the Performed – Pat Societies ,
it was later found in Slate . The book was renamed Empty and 2010 , a death customs of England
, Nick Archangel , Giao <unk> , canyon , <unk> , art , <unk> , deploys millionaire , Allāh ""
<unk> <unk> "" , son of <unk> , <unk> Wellington , a consistent Indeed <unk> <unk> , on <unk> ,
occasional terribly , and extra standards of popularity Renaissance . The name design praised the lawyer with Frederick arched editor
with Pitchfork Roosevelt <unk> , and the <unk> <unk> <unk> based on the sylvatic novelization from the fashion further parked
on the site . The years europium <unk> $ the restructuring produces the key body to a noble <unk> <unk>
<unk> 853D and in a <unk> cluster . Some of the ornithologists , which is often escalates in head and
turned him . "" <unk> "" is twenty @-@ lived , with a impression of its annual reporter a ""
breadth "" Steel in one involving a black Jacob

In [38]:
# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    # Currently, only rnn model supports flatten_parameters function.
    if model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
        model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

if len(onnx_export) > 0:
    # Export the model in ONNX format.
    export_onnx(args.onnx_export, batch_size=1, seq_len=args.bptt)

| End of training | test loss  5.15 | test ppl   172.79


class Field(RawField):
    """Defines a datatype together with instructions for converting to Tensor.

    Field class models common text processing datatypes that can be represented
    by tensors.  It holds a Vocab object that defines the set of possible values
    for elements of the field and their corresponding numerical representations.
    The Field object also holds other parameters relating to how a datatype
    should be numericalized, such as a tokenization method and the kind of
    Tensor that should be produced.

    If a Field is shared between two columns in a dataset (e.g., question and
    answer in a QA dataset), then they will have a shared vocabulary.

    Attributes:
        sequential: Whether the datatype represents sequential data. If False,
            no tokenization is applied. Default: True.
        use_vocab: Whether to use a Vocab object. If False, the data in this
            field should already be numerical. Default: True.
        init_token: A token that will be prepended to every example using this
            field, or None for no initial token. Default: None.
        eos_token: A token that will be appended to every example using this
            field, or None for no end-of-sentence token. Default: None.
        fix_length: A fixed length that all examples using this field will be
            padded to, or None for flexible sequence lengths. Default: None.
        dtype: The torch.dtype class that represents a batch of examples
            of this kind of data. Default: torch.long.
        preprocessing: The Pipeline that will be applied to examples
            using this field after tokenizing but before numericalizing. Many
            Datasets replace this attribute with a custom preprocessor.
            Default: None.
        postprocessing: A Pipeline that will be applied to examples using
            this field after numericalizing but before the numbers are turned
            into a Tensor. The pipeline function takes the batch as a list, and
            the field's Vocab.
            Default: None.
        lower: Whether to lowercase the text in this field. Default: False.
        tokenize: The function used to tokenize strings using this field into
            sequential examples. If "spacy", the SpaCy tokenizer is
            used. If a non-serializable function is passed as an argument,
            the field will not be able to be serialized. Default: string.split.
        tokenizer_language: The language of the tokenizer to be constructed.
            Various languages currently supported only in SpaCy.
        include_lengths: Whether to return a tuple of a padded minibatch and
            a list containing the lengths of each examples, or just a padded
            minibatch. Default: False.
        batch_first: Whether to produce tensors with the batch dimension first.
            Default: False.
        pad_token: The string token used as padding. Default: "<pad>".
        unk_token: The string token used to represent OOV words. Default: "<unk>".
        pad_first: Do the padding of the sequence at the beginning. Default: False.
        truncate_first: Do the truncating of the sequence at the beginning. Default: False
        stop_words: Tokens to discard during the preprocessing step. Default: None
        is_target: Whether this field is a target variable.
            Affects iteration over batches. Default: False
    """


“”"
クラス Field は，テンソルで表現できる一般的なテキスト処理のデータ型をモデル化する。
このクラスは，Field の要素値のセットとそれに対応する数値表現を定義する Vocab オブジェクトを保持する。
Field オブジェクトは トークン化方法や生成されるテンソルの種類など，データ型をどのように数値化するかに関連する他のパラメータも保持する。

フィールドがデータセット内の2つのカラム間で共有されている場合（例えば、QAデータセット内の質問と回答），それらは共有語彙を持つ。

# 属性
- sequential: データ型が逐次データを表すかどうか。Falseの場合，トークン化は適用されない。既定値:True
- use_vocab: Vocab オブジェクトを使用するかどうか。False にすると，このフィールドのデータはすでに数値でなければならない。既定値: True
- init_token: このフィールドを使用するすべての例の前に付加されるトークン，または初期トークンがない場合は None。既定値: 初期トークンがない場合は None
- eos_token: このフィールドを使用するすべての事例に追加されるトークン。eosトークンがない場合は None。既定値: None
- fix_length: このフィールドを使っているすべての事例がパッドされる固定長さ、または柔軟なシーケンス長の場合は None。既定値:None
- dtype: このデータの torch.dtype クラス。既定値: torch.long
- preprocessing: このフィールドを使用している例に、トークン化の後に数値化の前に適用されるパイプライン。
多くの Datasets は，この属性をカスタムのプリプロセッサに置き換えている。既定値:None
- postprocessing: 数値化の後で数値がテンソルに変換される前に，このフィールドを使用している事例に適用されるパイプライン。
パイプライン関数は，バッチをリストとして受け取り，フィールドの Vocab を受け取る。既定値:None
- lower: このフィールドのテキストを小文字にするかどうか。既定値:False
- tokenize: このフィールドを使用して文字列を連続した事例にトークン化するために使用する関数。spacy の場合は SpaCy トークン化が使用される。
シリアライズできない関数を引数に渡すと、このフィールドはシリアライズできなくなる。既定値: string.split.
- tokenizer_language: 構築するトークナイザーの言語。現在は SpaCy でのみサポートされている様々な言語があります。
- include_lengths: パッド付きミニバッチと各例の長さを含むリストのタプルを返すか、単にパッド付きミニバッチを返すか。既定値:False
- batch_first: バッチ次元でテンソルを最初に生成するかどうか。既定値: False
- pad_token: パディングとして使用する文字列トークン。既定値: "<pad>"
- unk_token: OOV 表現に使用される文字列トークン。既定値: "<unk>"
- pad_first: シーケンスの最初にパディングを行う。既定値:False
- truncate_first: シーケンスの先頭で切り詰めを行う。既定値: False
- stop_words: 前処理段階で破棄するトークン。既定値:None
- is_target: このフィールドがターゲット変数であるかどうか。バッチに対する反復処理に影響を与える。既定値:False
"""