In [1]:
import torch
from torch.utils.data import DataLoader

import torchtext
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# print(torchtext.__version__)

cuda = False
device = torch.device("cuda" if cuda else "cpu")

# Raw text iter
train_iter = WikiText2(split='train')
print(type(train_iter))

<class 'torchtext.data.datasets_utils._RawTextIterableDataset'>


In [22]:
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab["<unk>"])

def data_process(raw_text_iter):
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [32]:
count = 0
for item in train_iter:
    print(item)
    print(type(item))
    count += 1
    if count >= 2:
        break

 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 

<class 'str'>
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for series newcomers . Character designer <unk

In [34]:
train_loader = DataLoader(train_iter, batch_size = 8, shuffle = False)

print(len(train_loader))
print(isinstance(train_loader, Iterable))

In [44]:
count = 0
for batch in train_loader:
    print(len(batch))
    print(type(batch))
    print(batch)
    count += 1
    if count >= 1:
        break

8
<class 'list'>
[" PlayStation Official Magazine - UK praised the story 's <unk> of Gallia 's moral standing , art style , and most points about its gameplay , positively noting the latter for both its continued quality and the tweaks to balance and content . Its one major criticism were multiple difficulty spikes , something that had affected the previous games . Heath Hindman of gaming website PlayStation <unk> praised the addition of non @-@ linear elements and improvements or removal of mechanics from Valkyria Chronicles II in addition to praising the returning gameplay style of previous games . He also positively noted the story 's serious tone . Points criticized in the review were recycled elements , awkward cutscenes that seemed to include all characters in a scene for no good reason , pacing issues , and occasional problems with the game 's AI . \n", " In a preview of the TGS demo , Ryan Geddes of IGN was left excited as to where the game would go after completing the demo , 

In [47]:
text_pipeline = lambda x: vocab(tokenizer(x))

In [55]:
def collate_batch(batch):
    text_list = []
    text_list = torch.tensor(text_list, dtype=torch.int64)
    return text_list.to(device)

train_loader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [57]:
count = 0
for batch in train_loader:
    print(len(batch))
    print(type(batch))
    # print(batch)
    text_data = batch
    print(text_data.size())
    count += 1
    if count >= 1:
        break

268
<class 'torch.Tensor'>
torch.Size([268])


In [6]:
# Use tokenizer
import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()


trainer = BpeTrainer(special_tokens=["<unk>"])
files = [f".data/WikiText2/wikitext-2/wiki.{split}.tokens" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)

tokenizer.save("tokenizer-wiki2.json")

In [8]:
out = tokenizer.encode("Hello, y'all!")
print(out.tokens)

['Hel', 'lo', ',', 'y', "'", 'all', '!']


In [2]:
# Train a word level tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["<unk>"])
files = [f".data/WikiText2/wikitext-2/wiki.{split}.tokens" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)

# tokenizer.save("word-level-tokenizer-wiki2.json")

In [11]:
tokenizer = Tokenizer.from_file('word-level-tokenizer-wiki2.json')

# tokenizer.get_vocab()
print(tokenizer.token_to_id('<unk>'))
print(tokenizer.id_to_token(1))


print(f'before enable_padding(), vocab size = {len(tokenizer.get_vocab())}')

# out = tokenizer.encode("Hello, y'all!")
# print(out)
# print(out.tokens)

tokenizer.enable_padding(pad_id=1, pad_token="[PAD]")
outputs = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(outputs[0].tokens)
print(outputs[1].tokens)

print(f'after enable_padding(), vocab size = {tokenizer.get_vocab_size()}')


0
the
before enable_padding(), vocab size = 30000
['Hello', ',', 'y', "'", 'all', '!']
['How', 'are', 'you', '😁', '?', '[PAD]']
after enable_padding(), vocab size = 30000


In [42]:
def collate_batch(batch):
    # directly use the encode_batch from tokenizer
    encoded_results = tokenizer.encode_batch(batch)

    ids_list, attn_mask_list = [], []
    for res in encoded_results:
        ids_list.append(res.ids)
        attn_mask_list.append(res.attention_mask)
    ids = torch.tensor(ids_list, dtype=torch.int64)
    attention_mask = torch.tensor(attn_mask_list, dtype=torch.int64)
    return ids, attention_mask

train_loader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [43]:
count = 0
for batch in train_loader:
    print(len(batch))
    print(type(batch))
    print(batch[1])
    # print(batch.size())

    count += 1
    if count >= 1:
        break

2
<class 'tuple'>
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0