In [1]:
import random

from torchtext import datasets
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe
from torchtext.models import T5_SMALL_ENCODER, T5_SMALL_GENERATION, T5_SMALL
from torchtext.prototype.generate import GenerationUtils
from torch.utils.data import DataLoader

In [2]:
def print_first_item_dataset(dataset_):
    idx = random.randint(0, 1000)
    count = 0
    for item in dataset_:
        count += 1
        if count == idx:
            print(item)
            break

In [3]:
# Text Classification
train_dataset, test_dataset = datasets.IMDB()
print_first_item_dataset(train_dataset)

(1, 'This is just a case of a previously worthless island changed into something worthwhile. Jesus Christ people lets throw a big fit over 2000 islanders big deal.This is just a case of a previously worthless island changed into something worthwhile. Jesus Christ people lets throw a big fit over 2000 islanders big deal.This is just a case of a previously worthless island changed into something worthwhile. Jesus Christ people lets throw a big fit over 2000 islanders big deal.This is just a case of a previously worthless island changed into something worthwhile. Jesus Christ people lets throw a big fit over 2000 islanders big deal.')


In [4]:
# Sequence Tagging
train_dataset, test_dataset = datasets.CoNLL2000Chunking()
print_first_item_dataset(train_dataset)

[['Basf', 'AG', 'said', 'it', 'moved', 'its', 'headquarters', 'for', 'Latin', 'America', 'to', 'Mexico', 'and', 'the', 'headquarters', 'for', 'the', 'Asia\\/Australia', 'regional', 'division', 'to', 'Singapore', ',', 'effective', 'Oct', '.'], ['NNP', 'NNP', 'VBD', 'PRP', 'VBD', 'PRP$', 'NN', 'IN', 'NNP', 'NNP', 'TO', 'NNP', 'CC', 'DT', 'NN', 'IN', 'DT', 'NNP', 'JJ', 'NN', 'TO', 'NNP', ',', 'JJ', 'NNP', '.'], ['B-NP', 'I-NP', 'B-VP', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'O', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'O', 'B-ADJP', 'B-NP', 'O']]


In [5]:
# Question Answer
train_dataset, test_dataset = datasets.SQuAD2()
print_first_item_dataset(train_dataset)

('She has received co-writing credits for most of the songs recorded with Destiny\'s Child and her solo efforts. Her early songs were personally driven and female-empowerment themed compositions like "Independent Women" and "Survivor", but after the start of her relationship with Jay Z she transitioned to more man-tending anthems such as "Cater 2 U". Beyoncé has also received co-producing credits for most of the records in which she has been involved, especially during her solo efforts. However, she does not formulate beats herself, but typically comes up with melodies and ideas during production, sharing them with producers.', 'What part of production does she do?', ['melodies'], [564])


In [6]:
# Language Modeling
train_dataset, valid_dataset, test_dataset = datasets.PennTreebank()
print_first_item_dataset(train_dataset)

if they approach it with a <unk> <unk> attitude there will be a net gain for everyone


In [7]:
# # Machine Translation
# train_dataset, valid_dataset, test_dataset = datasets.IWSLT2016()
# print_first_item_dataset(train_dataset)

In [8]:
def get_tokens(tokenizer_, text):
    print(tokenizer_(text))

In [9]:
get_tokens(tokenizer_=get_tokenizer("basic_english"), text="You can now install TorchText using pip!")

['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']


In [10]:
# uv add spacy
# uv add "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"

get_tokens(tokenizer_=get_tokenizer("spacy"), text="You can now install TorchText using pip!")



['You', 'can', 'now', 'install', 'TorchText', 'using', 'pip', '!']


In [11]:
# uv add nltk

get_tokens(tokenizer_=get_tokenizer("toktok"), text="You can now install TorchText using pip!")

['You', 'can', 'now', 'install', 'TorchText', 'using', 'pip', '!']


In [12]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [13]:
train_dataset, test_dataset = datasets.IMDB()

tokenizer = get_tokenizer("basic_english")

In [14]:
vocab = build_vocab_from_iterator(yield_tokens(train_dataset), specials=["<unk>"])

In [15]:
vocab

Vocab()

In [16]:
len(vocab)

68811

In [17]:
vocab["<unk>"]

0

In [18]:
vocab(['here', 'is', 'an', 'example'])

[128, 9, 45, 433]

In [19]:
glove = GloVe(name='6B', dim=100)

.vector_cache/glove.6B.zip: 862MB [17:20, 829kB/s]                                                                                                    
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉| 399999/400000 [00:07<00:00, 56530.00it/s]


In [20]:
examples = ['chip', 'baby', 'Beautiful']

In [21]:
vectors = glove.get_vecs_by_tokens(examples, lower_case_backup=True)

In [22]:
vectors

tensor([[-3.5029e-02,  3.5055e-01, -3.7747e-01, -2.7484e-01, -4.6507e-01,
         -1.0469e+00,  2.6530e-01, -1.0274e+00, -1.5118e-01, -4.3337e-01,
          2.8249e-01,  3.0586e-02,  4.1676e-02, -8.0440e-01, -2.5877e-01,
          7.6451e-01, -1.1719e-01, -2.6887e-01,  3.1177e-01,  1.2424e-01,
         -8.2664e-02, -4.4980e-01,  3.4401e-01,  5.3207e-01,  7.7049e-01,
         -1.8074e-01, -7.9733e-02, -4.3636e-01, -6.3873e-01, -2.2136e-01,
         -3.6726e-01,  8.3501e-01, -1.8382e-02,  3.4585e-01,  8.1993e-01,
          1.1494e-01,  5.3179e-02, -3.2708e-01,  6.1461e-01, -5.5759e-01,
          1.6368e-01, -1.0126e+00,  7.3822e-02,  4.1195e-01, -1.1873e+00,
          9.6243e-02,  9.7531e-03,  3.5168e-01, -1.2006e-02, -8.6820e-01,
         -8.2064e-01,  2.1357e-01, -3.4402e-01,  3.0030e-01, -4.2657e-02,
         -1.0223e+00, -7.7814e-01,  2.5033e-01,  1.9737e+00, -7.9433e-02,
          6.2742e-01,  1.9720e-01, -1.5116e-01,  9.7084e-01, -5.7882e-01,
         -3.6337e-01,  4.6025e-01,  7.

In [23]:
t5_base_encoder_transform = T5_SMALL_ENCODER.transform()
t5_base_encoder_model = T5_SMALL_ENCODER.get_model()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 792k/792k [00:01<00:00, 788kB/s]
Downloading: "https://download.pytorch.org/models/text/t5.small.encoder.v2.pt" to /Users/minhhuunguyen/.cache/torch/hub/checkpoints/t5.small.encoder.v2.pt
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 135M/135M [00:15<00:00, 8.96MB/s]


In [24]:
t5_base_generation_transform = T5_SMALL_GENERATION.transform()
t5_base_generation_model = T5_SMALL_GENERATION.get_model()

Downloading: "https://download.pytorch.org/models/text/t5.small.generation.v2.pt" to /Users/minhhuunguyen/.cache/torch/hub/checkpoints/t5.small.generation.v2.pt
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 294M/294M [00:34<00:00, 9.00MB/s]


In [25]:
t5_base_transform = T5_SMALL.transform()
t5_base_model = T5_SMALL.get_model()

Downloading: "https://download.pytorch.org/models/text/t5.small.v2.pt" to /Users/minhhuunguyen/.cache/torch/hub/checkpoints/t5.small.v2.pt
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 231M/231M [00:29<00:00, 8.21MB/s]


In [26]:
# t5_base_generation_model

In [27]:
sequence_generator = GenerationUtils(t5_base_generation_model)
sequence_generator

<torchtext.prototype.generate.GenerationUtils at 0x1469b72b0>

In [28]:
model_input = t5_base_generation_transform(['A man in an orange hat starring at something.'])
model_input

tensor([[   71,   388,    16,    46,  5470,     3,   547,     3, 22236,    44,
           424,     5,     1]])

In [29]:
model_output = sequence_generator.generate(model_input)
model_output

`max_length` was not specified. Defaulting to 256 tokens.


tensor([[    0,   890,   388,    16,   665,  5470,     3,   547,   181,   665,
             3,  2719,    35,     3,   547,     6,     3, 22236,    44,   424,
             5,     1]])

In [30]:
output_text = t5_base_generation_transform.decode(model_output.tolist())
output_text

['Ein man in einem orange hat mit einem roten hat, starring at something.']