Latest Version (0.17.0)


In [1]:
import random
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator

from collections import Counter

In [2]:
train_iter = IMDB(root="./.datatext", split="train")
test_iter = IMDB(root="./.datatext", split="test")

In [3]:
# for label, line in train_iter:
#     print(label)
#     print(line)
#     break

In [4]:
train_data = to_map_style_dataset(train_iter)
test_data = to_map_style_dataset(test_iter)

print(f"Training examples: {len(train_data)}")
print(f"Testing examples: {len(test_data)}")

Training examples: 25000
Testing examples: 25000


In [5]:
train_data1 = to_map_style_dataset(train_iter)
test_data1 = to_map_style_dataset(test_iter)

print(f"Training examples: {len(train_data1)}")
print(f"Testing examples: {len(test_data1)}")

Training examples: 25000
Testing examples: 25000


In [6]:
print(train_data[0])
print(train_data[-1])

(1, 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between

In [7]:
print(type(train_data[-1][0]))

<class 'int'>


In [8]:
vocab_size = 20000
tokenizer = get_tokenizer("basic_english")


def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text.lower())


vocab = build_vocab_from_iterator(
    yield_tokens(train_iter), specials=["<unk>", "<pad>"], max_tokens=vocab_size
)
vocab.set_default_index(vocab["<unk>"])

In [9]:
print(vocab.get_itos()[:10])

['<unk>', '<pad>', 'the', '.', ',', 'and', 'a', 'of', 'to', "'"]


In [10]:
print(sorted(vocab.get_itos()))



In [11]:
print(vocab.get_itos()[-10:])

['narrow-minded', 'nauseatingly', 'nay', 'nazism', 'necrophilia', 'neglecting', 'neglects', 'neo-realism', 'neuroses', 'newbern']


In [12]:
print(len(vocab))

20000


In [13]:
[tokenizer(text.lower()) for label, text in train_data]

[['i',
  'rented',
  'i',
  'am',
  'curious-yellow',
  'from',
  'my',
  'video',
  'store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when',
  'it',
  'was',
  'first',
  'released',
  'in',
  '1967',
  '.',
  'i',
  'also',
  'heard',
  'that',
  'at',
  'first',
  'it',
  'was',
  'seized',
  'by',
  'u',
  '.',
  's',
  '.',
  'customs',
  'if',
  'it',
  'ever',
  'tried',
  'to',
  'enter',
  'this',
  'country',
  ',',
  'therefore',
  'being',
  'a',
  'fan',
  'of',
  'films',
  'considered',
  'controversial',
  'i',
  'really',
  'had',
  'to',
  'see',
  'this',
  'for',
  'myself',
  '.',
  'the',
  'plot',
  'is',
  'centered',
  'around',
  'a',
  'young',
  'swedish',
  'drama',
  'student',
  'named',
  'lena',
  'who',
  'wants',
  'to',
  'learn',
  'everything',
  'she',
  'can',
  'about',
  'life',
  '.',
  'in',
  'particular',
  'she',
  'wants',
  'to',
  'focus',
  'her',
  'attentions',
  'to',
  'making',
 

In [14]:
sorted([len(tokenizer(text.lower())) for label, text in train_data])[-1]

2752

Data Loader


In [15]:
batch_size = 32

In [16]:
print(list(train_iter)[0])


Counter(list(train_iter)) == Counter(train_data)

(1, 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between

True

In [17]:
len(list(train_iter)) / batch_size

781.25

In [18]:
def text_transform(text):
    return vocab([token for token in tokenizer(text.lower())])


def label_transform(label):
    return torch.tensor(1.0 if label == 2 else 0.0, dtype=torch.float)

In [19]:
pad_idx = vocab["<pad>"]


def collate_batch(batch):
    label_list, text_list = [], []
    for label, text in batch:
        label_list.append(label_transform(label))
        processed_text = torch.tensor(text_transform(text))
        text_list.append(processed_text)
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=pad_idx)

In [20]:
def batch_sampler():
    indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(train_data)]
    random.shuffle(indices)
    pooled_indices = []
    # create pool of indices with similar lengths
    for i in range(0, len(indices), batch_size * 100):
        pooled_indices.extend(
            sorted(indices[i : i + batch_size * 100], key=lambda x: x[1])
        )

    pooled_indices = [x[0] for x in pooled_indices]

    # yield indices for current batch
    for i in range(0, len(pooled_indices), batch_size):
        yield pooled_indices[i : i + batch_size]


train_loader = DataLoader(
    train_data, batch_sampler=batch_sampler(), collate_fn=collate_batch
)

test_loader = DataLoader(
    test_data, batch_sampler=batch_sampler(), collate_fn=collate_batch
)

In [21]:
from itertools import tee

train_loader, train_loader1 = tee(train_loader)

In [22]:
train_sample = next(iter(train_loader1))
print(train_sample[0])
print(train_sample[0].size())
print(train_sample[1])
print(train_sample[1].size())
print(train_sample[1].T.size())
print(train_sample[1].T)

tensor([0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1.,
        0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1.])
torch.Size([32])
tensor([[   14,    14,     6,  ...,  6247,  4057,    14],
        [   10,    23,    21,  ...,     0,  4857,    17],
        [  211,   100,   124,  ...,    17,    10,    35],
        ...,
        [    1,     1,     1,  ...,  2322, 11907,   382],
        [    1,     1,     1,  ...,     3,     3,    11],
        [    1,     1,     1,  ...,     1,     1,    36]])
torch.Size([50, 32])
torch.Size([32, 50])
tensor([[   14,    10,   211,  ...,     1,     1,     1],
        [   14,    23,   100,  ...,     1,     1,     1],
        [    6,    21,   124,  ...,     1,     1,     1],
        ...,
        [ 6247,     0,    17,  ...,  2322,     3,     1],
        [ 4057,  4857,    10,  ..., 11907,     3,     1],
        [   14,    17,    35,  ...,   382,    11,    36]])


In [23]:
example = train_sample[1].T

In [24]:
example1 = example[10].tolist()
print(vocab.lookup_tokens(example1))

['very', 'smart', ',', 'sometimes', 'shocking', ',', 'i', 'just', 'love', 'it', '.', 'it', 'shoved', 'one', 'more', 'side', 'of', 'david', "'", 's', 'brilliant', 'talent', '.', 'he', 'impressed', 'me', 'greatly', '!', 'david', 'is', 'the', 'best', '.', 'the', 'movie', '<unk>', 'your', 'attention', 'for', 'every', 'second', '.', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [25]:
print(len([sample[1].size() for sample in list(train_loader)]))
print(len([sample[1].size() for sample in list(test_loader)]))

782
782


In [26]:
# with open(r"./datatext.txt", "w") as fp:
#     for label, text in train_data:
#         # write each item on a new line
#         fp.write(f"{text}\n")
#     print("Done")

In [60]:
import os

os.getcwd()

'/Users/nicholaschoong/Documents/QuantumTransformer/test'

In [61]:
import sys

sys.path.append("/Users/nicholaschoong/Documents/QuantumTransformer")

In [62]:
from lib.data_loader import (
    yield_tokens,
    text_transform,
    label_transform,
    collate_batch,
    batch_sampler,
)

In [63]:
train_iter = IMDB(root="./.datatext", split="train")
test_iter = IMDB(root="./.datatext", split="test")

train_data = to_map_style_dataset(train_iter)
test_data = to_map_style_dataset(test_iter)

size = 100
train_data = np.array(train_data)[
    np.random.choice(len(train_data), size=size, replace=False)
].tolist()
test_data = np.array(test_data)[
    np.random.choice(len(test_data), size=size, replace=False)
].tolist()

train_data = [(int(label), text) for label, text in train_data]
test_data = [(int(label), text) for label, text in test_data]

batch_size = 5
vocab_size = 20000
tokenizer = get_tokenizer("basic_english")

vocab = build_vocab_from_iterator(
    yield_tokens(train_data, tokenizer),
    specials=["<unk>", "<pad>"],
    max_tokens=vocab_size,
)
vocab.set_default_index(vocab["<unk>"])


train_loader = DataLoader(
    train_data,
    batch_sampler=batch_sampler(train_data, batch_size, tokenizer),
    collate_fn=lambda batch: collate_batch(batch, vocab, tokenizer),
)

test_loader = DataLoader(
    test_data,
    batch_sampler=batch_sampler(test_data, batch_size, tokenizer),
    collate_fn=lambda batch: collate_batch(batch, vocab, tokenizer),
)

ValueError: DataLoader with IterableDataset: expected unspecified batch_sampler option, but got batch_sampler=<generator object batch_sampler at 0x309e316d0>

In [None]:
train_loader, train_loader1 = tee(train_loader)

In [None]:
len(test_loader.dataset)

100

In [None]:
train_sample = next(iter(train_loader1))
# print(train_sample[0])
print(train_sample[0].size())
# print(train_sample[1])
print(train_sample[1].size())
print(train_sample[1].T.size())
# print(train_sample[1].T)

torch.Size([5])
torch.Size([58, 5])
torch.Size([5, 58])


In [None]:
train_sample = next(iter(train_loader1))
print(train_sample[0])

StopIteration: 

In [None]:
example = train_sample[1].T

In [None]:
example1 = example[3].tolist()
print(vocab.lookup_tokens(example1))

['this', 'movie', 'has', 'made', 'me', 'want', 'to', 'become', 'a', 'director', ',', 'and', 'michelle', 'rodriguez', 'is', 'brilliant', '.', 'how', 'the', 'hell', 'wasn', "'", 't', 'she', 'on', 'mtv', "'", 's', 'top', '25', 'under', '25', ',', 'she', 'beats', 'them', 'all', '.', 'this', 'film', 'definitely', 'deserved', 'the', 'grand', 'jury', 'prize', 'at', 'sundance', ',', 'best', 'film', 'i', 'have', 'ever', 'seen', '.', '<pad>', '<pad>']


In [None]:
example = train_sample[1].T
example1 = example[4].tolist()
print(vocab.lookup_tokens(example1))

['dressed', 'to', 'kill', 'has', 'been', 'more', 'or', 'less', 'forgotten', 'in', 'critical', 'circles', 'in', 'the', 'past', '20', 'years', ',', 'but', 'it', 'is', 'a', 'true', 'american', 'classic', ',', 'a', 'film', 'which', 'is', 'much', 'more', 'than', 'just', 'a', 'glossy', 'thriller', '.', 'i', 'sincerely', 'hope', 'the', 'dvd', 'release', 'will', 'give', 'more', 'people', 'the', 'chance', 'to', 'hear', 'about', 'it', 'and', 'see', 'it', '.']


In [None]:
print(len([sample[1].size() for sample in list(train_loader)]))
print(len([sample[1].size() for sample in list(test_loader)]))

20
20


In [None]:
train_data

[(1,
 (1,
  "This Hitchcock movie bears little similarity to his later suspense films and seems much more like a very old fashioned morality tale. A young couple receives an inheritance that they believe will make them happy. They spend the money traveling about the world and living a very hedonistic existence. However, after a while the excitement begins to wane and the couple become dissipated and pointless in their existence. However, out of no where, when they are on a luxury cruise, the ship sinks and they lose everything--and end up much happier in the end because they now appreciate life! What an odd, silly and preachy film! Personally, I'd like to inherit all that money and find out if it makes me miserable!<br /><br />The production values are relatively poor compared to later productions--a rough film with poor sound quality and rather amateurish acting."),
 (2,
  'Being that I am a true product of the hip-hop and electronic dance music generation, this is without a doubt one