In [1]:
! pip install datasets transformers[sentencepiece]


[notice] A new release of pip available: 22.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128 # Preprocessdataset with fixed padding
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # Matches each data row to the function to tokenize it
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
print(tokenized_datasets)
print(tokenized_datasets["train"]["input_ids"].shape)
print(tokenized_datasets["train"]["input_ids"])

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})
torch.Size([3668, 128])
tensor([[  101,  7277,  2180,  ...,     0,     0,     0],
        [  101, 10684,  2599,  ...,     0,     0,     0],
        [  101,  1220,  1125,  ...,     0,     0,     0],
        ...,
        [  101,   107,  1284,  ...,     0,     0,     0],
        [  101,  1109,  1136,  ...,     0,     0,     0],
        [  101,  1109,  1476,  ...,     0,     0,     0]])


In [3]:
from torch.utils.data import DataLoader # Pytorch Dataset Loader

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True) # "batch_size=16" will store 16 rows of data/tokens in one selection

for step, batch in enumerate(train_dataloader): # Look at the first 5 elements in "train_dataloader"
    print("Step: ", step)
    print(batch["input_ids"].shape)
    print(batch["input_ids"])
    if step > 5:
        break

Step:  0
torch.Size([16, 128])
tensor([[  101,  1109,  2998,  ...,     0,     0,     0],
        [  101,  1409,  3890,  ...,     0,     0,     0],
        [  101,   107,  1188,  ...,     0,     0,     0],
        ...,
        [  101, 10178,   112,  ...,     0,     0,     0],
        [  101,  1370,  1103,  ...,     0,     0,     0],
        [  101,  1135,  1108,  ...,     0,     0,     0]])
Step:  1
torch.Size([16, 128])
tensor([[  101,  1109,  1419,  ...,     0,     0,     0],
        [  101,  1203,  1844,  ...,     0,     0,     0],
        [  101,  1507,   170,  ...,     0,     0,     0],
        ...,
        [  101, 10800,  8724,  ...,     0,     0,     0],
        [  101,  1109,  1937,  ...,     0,     0,     0],
        [  101,  1109,  7565,  ...,     0,     0,     0]])
Step:  2
torch.Size([16, 128])
tensor([[  101,  1135,  1110,  ...,     0,     0,     0],
        [  101,  8028,  3601,  ...,     0,     0,     0],
        [  101,  1124,  1163,  ...,     0,     0,     0],
        .

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) # "max_length=128" is now removed

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")

In [5]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding # Data collator that will dynamically pad the inputs received.

data_collator = DataCollatorWithPadding(tokenizer) # Performing a collate with "max_length" removed 
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=16, 
    shuffle=True, 
    # Will ensure data is generated of various lengths below 128
    collate_fn=data_collator 
)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 5:
        break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 74])
torch.Size([16, 73])
torch.Size([16, 75])
torch.Size([16, 93])
torch.Size([16, 76])
torch.Size([16, 73])
torch.Size([16, 87])
