In [1]:
! pip install datasets transformers[sentencepiece]


[notice] A new release of pip available: 22.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128 # Preprocessdataset with fixed padding
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # Matches each data row to the function to tokenize it
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
print(tokenized_datasets)
print(tokenized_datasets["train"]["input_ids"].shape)
print(tokenized_datasets["train"]["input_ids"])

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})
torch.Size([3668, 128])
tensor([[  101,  7277,  2180,  ...,     0,     0,     0],
        [  101, 10684,  2599,  ...,     0,     0,     0],
        [  101,  1220,  1125,  ...,     0,     0,     0],
        ...,
        [  101,   107,  1284,  ...,     0,     0,     0],
        [  101,  1109,  1136,  ...,     0,     0,     0],
        [  101,  1109,  1476,  ...,     0,     0,     0]])


In [13]:
from torch.utils.data import DataLoader # Pytorch Dataset Loader

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True) # "batch_size=16" will store 16 rows of data/tokens in one selection

for step, batch in enumerate(train_dataloader): # Look at the first 5 elements in "train_dataloader"
    print("Step: ", step)
    print(batch["input_ids"].shape)
    print(batch["input_ids"])
    if step > 5:
        break

Step:  0
torch.Size([16, 128])
tensor([[  101,  1130,  5337,  ...,     0,     0,     0],
        [  101,   139, 19671,  ...,     0,     0,     0],
        [  101,  1109,  1433,  ...,     0,     0,     0],
        ...,
        [  101,  1258,  1744,  ...,     0,     0,     0],
        [  101,   107,  1284,  ...,     0,     0,     0],
        [  101,  1109, 12594,  ...,     0,     0,     0]])
Step:  1
torch.Size([16, 128])
tensor([[  101,   107,  1109,  ...,     0,     0,     0],
        [  101, 28019,  2138,  ...,     0,     0,     0],
        [  101,  3570,   117,  ...,     0,     0,     0],
        ...,
        [  101,  1109,  1295,  ...,     0,     0,     0],
        [  101, 13719,  2105,  ...,     0,     0,     0],
        [  101,  1109,  2380,  ...,     0,     0,     0]])
Step:  2
torch.Size([16, 128])
tensor([[  101,  1646,  3912,  ...,     0,     0,     0],
        [  101,  1130,  1729,  ...,     0,     0,     0],
        [  101,  2677, 18399,  ...,     0,     0,     0],
        .

In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) # "max_length=128" is now removed

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [17]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding # Data collator that will dynamically pad the inputs received.

data_collator = DataCollatorWithPadding(tokenizer) # Performing a collate with "max_length" removed 
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=16, 
    shuffle=True, 
    # Will ensure data is generated of various lengths below 128
    collate_fn=data_collator 
)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 5:
        break

torch.Size([16, 75])
torch.Size([16, 78])
torch.Size([16, 80])
torch.Size([16, 74])
torch.Size([16, 75])
torch.Size([16, 79])
torch.Size([16, 73])
