In this notebook we will perform data preprocessing with dynamic and fixed padding.

In [26]:
! pip install datasets transformers[sentencepiece]





### Fixed Padding
* We pad all sentences to the maximum sentence length in the whole dataset. Or we can use a specified max_length manually.
* Pros : All the batches will have the same shape.
* Cons : Lots of batches will have useless columns with pad tokens only.

In [27]:
from transformers import AutoTokenizer
from datasets import load_dataset

Given sentence1 and sentence2, Suppose we want to predict whether sentence2 and sentence1 are the same or not (0 the same, 1 not the Same). We have a training, validation and test dataset which contains sentence1, sentence2, the label (either 0 or 1) and id (unique for each row). raw_datasets contains all these 3 datasets in a dictionary.

In [28]:
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Reusing dataset glue (C:\Users\loriz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [29]:
raw_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [30]:
raw_datasets['train']['sentence1']

['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .',
 'Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .',
 'The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .',
 'The DVD-CCA then appealed to the state Supreme Court .',
 'That compared with $ 35.18 million , or 24 cents per share , in the year-ago period .',
 'Shares of Genentech , a much larger company with several products on the market , rose more than 2 percent .

In [31]:
raw_datasets['train']['label']

[1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,


In [32]:
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# this function will be applied to each row (examples variable denotes one specific row) for each dataset (train, validation and test)
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128
    )

Set batched=True to apply the funtion for all datasets (values of dictionary) that are contained in the dictionary. The keys of the dictionary that are returned from Tokenizer will be the new column names in each dataset.


In [33]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Loading cached processed dataset at C:\Users\loriz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-f8d96d51737ef47f.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\loriz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-16e2e5dce3effda4.arrow


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'],
        num_rows: 1725
    })
})

In [34]:
# input_ids is a list of lists
tokenized_datasets['train']['input_ids']

[[101,
  7277,
  2180,
  5303,
  4806,
  1117,
  1711,
  117,
  2292,
  1119,
  1270,
  107,
  1103,
  7737,
  107,
  117,
  1104,
  9938,
  4267,
  12223,
  21811,
  1117,
  2554,
  119,
  102,
  11336,
  6732,
  3384,
  1106,
  1140,
  1112,
  1178,
  107,
  1103,
  7737,
  107,
  117,
  7277,
  2180,
  5303,
  4806,
  1117,
  1711,
  1104,
  9938,
  4267,
  12223,
  21811,
  1117,
  2554,
  119,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [101,
  10684,
  2599,
  9717,
  1161,
  2205,
  11288,
  1377,
  112,
  188,
  1196,
  4147,
  1103,
  4129,
  1106,
  19770,
  2787,
  1107,
  1772,
  1111,
  109,
  123,
  119,
  126,
  3775,
  119,
  102,


In [35]:
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 1725
    })
})

In [36]:
# convert all columns from Python list to Pytorch tensors
tokenized_datasets = tokenized_datasets.with_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 1725
    })
})

In [37]:
tokenized_datasets['train']['input_ids']

tensor([[  101,  7277,  2180,  ...,     0,     0,     0],
        [  101, 10684,  2599,  ...,     0,     0,     0],
        [  101,  1220,  1125,  ...,     0,     0,     0],
        ...,
        [  101,   107,  1284,  ...,     0,     0,     0],
        [  101,  1109,  1136,  ...,     0,     0,     0],
        [  101,  1109,  1476,  ...,     0,     0,     0]])

In [38]:
tokenized_datasets['test']['input_ids']

tensor([[  101,  7054,  1658,  ...,     0,     0,     0],
        [  101,  1109,  1362,  ...,     0,     0,     0],
        [  101,  1792,  1106,  ...,     0,     0,     0],
        ...,
        [  101, 17621,  4436,  ...,     0,     0,     0],
        [  101,  4254,  1989,  ...,     0,     0,     0],
        [  101,  1573,  5567,  ...,     0,     0,     0]])

In [39]:
tokenized_datasets['test']['labels']

tensor([1, 1, 1,  ..., 0, 1, 1])

In [41]:
tokenized_datasets['test']['attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

All sequences in the datasets have length of 128 => All batches will have length of 128.

In [42]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 5:
        break

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])


### Dynamic Padding 
* We pad the sentences at the batch creation, to the length of longest sentence in that batch.(we postpone the padding)
* Each batch has a different max_length.
* If the sequence has the length greater than max_length (usually 512), we truncate it.
* Pros : It will run faster on CPUs, GPUs because we have sequences of smaller length to process by the model. On TPUs we need batches of fixed length (fixed padding). 

In [44]:
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Note that when we tokenize our sentences we do not apply padding (we do not set padding=True)
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")

Reusing dataset glue (C:\Users\loriz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\loriz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-8d19cf40aa6499c4.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\loriz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-ded67097752a81bd.arrow


In [45]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 1725
    })
})

In [46]:
tokenized_datasets['train']['input_ids']

[tensor([  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292,  1119,
          1270,   107,  1103,  7737,   107,   117,  1104,  9938,  4267, 12223,
         21811,  1117,  2554,   119,   102, 11336,  6732,  3384,  1106,  1140,
          1112,  1178,   107,  1103,  7737,   107,   117,  7277,  2180,  5303,
          4806,  1117,  1711,  1104,  9938,  4267, 12223, 21811,  1117,  2554,
           119,   102]),
 tensor([  101, 10684,  2599,  9717,  1161,  2205, 11288,  1377,   112,   188,
          1196,  4147,  1103,  4129,  1106, 19770,  2787,  1107,  1772,  1111,
           109,   123,   119,   126,  3775,   119,   102, 10684,  2599,  9717,
          1161,  3306, 11288,  1377,   112,   188,  1107,  1876,  1111,   109,
          5691,  1495,  1550,  1105,  1962,  1122,  1106, 19770,  2787,  1111,
           109,   122,   119,   129,  3775,  1107,  1772,   119,   102]),
 tensor([  101,  1220,  1125,  1502,  1126, 16355,  1113,  1103,  4639,  1113,
          1340,  1275,   117,  4

For Dynamic Padding we use the class DataCollatorWithPadding

In [47]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator
)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 5:
        break

torch.Size([16, 72])
torch.Size([16, 66])
torch.Size([16, 81])
torch.Size([16, 76])
torch.Size([16, 111])
torch.Size([16, 82])
torch.Size([16, 82])


In [48]:
for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"])
    if step > 5:
        break

tensor([[  101,   158,   119,  ...,     0,     0,     0],
        [  101,  5271,  2802,  ...,   119,   102,     0],
        [  101,  1109,  1207,  ...,     0,     0,     0],
        ...,
        [  101,   107,  2096,  ...,     0,     0,     0],
        [  101,  1109,  1148,  ...,     0,     0,     0],
        [  101, 16544,   117,  ...,     0,     0,     0]])
tensor([[  101,  9018,  1116,  ...,     0,     0,     0],
        [  101,  2545, 14812,  ...,     0,     0,     0],
        [  101,  1109,  1244,  ...,     0,     0,     0],
        ...,
        [  101,  1284,  1132,  ...,     0,     0,     0],
        [  101,  1291, 14099,  ...,     0,     0,     0],
        [  101,  6356,   117,  ...,     0,     0,     0]])
tensor([[  101, 11336, 20080,  ...,     0,     0,     0],
        [  101,  1960,  3878,  ...,     0,     0,     0],
        [  101,  1987, 15776,  ...,     0,     0,     0],
        ...,
        [  101,  1130,  1103,  ...,     0,     0,     0],
        [  101,  1109,  1160,  