# Load Hugging Face dataset

In [3]:
# load data

from datasets import load_dataset

# load MRPC dataset, one of the 10 datasets composing the GLUE benchmark, which are used to meauser model performance
raw_datasets = load_dataset("glue", "mrpc")
print(raw_datasets)

#=============================================
# !pip install datasets  --> to install hugging face datasets
# load dataset contain raining set, the validation set, and the test set.
# each set contains 4 columns (sentence1, sentence2, label, and idx) in variable number of rows.
#=============================================


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ReadTimeout: (ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=100.0)"), '(Request ID: 104ca5fd-925f-4243-afa2-a8f0efb40e3a)')

# Expore dataset

In [None]:
# expore data

# to check a signle row of data
raw_train_dataset = raw_datasets["train"]
print (raw_train_dataset[0])

# understand columns in data
raw_train_dataset.features  # retruns a dictioinary with columns and their corresponsidng column type

#==================================================================
# features retruns a dictioinary with columns and their corresponsidng column type: dict[column_name, column_type].
# for example: 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None
#      column 'label' is a ClassLabel column type, so the integer 0 and 1 represents a class
#      0 = 'not_equivalent', 1 = 'equivalent'
# This dictionary has additional functions, depending on the column type, take the above example of "ClassLable" type:
#      raw_train_dataset.features['label'].int2str(0)                --> convert label integer to strings
#      raw_train_dataset.features['label'].str2int(not_equivalent)   --> convert label integer to strings
#==================================================================

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}


{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

# Tokenization

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer(raw_train_dataset[0]['sentence1'],
          raw_train_dataset[0]['sentence2'],
          padding=True,
          truncation=True,)

#==================================================================
# the input of two sentences are not in a list container, so tokenizer take them as a pair, and build tokens by concatenating them and adding [SEP] token at the end of each sentence.
# 'input_ids' token ids with special token aleady added
# 'token_type_ids' specify which sentence is the token coming from.
# 'attention_mask' to filter the padding tokens
#==================================================================

{'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# tokenize pair sentences in batches
tokenized_train_dataset = tokenized_train_dataset = tokenizer(
    raw_train_dataset["sentence1"],
    raw_train_dataset["sentence2"],
    padding=True,
    truncation=True)

# tokenizer() take two seperate list as inputs, will match elements by elements from lists, and tokenize them as pairs

In [None]:
# use map() method to apply a tokenization on each elements of the dataset

# define a tokenization function to take a dictionary as a dataset, and return tokenization result by pair batches
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)   # left the padding argument out for now, prepare for next dynamic padding

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)

#==================================================================
# 1. using batched=True in this call to map() so the defined function is applied to
#    multiple elements of dataset at once, and not on each element separately.
#    This allows for faster preprocessing.
# 2. the tokenizer() resulting columns input_ids, attention_mask, and token_type_ids are added to
#    the original dataset, to match the original data.
# 3. we can use multiprocessing by passing a num_proc argument to map(), to speed up the processing.
#    We didn’t do it here because the Tokenizers library already uses multiple threads for faster processing.
#==================================================================

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


# Dynamic Padding

In [None]:
# Dynamic padding: dynamically padding to the maximum length of the current batch, instead of the maximum of the whole dataset.


## tokenization on wohle dataset
raw_text1 = "This movie is scary!"
raw_text2 = "This movie is scary! Great!"
raw_text3 = "This movie is scary! I love it!"

whole_set = [raw_text1, raw_text2, raw_text3]
output = tokenizer(whole_set,
          padding=True,
          truncation=True,)
print("whole_datas 1st tokens", tokenizer.convert_ids_to_tokens(output['input_ids'][0]), sep=":")


# appy DataCollatorWithPadding on a batch of tokenization outcomes.
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#==================================================================
# 1. DataCollatorWithPadding applies on top of a tokenization outcome batch, padding each item to the maxium length of the batch.
# 2. At instantiation, it takes a tokenizer as input, to know which padding token to use,
#    and whether the model expects padding to be on the left or on the right of the inputs
# 3. When we use it, the input must be the result of tokenization, which is a dictionary include "input_ids" key.
#    (it search for key name "input_ids" in the input dictionary)
#==================================================================

mini_batch1 = tokenizer([raw_text1, raw_text2])
mini_batch2 = tokenizer([raw_text1, raw_text3])
output_batch1 = data_collator(mini_batch1)
output_batch2 = data_collator(mini_batch2)
print("mini_batch1 1st tokens", tokenizer.convert_ids_to_tokens(output_batch1['input_ids'][0]), sep=":")
print("mini_batch2 1st tokens", tokenizer.convert_ids_to_tokens(output_batch2['input_ids'][0]), sep=":")

whole_datas 1st tokens:['[CLS]', 'this', 'movie', 'is', 'scary', '!', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
mini_batch1 1st tokens:['[CLS]', 'this', 'movie', 'is', 'scary', '!', '[SEP]', '[PAD]', '[PAD]']
mini_batch2 1st tokens:['[CLS]', 'this', 'movie', 'is', 'scary', '!', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [None]:
# Apply Dynamic padding to the "mrpc" dataset

samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}