## Playing with the `datasets`

#### Processing the data


In [None]:
# datasets library from hugging face
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc") # pair sentences dataset

print(raw_datasets)

In [None]:
# train, validation, test datasets
train_datasets = raw_datasets["train"]
validation_datasets = raw_datasets["validation"]
test_datasets = raw_datasets["test"]

print(train_datasets)
print(validation_datasets)
print(test_datasets)

In [None]:
# the features or the columns details
# also to know the corresponding labels
print(train_datasets.features)

# for example here 'not_equivalent' is correspond to 0 and 'equivalent' to 1

In [None]:
# access to the first element from the train set, (select the first row)
train_datasets[0]

In [None]:
# access to 5 elements from the train set , (select 5 rows)
train_datasets[:5]

In [None]:
# access to the 87th element from the validation set, (select the 87th row)
validation_datasets[87]

In [None]:
# access to the 14th element from the test set, (select the 14th row)
test_datasets[14]

#### Preprocessing a dataset


In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenize each separately
# here we tokenize list of sentences (batching), but each separately
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) # tokenize all sentences_1 (3668 of sentences_1)
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"]) # tokenize all sentences_2 (3668 of sentences_2)

print(raw_datasets["train"].num_rows)

In [None]:
# we can alos tokenize pair of sentences, example
tokenizer("This is the first sentence.", "This is the second one.")

# note: this is diffrent from tokenize list of sentences (here the sentences will be concatenated in one list).

In [None]:
# another example
# way1 (each separately)
tokenized_sentences_1 = tokenizer(raw_datasets["train"][15]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"][15]["sentence2"])

print(tokenized_sentences_1)
print(tokenized_sentences_2)

# way2 (pair of sentences)
pair_tokenized_sentences = tokenizer(raw_datasets["train"][15]["sentence1"], raw_datasets["train"][15]["sentence2"])

print(pair_tokenized_sentences)

In [None]:
# let's convert ids to tokens
print(tokenizer.convert_ids_to_tokens(pair_tokenized_sentences["input_ids"]))

In [None]:
# let's decode them
print(tokenizer.decode(pair_tokenized_sentences["input_ids"]))

In [None]:
# we can also pair of lists
tokenizer(raw_datasets["train"]["sentence1"][0:2], raw_datasets["train"]["sentence2"][0:2])

# really wow!

In [None]:
# now let's to tokenize the whole train dataset
# the inefficient way:
tokenized_train_dataset = tokenizer(raw_datasets["train"]["sentence1"],
                                    raw_datasets["train"]["sentence2"],
                                    padding=True,
                                    truncation=True,
                                    return_tensors="pt")

In [None]:
# the efficient way, using map():
def tokenizing(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True) # Note here we didn't pass the padding=True yet.

# batched=True make the tokenization process very fast
tokenized_datasets = raw_datasets.map(function=tokenizing, batched=True)
tokenized_datasets

In [None]:
# after using the map() there will be new columns added to the dataset
# here is before using map()
raw_datasets["train"].features

In [None]:
# here is after
tokenized_datasets["train"].features

# as we see the new columns are: input_ids, token_type_ids, attention_mask (and these columns came from the tokenizer)

#### Dynamic Padding

In [None]:
# we need to padding according the largest sample in the samples, not according the largest sample in the whole dataset
# so this approach will keep the memory safty
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# pick up a samples from the dataset
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} # except idx, sentence1, sentence2 because we don't need them anymore, and also some of them are strings and tensors only accept numbers.
[len(x) for x in samples["input_ids"]]

In [None]:
# collate function (responsible for batching samples together and padding them)
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}
batch[:2]

In [None]:
# so we can say:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

raw_datasets = load_dataset("glue", "mrpc")

tokenized_datasets = raw_datasets.map(function=tokenizing, batched=True)
tokenized_train_dataset = tokenized_datasets["train"]

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["idx", "sentence1", "sentence2"])

batch = data_collator(tokenized_train_dataset[:])
batch

In [None]:
{k: v.shape for k, v in batch[:8].items()}

In [None]:
# so we don't padding the sentences during the tokenizing, but after finish from the tokenizing and we apply the dynamic padding by using DataCollatorWithPadding.
# and the reason is simple: just to save the memory from unnecessary computations and unnecessary allocating.

In [None]:
# another example of a dataset
from datasets import load_dataset

new_dataset = load_dataset("stanfordnlp/sst2") # single sentence dataset
new_dataset["train"].features

In [None]:
# also we can specify the dataset type when loading it:
train_dataset = load_dataset("rotten_tomatoes", split="train") # this will loading only the train set
train_dataset