In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from datasets import load_dataset
import datasets
from transformers import AutoTokenizer

In [3]:
dataset = load_dataset("opus_books", "en-fr")["train"] # Take the training part of the dataset
dataset = dataset.train_test_split(test_size=0.2)      # Split the dataset into "train" and "test"
tokenizer = AutoTokenizer.from_pretrained("t5-small")  # Get the tokenizer

In [4]:
# make the tokenize_function
def tokenize_function(dataset):
    input_outputs = dataset["translation"]
    inputs = [input_output["en"] for input_output in input_outputs]  # pull out all the "en" from dictionaries
    outputs = [input_output["fr"] for input_output in input_outputs]
    return tokenizer(inputs, text_target=outputs, truncation=True, max_length=128, padding="max_length")
    # by specifying "text_target", tokenizer will generate "label" that is the token list for target text

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [12]:
print(tokenizer("b b b", text_target="banana bbb", truncation=True, max_length=128, padding="max_length"))

{'input_ids': [3, 115, 3, 115, 3, 115, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [13634, 3, 115, 115, 115, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [5]:
# Make a smaller dataset

small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))

In [6]:
print(small_train_dataset)
print(small_train_dataset["id"][0])
print(small_train_dataset["input_ids"][0])
print(small_train_dataset["translation"][0]["en"])

print(small_train_dataset["labels"][0])
print(small_train_dataset["translation"][0]["fr"])

print(small_train_dataset["attention_mask"][0])
print(small_train_dataset["attention_mask"][1])

Dataset({
    features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})
46721
[4163, 7377, 3, 7361, 4125, 6, 4644, 924, 11, 3550, 10812, 15, 26, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Felton remained standing, motionless and undecided.
[4163, 7377, 340, 1238, 9, 155, 20, 4076, 17, 6, 256, 14814, 3, 15, 17, 16, 3764, 75, 159, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Fe

You have to convert input_ids, labels, and attention_mask to tensors. 
Otherwise, the stupid pytorch dataloader will cut it to batch_size.

In [140]:
class MyCustomDataset(torch.utils.data.Dataset):
    def __init__(self, your_data_here):
        self.data = your_data_here

    def __len__(self):
        return self.data.num_rows

    def __getitem__(self, idx):
        return {
            'id': self.data['id'][idx],
            'translation': self.data['translation'][idx],
            'input_ids': torch.tensor(self.data['input_ids'][idx]),
            'labels': torch.tensor(self.data['labels'][idx]),
            'attention_mask': torch.tensor(self.data['attention_mask'][idx]),
        }
    
# Finally create the dataloader
train_dataset = MyCustomDataset(small_train_dataset)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)

In [2]:
for i, data_batch in enumerate(train_dataloader):
    print(data_batch.keys())
    
    index          =   data_batch["id"]
    english        =   data_batch["translation"]["en"]
    french         =   data_batch["translation"]["fr"]
    source_token   =   data_batch["input_ids"]
    target_token   =   data_batch["labels"]
    masks          =   data_batch["attention_mask"]
    
    print(index[0])
    print(english[0])
    print(french[0])
    print(source_token[0])
    print(target_token[0])
    print(masks[0])
    
    break

NameError: name 'train_dataloader' is not defined

All together:

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from datasets import load_dataset
import datasets
from transformers import AutoTokenizer

dataset = load_dataset("opus_books", "en-fr")["train"] # Take the training part of the dataset
dataset = dataset.train_test_split(test_size=0.2)      # Split the dataset into "train" and "test"
tokenizer = AutoTokenizer.from_pretrained("t5-small")  # Get the tokenizer

# make the tokenize_function
def tokenize_function(dataset):
    input_outputs = dataset["translation"]
    inputs = [input_output["en"] for input_output in input_outputs]
    outputs = [input_output["fr"] for input_output in input_outputs]
    return tokenizer(inputs, text_target=outputs, truncation=True, max_length=128, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Make a smaller dataset
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))


class MyCustomDataset(torch.utils.data.Dataset):
    def __init__(self, your_data_here):
        self.data = your_data_here

    def __len__(self):
        return self.data.num_rows

    def __getitem__(self, idx):
        return {
            'id': self.data['id'][idx],
            'translation': self.data['translation'][idx],
            'input_ids': torch.tensor(self.data['input_ids'][idx]),
            'labels': torch.tensor(self.data['labels'][idx]),
            'attention_mask': torch.tensor(self.data['attention_mask'][idx]),
        }
    
# Finally create the dataloader
train_dataset = MyCustomDataset(small_train_dataset)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)


for i, data_batch in enumerate(train_dataloader):
    index          =   data_batch["id"]
    english        =   data_batch["translation"]["en"]
    french         =   data_batch["translation"]["fr"]
    source_token   =   data_batch["input_ids"]
    target_token   =   data_batch["labels"]
    masks          =   data_batch["attention_mask"]