In [107]:
# Use HuggingFace libraries
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the a hugging face dataset
dataset = load_dataset('ag_news') # test dataset
ag_news_dict = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}
print(dataset)

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #wordpiece


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [106]:
# Study vocabulary

# Get the size of the tokenizer's vocabulary
vocab_size = len(tokenizer.vocab)
print(f"Total number of vocabulary in the tokenizer: {vocab_size}")

# Tokenize all text data and count unique tokens
unique_tokens = set()

for example in dataset['test']:
    tokens = tokenizer.tokenize(example['text'])
    unique_tokens.update(tokens)

print(f"Total number of unique tokens in the test dataset: {len(unique_tokens)}")


Total number of vocabulary in the tokenizer: 30522
Total number of unique tokens in the test dataset: 16837


In [108]:
# silly function to explore the dataset
class Explore:
    def __init__(self, dataset, tokenizer, label_dict):
        self.iterator = iter(dataset)
        self.tokenizer = tokenizer
        self.label_dict = label_dict
        
    def __call__(self):
        d = next(self.iterator)
        # Assuming 'text' and 'label' are keys in the dataset dictionary
        print(d['text'])
        print('\n Label:', d['label'], "(", self.label_dict[d['label']],")")

        print('\n Tokenized text:', self.tokenizer.tokenize(d['text']))
    
        print('\n Tokenize encoding:', self.tokenizer.encode(d['text']))

        print('\n Tokenize call:', self.tokenizer(d['text']))


explore_train = Explore(dataset['train'], tokenizer, ag_news_dict)

In [109]:
# run iteratively to explore the dataset
explore_train()


Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.

 Label: 2 ( Sports )

 Tokenized text: ['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short', '-', 'sellers', ',', 'wall', 'street', "'", 's', 'd', '##wind', '##ling', '\\', 'band', 'of', 'ultra', '-', 'cy', '##nic', '##s', ',', 'are', 'seeing', 'green', 'again', '.']

 Tokenize encoding: [101, 2813, 2358, 1012, 6468, 15020, 2067, 2046, 1996, 2304, 1006, 26665, 1007, 26665, 1011, 2460, 1011, 19041, 1010, 2813, 2395, 1005, 1055, 1040, 11101, 2989, 1032, 2316, 1997, 11087, 1011, 22330, 8713, 2015, 1010, 2024, 3773, 2665, 2153, 1012, 102]

 Tokenize call: {'input_ids': [101, 2813, 2358, 1012, 6468, 15020, 2067, 2046, 1996, 2304, 1006, 26665, 1007, 26665, 1011, 2460, 1011, 19041, 1010, 2813, 2395, 1005, 1055, 1040, 11101, 2989, 1032, 2316, 1997, 11087, 1011, 22330, 8713, 2015, 1010, 2024

In [85]:
def encode_data(examples):
    # Tokenize the text and include labels in the output
    # Truncation and padding are applied to keep text lengths uniform
    # tokenizer() 
    encoding = tokenizer(examples['text'], truncation=True, padding='max_length')
    return encoding

# Apply the function to tokenize the dataset and include labels
tokenized_dataset = dataset.map(encode_data, batched=True)

#tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
from torch.utils.data import DataLoader

# Define the DataLoader for training data
train_loader = DataLoader(tokenized_dataset['train'], batch_size=16, shuffle=True)
it_train = iter(train_loader)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [100]:
print(next(it_train))

{'text': ['PARK #39;S STRONG FINISH Korean Grace Park shot five under par over her final five holes to set a course record and snatch the third-round lead away from Jill McGill at the Wachovia LPGA Classic.', 'IBM announces eServer i5 550 The eServer i5 550 comes with a feature called the  quot;Solution Edition, quot; which is apparently available with certain Independent Software Vendors. The eServer i5, allegedly, has new ways to handle and optimise multiple operating system sthrough the ...', 'Sunspot Grows to 20 Times Size of Earth (SPACE.com) SPACE.com - A sunspot group aimed squarely \\  at Earth has grown to 20 times the size of our planet and has the potential \\  to unleash a major solar storm.', "Message loud and clear A funny thing happened as Orlando Cabrera triumphantly rounded the bases after he knocked in Johnny Damon with his walkoff double off the Green Monster in Tuesday's 5-4 thriller over the Blue Jays. In the stands, Cabrera's wife, Eliana, noticed during the celeb