In [7]:
import os
import getpass

from datasets import load_dataset
from huggingface_hub import login
from transformers import AutoTokenizer
from dotenv import load_dotenv
import matplotlib.pyplot as plt

import datasets


## Create Chunked Dataset

In [8]:

load_dotenv('.envrc')
if 'HF_TOKEN' in os.environ:
    login(token=os.environ['HF_TOKEN'])
else:
    login(token=getpass.getpass('Huggingface token: '))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [9]:
CHUNK_SIZE = 128
SUBCHUNK_SIZE = 64
assert CHUNK_SIZE % SUBCHUNK_SIZE == 0
SUBCHUNK_RATIO = CHUNK_SIZE // SUBCHUNK_SIZE

LARGE_CHUNKED_DATASET = 'MikiV/SimpleStories-SimpleStories-chunked-128'
SUBCHUNK_DATASET_NAME = LARGE_CHUNKED_DATASET.replace('chunked', 'subchunked').replace(str(CHUNK_SIZE), f'{SUBCHUNK_SIZE}x{SUBCHUNK_RATIO}')

In [10]:
chunked_dataset = load_dataset(LARGE_CHUNKED_DATASET)
chunked_dataset


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3653829
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 152243
    })
})

In [None]:
subchunked_dataset = datasets.DatasetDict()
subchunked_dataset['test'] = chunked_dataset['test']

# Create the subchunked training set
# By spliting each item in the original dataset into SUBCHUNK_RATIO items
train_data = []
for item in chunked_dataset['train']:
    text = item['input_ids']
    for i in range(SUBCHUNK_RATIO):
        subchunk = text[i*SUBCHUNK_SIZE:(i+1)*SUBCHUNK_SIZE]
        train_data.append({'input_ids': subchunk})
# 

## Dataset Exploration

In [None]:
for thresh in [64, 128, 256, 512]:
    x = sum(l >= thresh for l in lengths) / len(lengths) * 100
    print(f"{x}% of stories are at least {thresh} tokens")

99.9950370941761% of stories are at least 64 tokens
94.04508020055812% of stories are at least 128 tokens
52.60907049027838% of stories are at least 256 tokens
7.031728565918733% of stories are at least 512 tokens


In [10]:
tokenizer.get_vocab()

{'gripped': 3351,
 'keep': 882,
 '##ide': 220,
 'warri': 3404,
 '##lls': 3480,
 'snowflakes': 3058,
 'lush': 3951,
 'face': 829,
 '##its': 3954,
 'exc': 398,
 'flo': 907,
 'wish': 845,
 'wrapped': 2296,
 'send': 2736,
 'planets': 2589,
 'war': 435,
 '##ilies': 3053,
 'treat': 2531,
 'rough': 3527,
 'writ': 2546,
 'burden': 3668,
 '##ugg': 1120,
 'branches': 1916,
 'snowball': 2847,
 'have': 589,
 '##eless': 3645,
 'cand': 1351,
 '##tle': 456,
 'climb': 824,
 '##uly': 2003,
 'shaking': 2332,
 'vic': 2575,
 'jean': 745,
 'happening': 2586,
 'working': 1823,
 'civ': 3128,
 'not': 191,
 'who': 427,
 'p': 47,
 'faces': 1507,
 'cle': 1642,
 'wr': 615,
 'simple': 2075,
 'loudly': 2241,
 'job': 2876,
 'wisely': 3113,
 'longing': 2561,
 'statue': 3312,
 'just': 300,
 'being': 905,
 'wat': 487,
 'empty': 1297,
 '##iny': 873,
 '##her': 148,
 'puddle': 3981,
 'hesit': 1752,
 'rec': 1512,
 'leader': 1384,
 'early': 3455,
 '##ms': 1413,
 'blooms': 3587,
 '##ieved': 4093,
 'surface': 2121,
 'phot': 2

In [11]:
# Ove chunked dataset, find the most common tokens and what percentage of the dataset they cover
from collections import Counter
all_tokens = [token for chunk in chunked_dataset['input_ids'] for token in chunk]
token_counts = Counter(all_tokens)
total_tokens = len(all_tokens)
most_common = token_counts.most_common(20)
for token, count in most_common:
    print(f"Token: {token} ({tokenizer.decode([token])}), Count: {count}, Percentage: {count/total_tokens*100:.2f}%")

NameError: name 'chunked_dataset' is not defined