In [48]:
import os
import requests
import tiktoken
import numpy as np
from tqdm import tqdm
from datasets import load_dataset # huggingface datasets

# download the tiny shakespeare dataset or load it from a local file
# os.getcwd() returns the current working directory
input_file_path = os.path.join(os.getcwd(), 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)
    
dataset = load_dataset("text", data_files="input.txt")

# input.txt by default only contains the 'train' split, so create a test split
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test') # rename the test split to val

print(split_dataset.items(), '\n')

print(split_dataset['train'][10]['text'])
# split_dataset['train'][0 ~ 35999]['text'] = "often give us ..."
# split_dataset['val'][0 ~ 3999]['text'] = "thee, and ..."

dict_items([('train', Dataset({
    features: ['text'],
    num_rows: 36000
})), ('val', Dataset({
    features: ['text'],
    num_rows: 4000
}))]) 

often give us soldiers the lie: but we pay them for


In [51]:
num_proc = 2
enc = tiktoken.get_encoding("gpt2")

def process(example):
    ids = enc.encode_ordinary(example['text'])
    ids.append(enc.eot_token)
    out = {'ids': ids, 'len': len(ids)}
    return out

# tokenize the dataset
tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)
print(split_dataset.items(), '\n')
print(tokenized.items(), '\n')



dict_items([('train', Dataset({
    features: ['text'],
    num_rows: 36000
})), ('val', Dataset({
    features: ['text'],
    num_rows: 4000
}))]) 

dict_items([('train', Dataset({
    features: ['ids', 'len'],
    num_rows: 36000
})), ('val', Dataset({
    features: ['ids', 'len'],
    num_rows: 4000
}))]) 



In [None]:
print(tokenized.items())