### Importing and Installing Dataset

In [None]:
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
from tokenizers import Tokenizer

### Importing Dataset

In [None]:
dataset = load_dataset("YourAccountName/PreTraining", split="", streaming=True)

### Load Tokenizer


In [None]:
tokenizer = Tokenizer.from_file("LumenTokenizer.json")

### Tokenizing Dataset

#### Tokenizing in chunks

In [None]:
chunk_size_samples = 2_000_000 
chunk_counter = 0
eos_id = tokenizer.encode("<|endoftext|>", add_special_tokens=False).ids[0]

current_chunk = []
current_samples = 0

for item in tqdm(dataset, desc="Tokenizing"):
    tokens = tokenizer.encode(item["text"], add_special_tokens=False).ids
    current_chunk.extend(tokens)
    current_chunk.append(eos_id)
    
    current_samples += 1
    if current_samples >= chunk_size_samples:
        np.save(f"tokens_chunk_{chunk_counter}.npy", np.array(current_chunk, dtype=np.int32))
        print(f"Saved chunk {chunk_counter} with {len(current_chunk)} tokens ({current_samples} samples)")
        chunk_counter += 1
        current_chunk = []
        current_samples = 0

if current_chunk:
    np.save(f"tokens_chunk_{chunk_counter}.npy", np.array(current_chunk, dtype=np.int32))
    print(f"Saved final chunk {chunk_counter} with {len(current_chunk)} tokens ({current_samples} samples)")


#### Merging Chunks

In [None]:
chunks_to_merge = ["tokens_chunk_0.npy"]

all_tokens = []

# Load and append each chunk
for chunk_file in chunks_to_merge:
    arr = np.load(chunk_file)
    all_tokens.append(arr)

# Concatenate all tokens into one array
merged_tokens = np.concatenate(all_tokens, axis=0)

# Save as a new merged chunk
np.save("TokenizedDataSet.npy", merged_tokens)
print(f"Merged chunk saved with {len(merged_tokens)} tokens")

### Train and Validation Splitting


In [None]:
all_tokens = np.load("TokenizedDataSet.npy")

In [None]:
split_ratio = int(0.9*len(all_tokens))
train_split = all_tokens[:split_ratio]
val_split = all_tokens[split_ratio:]

In [None]:
print("Saving Training and Validation Data")
np.save("train_split.npy", train_split)
np.save("val_split.npy", val_split)
print("Training and Validation Data Saved!")