In [1]:
import os
from tqdm import tqdm
import pandas as pd
import argparse
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast
import argparse
import datetime
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast

def train_byte_level_tokenizer(data_list, vocab_size=32768, model_name="./urdu_byte_tokenizer_v2"):
    ## Set special tokens
    bos_tok = "<sos>"
    eos_tok = "<end_of_sen>"
    
    # Special characters, add more as needed
    special_char = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    
    # Initialize ByteLevelBPETokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Train the tokenizer
    tokenizer.train_from_iterator(
        data_list,
        vocab_size=vocab_size,
        min_frequency=5,
        special_tokens=["<pad>", "<unk>", bos_tok, eos_tok, "<user>", "<assistant>"] + special_char
    )

    # Wrap with transformers PreTrainedTokenizerFast
    transformer_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token=bos_tok,
        eos_token=eos_tok,
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        padding_side="left",
        truncation_side="right",
        additional_special_tokens=["<user>", "<assistant>"],
        clean_up_tokenization_spaces=False,
    )

    # Save the tokenizer
    transformer_tokenizer.save_pretrained(model_name)
    return transformer_tokenizer


In [1]:
import os
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast
import pandas as pd

# Define the paths and target size for data sampling
data_path = "/Users/pranav/Desktop/nlp-tokeniser/scraped_data"
target_size = 500 * 1024 * 1024  # 500 MB in bytes

# Step 1: Load 500 MB of data
def load_data_subset(data_path, target_size):
    data = []
    current_size = 0
    for filename in os.listdir(data_path):
        file_path = os.path.join(data_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                if current_size >= target_size:
                    return data
                data.append(line.strip())
                current_size += len(line.encode('utf-8'))
    return data

# Load 500 MB of data for training
data_sample = load_data_subset(data_path, target_size)

# Step 2: Train the ByteLevelBPETokenizer
def train_tokenizer(data, vocab_size=32768, model_name="./urdu_tokenizer_v3"):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train_from_iterator(data, vocab_size=vocab_size, min_frequency=5, special_tokens=["<pad>", "<unk>", "<sos>", "<end_of_sen>"])
    return PreTrainedTokenizerFast(tokenizer_object=tokenizer)

# Train the tokenizer on the sampled data
tokenizer = train_tokenizer(data_sample)

# Step 3: Calculate fertility score on the entire dataset
def calculate_fertility_score_dataset(tokenizer, data):
    total_words = 0
    total_tokens = 0
    for line in data:
        words = line.split()
        tokens = tokenizer.tokenize(line)
        total_words += len(words)
        total_tokens += len(tokens)
    return total_tokens / total_words if total_words > 0 else 0

# Calculate fertility score for the entire dataset
fertility_score = calculate_fertility_score_dataset(tokenizer, data_sample)

# Step 4: Display results in a matrix
results = pd.DataFrame({
    "Tokenizer": ["ByteLevelBPETokenizer"],
    "Fertility Score": [fertility_score],
    "Dataset Size (MB)": [target_size / (1024 * 1024)]
})

print("Fertility Score Matrix:")
print(results)


  from .autonotebook import tqdm as notebook_tqdm





Fertility Score Matrix:
               Tokenizer  Fertility Score  Dataset Size (MB)
0  ByteLevelBPETokenizer         1.126726              500.0
