In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

from huggingface_hub import login

# Fetch the token securely from Colab secrets
hf_token = os.getenv("HF_TOKEN")

if hf_token is None:
    raise ValueError("HF_TOKEN not found. Please add it via Colab secrets.")

login(token=hf_token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
import re
import unicodedata

def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)                # Normalize Unicode
    text = text.replace("’", "'")                             # Fix smart apostrophe
    text = text.replace("“", '"').replace("”", '"')           # Fix smart quotes
    text = text.replace("–", "-").replace("—", "-")           # Fix dashes
    text = re.sub(r"[^\x00-\x7F]+", " ", text)                # Remove any remaining non-ASCII
    text = re.sub(r"\s+", " ", text).strip()                  # Collapse whitespace
    return text

In [5]:
import pandas as pd

def load_corpus(file_path: str) -> list:
    df = pd.read_csv(file_path, encoding="latin-1")
    text_data = df["v2"].dropna().astype(str).tolist()
    text_data = [normalize_text(t) for t in text_data]
    return text_data

corpus = load_corpus("data/spam.csv")
print(f"✅ Loaded {len(corpus)} samples")
print(f"Example: {corpus[0]}")

✅ Loaded 5572 samples
Example: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [7]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
import os

def train_tokenizer(corpus: list, output_dir: str, vocab_size: int = 10000) -> ByteLevelBPETokenizer:
    """
    Trains a Byte-Level BPE tokenizer on a given corpus.

    Args:
        corpus (list): List of text strings.
        output_dir (str): Path to save tokenizer files.
        vocab_size (int): Vocabulary size to train on.

    Returns:
        ByteLevelBPETokenizer: Trained tokenizer.
    """
    os.makedirs(output_dir, exist_ok=True)

    tokenizer = ByteLevelBPETokenizer()
    tokenizer._tokenizer.pre_tokenizer = ByteLevel()


    tokenizer.train_from_iterator(
        corpus,
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    )
    tokenizer.save_model(output_dir)
    tokenizer.save(f"{output_dir}/tokenizer.json")  # ✅ Required for Hugging Face compatibility
    return tokenizer

tokenizer = train_tokenizer(corpus, output_dir="my_tokenizer")






In [8]:
from transformers import PreTrainedTokenizerFast

def load_fast_tokenizer(tokenizer_dir: str) -> PreTrainedTokenizerFast:
    """
    Loads tokenizer files into a PreTrainedTokenizerFast object.

    Args:
        tokenizer_dir (str): Directory where tokenizer files are saved.

    Returns:
        PreTrainedTokenizerFast: Hugging Face compatible tokenizer.
    """
    return PreTrainedTokenizerFast(
        tokenizer_file=f"{tokenizer_dir}/tokenizer.json",
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<s>",
        eos_token="</s>",
        mask_token="<mask>"
    )

fast_tokenizer = load_fast_tokenizer("my_tokenizer")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
from huggingface_hub import HfApi

HF_USERNAME = HfApi().whoami()["name"]
REPO_NAME = "spam-tokenizer-bpe"
TOKENIZER_REPO = f"{HF_USERNAME}/{REPO_NAME}"

fast_tokenizer.save_pretrained("my_tokenizer")  # includes config files
fast_tokenizer.push_to_hub(REPO_NAME)

print(f"✅ Tokenizer pushed: https://huggingface.co/{TOKENIZER_REPO}")

✅ Tokenizer pushed: https://huggingface.co/mushfiqurrobin/spam-tokenizer-bpe


In [9]:
text = "Congratulations! You’ve won a free ticket to Bahamas."
tokens = fast_tokenizer.tokenize(text)
ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Text:", text)
print("Tokens:", tokens)
print("Token IDs:", ids)

Text: Congratulations! You’ve won a free ticket to Bahamas.
Tokens: ['C', 'ongrat', 'ulations', '!', 'ĠYou', 'â', 'Ģ', 'Ļ', 've', 'Ġwon', 'Ġa', 'Ġfree', 'Ġticket', 'Ġto', 'ĠBahamas', '.']
Token IDs: [39, 1322, 2254, 5, 410, 163, 227, 252, 294, 689, 262, 645, 4175, 276, 9676, 18]


In [10]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer from the directory
fast_tokenizer = PreTrainedTokenizerFast.from_pretrained("my_tokenizer")

# New text to tokenize
text = "Hello! This is a different sentence to tokenize."

# Tokenize and get token IDs
tokens = fast_tokenizer.tokenize(text)
token_ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

Tokens: ['H', 'ello', '!', 'ĠThis', 'Ġis', 'Ġa', 'Ġdifferent', 'Ġsentence', 'Ġto', 'Ġtok', 'en', 'ize', '.']
Token IDs: [44, 7284, 5, 839, 327, 262, 2933, 6447, 276, 5917, 307, 2300, 18]
