In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

from huggingface_hub import login

# Fetch the token securely from Colab secrets
hf_token = os.getenv("HF_TOKEN")

if hf_token is None:
    raise ValueError("HF_TOKEN not found. Please add it via Colab secrets.")

login(token=hf_token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
import re
import unicodedata

def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)                # Normalize Unicode
    text = text.replace("’", "'")                             # Fix smart apostrophe
    text = text.replace("“", '"').replace("”", '"')           # Fix smart quotes
    text = text.replace("–", "-").replace("—", "-")           # Fix dashes
    text = re.sub(r"[^\x00-\x7F]+", " ", text)                # Remove any remaining non-ASCII
    text = re.sub(r"\s+", " ", text).strip()                  # Collapse whitespace
    return text

In [3]:
import pandas as pd

def load_corpus(file_path: str) -> list:
    df = pd.read_csv(file_path, encoding="latin-1")
    text_data = df["v2"].dropna().astype(str).tolist()
    text_data = [normalize_text(t) for t in text_data]
    return text_data

corpus = load_corpus("data/spam.csv")
print(f"✅ Loaded {len(corpus)} samples")
print(f"Example: {corpus[0]}")

✅ Loaded 5572 samples
Example: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [4]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
import os

def train_tokenizer(corpus: list, output_dir: str, vocab_size: int = 10000) -> ByteLevelBPETokenizer:
    """
    Trains a Byte-Level BPE tokenizer on a given corpus.

    Args:
        corpus (list): List of text strings.
        output_dir (str): Path to save tokenizer files.
        vocab_size (int): Vocabulary size to train on.

    Returns:
        ByteLevelBPETokenizer: Trained tokenizer.
    """
    os.makedirs(output_dir, exist_ok=True)

    tokenizer = ByteLevelBPETokenizer()
    tokenizer._tokenizer.pre_tokenizer = ByteLevel()


    tokenizer.train_from_iterator(
        corpus,
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    )
    tokenizer.save_model(output_dir)
    tokenizer.save(f"{output_dir}/tokenizer.json")  # ✅ Required for Hugging Face compatibility
    return tokenizer

tokenizer = train_tokenizer(corpus, output_dir="my_tokenizer")







---

### 🧠 Step 1: Initialize the tokenizer

```python
tokenizer = ByteLevelBPETokenizer()
tokenizer._tokenizer.pre_tokenizer = ByteLevel()
```

* `ByteLevelBPETokenizer()` initializes a tokenizer that:

  * Operates at the byte level (good for handling rare or multilingual text).
  * Uses Byte Pair Encoding (BPE) to learn subword units.

* `tokenizer._tokenizer.pre_tokenizer = ByteLevel()` sets the **pre-tokenizer** to `ByteLevel`, which:

  * Encodes the input text into bytes first.
  * Handles spaces explicitly by marking them with a special character like `Ġ` (important for preserving word boundaries).

⚠️ Note: This low-level `. _tokenizer.pre_tokenizer` usage is necessary because `ByteLevelBPETokenizer` uses a legacy API; the higher-level abstraction is `tokenizer.pre_tokenizer`.

---

### 🧪 Step 2: Train the tokenizer

```python
tokenizer.train_from_iterator(
    corpus,
    vocab_size=vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)
```

This tells the tokenizer to:

* Train using the given `corpus` (an iterator of texts).
* Limit the vocab size to `vocab_size`.
* Ignore tokens that occur less than `min_frequency=2`.
* Show progress during training.
* Add 5 **special tokens** often used in NLP:

  * `<s>` = start of sentence
  * `</s>` = end of sentence
  * `<pad>` = padding
  * `<unk>` = unknown token
  * `<mask>` = for masked language modeling

---

### 💾 Step 3: Save the tokenizer model

```python
tokenizer.save_model(output_dir)
tokenizer.save(f"{output_dir}/tokenizer.json")
```

* `save_model()` saves the `vocab.json` and `merges.txt` files — required for legacy loading.
* `save("tokenizer.json")` saves a **Hugging Face compatible** single-file JSON format, usable by `PreTrainedTokenizerFast`.

---

### ✅ Final Output

```python
return tokenizer
```

Returns the trained `ByteLevelBPETokenizer` object so it can be reused if needed.

---

### 🔄 Summary of Output Files in `output_dir`

| File             | Purpose                                |
| ---------------- | -------------------------------------- |
| `vocab.json`     | Maps tokens to IDs                     |
| `merges.txt`     | Contains merge rules learned by BPE    |
| `tokenizer.json` | Hugging Face-compatible tokenizer file |

---

In [5]:
from transformers import PreTrainedTokenizerFast

def load_fast_tokenizer(tokenizer_dir: str) -> PreTrainedTokenizerFast:
    """
    Loads tokenizer files into a PreTrainedTokenizerFast object.

    Args:
        tokenizer_dir (str): Directory where tokenizer files are saved.

    Returns:
        PreTrainedTokenizerFast: Hugging Face compatible tokenizer.
    """
    return PreTrainedTokenizerFast(
        tokenizer_file=f"{tokenizer_dir}/tokenizer.json",
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<s>",
        eos_token="</s>",
        mask_token="<mask>"
    )

fast_tokenizer = load_fast_tokenizer("my_tokenizer")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


The `fast_tokenizer` here refers to a **Hugging Face-compatible tokenizer** that is created by wrapping the trained tokenizer files (`tokenizer.json`, `vocab.json`, `merges.txt`) using the class:

```python
from transformers import PreTrainedTokenizerFast
```

---

### 💡 What is `PreTrainedTokenizerFast`?

`PreTrainedTokenizerFast` is a **fast implementation of Hugging Face tokenizers** using the [🤗 `tokenizers` Rust-based library](https://github.com/huggingface/tokenizers), which is:

* ⚡ **Much faster** than Python-based tokenizers.
* ✅ Fully compatible with `transformers` models.
* 🧠 Capable of handling everything: tokenization, detokenization, ID mapping, padding, truncation, etc.
* 🧩 Works with any tokenizer trained and saved as a `tokenizer.json`.

---

### 🔄 What the code does

```python
fast_tokenizer = load_fast_tokenizer("my_tokenizer")
```

This loads the custom tokenizer you trained earlier with `ByteLevelBPETokenizer` and wraps it into a Hugging Face-compatible format.

The important part is here:

```python
PreTrainedTokenizerFast(
    tokenizer_file=f"{tokenizer_dir}/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    mask_token="<mask>"
)
```

This:

* Loads `tokenizer.json` (which contains vocab, merges, config).
* Defines all special tokens to work seamlessly with Transformer models.

Now, you can use `fast_tokenizer` just like any pretrained tokenizer (e.g., `BertTokenizerFast`, `GPT2TokenizerFast`, etc.):

```python
text = "Hello world!"
tokens = fast_tokenizer.tokenize(text)
ids = fast_tokenizer.convert_tokens_to_ids(tokens)
decoded = fast_tokenizer.decode(ids)

print(tokens)  # ['H', 'ello', 'Ġworld', '!']
print(ids)     # [id1, id2, id3, id4]
print(decoded) # 'Hello world!'
```

---

### ✅ Summary

| Concept                   | What it does                                                               |
| ------------------------- | -------------------------------------------------------------------------- |
| `PreTrainedTokenizerFast` | A wrapper to load and use a tokenizer trained with `tokenizers`            |
| `tokenizer.json`          | The tokenizer model (vocab, merges, config)                                |
| `fast_tokenizer`          | The fully functional tokenizer you can now use in your pipeline            |
| Benefit                   | Fast, efficient, and works with Hugging Face `transformers` out-of-the-box |

In [6]:
from huggingface_hub import HfApi

HF_USERNAME = HfApi().whoami()["name"]
REPO_NAME = "spam-tokenizer-bpe"
TOKENIZER_REPO = f"{HF_USERNAME}/{REPO_NAME}"

fast_tokenizer.save_pretrained("my_tokenizer")  # includes config files
fast_tokenizer.push_to_hub(REPO_NAME)

print(f"✅ Tokenizer pushed: https://huggingface.co/{TOKENIZER_REPO}")

No files have been modified since last commit. Skipping to prevent empty commit.


✅ Tokenizer pushed: https://huggingface.co/mushfiqurrobin/spam-tokenizer-bpe


In [7]:
text = "Congratulations! You’ve won a free ticket to Bahamas."
tokens = fast_tokenizer.tokenize(text)
ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Text:", text)
print("Tokens:", tokens)
print("Token IDs:", ids)

Text: Congratulations! You’ve won a free ticket to Bahamas.
Tokens: ['C', 'ongrat', 'ulations', '!', 'ĠYou', 'â', 'Ģ', 'Ļ', 've', 'Ġwon', 'Ġa', 'Ġfree', 'Ġticket', 'Ġto', 'ĠBahamas', '.']
Token IDs: [39, 1322, 2254, 5, 410, 163, 227, 252, 294, 689, 262, 645, 4175, 276, 9676, 18]


In [8]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer from the directory
fast_tokenizer = PreTrainedTokenizerFast.from_pretrained("my_tokenizer")

# New text to tokenize
text = "Hello! This is a different sentence to tokenize."

# Tokenize and get token IDs
tokens = fast_tokenizer.tokenize(text)
# print([t.replace("Ġ", "") for t in tokens])
token_ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

Tokens: ['H', 'ello', '!', 'ĠThis', 'Ġis', 'Ġa', 'Ġdifferent', 'Ġsentence', 'Ġto', 'Ġtok', 'en', 'ize', '.']
Token IDs: [44, 7284, 5, 839, 327, 262, 2933, 6447, 276, 5917, 307, 2300, 18]


The `Ġ` character you're seeing in the token strings (e.g., `'ĠThis'`, `'Ġdifferent'`, `'Ġsentence'`) is **not an actual character in the final tokenized text**—it's just a **display artifact** from how some tokenizers (especially Byte-Level BPE tokenizers and GPT-style tokenizers) represent **word boundaries**.

### 🔍 What `Ġ` Means

* `Ġ` (Unicode U+0120) is used by some tokenizers to indicate that the token starts with a **space**.
* It helps distinguish between:

  * `"Ġtoken"` → the word "token" preceded by a space.
  * `"token"` → the word "token" without a preceding space (e.g., part of a longer word like `"notoken"`).

### ✅ Why It's Useful

This distinction allows the tokenizer to:

* **Model spaces explicitly**, which is important for text generation and reconstruction.
* **Preserve the structure of the input text** more accurately during training.

### 🧼 If You Want to Remove `Ġ` from Display

Just strip it during display if it's confusing:

```python
print([t.replace("Ġ", "") for t in tokens])
```

### ⚠️ But: Don't Remove It Before Encoding

The tokenizer uses `Ġ` internally to tokenize and reconstruct text correctly. You should **not** remove or alter it in any preprocessing unless you're just cleaning up **display/output**.

---

# creating a tokenizer on a HF dataset

In [9]:
from datasets import load_dataset
import os

dataset = load_dataset("SoorajK1/questions_and_answers")
os.makedirs("data", exist_ok=True)

dataset["train"].to_csv("data/questions_and_answers.csv", index=False)
print("Dataset saved to: data/questions_and_answers.csv")

Creating CSV from Arrow format: 100%|██████████| 30/30 [00:00<00:00, 47.29ba/s]

Dataset saved to: data/questions_and_answers.csv





In [10]:
import re
import unicodedata
import pandas as pd

def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')
    text = text.replace("–", "-").replace("—", "-")
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [11]:
pd.read_csv("data/questions_and_answers.csv").columns

Index(['Unnamed: 0', 'question', 'answer', 'content_row'], dtype='object')

In [12]:
def load_corpus_from_csv(path: str) -> list:
    df = pd.read_csv(path)
    # Combine questions and answers
    combined_text = (df["question"].astype(str) + "\n" + df["answer"].astype(str)).tolist()
    combined_text = [normalize_text(t) for t in combined_text if t.strip()]
    return combined_text

corpus = load_corpus_from_csv("data/questions_and_answers.csv")
print(f"Loaded {len(corpus)} samples from Q&A dataset")
print(f"Example: {corpus[0]}")

Loaded 29438 samples from Q&A dataset
Example: What is the purpose of the Android App mentioned? The Android App mentioned in the text is designed to collect data using smartphones. It serves as a tool that allows farmers to input and store information related to their farming activities.


In [13]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.pre_tokenizers import ByteLevel

def train_tokenizer(corpus: list, output_dir: str, vocab_size: int = 20000) -> ByteLevelBPETokenizer:
    os.makedirs(output_dir, exist_ok=True)
    tokenizer = ByteLevelBPETokenizer()
    tokenizer._tokenizer.pre_tokenizer = ByteLevel()

    tokenizer.train_from_iterator(
        corpus,
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    )
    tokenizer.save_model(output_dir)
    tokenizer.save(f"{output_dir}/tokenizer.json")
    return tokenizer

tokenizer = train_tokenizer(corpus, "my_qa_tokenizer")
print("✅ Tokenizer trained and saved to 'my_qa_tokenizer'")




✅ Tokenizer trained and saved to 'my_qa_tokenizer'


In [14]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="my_qa_tokenizer/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    mask_token="<mask>"
)

sample_text = "Yes, the Android App can be used offline, without requiring an internet connection for data collection. \
    Farmers can input data into the app even in remote areas where internet access may be limited. \
    The data collected offline can be synced and uploaded to the app's server once an internet connection becomes available."
tokens = fast_tokenizer.tokenize(sample_text)
ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Sample text:", sample_text)
print("Tokens:", [t.replace("Ġ", "") for t in tokens])
print("Token IDs:", ids)

Sample text: Yes, the Android App can be used offline, without requiring an internet connection for data collection.     Farmers can input data into the app even in remote areas where internet access may be limited.     The data collected offline can be synced and uploaded to the app's server once an internet connection becomes available.
Tokens: ['Y', 'es', ',', 'the', 'Android', 'App', 'can', 'be', 'used', 'off', 'line', ',', 'without', 'requiring', 'an', 'internet', 'connection', 'for', 'data', 'collection', '.', '', '', '', '', 'Farmers', 'can', 'input', 'data', 'into', 'the', 'app', 'even', 'in', 'remote', 'areas', 'where', 'internet', 'access', 'may', 'be', 'limited', '.', '', '', '', '', 'The', 'data', 'collected', 'off', 'line', 'can', 'be', 'syn', 'ced', 'and', 'uploaded', 'to', 'the', 'app', "'s", 'ser', 'ver', 'once', 'an', 'internet', 'connection', 'becomes', 'available', '.']
Token IDs: [61, 275, 16, 266, 8822, 2619, 342, 312, 524, 1078, 3138, 16, 1365, 8573, 286, 13133, 9

In [15]:
from huggingface_hub import HfApi

HF_USERNAME = HfApi().whoami()["name"]
REPO_NAME = "qa-tokenizer-bpe"
TOKENIZER_REPO = f"{HF_USERNAME}/{REPO_NAME}"

fast_tokenizer.save_pretrained("my_qa_tokenizer")  # includes config files
fast_tokenizer.push_to_hub(REPO_NAME)

print(f"✅ Tokenizer pushed: https://huggingface.co/{TOKENIZER_REPO}")

No files have been modified since last commit. Skipping to prevent empty commit.


✅ Tokenizer pushed: https://huggingface.co/mushfiqurrobin/qa-tokenizer-bpe


In [16]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer from the directory
fast_tokenizer = PreTrainedTokenizerFast.from_pretrained("my_qa_tokenizer")

text = "Building a tokenizer from scratch might seem daunting at first, but with the right tools and a step-by-step approach, it's quite manageable."

tokens = fast_tokenizer.tokenize(text)
token_ids = fast_tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", [t.replace("Ġ", "") for t in tokens])
print("Token IDs:", token_ids)

Tokens: ['B', 'uilding', 'a', 'to', 'ken', 'izer', 'from', 'scrat', 'ch', 'might', 'seem', 'd', 'a', 'unting', 'at', 'first', ',', 'but', 'with', 'the', 'right', 'tools', 'and', 'a', 'step', '-', 'by', '-', 'step', 'approach', ',', 'it', "'s", 'quite', 'manageable', '.']
Token IDs: [38, 10743, 264, 295, 3743, 7545, 453, 14266, 359, 5625, 15894, 294, 69, 12043, 477, 1273, 16, 1418, 417, 266, 2379, 5852, 292, 264, 2011, 17, 3723, 17, 13507, 1926, 16, 376, 821, 12520, 15040, 18]
