In [2]:
# ==============================================================
# TOKENIZATION TECHNIQUES COMPARISON — BPE vs DCT vs EBT
# ==============================================================

# ✅ Step 1. Install dependencies
!pip install datasets tokenizers sentence-transformers transformers tqdm matplotlib numpy scipy --quiet

# ==============================================================
# Step 2. Imports
# ==============================================================
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from tokenizers import Tokenizer
from tqdm import tqdm
import numpy as np
import math
from collections import Counter
import matplotlib.pyplot as plt
import time

# ==============================================================
# Step 3. Load dataset
# ==============================================================
print("Loading dataset...")
dataset = load_dataset("ag_news", split="train[:2%]")
texts = [x["text"] for x in dataset]
print("Loaded", len(texts), "samples")

# ==============================================================
# Step 4. Dynamic Contextual Tokenization (DCT)
# ==============================================================
print("\nInitializing SentenceTransformer model for DCT...")
model = SentenceTransformer("all-MiniLM-L6-v2")

def dynamic_contextual_tokenize(sentence, sim_threshold=0.55):
    words = sentence.split()
    if len(words) == 1:
        return words
    vectors = model.encode(words, normalize_embeddings=True)
    merged, i = [], 0
    while i < len(words):
        j = i
        while j + 1 < len(words) and np.dot(vectors[j], vectors[j+1]) > sim_threshold:
            j += 1
        merged.append(" ".join(words[i:j+1]))
        i = j + 1
    return merged

# ==============================================================
# Step 5. Entropy-Based Tokenization (EBT)
# ==============================================================
def shannon_entropy(seq):
    counts = Counter(seq)
    total = len(seq)
    return -sum((c/total)*math.log2(c/total) for c in counts.values())

def entropy_based_tokenize(text, window=4, threshold=0.8):
    tokens, start = [], 0
    for i in range(window, len(text)):
        left = text[i-window:i]
        right = text[i-window+1:i+1]
        if abs(shannon_entropy(right) - shannon_entropy(left)) > threshold:
            tokens.append(text[start:i])
            start = i
    tokens.append(text[start:])
    tokens = [t.strip() for t in tokens if t.strip()]
    return tokens

# ==============================================================
# Step 6. Baseline GPT-2 Tokenizer (BPE)
# ==============================================================
print("\nLoading GPT-2 tokenizer (BPE baseline)...")
bpe_tok = AutoTokenizer.from_pretrained("gpt2")

def bpe_tokenize(text):
    return bpe_tok.tokenize(text)

# ==============================================================
# Step 7. Test Sample Output
# ==============================================================
sample_text = "Quantum physics explores atomic scale phenomena in the universe."
print("\nSample text:", sample_text)
print("DCT:", dynamic_contextual_tokenize(sample_text))
print("EBT:", entropy_based_tokenize(sample_text))
print("BPE:", bpe_tokenize(sample_text))

# ==============================================================
# Step 8. Evaluation Function
# ==============================================================
def evaluate_tokenizer(fn, name, texts, n=100):
    start = time.time()
    avg_tokens = np.mean([len(fn(t)) for t in tqdm(texts[:n], desc=f"Testing {name}")])
    end = time.time()
    elapsed = end - start
    return avg_tokens, elapsed

# ==============================================================
# Step 9. Run Evaluations
# ==============================================================
print("\nEvaluating all tokenizers...")

results = {}
for name, fn in {
    "DCT": dynamic_contextual_tokenize,
    "EBT": entropy_based_tokenize,
    "BPE": bpe_tokenize
}.items():
    avg_tokens, elapsed = evaluate_tokenizer(fn, name, texts)
    results[name] = (avg_tokens, elapsed)
    print(f"{name}: Avg Tokens = {avg_tokens:.2f}, Time = {elapsed:.2f}s")

# ==============================================================
# Step 10. Plot Comparison Graphs
# ==============================================================
methods = list(results.keys())
token_counts = [results[m][0] for m in methods]
times = [results[m][1] for m in methods]

plt.figure(figsize=(12,5))

# Plot 1 — Token Counts
plt.subplot(1,2,1)
plt.bar(methods, token_counts, color=['orange','green','skyblue'])
plt.title("Average Tokens per Sentence")
plt.ylabel("Tokens (lower = more efficient)")

# Plot 2 — Runtime
plt.subplot(1,2,2)
plt.bar(methods, times, color=['orange','green','skyblue'])
plt.title("Runtime (seconds per 100 samples)")
plt.ylabel("Time (s)")

plt.suptitle("Comparison: BPE vs DCT vs EBT", fontsize=14, fontweight='bold')
plt.show()

# ==============================================================
# Step 11. Summary
# ==============================================================
print("\n===== SUMMARY =====")
for m, (toks, t) in results.items():
    print(f"{m}: Avg Tokens = {toks:.2f}, Runtime = {t:.2f}s")

print("\n✅ Done! You can now visually compare performance and efficiency of all tokenizers.")


'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /datasets/ag_news/resolve/main/README.md (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1000)')))"), '(Request ID: 83c6f5fc-d943-48e6-b3ac-1ce85e517c72)')' thrown while requesting HEAD https://huggingface.co/datasets/ag_news/resolve/main/README.md
Retrying in 1s [Retry 1/5].


Loading dataset...


'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /datasets/ag_news/resolve/main/README.md (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1000)')))"), '(Request ID: 37d2fec3-f6c0-4c35-8ca0-fc39187d3977)')' thrown while requesting HEAD https://huggingface.co/datasets/ag_news/resolve/main/README.md
Retrying in 2s [Retry 2/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /datasets/ag_news/resolve/main/README.md (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1000)')))"), '(Request ID: e2635aef-706d-4da9-8871-d76631081303)')' thrown while requesting HEAD https://huggingface.co/datasets/ag_news/resolve/main/README.md
Retrying in 4s [Retry 3/5].
'(MaxRetryErro

ConnectionError: Couldn't reach 'ag_news' on the Hub (SSLError)