In [1]:
import os
import collections
import pandas as pd
from typing import List, Tuple, Dict
import re

In [2]:
MERGE_STEPS = 32000
VOCAB_SIZE = 32000
SAVE_DIR = "."

In [3]:
os.makedirs(SAVE_DIR, exist_ok=True)

In [4]:
import json
from collections import defaultdict, Counter
with open("tokenized.json", "r", encoding="utf-8") as f:
    data = json.load(f)
all_sentences = [sentence for sentence in data["sentences"]]
all_tokens = [token for sentence in data["tokens"] for token in sentence]
print(f"Successfully loaded {len(all_sentences)} sentences.")
print(f"Total tokens for training: {len(all_tokens)}")

Successfully loaded 1557 sentences.
Total tokens for training: 21856


In [None]:
import numpy as np

def extract_sentences_from_parquet(df):
    sentences = []
    for entry in df["sentences"]:
        # Case 1: numpy array of dicts
        if isinstance(entry, np.ndarray):
            for e in entry:
                if isinstance(e, dict) and 'text' in e:
                    text = e['text']
                    tokens = text.strip().split()
                    sentences.append(tokens)
        # Case 2: single dict
        elif isinstance(entry, dict) and 'text' in entry:
            text = entry['text']
            tokens = text.strip().split()
            sentences.append(tokens)
        # Case 3: list of dicts
        elif isinstance(entry, list):
            for e in entry:
                if isinstance(e, dict) and 'text' in e:
                    text = e['text']
                    tokens = text.strip().split()
                    sentences.append(tokens)
    return sentences

In [5]:
word_freq = collections.Counter()
for sent in all_sentences:
    for word in sent:
        word_freq[word] += 1

print(f"Unique words: {len(word_freq)}")
print("Most common:", word_freq.most_common(10))

Unique words: 161
Most common: [(' ', 20351), ('ા', 11646), ('ર', 7269), ('ે', 7076), ('ન', 5874), ('્', 5255), ('ી', 5074), ('ક', 4233), ('મ', 4210), ('વ', 3934)]


In [6]:
#prepare the initial vocabulary
vocab = {' '.join(list(word)) + ' </w>': freq for word, freq in word_freq.items()}
print("Initial vocab size:", len(vocab))

Initial vocab size: 161


In [7]:
def get_stats(vocab):
    """Count frequency of all adjacent symbol pairs in vocab."""
    pairs = collections.Counter()
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs

In [8]:
def merge_vocab(pair, v_in):
    """Merge the most frequent pair in vocab."""
    v_out = {}
    bigram = re.escape(' '.join(pair))
    pattern = re.compile(r'(?<!\\S)' + bigram + r'(?!\\S)')
    for word in v_in:
        new_word = pattern.sub(''.join(pair), word)
        v_out[new_word] = v_in[word]
    return v_out

In [9]:
for i in range(MERGE_STEPS):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    if (i+1) % 1000 == 0:
        print(f"Step {i+1}: merged {best}")

In [10]:
bpe_vocab = set()
for word in vocab:
    bpe_vocab.update(word.split())

with open("bpe_vocab.txt", "w", encoding="utf-8") as f:
    for token in sorted(bpe_vocab):
        f.write(token + "\\n")

print("BPE vocabulary saved at bpe_vocab.txt")

BPE vocabulary saved at bpe_vocab.txt


In [11]:
def encode_bpe_word(word, merges):
    word = list(word) + ['</w>']
    pairs = [(word[i], word[i+1]) for i in range(len(word)-1)]
    merge_set = set(merges)
    while True:
        bigrams = [(a,b) for (a,b) in pairs if (a,b) in merge_set]
        if not bigrams:
            break
        a,b = bigrams[0]
        new_word = []
        i = 0
        while i < len(word):
            if i < len(word)-1 and word[i]==a and word[i+1]==b:
                new_word.append(a+b)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        word = new_word
        if len(word) == 1:
            break
        pairs = [(word[i], word[i+1]) for i in range(len(word)-1)]
    return word

print("Example BPE tokens:", encode_bpe_word("વીડિયો", [("વ","ી"),("ડ","િ"),("ય","ો")])
)

Example BPE tokens: ['વી', 'ડિ', 'યો', '</w>']


In [13]:
def train_wordpiece(corpus, vocab_size):
    vocab = collections.Counter()
    for sent in corpus:
        for word in sent:
            vocab[word] += 1

    # Start with all characters
    wp_vocab = set()
    for word in vocab:
        wp_vocab.update(word)

    wp_vocab = {ch: i for i, ch in enumerate(sorted(wp_vocab))}

    while len(wp_vocab) < vocab_size:
        pairs = collections.Counter()
        for word, freq in vocab.items():
            chars = list(word)
            for i in range(len(chars) - 1):
                pairs[(chars[i], chars[i+1])] += freq
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        new_token = best[0] + best[1]
        wp_vocab[new_token] = len(wp_vocab)

        new_vocab = collections.Counter()
        for word, freq in vocab.items():
            new_word = word.replace(new_token, new_token)
            new_vocab[new_word] += freq
        vocab = new_vocab

    return wp_vocab

wp_vocab = train_wordpiece(all_sentences, 32000)
print("WordPiece vocab size:", len(wp_vocab))

WordPiece vocab size: 161


In [14]:
with open("wp_vocab.txt", "w", encoding="utf-8") as f:
    for token in wp_vocab:
        f.write(token + "\\n")

print("WordPiece vocabulary saved at wp_vocab.txt")

WordPiece vocabulary saved at wp_vocab.txt


In [15]:
def encode_word_wordpiece(word, vocab):
    output_tokens = []
    start = 0
    while start < len(word):
        end = len(word)
        sub = None
        while start < end:
            substr = word[start:end]
            if start > 0:
                substr = '##' + substr
            if substr in vocab:
                sub = substr
                break
            end -= 1
        if sub is None:
            output_tokens.append('[UNK]')
            start += 1
        else:
            output_tokens.append(sub)
            start = end
    return output_tokens

print("Example WordPiece tokens:", encode_word_wordpiece("વીડિયો", wp_vocab))

Example WordPiece tokens: ['વ', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]']
