<a href="https://colab.research.google.com/github/KunjShah95/HINDITOKENIZER/blob/main/hinditokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentencepiece indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Downloading sphinx_argparse-0.5.2-py3-none-any.whl (12 kB)
Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl (7.7 MB)
[2K   [90m━

In [2]:
import re
import sentencepiece as spm
from indicnlp.tokenize import indic_tokenize

class HindiTokenizer:
    def __init__(self, sp_model_path=None):
        """
        Initialize tokenizer with optional SentencePiece model
        """
        self.sp_model = None
        if sp_model_path:
            self.sp_model = spm.SentencePieceProcessor()
            self.sp_model.load(sp_model_path)

    def regex_tokenizer(self, text):
        """
        Basic regex-based word tokenizer
        Handles common Hindi punctuation and symbols
        """
        pattern = r"""
            \s+                      # Split on spaces
            | [।,!?;॰ः़]              # Hindi punctuation
            | \d+                     # Numbers
            | [^\s।,!?;॰ः़\d]+       # Words and other characters
        """
        tokens = re.findall(pattern, text, flags=re.X)
        return [token.strip() for token in tokens if token.strip()]

    def indic_nlp_tokenizer(self, text):
        """
        Using Indic NLP Library for more sophisticated tokenization
        Requires: pip install indic-nlp-library
        """
        return indic_tokenize.trivial_tokenize(text, lang='hi')

    def sentencepiece_tokenizer(self, text):
        """
        Subword tokenization using pre-trained SentencePiece model
        """
        if not self.sp_model:
            raise ValueError("SentencePiece model not loaded!")
        return self.sp_model.encode_as_pieces(text)

    @staticmethod
    def train_sentencepiece_model(corpus_path, model_prefix='hindi_sp', vocab_size=8000):
        """
        Train a SentencePiece model from a corpus
        """
        spm.SentencePieceTrainer.train(
            input=corpus_path,
            model_prefix=model_prefix,
            vocab_size=vocab_size,
            character_coverage=0.9995,
            model_type='unigram',
            user_defined_symbols=['।', 'ः', '॰']  # Hindi-specific symbols
        )

if __name__ == "__main__":
    # Example usage
    sample_text = "हिन्दी भाषा विश्व की प्राचीनतम भाषाओं में से एक है। इसकी लिपि देवनागरी है।"

    # Initialize tokenizer with SentencePiece model (if available)
    tokenizer = HindiTokenizer()  # Without SentencePiece
    # tokenizer = HindiTokenizer('hindi_sp.model')  # With SentencePiece

    print("Regex Tokenizer:")
    print(tokenizer.regex_tokenizer(sample_text))

    print("\nIndic NLP Tokenizer:")
    try:
        print(tokenizer.indic_nlp_tokenizer(sample_text))
    except ImportError:
        print("Install indic-nlp-library first: pip install indic-nlp-library")

    # To use SentencePiece:
    # 1. First train a model using:
    # HindiTokenizer.train_sentencepiece_model('hindi_corpus.txt')
    # 2. Then load the model and use

    # Sample SentencePiece output (if model available):
    # print("\nSentencePiece Tokenizer:")
    # print(tokenizer.sentencepiece_tokenizer(sample_text))

Regex Tokenizer:
['हिन्दी', 'भाषा', 'विश्व', 'की', 'प्राचीनतम', 'भाषाओं', 'में', 'से', 'एक', 'है', '।', 'इसकी', 'लिपि', 'देवनागरी', 'है', '।']

Indic NLP Tokenizer:
['हिन्दी', 'भाषा', 'विश्व', 'की', 'प्राचीनतम', 'भाषाओं', 'में', 'से', 'एक', 'है', '।', 'इसकी', 'लिपि', 'देवनागरी', 'है', '।']
