## [Nepali Tokenizer](https://github.com/basnetsoyuj/nepali-tokenizers)

In [7]:
!pip install nepalitokenizers -q

In [8]:
from nepalitokenizers import SentencePiece
from tokenizers.processors import TemplateProcessing

texts = [
    "म स्कुल जान्छु।",
    "यदि तिमी आएनौ भने, हामीले खेल सुरु गर्नेछौं।",
    "समाजमा नैतिक मूल्यहरूको संरक्षण गर्नका लागि हामी सबैको योगदान आवश्यक छ।"
]

# Nepali Sentence Pair Tokenizer
tokenizer_sp = SentencePiece()
tokenizer_sp.post_processor = TemplateProcessing()

for text in texts:
    tokens = tokenizer_sp.encode(text)
    print("\nText:", text)
    print("Token Ids:", tokens.ids)
    print("Tokens:", tokens.tokens)
    print("Revert:", tokenizer_sp.decode(tokens.ids))


Text: म स्कुल जान्छु।
Token Ids: [7, 59, 1445, 11871, 36]
Tokens: ['▁', 'म', '▁स्कुल', '▁जान्छु', '।']
Revert: म स्कुल जान्छु।

Text: यदि तिमी आएनौ भने, हामीले खेल सुरु गर्नेछौं।
Token Ids: [1467, 3325, 7338, 819, 143, 341, 222, 172, 12593, 36]
Tokens: ['▁यदि', '▁तिमी', '▁आएन', 'ौ', '▁भने,', '▁हामीले', '▁खेल', '▁सुरु', '▁गर्नेछौं', '।']
Revert: यदि तिमी आएनौ भने, हामीले खेल सुरु गर्नेछौं।

Text: समाजमा नैतिक मूल्यहरूको संरक्षण गर्नका लागि हामी सबैको योगदान आवश्यक छ।
Token Ids: [1695, 5676, 519, 393, 809, 484, 24, 197, 2122, 1231, 313, 47]
Tokens: ['▁समाजमा', '▁नैतिक', '▁मूल्य', 'हरूको', '▁संरक्षण', '▁गर्नका', '▁लागि', '▁हामी', '▁सबैको', '▁योगदान', '▁आवश्यक', '▁छ।']
Revert: समाजमा नैतिक मूल्यहरूको संरक्षण गर्नका लागि हामी सबैको योगदान आवश्यक छ।


## T5-Small Tokenizer

In [9]:
from transformers import AutoTokenizer

# T5-Small Sentence Piece Tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

texts = [
    "I go to school.",
    "If you don’t come, we will start the game.",
    "To preserve moral values in society, the contribution of all of us is necessary."
]

for text in texts:
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    print("\nText:", text)
    print("Token Ids:", token_ids)
    print("Tokens:", tokens)
    print("Revert:", tokenizer.decode(token_ids))


Text: I go to school.
Token Ids: [27, 281, 12, 496, 5]
Tokens: ['▁I', '▁go', '▁to', '▁school', '.']
Revert: I go to school.

Text: If you don’t come, we will start the game.
Token Ids: [156, 25, 278, 22, 17, 369, 6, 62, 56, 456, 8, 467, 5]
Tokens: ['▁If', '▁you', '▁don', '’', 't', '▁come', ',', '▁we', '▁will', '▁start', '▁the', '▁game', '.']
Revert: If you don’t come, we will start the game.

Text: To preserve moral values in society, the contribution of all of us is necessary.
Token Ids: [304, 8996, 4854, 2620, 16, 2710, 6, 8, 6275, 13, 66, 13, 178, 19, 1316, 5]
Tokens: ['▁To', '▁preserve', '▁moral', '▁values', '▁in', '▁society', ',', '▁the', '▁contribution', '▁of', '▁all', '▁of', '▁us', '▁is', '▁necessary', '.']
Revert: To preserve moral values in society, the contribution of all of us is necessary.


## Adding some preprocessings and special tokens

In [39]:
import re

def tokenize_english(text):
    """Tokenizes English text"""
    text = text.lower().strip()                 # Normalize text
    tokens = tokenizer.tokenize(text + "</s>")  # EOS:</s> tokens
    return tokens

# Tokenization Function for Nepali
def tokenize_nepali(text):
    """Tokenizes Nepali text using SentencePiece tokenizer."""
    text = text.strip()
    text = re.sub(r"([.!।?])", r" \1", text)
    token = tokenizer_sp.encode(text + "</s>")   # EOS:</s> tokens
    return token

In [40]:
# Tokenizing
def tokenize_text(english_sentence, nepali_sentence):
    eng_tokens = tokenize_english(english_sentence)
    nep_tokens = tokenize_nepali(nepali_sentence)

    eng_token_id = tokenizer.convert_tokens_to_ids(eng_tokens)
    nep_token_id = nep_tokens.ids

    print("English Text:", english_sentence)
    print("English Tokens:", eng_tokens)
    print("English Tokens ID:", eng_token_id)
    print("Revert:", tokenizer.decode(eng_token_id))
    print("Token len:", len(eng_tokens))

    print("\nNepali Text:", nepali_sentence)
    print("Nepali Tokens:", nep_tokens.tokens)
    print("Nepali Tokens ID:", nep_token_id)
    print("Revert:", tokenizer_sp.decode(nep_token_id))
    print("Token len:", len(nep_tokens))

In [41]:
english_sentence = "If you don’t come, we will start the game."
nepali_sentence = "यदि तिमी आएनौ भने, हामीले खेल सुरु गर्नेछौं।"

tokenize_text(english_sentence, nepali_sentence)

English Text: If you don’t come, we will start the game.
English Tokens: ['▁', 'if', '▁you', '▁don', '’', 't', '▁come', ',', '▁we', '▁will', '▁start', '▁the', '▁game', '.', '</s>']
English Tokens ID: [3, 99, 25, 278, 22, 17, 369, 6, 62, 56, 456, 8, 467, 5, 1]
Revert: if you don’t come, we will start the game.</s>
Token len: 15

Nepali Text: यदि तिमी आएनौ भने, हामीले खेल सुरु गर्नेछौं।
Nepali Tokens: ['▁यदि', '▁तिमी', '▁आएन', 'ौ', '▁भने,', '▁हामीले', '▁खेल', '▁सुरु', '▁गर्नेछौं', '▁।', '</s>']
Nepali Tokens ID: [1467, 3325, 7338, 819, 143, 341, 222, 172, 12593, 8, 6]
Revert: यदि तिमी आएनौ भने, हामीले खेल सुरु गर्नेछौं ।
Token len: 11


In [44]:
nepali_sentence = "हाम्रो देशको आर्थिक वृद्धिका लागि नवप्रवर्तनशील सोच र लगानीको आवश्यकता पर्दछ। जबसम्म हामी परिवर्तनका लागि पहल गर्दैनौं, तबसम्म कुनै पनि सुधार सम्भव छैन।"
english_sentence = "For the economic growth of our country, innovative thinking and investment are required. Unless we take the initiative for change, no improvement is possible."

tokenize_text(english_sentence, nepali_sentence)

English Text: For the economic growth of our country, innovative thinking and investment are required. Unless we take the initiative for change, no improvement is possible.
English Tokens: ['▁for', '▁the', '▁economic', '▁growth', '▁of', '▁our', '▁country', ',', '▁innovative', '▁thinking', '▁and', '▁investment', '▁are', '▁required', '.', '▁', 'unless', '▁we', '▁take', '▁the', '▁initiative', '▁for', '▁change', ',', '▁no', '▁improvement', '▁is', '▁possible', '.', '</s>']
English Tokens ID: [21, 8, 1456, 1170, 13, 69, 684, 6, 3058, 1631, 11, 1729, 33, 831, 5, 3, 3227, 62, 240, 8, 6121, 21, 483, 6, 150, 4179, 19, 487, 5, 1]
Revert: for the economic growth of our country, innovative thinking and investment are required. unless we take the initiative for change, no improvement is possible.</s>
Token len: 30

Nepali Text: हाम्रो देशको आर्थिक वृद्धिका लागि नवप्रवर्तनशील सोच र लगानीको आवश्यकता पर्दछ। जबसम्म हामी परिवर्तनका लागि पहल गर्दैनौं, तबसम्म कुनै पनि सुधार सम्भव छैन।
Nepali Tokens: ['▁हाम

In [43]:
nepali_sentence = "समाजमा नैतिक मूल्यहरूको संरक्षण गर्नका लागि हामी सबैको योगदान आवश्यक छ।"
english_sentence =  "To preserve moral values in society, the contribution of all of us is necessary."

tokenize_text(english_sentence, nepali_sentence)

English Text: To preserve moral values in society, the contribution of all of us is necessary.
English Tokens: ['▁to', '▁preserve', '▁moral', '▁values', '▁in', '▁society', ',', '▁the', '▁contribution', '▁of', '▁all', '▁of', '▁us', '▁is', '▁necessary', '.', '</s>']
English Tokens ID: [12, 8996, 4854, 2620, 16, 2710, 6, 8, 6275, 13, 66, 13, 178, 19, 1316, 5, 1]
Revert: to preserve moral values in society, the contribution of all of us is necessary.</s>
Token len: 17

Nepali Text: समाजमा नैतिक मूल्यहरूको संरक्षण गर्नका लागि हामी सबैको योगदान आवश्यक छ।
Nepali Tokens: ['▁समाजमा', '▁नैतिक', '▁मूल्य', 'हरूको', '▁संरक्षण', '▁गर्नका', '▁लागि', '▁हामी', '▁सबैको', '▁योगदान', '▁आवश्यक', '▁छ', '▁।', '</s>']
Nepali Tokens ID: [1695, 5676, 519, 393, 809, 484, 24, 197, 2122, 1231, 313, 19, 8, 6]
Revert: समाजमा नैतिक मूल्यहरूको संरक्षण गर्नका लागि हामी सबैको योगदान आवश्यक छ ।
Token len: 14
