# Gujarati Text Normalization Test Cases

This notebook tests the Gujarati text normalization implementation.

## Test Categories:
1. Cardinal Numbers
2. Decimal Numbers
3. Fractions
4. Dates
5. Time
6. Money
7. Measurements
8. Ordinal Numbers
9. Telephone Numbers
10. Whitelist/Abbreviations
11. Mixed Content
12. Batch Testing



In [None]:
import sys
import os

# Add the NeMo-text-processing directory to the path
sys.path.insert(0, os.path.abspath('.'))

from nemo_text_processing.text_normalization.normalize import Normalizer

print("Imports successful!")


In [None]:
# Initialize Gujarati normalizer
normalizer_gu = Normalizer(
    input_case='cased',
    lang='gu',
    cache_dir=None,  # Set to a directory path if you want to cache .far files
    overwrite_cache=False,
    post_process=True
)

print("Gujarati Text Normalizer initialized successfully!")
print(f"Language: {normalizer_gu.lang}")


## 1. Cardinal Numbers Test



In [None]:
cardinal_tests = [
    "155=100",
    "૧૨૩૪",
    "૧૨૩૪૫",
    "૧૨૩૪૫૬",
    "૧૨૩૪૫૬૭",
    "૧૨૩૪૫૬૭૮",
    "-૧૨૩",
    "-120",
    "૧૦૦૦",
    "૧૦૦૦૦૦",  # 1 lakh
    "૧૦૦૦૦૦૦૦૦ ",  # 1 crore
]

print("=" * 60)
print("CARDINAL NUMBERS TEST")
print("=" * 60)
for test in cardinal_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 2. Decimal Numbers Test



In [None]:
# Test both Gujarati and English digits for decimals
decimal_tests = [
    "૧૨.૩૪",      # Gujarati digits
    "૧૨૩.૪૫૬",    # Gujarati digits
    "૦.૫",         # Gujarati digits
    "-૧૨.૩૪",     # Gujarati digits
    "૧૨.૩૪૫૬",    # Gujarati digits
    "12.34",        # English digits
    "123.456",      # English digits
    "0.5",          # English digits
    "-12.34",       # English digits
    "12.3456",      # English digits
]

print("=" * 60)
print("DECIMAL NUMBERS TEST (GUJARATI & ENGLISH DIGITS)")
print("=" * 60)
for test in decimal_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 3. Fractions Test



In [None]:
# Test both Gujarati and English digits for fractions
fraction_tests = [
    "૩/૪",         # Gujarati digits
    "૧/૨",         # Gujarati digits
    "૧/૪",         # Gujarati digits
    "૧૨ ૩/૪",      # Gujarati digits
    "-૧/૨",        # Gujarati digits
    "3/4",          # English digits
    "1/2",          # English digits
    "1/4",          # English digits
    "12 3/4",       # English digits
    "-1/2",         # English digits
]

print("=" * 60)
print("FRACTIONS TEST (GUJARATI & ENGLISH DIGITS)")
print("=" * 60)
for test in fraction_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 4. Dates Test



In [None]:
# Test both Gujarati and English digits for dates
date_tests = [
    "૦૧-૦૪-૨૦૨૪",      # Gujarati digits
    "૧૫-૦૬-૨૦૨૪",      # Gujarati digits
    "૨૦૨૪-૦૧-૧૫",      # Gujarati digits
    "૧૫/૦૬/૨૦૨૪",      # Gujarati digits
    "૦૪-૦૧-૨૦૨૪",      # MM-DD format, Gujarati digits
    "01-04-2024",        # English digits
    "15-06-2024",        # English digits
    "2024-01-15",        # English digits
    "15/06/2024",        # English digits
    "04-01-2024",        # MM-DD format, English digits
]

print("=" * 60)
print("DATES TEST (GUJARATI & ENGLISH DIGITS)")
print("=" * 60)
for test in date_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 5. Time Test



In [None]:
# Test both Gujarati and English digits for time
time_tests = [
    "૧૨:૩૦",         # Gujarati digits
    "૧:૪૦",          # Gujarati digits
    "૧૨:૦૦",         # Gujarati digits
    "૧૨:૩૦:૪૫",      # Gujarati digits
    "૦૯:૧૫",         # Gujarati digits
    "૨૩:૫૯",         # Gujarati digits
    "12:30",          # English digits
    "1:40",           # English digits
    "12:00",          # English digits
    "12:30:45",       # English digits
    "09:15",          # English digits
    "23:59",          # English digits
    "9:15",           # English digits (single digit hour)
]

print("=" * 60)
print("TIME TEST (GUJARATI & ENGLISH DIGITS)")
print("=" * 60)
for test in time_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 6. Money Test



In [None]:
# Test both Gujarati and English digits for money
money_tests = [
    "₹૧૦૦",         # Gujarati digits
    "₹૧૨૩૪",        # Gujarati digits
    "₹૫૦.૫૦",       # Gujarati digits
    "₹૦.૫૦",        # Gujarati digits
    "રૂ ૧૦૦૦",      # Gujarati digits
    "₹100",          # English digits
    "₹1234",         # English digits
    "₹50.50",        # English digits
    "₹0.50",         # English digits
    "₹1000",         # English digits
    "₹500",          # English digits
]

print("=" * 60)
print("MONEY TEST (GUJARATI & ENGLISH DIGITS)")
print("=" * 60)
for test in money_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 7. Measurements Test



In [None]:
# Test both Gujarati and English digits for measurements
measure_tests = [
    "૧૨ kg",         # Gujarati digits
    "૧૨૫ kg",        # Gujarati digits
    "૧૦૦ m",         # Gujarati digits
    "૫ km",          # Gujarati digits
    "૧૨.૩૪ cm",      # Gujarati digits
    "12 kg",          # English digits
    "125 kg",         # English digits
    "100 m",          # English digits
    "5 km",           # English digits
    "12.34 cm",       # English digits
]

print("=" * 60)
print("MEASUREMENTS TEST (GUJARATI & ENGLISH DIGITS)")
print("=" * 60)
for test in measure_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 8. Ordinal Numbers Test



In [None]:
# Test both Gujarati and English digits for ordinals
ordinal_tests = [
    "૧મું",           # Gujarati digits
    "૨મી",           # Gujarati digits
    "૧૦મું",          # Gujarati digits
    "૨૧મી",          # Gujarati digits
    "૧૦૦મું",         # Gujarati digits
    "1 kg",            # English digits with Gujarati suffix
    "2મી",            # English digits with Gujarati suffix
    "10મું",           # English digits with Gujarati suffix
    "21મી",           # English digits with Gujarati suffix
    "100મું",          # English digits with Gujarati suffix
]

print("=" * 60)
print("ORDINAL NUMBERS TEST (GUJARATI & ENGLISH DIGITS)")
print("=" * 60)
for test in ordinal_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 9. Telephone Numbers Test



In [None]:
telephone_tests = [
    "+૯૧૫૭૧૧૪૦૦૭",
    "+૯૧ ૯૨૧૦૫૧૫૬૦૬",
    "૧૩૭૪-૩૦૯૯૮૮",
    "9943206292",
]

print("=" * 60)
print("TELEPHONE NUMBERS TEST")
print("=" * 60)
for test in telephone_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:20} -> Output: {result}")


## 10. Whitelist/Abbreviations Test



In [None]:
whitelist_tests = [
    "ડૉક્.",
    "પ્રો.",
    "શ્રી",
    "કિ.મી.",
    "મી.",
]

print("=" * 60)
print("WHITELIST/ABBREVIATIONS TEST")
print("=" * 60)
for test in whitelist_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test:15} -> Output: {result}")


## 11. Mixed Content Test



In [None]:
mixed_tests = [
    "આજે ૧૫-૦૬-૨૦૨૪ ના રોજ ૧૨:૩૦ વાગ્યે સભા છે.",
    "₹૧૦૦૦ અને ₹૫૦૦ મળીને ₹૧૫૦૦ થાય છે.",
    "૧૨૩ કિલોગ્રામ વજન અને ૫૦ કિલોમીટર અંતર.",
    "૧મું સ્થાન અને ૨મી સ્થાન.",
]

print("=" * 60)
print("MIXED CONTENT TEST")
print("=" * 60)
for test in mixed_tests:
    result = normalizer_gu.normalize(test)
    print(f"Input:  {test}")
    print(f"Output: {result}")
    print("-" * 60)


## 12. Batch Testing



In [None]:
# Test multiple inputs at once
batch_tests = [
    "123",
    "૧૨.૩૪",
    "૧૨:૩૦",
    "₹૧૦૦",
    "૧૫-૦૬-૨૦૨૪",
]

print("=" * 60)
print("BATCH TESTING")
print("=" * 60)
results = normalizer_gu.normalize_list(batch_tests)
for input_text, output_text in zip(batch_tests, results):
    print(f"Input:  {input_text:15} -> Output: {output_text}")


## Summary

All test cases have been executed. Check the outputs above to verify that Gujarati text normalization is working correctly for all categories.

### Usage Tips:
- Run each cell sequentially (Shift+Enter)
- Modify test cases in any cell to test your own inputs
- Use `verbose=True` in normalize() to see detailed processing information
- Set `cache_dir` to a directory path to speed up subsequent runs

