# Test Khmer Word Segmentation
Testing khmercut library for Khmer word segmentation

In [1]:
# Install khmercut
!pip install khmercut

Collecting khmercut
  Downloading khmercut-0.1.0.tar.gz (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m364.5 kB/s[0m  [33m0:00:16[0mm0:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting python-crfsuite (from khmercut)
  Downloading python_crfsuite-0.9.12-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.3 kB)
Downloading python_crfsuite-0.9.12-cp311-cp311-macosx_11_0_arm64.whl (319 kB)
Building wheels for collected packages: khmercut
  Building wheel for khmercut (pyproject.toml) ... [?25ldone
[?25h  Created wheel for khmercut: filename=khmercut-0.1.0-py3-none-any.whl size=5872139 sha256=28db1b585d5e8a43177b34a8f03055d91592ac27945f28f2e24384aa2e596f7d
  Stored in directory: /Users/macbookair/Library/Caches/pip/wheels/98/e8/2c/bf29d047ec29f274a676e0c0448bad56492c0de5c3ee21623c
Successful

In [2]:
# Test khmercut
from khmercut import tokenize

# Test text without spaces
text_no_spaces = "លោកជំទាវបណ្ឌិតពេជចន្ទមុន្នីហ៊ុនម៉ាណែតបន្តប្រគេនភេសជ្ជៈ"

# Tokenize
tokens = tokenize(text_no_spaces)
print("Original text:")
print(text_no_spaces)
print("\nTokenized:")
print(tokens)
print("\nJoined with spaces:")
print(' '.join(tokens))

Original text:
លោកជំទាវបណ្ឌិតពេជចន្ទមុន្នីហ៊ុនម៉ាណែតបន្តប្រគេនភេសជ្ជៈ

Tokenized:
['លោកជំទាវ', 'បណ្ឌិត', 'ពេជចន្ទ', 'មុន្នី', 'ហ៊ុន', 'ម៉ាណែត', 'បន្ត', 'ប្រគេន', 'ភេសជ្ជៈ']

Joined with spaces:
លោកជំទាវ បណ្ឌិត ពេជចន្ទ មុន្នី ហ៊ុន ម៉ាណែត បន្ត ប្រគេន ភេសជ្ជៈ


In [3]:
# Test with more examples
from khmercut import tokenize

test_sentences = [
    "លោកអឿនប៉ាវទៅលេងខេត្តកំពត",
    "ព្រះរាជាណាចក្រកម្ពុជា",
    "សម្តេចបវរធិបតីហ៊ុនម៉ាណែត",
]

for text in test_sentences:
    tokens = tokenize(text)
    print(f"Input:  {text}")
    print(f"Output: {' '.join(tokens)}")
    print("-" * 80)

Input:  លោកអឿនប៉ាវទៅលេងខេត្តកំពត
Output: លោក អឿន ប៉ាវ ទៅ លេង ខេត្ត កំពត
--------------------------------------------------------------------------------
Input:  ព្រះរាជាណាចក្រកម្ពុជា
Output: ព្រះរាជាណាចក្រ កម្ពុជា
--------------------------------------------------------------------------------
Input:  សម្តេចបវរធិបតីហ៊ុនម៉ាណែត
Output: សម្តេច បវរធិបតី ហ៊ុន ម៉ាណែត
--------------------------------------------------------------------------------


In [4]:
# Test handling text that already has spaces
from khmercut import tokenize

text_with_spaces = "ហ៊ុន ម៉ាណែត ទៅ ភ្នំពេញ"

tokens = tokenize(text_with_spaces)
print("Text with spaces:")
print(text_with_spaces)
print("\nTokenized:")
print(' '.join(tokens))

Text with spaces:
ហ៊ុន ម៉ាណែត ទៅ ភ្នំពេញ

Tokenized:
ហ៊ុន   ម៉ាណែត   ទៅ   ភ្នំពេញ


In [5]:
# Create a function that handles both cases
from khmercut import tokenize

def smart_tokenize(text):
    """
    Smart tokenization that handles:
    - Text without spaces (applies word segmentation)
    - Text with spaces (uses existing tokenization)
    """
    text = text.strip()
    
    # Check if text already has spaces
    if ' ' in text:
        # Already tokenized, just split
        return text.split()
    else:
        # No spaces, apply word segmentation
        return tokenize(text)

# Test both cases
print("Test 1: No spaces")
result1 = smart_tokenize("លោកអឿនប៉ាវទៅលេងខេត្តកំពត")
print(' '.join(result1))

print("\nTest 2: With spaces")
result2 = smart_tokenize("ហ៊ុន ម៉ាណែត ទៅ ភ្នំពេញ")
print(' '.join(result2))

Test 1: No spaces
លោក អឿន ប៉ាវ ទៅ លេង ខេត្ត កំពត

Test 2: With spaces
ហ៊ុន ម៉ាណែត ទៅ ភ្នំពេញ
