## Normalize

In [6]:
import json
import regex as re
import shutil
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Dict, List, Tuple

from datasets import Audio, Dataset, DatasetDict, load_from_disk
from khmercut import tokenize as khmer_tokenize

from khmerspeech import (
  normalize,
  datetime as km_datetime,
  phone_numbers,
  currency,
  cardinals,
  decimals,
  urls,
  ordinals,
  dict_verbalize,
  repeater,
  punctuations,
  parenthesis
)

def segment_text(text: str) -> str:
    text = normalize.processor(text)
    text = phone_numbers.processor(text, chunk_size=3)
    text = km_datetime.date_processor(text)
    text = km_datetime.time_processor(text)
    text = urls.processor(text)
    text = repeater.processor(text)
    text = currency.processor(text)
    text = cardinals.processor(text)
    text = decimals.processor(text)
    text = ordinals.processor(text)
    text = punctuations.processor(text)
    text = dict_verbalize(text)
    text = re.sub(r"\s+", " ", text.strip())
    
    return text

segment_text('ការសិក្សាត្រូវមាន       ការព្យាយាម this  is the function for word segmentation!!! 010123123 គិតថ្លៃ $100.25 នៅថ្ងៃទី 2024-01-02 វេលា 10:23AM ចូលតាម https://google.com.kh')
# segment_text('ការសិក្សាត្រូវមានការព្យាយាម')

'ការសិក្សាត្រូវមាន ការព្យាយាម this is the function for word segmentation ! សូន្យ▁ដប់▁មួយរយ▁ម្ភៃបី▁មួយរយ▁ម្ភៃបី គិតថ្លៃ មួយរយដុល្លារ▁ម្ភៃប្រាំសេន នៅថ្ងៃទី ពីរពាន់▁ម្ភៃបួន មួយ ពីរ វេលា ដប់ ម្ភៃបី▁A▁M ចូលតាម google dot com dot k▁h'

## Phone Numbers

In [6]:
from khmerspeech import phone_numbers

phone_numbers.processor("010123123", chunk_size=2)

'0▁10▁12▁31▁23'

In [7]:
phone_numbers.processor("010123123", chunk_size=3)

'0▁10▁123▁123'

## URLs and Emails

In [12]:
from khmerspeech import urls

text = """
Contact us at support@gmail.com or visit http://moe.gov.kh for more info.
Local test: http://localhost:8080.
Acleda : https://www.acledabank.com.kh/
"""

text2 = "http://localhost:2000   https://www.acledabank.com.kh http://localhost:2000"

print(urls.processor(text2))





has url
no url
host.lower() port 2000   www dot acledabank dot com dot k▁h host.lower() port 2000


## hashtags

## repeaters

## Datetimes

In [7]:

from khmerspeech import datetime

# (your regexes and functions here...)

text = (
    "Meet on 2025-10-20 at 7:05pm.\n"
    "ពិភាក្សា ២០២៥/១០/២០ ម៉ោង ០៨:៥៩ AM.\n"
    "Invoice date: 20/10/2025 and follow-up at 12:30."
)

print("Original:")
print(text)

print("\nDates normalized:")
print(datetime.date_processor(text))

print("\nTimes normalized:")
print(datetime.time_processor(text))

print("\nDates then Times (combined):")
print(datetime.time_processor(datetime.date_processor(text)))


Original:
Meet on 2025-10-20 at 7:05pm.
ពិភាក្សា ២០២៥/១០/២០ ម៉ោង ០៨:៥៩ AM.
Invoice date: 20/10/2025 and follow-up at 12:30.

Dates normalized:
Meet on 2025 10 20 at 7:05pm.
ពិភាក្សា ២០២៥ ១០ ២០ ម៉ោង ០៨:៥៩ AM.
Invoice date: 20 10 2025 and follow-up at 12:30.

Times normalized:
Meet on 2025-10-20 at 7 05▁p▁m.
ពិភាក្សា ២០២៥/១០/២០ ម៉ោង ០៨ ៥៩▁A▁M.
Invoice date: 20/10/2025 and follow-up at 12 30.

Dates then Times (combined):
Meet on 2025 10 20 at 7 05▁p▁m.
ពិភាក្សា ២០២៥ ១០ ២០ ម៉ោង ០៨ ៥៩▁A▁M.
Invoice date: 20 10 2025 and follow-up at 12 30.


In [None]:
import regex as re

RE_HASHTAGS = re.compile(r"\B(\#[a-zA-Z]+\b)")
RE_ENCLOSED_PARENTHESIS_BACKETS = re.compile(r"\s*\((.*?)\)\s*|\s*\[(.*?)\]\s*")
RE_GENERIC_NUMBER = re.compile(r"\d+\.?\d*")
RE_NUMBER_COMMAS = re.compile(r"(\d+)\,(\d+)")
RE_USELESS_COMMAS = re.compile(r"\s*\,\s*")
RE_ORDINAL_NUMBER = re.compile(r"^([\d\u17e0-\u17e9]+)\.\s")
RE_SPACING_COMMAS = re.compile(
  r"([^0-9\u17e0-\u17e9])[,]+([^0-9\u17e0-\u17e9]|$)"
)  # abc,abc

RE_NUM_LEADING_ZEROS = re.compile(r"\.([\u17e00]+)([\d\u17e0-\u17e9]+)")

def leading_zeros_replacer(m):
  padded_zeros = "▁".join(m[1])
  return f".▁{padded_zeros}▁{m[2]}"

