# Word segmentation using Maximal Matching
apply this concept from nmm_pythainl research in Thai language

## Import

In [9]:
import re
from collections import defaultdict
from heapq import heappush, heappop  # for priority queue

!pip install -q marisa_trie
from marisa_trie import Trie

## Convert data from excel to txt file

In [10]:
# !wget nv https://huggingface.co/datasets/metythorn/khmer-dictionary-dataset-44k-v1/blob/main/khmer_dictionary.xlsx

If u got error when loading excel let try download directly from huggingface url above

In [11]:
# load excel file using pandas
import pandas as pd
df = pd.read_excel('./khmer_dictionary.xlsx', engine='openpyxl')
df.head()

Unnamed: 0,t_main,t_pron,t_poly,t_pos,t_exp,t_exam
0,ក,[ក],,ន.,តួព្យញ្ជនៈទី១នៃអក្ខរក្រមព្យញ្ជនៈខ្មែរ និងជាតួព...,
1,ក,[ក],,កិ.,"តាំងផ្ដើម, តាំងធ្វើ :","កសាង, កកើត, កចេតិយ, កភូមិ។"
2,ក,[ក],១),ន.,អវយវៈដែលតពីក្បាលទៅស្មាឬទៅខ្លួននៃមនុស្សសត្វ :,"កមនុស្ស, កមាន់។"
3,ក,[ក],២),ន.,កន្លែងដែលតជាប់ពីមាត់ដបជាដើមចុះទៅទល់នឹងក្អេងក៏ហ...,"កដប, កក្អម។"
4,ក,[ក],៣),ន.,ផ្នែកនៃដៃឬជើងដែលតភ្ជាប់ពីប្រអប់ដៃទៅកំភួនដៃឬពីប...,"កដៃ, កជើង។"


In [12]:
df_word = df["t_main"]
# remove duplicate words
df_word = df_word.drop_duplicates()
# convert to text file by support khmer language
df_word.to_csv("khmer_dictionary.txt", index=False, header=False)


## Load Data
load khmer_dictionary.txt to create trie

In [13]:
wordlist = [li.strip() for li in open('./khmer_dictionary.txt', 'r', encoding='utf-8')]
trie = Trie(wordlist)


In [14]:
# check dataset 
print(trie.keys(u'កក់'))
print(trie.keys(u'កា'))
print(len(trie.keys(u'កា')))

print("====== dataset info ======")
print(type(trie))
print(len(trie))

['កក់', 'កក់ក្ដៅ']
['កា', 'ការ', 'ការិយាធិបតេយ្យ', 'ការិយាល័យ', 'ការិយកម្ម', 'ការិយបរិច្ឆេទ', 'ការិនី', 'ការុញ្ញចិត្ត', 'ការុញ្ញភាព', 'ការុញ្ញហេតុ', 'ការុង', 'ការុណិក', 'ការបង្កើតថ្មី', 'ការបន្តពូជ', 'ការប៉ះទង្គិច', 'ការប្រាក់', 'ការពារ', 'ការពារប្រទេស', 'ការពិត', 'ការកនាម', 'ការកេត', 'ការណនិយម', 'ការណ៍', 'ការី', 'ការីម៉ាឃិធីង', 'ការងារ', 'ការជួល', 'ការដ្ឋាន', 'ការទូត', 'ការន្ត', 'ការរញ្ជួយដី', 'ការាត់', 'ការ៉ាតេដូ', 'ការ៉ាស់', 'ការ៉ូ', 'ការ៉ូឡា', 'ការ៉ុត', 'ការ៉េ', 'ការ៉េម', 'ការ៉េមកី', 'ការ្យ', 'ការ្យសិល្បៈ', 'ការ៍ទូស', 'កាត', 'កាត់', 'កាត់កង', 'កាត់កាល់', 'កាត់ក្ដី', 'កាត់សក់', 'កាត់សាញ', 'កាត់ស្បែក', 'កាត់ទឹក', 'កាត់ទោស', 'កាត់ចុង', 'កាត់ដេរ', 'កាត់ថ្លៃ', 'កាតៅ', 'កាតំ', 'កាតាលីកម្ម', 'កាតាលីករ', 'កាតាក', 'កាតាប', 'កាតាឡុក', 'កាតឹប', 'កាតឹបស៊ង', 'កាតគ្រី', 'កាតព្វកិច្ច', 'កាតិក', 'កាតុង', 'កាតូដ', 'កាប', 'កាប៊ីន', 'កាប៊ីនភ្លើង', 'កាប៊ីណេត៍', 'កាប់', 'កាប់ឆៅ', 'កាប់ស្តាំង', 'កាប៉ូរ៉ាល់', 'កាប៉ៅ', 'កាបែន', 'កាប៌ាស', 'កាបៗ', 'កាបូន', 'កាបូនឌីអុកស៊ីដ', 'កាបូនម៉ូណូអុកស៊ីដ', 'កាបូនអ៊ីដ្រ

## One Cut
แทนที่ multicut

In [15]:
# help segmenet enlish word also
pat_eng = re.compile(r'''(?x)
[-a-zA-Z]+|   # english
\d[\d,\.]*|   # number
[ \t]+|       # space
\r?\n         # newline
''')

In [16]:
# text = 'สวัสดีครับ สบายดีไหมครับ'
# creae text variable with khmer sentence
text = "ផ្នែកនៃដៃឬជើងដែលតភ្ជាប់ពីប្រអប់ដៃទៅកំភួនដៃ"

In [17]:
def onecut(text):
  words_at = defaultdict(list)  # main data structure

  def serialize(p, p2):    # helper function
    for w in words_at[p]:
      p_ = p + len(w)
      if p_== p2:
        yield [w]
      elif p_ < p2:
        for path in serialize(p_, p2):
          yield [w]+path

  q = [0]       # min-heap queue
  last_p = 0    # last position for yield
  while q[0] < len(text):
      p = heappop(q)

      for w in trie.prefixes(text[p:]):
          words_at[p].append(w)
          if p+len(w) not in q:
            heappush(q, p+len(w))

      if len(q)==1:
          for w in min(serialize(last_p, q[0]), key=len):
            yield w
          last_p = q[0]

      # กรณี len(q) == 0  คือ ไม่มีใน dict
      if len(q)==0:
          m = pat_eng.match(text[p:])
          if m: # อังกฤษ, เลข, ว่าง
              i = p + m.span()[1]
          else: # skip น้อยที่สุด ที่เป็นไปได้
              for i in range(p, len(text)):
                  ww = trie.prefixes(text[i:])
                  m = pat_eng.match(text[i:])
                  if ww or m:
                      break
              else:
                  i = len(text)
          w = text[p:i]
          words_at[p].append(w)
          yield w
          last_p = i
          heappush(q, i)

### heapq แทน set

In [18]:
q = []   # min heap queue
for x in [4, 9, 2, 1, 5]:
  heappush(q, x)
q

[1, 2, 4, 9, 5]

In [19]:
print("min of queue is", q[0])

min of queue is 1


In [20]:
while q:
  print(heappop(q))

1
2
4
5
9


### Manual loop

In [21]:
words_at = defaultdict(list)
q = [0]
last_p = 0

In [22]:
# manual instead of while loop
print(q[0])
p = heappop(q)

for w in trie.prefixes(text[p:]):
  words_at[p].append(w)
  if p+len(w) not in q:
    heappush(q, p+len(w))
q

0


[1, 5]

In [23]:
# ส, สว
'ផ្នែក' in wordlist

True

In [24]:
print(q[0])
p = heappop(q)

for w in trie.prefixes(text[p:]):
  words_at[p].append(w)
  if p+len(w) not in q:
    heappush(q, p+len(w))
q

1


[5]

In [25]:
print(q[0])
p = heappop(q)

for w in trie.prefixes(text[p:]):
  words_at[p].append(w)
  if p+len(w) not in q:
    heappush(q, p+len(w))
q

5


[6, 7]

In [26]:
# if len(q) == 1:
#   q0 = q[0]
#   yield LatticeString(text[last_p:q0], serialize(last_p, q0))
#   last_p = q0
last_p, q[0]

(0, 6)

In [27]:
words_at

defaultdict(list, {0: ['ផ', 'ផ្នែក'], 5: ['ន', 'នៃ']})

### mm_path
ปรับจาก LatticeString ที่รวมทุกๆ path มาเป็น min แค่ path เดียว

In [28]:
def serialize(p, p2):    # helper function
  for w in words_at[p]:
    p_ = p + len(w)
    if p_== p2:
      yield [w]
    elif p_ < p2:
      for path in serialize(p_, p2):
        yield [w]+path

In [29]:
# maximal path ก็คือใช้ len เป็นตัวเลือก
min(serialize(0,6), key=len)

['ផ្នែក', 'ន']

ทดลองเสร็จแล้ว ก็ไปแก้ใน one cut ด้านบน

## ทดลอง

In [30]:
list(onecut('ផ្នែកនៃដៃឬជើងដែលតភ្ជាប់ពីប្រអប់ដៃទៅកំភួនដៃ'))

['ផ្នែក',
 'នៃ',
 'ដៃ',
 'ឬ',
 'ជើង',
 'ដែល',
 'តភ្ជាប់',
 'ពី',
 'ប្រអប់ដៃ',
 'ទៅ',
 'កំភួនដៃ']

# Summary Code

In [31]:
import re
from collections import defaultdict
from heapq import heappush, heappop  # for priority queue
from marisa_trie import Trie

wordlist = [li.strip() for li in open('./khmer_words.txt', 'r', encoding='utf-8')]
trie = Trie(wordlist)

# ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
pat_eng = re.compile(r'''(?x)
[-a-zA-Z]+|   # english
\d[\d,\.]*|   # number
[ \t]+|       # space
\r?\n         # newline
''')

In [32]:
def onecut(text):
  words_at = defaultdict(list)  # main data structure

  def serialize(p, p2):    # helper function
    for w in words_at[p]:
      p_ = p + len(w)
      if p_== p2:
        yield [w]
      elif p_ < p2:
        for path in serialize(p_, p2):
          yield [w]+path

  q = [0]       # min-heap queue
  last_p = 0    # last position for yield
  while q[0] < len(text):
      p = heappop(q)

      for w in trie.prefixes(text[p:]):
          words_at[p].append(w)
          if p+len(w) not in q:
            heappush(q, p+len(w))

      if len(q)==1:
          for w in min(serialize(last_p, q[0]), key=len):
            yield w
          last_p = q[0]

      # กรณี len(q) == 0  คือ ไม่มีใน dict
      if len(q)==0:
          m = pat_eng.match(text[p:])
          if m: # อังกฤษ, เลข, ว่าง
              i = p + m.span()[1]
          else: # skip น้อยที่สุด ที่เป็นไปได้
              for i in range(p, len(text)):
                  ww = trie.prefixes(text[i:])
                  m = pat_eng.match(text[i:])
                  if ww or m:
                      break
              else:
                  i = len(text)
          w = text[p:i]
          words_at[p].append(w)
          yield w
          last_p = i
          heappush(q, i)

#### test 

In [33]:
list(onecut('សួស្តីឆ្នាំថ្មី។'))

['សួស', '្', 'តី', 'ឆ្នាំ', 'ថ្មី', '។']

In [34]:
khmer_vowels_string = ''.join([
    "\u17A5", "\u17A6", "\u17A7", "\u17A8", "\u17A9",
    "\u17AA", "\u17AB", "\u17AC", "\u17AD", "\u17AE",
    "\u17AF", "\u17B0", "\u17B1", "\u17B2", "\u17B3",
    "\u17B4", "\u17B5", "\u17B6", "\u17B7", "\u17B8",
    "\u17B9", "\u17BA", "\u17BB", "\u17BC", "\u17BD",
    "\u17BE", "\u17BF", "\u17C0", "\u17C1", "\u17C2",
    "\u17C3", "\u17C4", "\u17C5", "\u17C6", "\u17C7",
    "\u17C8",
    
])

print(khmer_vowels_string)


ឥឦឧឨឩឪឫឬឭឮឯឰឱឲឳ឴឵ាិីឹឺុូួើឿៀេែៃោៅំះៈ


## รวม TCC
หลักคือ คำนวณ tcc position ก่อน และสร้าง edge เฉพาะที่ลงพอดีตำแหน่งกัน

In [35]:
pat_tcc = """\
เc็c
เcctาะ
เccีtยะ
เccีtย(?=[เ-ไก-ฮ]|$)
เccอะ
เcc็c
เcิc์c
เcิtc
เcีtยะ?
เcืtอะ?
เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)
เctา?ะ?
cัtวะ
c[ัื]tc[ุิะ]?
c[ิุู]์
c[ะ-ู]t
c็
ct[ะาำ]?
แc็c
แcc์
แctะ
แcc็c
แccc์
โctะ
[เ-ไ]ct
""".replace('c','[ก-ฮ]').replace('t', '[่-๋]?').split()

def tcc(w):
    p = 0
    pat = re.compile("|".join(pat_tcc))
    while p<len(w):
        m = pat.match(w[p:])
        if m:
            n = m.span()[1]
        else:
            n = 1
        yield w[p:p+n]
        p += n

In [36]:
list(tcc(text))

['ផ',
 '្',
 'ន',
 'ែ',
 'ក',
 'ន',
 'ៃ',
 'ដ',
 'ៃ',
 'ឬ',
 'ជ',
 'ើ',
 'ង',
 'ដ',
 'ែ',
 'ល',
 'ត',
 'ភ',
 '្',
 'ជ',
 'ា',
 'ប',
 '់',
 'ព',
 'ី',
 'ប',
 '្',
 'រ',
 'អ',
 'ប',
 '់',
 'ដ',
 'ៃ',
 'ទ',
 'ៅ',
 'ក',
 'ំ',
 'ភ',
 'ួ',
 'ន',
 'ដ',
 'ៃ']

In [37]:
# ตำแหน่งที่อนุญาตให้ตัดได้
def tcc_pos(text):
  p_set = set()
  p = 0
  for w in tcc(text):
    p += len(w)
    p_set.add(p)
  return p_set

tcc_pos(text)

{1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42}

In [38]:
def mmcut(text):
  return list(onecut(text))

In [39]:
# ตัวอย่างปัญหา ไม่ควร add 'จุ' เพราะไม่ตรง tcc_pos
mmcut('ផ្នែកនៃដៃឬជើងដែលតភ្ជាប់ពីប្រអប់ដៃទៅកំភួនដៃ')

['ផ្នែក',
 'នៃ',
 'ដៃ',
 'ឬ',
 'ជើង',
 'ដែល',
 'តភ្ជាប់',
 'ពី',
 'ប្រអប់ដៃ',
 'ទៅ',
 'កំភួនដៃ']

In [40]:
# แยก serialize ออกมา จะได้อ่าน code ง่ายขึ้น
def serialize(words_at, p, p2):
  # find path แบบ depth first
  for w in words_at[p]:
    p_ = p + len(w)
    if p_== p2:
      yield [w]
    elif p_ < p2:
      for path in serialize(words_at, p_, p2):
        yield [w]+path

In [41]:
# ปรับ onecut ให้ใช้ tcc_pos
def onecut(text):
  words_at = defaultdict(list)  # main data structure
  allow_pos = tcc_pos(text)

  q = [0]       # min-heap queue
  last_p = 0    # last position for yield
  while q[0] < len(text):
      p = heappop(q)

      for w in trie.prefixes(text[p:]):
          p_ = p + len(w)
          if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
            words_at[p].append(w)
            if p_ not in q:
              heappush(q, p_)

      if len(q)==1:
          paths = serialize(words_at, last_p, q[0])
          for w in min(paths, key=len):
            yield w
          last_p = q[0]

      # กรณี len(q) == 0  คือ ไม่มีใน dict
      if len(q)==0:
          m = pat_eng.match(text[p:])
          if m: # อังกฤษ, เลข, ว่าง
              i = p + m.end()
          else: # skip น้อยที่สุด ที่เป็นไปได้
              for i in range(p, len(text)):
                  if i in allow_pos:   # ใช้ tcc ด้วย
                      ww = trie.prefixes(text[i:])
                      m = pat_eng.match(text[i:])
                      if ww or m:
                          break
              else:
                  i = len(text)
          w = text[p:i]
          words_at[p].append(w)
          yield w
          last_p = i
          heappush(q, i)

In [42]:
mmcut(text)

['ផ្នែក',
 'នៃ',
 'ដៃ',
 'ឬ',
 'ជើង',
 'ដែល',
 'តភ្ជាប់',
 'ពី',
 'ប្រអប់ដៃ',
 'ទៅ',
 'កំភួនដៃ']

In [43]:
mmcut('จุ๋ม')

['จุ๋ม']

In [44]:
mmcut('ไทยปน english ก็ได้นะ')

['ไทยปน', ' ', 'english', ' ', 'ก็ได้นะ']

# สรุปรวม
copy code จากข้างบน เอาเฉพาะที่ใช้จริง

In [45]:
import re
from collections import defaultdict
from heapq import heappush, heappop  # for priority queue
from marisa_trie import Trie

wordlist = [li.strip() for li in open('./khmer_dictionary.txt', 'r', encoding='utf-8')]
trie = Trie(wordlist)

# ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
pat_eng = re.compile(r'''(?x)
[-a-zA-Z]+|   # english
\d[\d,\.]*|   # number
[ \t]+|       # space
\r?\n         # newline
''')

In [46]:
# list of khmer vowels in unicode
# vowels = '឴឵នែកនៃដៃឬជើងដែលតភ្ជាប់ពីប្រអប់ដៃទៅកំភួនដៃ'

In [47]:
# TCC
pat_tcc = """\
เc็c
เcctาะ
เccีtยะ
เccีtย(?=[เ-ไก-ฮ]|$)
เccอะ
เcc็c
เcิc์c
เcิtc
เcีtยะ?
เcืtอะ?
เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)
เctา?ะ?
cัtวะ
c[ัื]tc[ุิะ]?
c[ิุู]์
c[ะ-ู]t
c็
ct[ะาำ]?
แc็c
แcc์
แctะ
แcc็c
แccc์
โctะ
[เ-ไ]ct
""".replace('c','[ก-ฮ]').replace('t', '[่-๋]?').split()

khm_pat_tcc = """\
s[ក-ឯ]
s[ឰ-។]
s[០-៩]
s[[ក-ឯ]s
s[ឰ-។]s
s[៕-៚]s
s[៛-៞]s
s[០-៩]s
c[ក-ឯ]
c[ក-ឯ]c
""".replace('s', '[\u17b6 \u17b7 \u17b8 \u17b9 \u17ba \u17bb \u17bc \u17bd \u17be \u17bf \u17c0 \u17c1 \u17c2 \u17c3 \u17c4 \u17c5 \u17c6 \u17c7 \u17c8 ]').replace('c', '[ក-ឯ]').split()


def tcc(w):
    p = 0
    pat = re.compile("|".join(khm_pat_tcc))
    while p<len(w):
        m = pat.match(w[p:])
        if m:
            n = m.span()[1]
        else:
            n = 1
        yield w[p:p+n]
        p += n

def tcc_pos(text):
    p_set = set()
    p = 0
    for w in tcc(text):
        p += len(w)
        p_set.add(p)
    return p_set

In [48]:
def serialize(words_at, p, p2):
  # find path ทั้งหมด แบบ depth first
  if p in words_at:
    for w in words_at[p]:
      p_ = p + len(w)
      if p_== p2:
        yield [w]
      elif p_ < p2:
        for path in serialize(words_at, p_, p2):
          yield [w]+path

In [49]:
def onecut(text):
  words_at = defaultdict(list)  # main data structure
  allow_pos = tcc_pos(text)     # ตำแหน่งที่ตัด ต้องตรงกับ tcc

  q = [0]       # min-heap queue
  last_p = 0    # last position for yield
  while q[0] < len(text):
      p = heappop(q)

      for w in trie.prefixes(text[p:]):
          p_ = p + len(w)
          if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
            words_at[p].append(w)
            if p_ not in q:
              heappush(q, p_)

      # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
      if len(q)==1:
          paths = serialize(words_at, last_p, q[0])
          for w in min(paths, key=len):
            yield w
          last_p = q[0]

      # กรณี length 0  คือ ไม่มีใน dict
      if len(q)==0:
          m = pat_eng.match(text[p:])
          if m: # อังกฤษ, เลข, ว่าง
              i = p + m.end()
          else: # skip น้อยที่สุด ที่เป็นไปได้
              for i in range(p+1, len(text)):
                  if i in allow_pos:   # ใช้ tcc ด้วย ทั้งจุดเริ่มและจบ
                      ww = [w for w in trie.prefixes(text[i:]) if (i+len(w) in allow_pos)]
                      m = pat_eng.match(text[i:])
                      if ww or m:
                          break
              else:
                  i = len(text)
          w = text[p:i]
          words_at[p].append(w)
          yield w
          last_p = i
          heappush(q, i)

# ช่วยให้ไม่ต้องพิมพ์ยาวๆ
def mmcut(text):
  return list(onecut(text))

In [50]:
mmcut('នែកនៃដៃឬជើងដែលតភ្ជាប់ពីប្រអប់ដៃទៅកំភួនដៃ')   # ทำงานได้ถูกต้อง

  pat = re.compile("|".join(khm_pat_tcc))


['នែក',
 'ន',
 'ៃដៃឬ',
 'ជើង',
 'ដែល',
 'តភ្ជាប់',
 'ព',
 'ីប្រអ',
 'ប',
 '់',
 'ដ',
 'ៃទៅកំភួន',
 'ដៃ']

In [51]:
mmcut('ជើង')

['ជើង']