In [1]:
import typing
import re

wordWild = r"[~～]"

def loadRawCSV(path : str):
  ret = []
  with open(path,encoding='utf-8',mode='r') as raw_file:
    for line in raw_file.readlines():
      ret.append(line.strip().split(","))
  return ret

def containWordWild(input : str):
  return re.search(wordWild,input) is not None
def locateWordLine(lines: typing.List[str]):
  ret = []
  for line in lines:
    dsc = line[2]
    if containWordWild(dsc):
      ret.append(line)
  return ret
  
def posWordInLine(line:str):
  return list(res.start() for res in re.finditer(wordWild,line))

# 先 define 啲 helper，然後寫一個提詞方法
def isCJK(char : str):
  utfcode = ord(char.encode("utf-8").decode())
  cpr = []
  # regular
  cpr.append((0x4E00,0x9FFF))
  # ExtA
  cpr.append((0x3400,0x4DBF))
  # ExtB
  cpr.append((0x20000,0x2A6DF))
  # ExtC
  cpr.append((0x2A700,0x2B73F))
  # ExtD
  cpr.append((0x2B740,0x2B81F))
  # ExtE
  cpr.append((0x2B820,0x2CEAF))
  # ExtF
  cpr.append((0x2CEB0,0x2EBEF))
  # ExtG
  cpr.append((0x30000,0x3134F))
  return any(cmp[0] <= utfcode <= cmp[1] for cmp in cpr)

# 寫切詞方法
def splitWord(line:str):
  def traverseUntilNotCJK(line:str,start_pos :int,direction : int, len_limit = 3):
    cur_pos = start_pos
    should_skip = lambda char : not containWordWild(char) and not isCJK(char)
    for i in (range(start_pos,len(line),1) if direction > 0 else range(start_pos,0,-1)):
      char = line[i]
      if(len_limit == 0):
        break
      len_limit -= 1  
      cur_pos = i
      if should_skip(char):
        cur_pos += 1
        break

    return cur_pos
  
  ret = []
  if(type(line) != str):
    return ret
  pos = posWordInLine(line)
  for eachPos in pos:
    left_bound = right_bound = eachPos
    left_bound = traverseUntilNotCJK(line,left_bound,-1)
    right_bound = traverseUntilNotCJK(line,right_bound,1)
    word = line[left_bound:right_bound]
    ret.append(word)
  return ret

试下 识別效果

In [None]:
print(isCJK("a"))
print(isCJK("（"))
print(isCJK("一"))
print(isCJK("|"))
print(isCJK("｜"))
print(isCJK("𡠀"))
print(isCJK("ㄦ"))

開始 試下 揾詞

In [2]:
raw_data = loadRawCSV("faanjyutExport.csv")
ele_with_words = locateWordLine(raw_data)
words = []
for ele in ele_with_words:
  line = ele[2]
  single_char = ele[0]
  pron = ele[1]
  words_in_line = splitWord(line)
  if(len(words_in_line) == 0):
    continue
  print(words_in_line)
  words.append(words_in_line)

['～粉）']
['面～｜', '面珠']
['缸～｜', '～缸｜', '缸～']
['～場｜', '大～場(']
['慳～）']
['思～）']
['百～）']
['重～', '重～']
['邊～？', '南～。']
['黑～～）', '黑～～）']
['腍～～｜', '腍～～｜', '溶～～｜', '溶～～｜', '腍～劣｜', '溶～膉。']
['～一']
['係有～。']
['～悠）']
['～悠）']
['～h']
['淡～～｜', '淡～～｜', '淡～劣(']
['～脷）', '真係～囉。']
['斬～蔗']
['正～）', '正～白']
['正～）', '正～白']
['正～）', '正～白']
['老～｜', '老～仔｜', '老～妹。']
['～～', '～～聲']
['你都～嘅｜', '知傻～～', '傻～～冇']
['冤～）']
['～身', '～轉', '拗～條']
['～擬）', '冇使～佢']
['指～）', '冇使～佢']
['～開']
['水～｜', '替死～；', '～佬｜', '～妹；', '麻～煩｜', '～咁', '咁～死', '窮～｜', '衰～。']
['～你']
['～～）', '～～）', '～～', '～～仔）']
['～周｜', '～㞗｜', '有冇～㗎？']
['賓～｜', '賓～仔｜', '有冇～㗎？']
['水就～起', '電池～成']
['力啲～先']
['一～鎖', '一～']
['～糟）']
['屙～屎；', '粥好～。']
['魚～）']
['起～｜', '蚊～｜', '起䊆～。']
['～亂）', '～～', '～～亂）']
['口～）', '口～～）', '口～～）']
['黑～～）', '黑～～）']
['～開。']
['～刺）']
['～糖', '～住']
['等我～下']
['條船～埋；', '～檔；', '～拖）']
['呢～我']
['～懵｜', '～一', '～一～，']
['日手～腳', '～腳～。']
['成～']
['～～）', '～～）', '脹～～）', '脹～～）']
['魚~）']
['～蓋）']
['槍打～嘹，']
['～']
['人都～齊｜', '～下', '～下～下(']
['人咁～，']
['咁～等']
['～鏡）']
['～開', '後便～矣']
['食～好