/
korean.py
61 lines (53 loc) · 2.03 KB
/
korean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from __future__ import annotations
import re
try:
import jamo
from mecab import MeCab
KO_AVAILABLE = True
except ImportError:
KO_AVAILABLE = False
MeCab = None
jamo = None
class KoreanTokenizer:
def __init__(self, ignore_case: bool = True):
self.ignore_case = ignore_case
self.tokenizer = MeCab()
def __call__(self, text):
new_text = []
morphs = self.tokenizer.parse(text)
pronunciations = []
for morph in morphs:
normalized = morph.surface
join = False
m = re.search(r"[]})>][<({[]", normalized)
if new_text and m:
new_text[-1] += normalized[: m.start() + 1]
normalized = normalized[m.end() - 1 :]
elif new_text and re.match(r"^[<({\[].*", new_text[-1]):
join = True
elif new_text and re.match(r".*[-_~]$", new_text[-1]):
join = True
elif new_text and re.match(r".*[>)}\]]$", normalized):
join = True
elif new_text and re.match(r"^[-_~].*", normalized):
join = True
if new_text and any(new_text[-1].endswith(x) for x in {">", ")", "}", "]"}):
join = False
if join:
new_text[-1] += normalized
pronunciations[-1] += jamo.h2j(normalized)
continue
if morph.pos in {"SF", "SY", "SC"} and normalized not in {"<", "(", "{", "["}:
continue
new_text.append(normalized)
pronunciations.append(jamo.h2j(normalized))
new_text = " ".join(new_text)
pronunciations = " ".join(pronunciations)
if self.ignore_case:
new_text = new_text.lower()
pronunciations = pronunciations.lower()
return new_text, pronunciations
def ko_spacy(ignore_case: bool = True):
if not KO_AVAILABLE:
raise ImportError("Please install Korean support via `pip install python-mecab-ko jamo`")
return KoreanTokenizer(ignore_case)