Some Python libraries for Korean text-processing and NLP:
* [Library listing](https://github.com/lqez/awesome-hangul#python)
* [hangul-utils](https://github.com/kaniblu/hangul-utils)
* [Korean mornphology library](https://pythonhosted.org/korean/)
* [Jamo](https://github.com/JDongian/python-jamo), docs [here](https://python-readthedocs.io/en/latest/)
* [Python script for Hangul manipulations](https://raw.githubusercontent.com/sublee/hangulize/master/hangulize/hangul.py)
* [PyKoSpacing - spacing corrector](https://github.com/haven-jeon/PyKoSpacing)
* [Hanspell - spelling corrector](https://github.com/ssut/py-hanspell)


In [1]:
import json

In [4]:
# load KAIST 60K corpus
koGroundTruth = []
with open('kaist.corpus-khaiii-v0.5.json') as kcf:
    for line in kcf:
        data = json.loads(line)
        if len(data) == 5:
            koGroundTruth.append(data[1])
#
koGroundTruth[:10]

['끝.', '아!', '후퇴!', '번호!', '병가.', '전하.', '거총!', '빨리!', '꺼져!', '매체.']

In [7]:
# check some Hangul libraries
from jamo import h2j, j2hcj, hangul_to_jamo, j2h

print("h2j('친') => ", h2j('친'))
print("list(hangul_to_jamo('친')) =>", list(hangul_to_jamo('친')))
print("j2hcj(h2j('친')) =>", j2hcj(h2j('친')))
print("j2h('ᄎ', 'ᅵ', 'ᆫ') =>", j2h('ᄎ', 'ᅵ', 'ᆫ'))


SyntaxError: invalid syntax (215491863.py, line 1)

In [None]:
h2j('친')

In [None]:
j2hcj('친')

In [None]:
j2hcj(h2j('친'))

In [None]:
hangul_to_jamo('친')

In [None]:
list(hangul_to_jamo('친'))

In [1]:
from hanspell import spell_checker
spell_checker.check(u'저 는 한국어을 쉽게 공부할수 있어요.').as_dict()
spell_checker.check(u'저 는 한국어를쉽게 공부할수 있 어요.').as_dict()
spell_checker.check(u'저는한국어를쉽게공부할수있어요.').as_dict()

{'result': True,
 'original': '저는한국어를쉽게공부할수있어요.',
 'checked': '저는 한국어를 쉽게 공부할 수 있어요.',
 'errors': 1,
 'words': OrderedDict([('저는', 2),
              ('한국어를', 2),
              ('쉽게', 2),
              ('공부할', 2),
              ('수', 2),
              ('있어요.', 2)]),
 'time': 0.03572702407836914}

In [None]:
import hangul

In [None]:
suspects = []
for sentence in koGroundTruth:
    hangulCount = 0
    nonAlphabeticCount = 0
    for character in sentence:
        if hangul.ishangul(character):
            hangulCount += 1
        elif not character.isalpha():
            nonAlphabeticCount += 1
    lengthWithoutAlphas = len(sentence) - nonAlphabeticCount
    if lengthWithoutAlphas > 0 and hangulCount / lengthWithoutAlphas < 0.25:
        suspects.append(sentence)



In [None]:
len(suspects)

In [None]:
suspects[:10]

In [1]:
from pykospacing import Spacing

In [2]:
spacing = Spacing()
spacing('귀밑에서턱까지잇따라난수염을구레나룻이라고한다.')

'귀 밑에서 턱까지 잇따라 난 수염을 구레나 룻이라고 한다.'

In [3]:
spacing("저 는 한국어를쉽게 공부할수 있 어요.")

'저 는 한국어를 쉽게 공부할 수 있 어요.'

In [6]:
spacing("저는한국어를쉽게공부할수있어요.")

'저는 한국어를 쉽게 공부할 수 있어요.'