In [1]:
import sys
sys.path.append('../')
import soynlp

print(soynlp.__version__)

0.0.41


In [2]:
from soynlp.pos.tagset import tagset
from pprint import pprint
pprint(tagset)

{'Adjective': '형용사',
 'Adverb': '부사',
 'Determiner': '관형사',
 'Exclamation': '감탄사',
 'Josa': '조사',
 'Noun': '명사',
 'Numeral': '수사',
 'Pronoun': '대명사',
 'Symbol': '기호',
 'Verb': '동사'}


In [3]:
from soynlp.pos import Dictionary
from soynlp.pos import LRTemplateMatcher
from soynlp.pos import LREvaluator
from soynlp.pos import SimpleTagger
from soynlp.pos import UnknowLRPostprocessor

pos_dict = {
    'Adverb': {'너무', '매우'}, 
    'Noun': {'너무너무너무', '아이오아이', '아이', '노래', '오', '이', '고양'},
    'Josa': {'는', '의', '이다', '입니다', '이', '이는', '를', '라', '라는'},
    'Verb': {'하는', '하다', '하고'},
    'Adjective': {'예쁜', '예쁘다'},
    'Exclamation': {'우와'}    
}

dictionary = Dictionary(pos_dict)

In [4]:
dictionary.pos_dict

{'Adjective': {'예쁘다', '예쁜'},
 'Adverb': {'너무', '매우'},
 'Exclamation': {'우와'},
 'Josa': {'는', '라', '라는', '를', '의', '이', '이는', '이다', '입니다'},
 'Noun': {'고양', '너무너무너무', '노래', '아이', '아이오아이', '오', '이'},
 'Verb': {'하고', '하는', '하다'}}

In [5]:
print(dictionary.get_pos('아이오아이'))
print(dictionary.get_pos('이'))

['Noun']
['Noun', 'Josa']


In [6]:
print(dictionary.word_is_tag('아이오아이', 'Noun'))
print(dictionary.word_is_tag('아이오아이', '명사'))
print(dictionary.word_is_tag('아이오아이', 'Josa'))

True
False
False


In [7]:
print('# Add a word with a tag')
dictionary.add_words('Noun', '앙순이')
pprint(dictionary.pos_dict)

print('\n# Add words with a tag')
dictionary.add_words('Noun', ['워너원', '아이돌'])
pprint(dictionary.pos_dict)

# Add a word with a tag
{'Adjective': {'예쁜', '예쁘다'},
 'Adverb': {'너무', '매우'},
 'Exclamation': {'우와'},
 'Josa': {'입니다', '는', '이', '라는', '라', '를', '의', '이다', '이는'},
 'Noun': {'아이', '고양', '이', '아이오아이', '너무너무너무', '오', '노래', '앙순이'},
 'Verb': {'하는', '하고', '하다'}}

# Add words with a tag
{'Adjective': {'예쁜', '예쁘다'},
 'Adverb': {'너무', '매우'},
 'Exclamation': {'우와'},
 'Josa': {'입니다', '는', '이', '라는', '라', '를', '의', '이다', '이는'},
 'Noun': {'아이', '아이돌', '고양', '이', '아이오아이', '너무너무너무', '워너원', '오', '노래', '앙순이'},
 'Verb': {'하는', '하고', '하다'}}


In [8]:
# 등록되어 있지 않은 품사태그 Name 의 경우 입력이 안됨, 오탈자 방지
dictionary.add_words('Name', 'gzupark')

ValueError: Check your tag or use add_words(tag, words, force=True)

In [9]:
# 필요한 품사태그라면 강제로 입력 가능
dictionary.add_words('Name', 'gzupark', force=True)
pprint(dictionary.pos_dict)

{'Adjective': {'예쁜', '예쁘다'},
 'Adverb': {'너무', '매우'},
 'Exclamation': {'우와'},
 'Josa': {'입니다', '는', '이', '라는', '라', '를', '의', '이다', '이는'},
 'Name': {'gzupark'},
 'Noun': {'아이', '아이돌', '고양', '이', '아이오아이', '너무너무너무', '워너원', '오', '노래', '앙순이'},
 'Verb': {'하는', '하고', '하다'}}


In [11]:
dictionary.add_words('명사', '아이오아이', force=True)
pprint(dictionary.pos_dict)

{'Adjective': {'예쁜', '예쁘다'},
 'Adverb': {'너무', '매우'},
 'Exclamation': {'우와'},
 'Josa': {'입니다', '는', '이', '라는', '라', '를', '의', '이다', '이는'},
 'Name': {'gzupark'},
 'Noun': {'아이', '아이돌', '고양', '이', '아이오아이', '너무너무너무', '워너원', '오', '노래', '앙순이'},
 'Verb': {'하는', '하고', '하다'},
 '명사': {'아이오아이'}}


In [12]:
print(dictionary.word_is_tag('아이오아이', 'Noun'))
print(dictionary.word_is_tag('아이오아이', '명사'))
print(dictionary.word_is_tag('아이오아이', 'Josa'))

True
True
False


In [14]:
print(dictionary.word_is_tag('대통령', 'Noun'))

False


In [15]:
dictionary.remove_words('Noun', {'앙순이', '워너원'} )
pprint(dictionary.pos_dict)

{'Adjective': {'예쁜', '예쁘다'},
 'Adverb': {'너무', '매우'},
 'Exclamation': {'우와'},
 'Josa': {'입니다', '는', '이', '라는', '라', '를', '의', '이다', '이는'},
 'Name': {'gzupark'},
 'Noun': {'아이', '아이돌', '고양', '이', '아이오아이', '너무너무너무', '오', '노래'},
 'Verb': {'하는', '하고', '하다'},
 '명사': {'아이오아이'}}


In [16]:
dictionary.remove_words('Noun')
pprint(dictionary.pos_dict)

{'Adjective': {'예쁜', '예쁘다'},
 'Adverb': {'너무', '매우'},
 'Exclamation': {'우와'},
 'Josa': {'입니다', '는', '이', '라는', '라', '를', '의', '이다', '이는'},
 'Name': {'gzupark'},
 'Verb': {'하는', '하고', '하다'},
 '명사': {'아이오아이'}}


In [17]:
sent = '너무너무너무는아이오아이의노래입니다!!'

pos_dict = {
    'Adverb': {'너무', '매우'}, 
    'Noun': {'너무너무너무', '아이오아이', '아이', '노래', '오', '이', '고양'},
    'Josa': {'는', '의', '이다', '입니다', '이', '이는', '를', '라', '라는'},
    'Verb': {'하는', '하다', '하고'},
    'Adjective': {'예쁜', '예쁘다'},
    'Exclamation': {'우와'}    
}

dictionary = Dictionary(pos_dict)
generator = LRTemplateMatcher(dictionary)
pprint(generator.generate(sent))

[LR(l='너무너무너무', l_tag='Noun', r='는', r_tag='Josa', b=0, m=6, e=7),
 LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=0, m=2, e=2),
 LR(l='너무너무너무', l_tag='Noun', r='', r_tag=None, b=0, m=6, e=6),
 LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=2, m=4, e=4),
 LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=4, m=6, e=6),
 LR(l='아이오아이', l_tag='Noun', r='의', r_tag='Josa', b=7, m=12, e=13),
 LR(l='아이오아이', l_tag='Noun', r='', r_tag=None, b=7, m=12, e=12),
 LR(l='노래', l_tag='Noun', r='입니다', r_tag='Josa', b=13, m=15, e=18),
 LR(l='노래', l_tag='Noun', r='', r_tag=None, b=13, m=15, e=15)]


In [18]:
generator.generate(sent)[0]

LR(l='너무너무너무', l_tag='Noun', r='는', r_tag='Josa', b=0, m=6, e=7)

In [19]:
generator.templates

{'Noun': ('Josa', 'Verb', 'Adjective')}

In [21]:
evaluator = LREvaluator()
postprocessor = UnknowLRPostprocessor()

tagger = SimpleTagger(generator, evaluator, postprocessor)
tagger.tag(sent)

[('너무너무너무', 'Noun'),
 ('는', 'Josa'),
 ('아이오아이', 'Noun'),
 ('의', 'Josa'),
 ('노래', 'Noun'),
 ('입니다', 'Josa'),
 ('!!', None)]

In [22]:
SimpleTagger(generator, evaluator).tag(sent)

[('너무너무너무', 'Noun'),
 ('는', 'Josa'),
 ('아이오아이', 'Noun'),
 ('의', 'Josa'),
 ('노래', 'Noun'),
 ('입니다', 'Josa')]

In [23]:
tags, debugs = tagger.tag(sent, debug=True)

In [24]:
pprint(tags)

[('너무너무너무', 'Noun'),
 ('는', 'Josa'),
 ('아이오아이', 'Noun'),
 ('의', 'Josa'),
 ('노래', 'Noun'),
 ('입니다', 'Josa'),
 ('!!', None)]


In [25]:
pprint(debugs)

[[(LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=0, m=2, e=2), 0.4),
  (LR(l='너무너무너무', l_tag='Noun', r='', r_tag=None, b=0, m=6, e=6),
   0.9000000000000001),
  (LR(l='너무너무너무', l_tag='Noun', r='는', r_tag='Josa', b=0, m=6, e=7),
   1.1500000000000001),
  (LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=2, m=4, e=4), 0.4),
  (LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=4, m=6, e=6), 0.4),
  (LR(l='아이오아이', l_tag='Noun', r='', r_tag=None, b=7, m=12, e=12), 0.8),
  (LR(l='아이오아이', l_tag='Noun', r='의', r_tag='Josa', b=7, m=12, e=13), 1.05),
  (LR(l='노래', l_tag='Noun', r='', r_tag=None, b=13, m=15, e=15), 0.5),
  (LR(l='노래', l_tag='Noun', r='입니다', r_tag='Josa', b=13, m=15, e=18), 0.95)]]


In [26]:
preference = {
    'Noun': {'아이오아이':10.0, '너무너무너무':5}
}

evaluator = LREvaluator(preference=preference)
tagger = SimpleTagger(generator, evaluator, postprocessor)
tags, debugs = tagger.tag(sent, debug=True)

pprint(debugs)

[[(LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=0, m=2, e=2), 0.4),
  (LR(l='너무너무너무', l_tag='Noun', r='', r_tag=None, b=0, m=6, e=6), 5.9),
  (LR(l='너무너무너무', l_tag='Noun', r='는', r_tag='Josa', b=0, m=6, e=7), 6.15),
  (LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=2, m=4, e=4), 0.4),
  (LR(l='너무', l_tag='Adverb', r='', r_tag=None, b=4, m=6, e=6), 0.4),
  (LR(l='아이오아이', l_tag='Noun', r='', r_tag=None, b=7, m=12, e=12), 10.8),
  (LR(l='아이오아이', l_tag='Noun', r='의', r_tag='Josa', b=7, m=12, e=13), 11.05),
  (LR(l='노래', l_tag='Noun', r='', r_tag=None, b=13, m=15, e=15), 0.5),
  (LR(l='노래', l_tag='Noun', r='입니다', r_tag='Josa', b=13, m=15, e=18), 0.95)]]
