In [2]:
from nltk.tokenize import word_tokenize
import nltk


In [3]:
from konlpy.tag import Okt
okt = Okt()

In [4]:
def pos_tokenize(raw_sent):
    pos_sent = []
    
    sent = okt.pos(raw_sent, norm=True, stem=True)
    
    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
    return ' '.join(pos_sent)

In [5]:
def load_data(file_path):
    train = []
    count = 0
    
    with open(file_path,'r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1' : label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc, label))
            count += 1
    return train

In [6]:
train = load_data('ratings_train.txt')
print(train[:5])

[('document', 'label'), ('아 더빙.. 진짜 짜증나네요 목소리', 'neg'), ('흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'pos'), ('너무재밓었다그래서보는것을추천한다', 'neg'), ('교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', 'neg')]


In [7]:
train = train[1:10]
print(train[:5])

[('아 더빙.. 진짜 짜증나네요 목소리', 'neg'), ('흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'pos'), ('너무재밓었다그래서보는것을추천한다', 'neg'), ('교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', 'neg'), ('사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', 'pos')]


In [8]:
import urllib.request

urllib.request.urlretrieve("https://github.com/e9t/nsmc/blob/master/ratings_train.txt",
                  filename="ratings_train.txt")


('ratings_train.txt', <http.client.HTTPMessage at 0x15211c06ac0>)

In [9]:
all_words = set()

for tup in train:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)
    for word in words:
        all_words.add(word)
all_words

{'!',
 '..',
 '...',
 '....',
 './Punctuation',
 '/Punctuation',
 '1/Number',
 '3/Number',
 '8/Number',
 'ㅋㅋㅋ/KoreanParticle',
 '가/Josa',
 '가볍다/Adjective',
 '가족/Noun',
 '감금/Noun',
 '걸음/Noun',
 '교도소/Noun',
 '구먼/Noun',
 '그/Determiner',
 '그것/Noun',
 '기/Modifier',
 '긴장감/Noun',
 '길용우/Noun',
 '나오다/Verb',
 '납치/Noun',
 '낫다/Verb',
 '너/Modifier',
 '너무나도/Adverb',
 '네/Noun',
 '년/Noun',
 '는/Josa',
 '늙다/Verb',
 '다/Adverb',
 '다그/Noun',
 '더빙/Noun',
 '던스트/Noun',
 '도/Josa',
 '돋보이다/Verb',
 '드라마/Noun',
 '떼다/Verb',
 '래서/Noun',
 '로/Josa',
 '마/Noun',
 '막/Noun',
 '만/Josa',
 '몇/Modifier',
 '몇/Noun',
 '모/Modifier',
 '목소리/Noun',
 '몬페/Noun',
 '못/VerbPrefix',
 '무재/Noun',
 '밓었/Noun',
 '반개/Noun',
 '반복/Noun',
 '발/Noun',
 '별/Modifier',
 '별/Noun',
 '보고/Noun',
 '보다/Verb',
 '보단/Josa',
 '보이다/Verb',
 '부터/Josa',
 '사람/Noun',
 '사이/Modifier',
 '살다/Verb',
 '살리다/Verb',
 '생인/Noun',
 '생활/Noun',
 '세/Noun',
 '솔직하다/Adjective',
 '스파이더맨/Noun',
 '아/Exclamation',
 '아깝다/Adjective',
 '안되다/Adjective',
 '않다/Verb',
 '액션/Noun',
 '없다/Adjective'

In [10]:
# 위에꺼 간략하게 작성하기

train_features = []

for tup in train:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)

    tmp = {set_word: (set_word in words) for set_word in all_words}
    sent_tup = (tmp, label)
    train_features.append(sent_tup)
    
for i in range(len(train_features)):
               print(train_features[i])

({'밓었/Noun': False, '제대로/Noun': False, '목소리/Noun': True, '낫다/Verb': False, '...': False, '몇/Modifier': False, '모/Modifier': False, '원작/Noun': False, '사람/Noun': False, '액션/Noun': False, '다그/Noun': False, './Punctuation': False, '무재/Noun': False, '더빙/Noun': True, '을/Josa': False, '걸음/Noun': False, '막/Noun': False, '..': True, '부터/Josa': False, '가볍다/Adjective': False, '가/Josa': False, '안되다/Adjective': False, '못/VerbPrefix': False, '조정/Noun': False, '네/Noun': False, '발/Noun': False, '떼다/Verb': False, '학년/Noun': False, '긴장감/Noun': False, '사이/Modifier': False, '!': False, '오버/Noun': False, '8/Number': False, '보다/Verb': False, '몬페/Noun': False, '돋보이다/Verb': False, '재미/Noun': False, '몇/Noun': False, '감금/Noun': False, '짜증나다/Adjective': True, '이/Josa': False, '욕/Noun': False, '초딩/Noun': False, '교도소/Noun': False, '초등학교/Noun': False, '아깝다/Adjective': False, '....': False, '없다/Adjective': False, '가족/Noun': False, '는/Josa': False, '마/Noun': False, '만/Josa': False, '의/Josa': False, '줄/Noun': False, '

In [11]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features(2000)

Most Informative Features
                 영화/Noun = True              pos : neg    =      4.1 : 1.0
                 연기/Noun = True              pos : neg    =      2.9 : 1.0
                 연기/Noun = False             neg : pos    =      2.1 : 1.0
                      .. = False             pos : neg    =      1.8 : 1.0
                     ... = True              pos : neg    =      1.8 : 1.0
           ./Punctuation = False             pos : neg    =      1.8 : 1.0
                  이/Josa = True              pos : neg    =      1.8 : 1.0
                 재미/Noun = True              pos : neg    =      1.8 : 1.0
                       ! = False             neg : pos    =      1.5 : 1.0
                    .... = False             neg : pos    =      1.5 : 1.0
                  가/Josa = False             neg : pos    =      1.5 : 1.0
           가볍다/Adjective = False             neg : pos    =      1.5 : 1.0
            그/Determiner = False             neg : pos    =      1.5 : 1.0

In [12]:
test_sent = load_data('ratings_test.txt')

In [13]:
# test_sent[1][0]

In [14]:
test_sent = '재밌다'
test_sent = pos_tokenize(test_sent)
test_words = word_tokenize(test_sent)



In [15]:
test_sent = pos_tokenize(test_sent)
words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}

print(test_feature)

{'밓었/Noun': False, '제대로/Noun': False, '목소리/Noun': False, '낫다/Verb': False, '...': False, '몇/Modifier': False, '모/Modifier': False, '원작/Noun': False, '사람/Noun': False, '액션/Noun': False, '다그/Noun': False, './Punctuation': False, '무재/Noun': False, '더빙/Noun': False, '을/Josa': False, '걸음/Noun': False, '막/Noun': False, '..': False, '부터/Josa': False, '가볍다/Adjective': False, '가/Josa': False, '안되다/Adjective': False, '못/VerbPrefix': False, '조정/Noun': False, '네/Noun': False, '발/Noun': False, '떼다/Verb': False, '학년/Noun': False, '긴장감/Noun': False, '사이/Modifier': False, '!': False, '오버/Noun': False, '8/Number': False, '보다/Verb': False, '몬페/Noun': False, '돋보이다/Verb': False, '재미/Noun': False, '몇/Noun': False, '감금/Noun': False, '짜증나다/Adjective': False, '이/Josa': False, '욕/Noun': False, '초딩/Noun': False, '교도소/Noun': False, '초등학교/Noun': False, '아깝다/Adjective': False, '....': False, '없다/Adjective': False, '가족/Noun': False, '는/Josa': False, '마/Noun': False, '만/Josa': False, '의/Josa': False, '줄/Noun': False

In [16]:
classifier.classify(test_feature)

'neg'