In [1]:
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x29a852c5a00>)

In [2]:
from nltk.tokenize import word_tokenize
import nltk


In [3]:
from konlpy.tag import Okt
okt = Okt()

In [4]:
def pos_tokenize(raw_sent):
    pos_sent = []
    
    sent = okt.pos(raw_sent, norm=True, stem=True)
    
    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
    return ' '.join(pos_sent)

In [5]:
def make_word_dict(train, use_morph=False):
    all_words = set()
    
    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph: sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        for word in words:
            all_words.add(word)
            
    return all_words

In [6]:
def make_train_feats(train, all_words, use_morph=False):
    train_features = []
    
    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph: sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        tmp = {set_word: (set_word in words) for set_word in all_words}
        sent_tup = (tmp, label)
        train_features.append(sent_tup)
        
    return train_features

In [7]:
def load_data(file_path):
    train = []
    count = 0
    
    with open(file_path,'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 500: break
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1' : label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc, label))
            count += 1
    return train

In [8]:
train = load_data('ratings_train.txt')
train[2]

('흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', 'pos')

In [13]:
all_words = make_word_dict(train, use_morph=False)

In [18]:
train_features = make_train_feats(train, all_words, use_morph=False)
print(train_features[:5])

[({'건지': False, '개연성도': False, "'-": False, '같다는': False, '일으킨': False, '왜그랬으까': False, '보게되었습니다': False, '잔잔하지만': False, '가르엔': False, '크리스토퍼왈츠와': False, '없어도': False, '오시길^^': False, '에릭': False, '코미디와': False, '아이스': False, '청춘': False, '영화수준': False, '상업': False, '5년전꺼다': False, '지네': False, '이기면': False, '동화다운': False, '놀아나는': False, '몸짓으로': False, '이해하는데': False, '괜찮다았던': False, '주인공이': False, '영활': False, '노력하는지를': False, '...': False, '이하늘은': False, '대놓고': False, '이해가': False, '잘봤습니다': False, '몬초가': False, '쌤': False, '들어간다던가': False, '있었다': False, '남을': False, '멋진': False, '보는데': False, '혀짧은': False, '9점': False, '남주인공하는게': False, '아픔을': False, '오늘': False, '하나도': False, '열려있다던지': False, '더대박이였다': False, '쌓고': False, '복수': False, '드라마속에선': False, '내가보기엔': False, '난다': False, '쩔게': False, '별루예요': False, '합시다': False, '지쳐있었는데': False, '극치를': False, '아니다.범죄': False, '모습만': False, '네이버': False, '였다': False, '감독님들': False, '영화뮤지컬영화': False, '봤음에도': False, '나머진다들답답하네요': False, '안나':

In [15]:
classifier = nltk.NaiveBayesClassifier.train(train_features)

In [20]:
classifier.show_most_informative_features(200)

Most Informative Features
                       ; = True              neg : pos    =      8.2 : 1.0
                       그 = True              pos : neg    =      4.7 : 1.0
                       수 = True              pos : neg    =      4.7 : 1.0
                      ㅡㅡ = True              neg : pos    =      4.6 : 1.0
                       ! = True              pos : neg    =      4.0 : 1.0
                    스토리도 = True              neg : pos    =      3.9 : 1.0
                      이거 = True              neg : pos    =      3.9 : 1.0
                     하나도 = True              neg : pos    =      3.9 : 1.0
                      없다 = True              neg : pos    =      3.6 : 1.0
                       ? = True              neg : pos    =      3.6 : 1.0
                     그리고 = True              neg : pos    =      3.2 : 1.0
                      이게 = True              neg : pos    =      3.2 : 1.0
                    .... = True              neg : pos    =      3.0 : 1.0

----------------------