In [1]:
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x22974044cd0>)

In [2]:
from nltk.tokenize import word_tokenize
import nltk


In [3]:
from konlpy.tag import Okt
okt = Okt()

In [4]:
def pos_tokenize(raw_sent):
    pos_sent = []
    
    sent = okt.pos(raw_sent, norm=True, stem=True)
    
    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
    return ' '.join(pos_sent)

In [5]:
def make_word_dict(train, use_morph=False):
    all_words = set()
    
    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph: sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        for word in words:
            all_words.add(word)
            
    return all_words

In [6]:
def make_train_feats(train, all_words, use_morph=False):
    train_features = []
    
    for tup in train:
        sent, label = tup[0], tup[1]
        if use_morph: sent = pos_tokenize(sent)
        words = word_tokenize(sent)
        tmp = {set_word: (set_word in words) for set_word in all_words}
        sent_tup = (tmp, label)
        train_features.append(sent_tup)
        
    return train_features

In [31]:
def load_data(file_path):
    train = []
    count = 0
    
    with open(file_path,'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 1000: break
            line = line.strip()
            id, doc, label = line.split('\t')
            if label == '1' : label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc, label))
            count += 1
    return train

In [32]:
train = load_data('ratings_train.txt')

In [33]:
all_words = make_word_dict(train, use_morph=True)

In [34]:
train_features = make_train_feats(train, all_words, use_morph=True)

In [35]:
classifier = nltk.NaiveBayesClassifier.train(train_features)

----------------------

# count == 500

In [27]:
test = load_data('ratings_test.txt')

In [28]:
test_features = make_train_feats(test, all_words, use_morph=True)

In [29]:
test_result = nltk.classify.accuracy(classifier,test_features)

In [30]:
test_result

0.714

---
# count == 1000

In [36]:
test = load_data('ratings_test.txt')

In [37]:
test_features = make_train_feats(test, all_words, use_morph=True)

In [38]:
test_result = nltk.classify.accuracy(classifier,test_features)
test_result

0.774

아무래도 수집하는 데이터가 적으면 전체적인 부분에서 정확도가 떨어질 수 밖에 없다.
개수를 500으로 제한뒀을 때 보다 개수를 1000개로 제한뒀을 때 데이터의 개수가 많아져 1000일 때가 500일 때보다 정확도가 높은 것을 볼 수 있다.

데이터가 많아지면 많아질수록 정확도가 증가한다. 
