## 2022년 11월 30일 (수) 실습 1

`인천대학교 경제학과 201900740 박혜인`

In [1]:
import urllib

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x20d774e0880>)

In [2]:
from nltk.tokenize import word_tokenize
import nltk

from konlpy.tag import Okt
okt = Okt()

In [140]:
def load_data(filename):
    train = []
    with open(filename, 'r', encoding='utf-8') as f:
        count = 0
        for line in f.readlines():
            if count == 1000: break
            count += 1

            line = line.strip()
            id, document, label = line.split('\t')
            if label == '1': label = 'pos'
            else: label = 'neg'    
            train_tup = (document, label)
            train.append(train_tup)
    return train

In [4]:
def pos_tokenize(raw_sent):
    pos_sent = []
    
    sent = okt.pos(raw_sent, norm=True, stem=True)
    for tup in sent:
        word, tag = tup
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
       
    return ' '.join(pos_sent) 

In [5]:
def make_word_dict(train, use_morph=False):
    all_words = set()
    for tup in train:
        sent, label = tup
        if use_morph: sent = pos_tokenize(sent)
        word_list = word_tokenize(sent)
        for word in word_list:
            all_words.add(word)
    
    return all_words

In [6]:
def make_train_feats(train, all_words, use_morph=False):
    train_features = []
    for tup in train:
        sent, label = tup # sent:'I like you' label: 'pos'
        if use_morph: sent = pos_tokenize(sent)
        word_list = word_tokenize(sent) # ['I', like, you]
        tmp = {set_word: set_word in word_list for set_word in all_words}
        train_feature = (tmp, label)
        train_features.append(train_feature)
    
    return train_features

## Training

In [141]:
train = load_data('ratings_train.txt')[1:]

In [149]:
all_words = make_word_dict(train, use_morph=True)

In [150]:
train_features = make_train_feats(train, all_words, use_morph=True)

In [151]:
classifier = nltk.NaiveBayesClassifier.train(train_features)

In [152]:
classifier.show_most_informative_features(n=10)

Most Informative Features
                쓰레기/Noun = True              neg : pos    =     11.9 : 1.0
                 인생/Noun = True              pos : neg    =     10.0 : 1.0
                 최고/Noun = True              pos : neg    =      9.5 : 1.0
           괜찮다/Adjective = True              pos : neg    =      8.6 : 1.0
                       ; = True              neg : pos    =      8.3 : 1.0
          재미없다/Adjective = True              neg : pos    =      8.1 : 1.0
       ㅡㅡ/KoreanParticle = True              neg : pos    =      8.1 : 1.0
           아깝다/Adjective = True              neg : pos    =      7.6 : 1.0
          지루하다/Adjective = True              neg : pos    =      7.5 : 1.0
           재밌다/Adjective = True              pos : neg    =      7.5 : 1.0


## Test

In [12]:
test = load_data('ratings_test.txt')[1:]

In [13]:
test_features = make_train_feats(test, all_words, use_morph=False)

In [14]:
classifier.classify(test_features[0][0])

'pos'

---

## 실습 1

* count는 500

### (1) use_morph가 True인 경우

In [124]:
test = load_data('ratings_test.txt')

In [125]:
test_features = make_train_feats(test, all_words, use_morph=True)

In [126]:
nltk.classify.accuracy(classifier, test_features)

0.714

`use_morph가 True일 때, 분류기의 정확도는 0.714로, 71.4%가 나왔다`

### (2) use_morph가 False인 경우

In [134]:
test = load_data('ratings_test.txt')

In [135]:
test_features = make_train_feats(test, all_words, use_morph=False)

In [136]:
nltk.classify.accuracy(classifier, test_features)

0.642

`use_morph가 False일 때, 분류기의 정확도는 0.642로, 64.2%가 나왔다`

## 실습 2

### (1) use_morph가 True인 경우

In [153]:
test = load_data('ratings_test.txt')

In [154]:
test_features = make_train_feats(test, all_words, use_morph=True)

In [129]:
# count 값 500
nltk.classify.accuracy(classifier, test_features)

0.714

In [155]:
# count 값 1000
nltk.classify.accuracy(classifier, test_features)

0.774

### 결과

- count값이 500일 때, 정확도는 0.714
- count값이 1000일 때, 정확도는 0.774

### (2) use_morph가 False인 경우

In [146]:
test = load_data('ratings_test.txt')

In [147]:
test_features = make_train_feats(test, all_words, use_morph=False)

In [139]:
# count 값 500
nltk.classify.accuracy(classifier, test_features)

0.642

In [148]:
# count 값 1000
nltk.classify.accuracy(classifier, test_features)

0.666

### 결과

- count값이 500일 때, 정확도는 0.642
- count값이 1000일 때, 정확도는 0.666