## 2022년 11월 30일 (수) 실습 2

`인천대학교 경제학과 201900740 박혜인`

In [51]:
import math

class MyNaiveBayesClassifier:
    
    def __init__(self, k=0.5, use_morph=False):
        self.k = k
        self.word_probs = []
        self.use_morph = use_morph
        
        if self.use_morph:
            from konlpy.tag import Okt
            self.okt = Okt()
            
            
    def load_data(self, file_path):
        # file_path에 있는 데이터를 읽어옴
        
        docs = []
        labels = []
        count = 0
        
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                if count == 500: break
                line = line.strip()
                id, doc, label = line.split('\t')
                docs.append(doc)
                if label == '1': label = 'pos'
                elif label == '0': label = 'neg'
                labels.append(label)
                count += 1
            
        return docs[1:], labels[1:]
        
        
    def tokenize(self, sentence):
        if self.use_morph:
            pos_sent = []
            
            # raw_sent: 사과가 좋아
            sent = self.okt.pos(sentence, norm=True, stem=True)
            # sent: [('사과', 'Noun'), ('가', 'Josa'), ('좋다', 'Adjective')]
            
            for tup in sent:
                word, tag = tup[0], tup[1]      # tup: ('사과', 'Noun')
                word_tag = word + '/' + tag     # word_tag: '사과/Noun'
                pos_sent.append(word_tag)
            
            sentence = ' '.join(pos_sent)
        
        return sentence.split()

            
    def count_words(self, docs, labels):
        # 단어 사전(dictionary)를 만들고, 각 단어의 긍정/부정 문서 등장 횟수 세기
        
        count_dict = dict()
        for doc, label in zip(docs, labels):
            for word in self.tokenize(doc):
                if word not in count_dict:
                    count_dict[word] = {'pos': 0, 'neg': 0}
                count_dict[word][label] += 1
        
        print('num of words...', len(count_dict))
        return count_dict
        
        
    def word_prob(self, count_dict, pos_class_num, neg_class_num, k):
        # (단어, p(단어|긍정), p(단어|부정))의 튜플 형태로 만들어주어 리스트에 추가
        
        word_prob_list = []
        
        for key in count_dict:
            pos_word_num = count_dict[key]['pos']
            neg_word_num = count_dict[key]['neg']
            
            pos_class_prob = (pos_word_num + k) / (pos_class_num + 2*k)
            neg_class_prob = (neg_word_num + k) / (neg_class_num + 2*k)
            
            tup = (key, pos_class_prob, neg_class_prob)
            word_prob_list.append(tup)
            
        return word_prob_list
    
    
    def class_prob(self, word_prob_list, test_sentence, use_unseen=False):
        # p(긍정|문서), p(부정|문서) 계산
    
        test_words = self.tokenize(test_sentence)
        
        sent_log_pos_class_prob, sent_log_neg_class_prob = 0.0, 0.0
        
        for word, word_pos_class_prob, word_neg_class_prob in word_prob_list:
            if word in test_words:
                sent_log_pos_class_prob = sent_log_pos_class_prob + math.log(word_pos_class_prob)
                sent_log_neg_class_prob = sent_log_neg_class_prob + math.log(word_neg_class_prob)
            else:
                if use_unseen:
                    sent_log_pos_class_prob = sent_log_pos_class_prob + math.log(1.0 - word_pos_class_prob)
                    sent_log_neg_class_prob = sent_log_neg_class_prob + math.log(1.0 - word_neg_class_prob)
        
        sent_pos_class_prob = math.exp(sent_log_pos_class_prob)
        sent_neg_class_prob = math.exp(sent_log_neg_class_prob)
        
        pos_class_prob = sent_pos_class_prob / (sent_pos_class_prob + sent_neg_class_prob)
        neg_class_prob = sent_neg_class_prob / (sent_pos_class_prob + sent_neg_class_prob)
        
        return pos_class_prob, neg_class_prob
    
    
    def train(self, train_file_path):
        # load_data, count_words, word_prob 계산
        
        train_docs, train_labels = self.load_data(train_file_path)
        
        word_count_dict = self.count_words(train_docs, train_labels)
        
        pos_class_num = len([label for label in train_labels if label == 'pos'])
        neg_class_num = len([label for label in train_labels if label == 'neg'])
        
        self.word_probs = self.word_prob(word_count_dict, pos_class_num, neg_class_num, self.k)
        
        
    def classify(self, doc, use_unseen=False):
        # class_prob 계산
        
        pos_class_prob, neg_class_prob = self.class_prob(self.word_probs, doc, use_unseen)
        
        if pos_class_prob > neg_class_prob:
            print('pos', pos_class_prob)
        else:
            print('neg', neg_class_prob)
            
    # accuracy 구하는 메소드 추가
    def accuracy(self, file_path, use_unseen):
        
        # 파일 로드 (내용, 라벨)
        docs, labels = self.load_data(file_path)

        # 확률 구하기
        count = 0
        i = 0
        
        for doc in docs:
            pos, neg = self.class_prob(self.word_probs, doc, use_unseen)
            if pos > neg:
                if labels[i] == 'pos':
                    count += 1
            else:
                if labels[i] == 'neg':
                    count += 1
            i += 1
            
        prob = count / len(docs)
        return prob
                   

In [52]:
classifier = MyNaiveBayesClassifier(use_morph=False)
classifier.train('ratings_train.txt')

num of words... 3055


In [53]:
classifier.accuracy('ratings_test.txt', use_unseen=False)

0.625250501002004

In [54]:
classifier.accuracy('ratings_test.txt', use_unseen=True)

0.6212424849699398

In [55]:
classifier = MyNaiveBayesClassifier(use_morph=True)
classifier.train('ratings_train.txt')

num of words... 2344


In [56]:
classifier.accuracy('ratings_test.txt', use_unseen=False)

0.7354709418837675

In [57]:
classifier.accuracy('ratings_test.txt', use_unseen=True)

0.7154308617234469