In [1]:
import math

class MyNaiveBayesClassifier:
    
    def __init__(self, k = 0.5, use_morph=False):
        self.k = k
        self.word_probs = []
        self.use_morph = use_morph
        
        if self.use_morph:
            from konlpy.tag import Okt
            self.okt = Okt()
            
    def load_data(self, file_path):
        docs = []
        labels = []
        count = 0
        with open(file_path,'r', encoding='utf-8') as f:
            for line in f.readlines():
                if count == 500: break
                line = line.strip()
                id, doc, label = line.split('\t')
                docs.append(doc)
                if label == '1' : label = 'pos'
                elif label == '0': label = 'neg'
                labels.append(label)
                count += 1
                
        return docs[1:], labels[1:]
    
    def tokenize(self, sentence):
        if self.use_morph:
            pos_sent = []
    
            sent = okt.pos(sentence, norm=True, stem=True)

            for tup in sent:
                word, tag = tup[0], tup[1]
                word_tag = word + '/' + tag
                pos_sent.append(word_tag)
            sentence = ' '.join(pos_sent)
            
        return sentence.split()
    
    def count_words(self, docs, labels):
        
        count_dict = dict()
        for doc, label in zip(docs, labels):
            for word in self.tokenize(doc):
                if word not in count_dict:
                    count_dict[word] = {'pos' : 0, 'neg' : 0}
                count_dict[word][label] += 1
        print('num of words...', len(count_dict))
        return count_dict
    
    def word_prob(self, count_dict, pos_class_num, neg_class_num, k):
        
        word_prob_list = []
        
        for key in count_dict:
            pos_word_num = count_dict[key]['pos']
            neg_word_num = count_dict[key]['neg']
            
            pos_class_prob = (pos_word_num + k) / (pos_class_num + 2*k)
            neg_class_prob = (neg_word_num + k) / (neg_class_num + 2*k)
            
            tup = (key, pos_class_prob, neg_class_prob)
            word_prob_list.append(tup)
            
        return word_prob_list
        
    def class_prob(self, word_prob_list, test_sentence, use_unseen=False):
        
        test_words = self.tokenize(test_sentence)
        
        sent_log_pos_class_prob, sent_log_neg_class_prob = 0.0, 0.0
        
        for word, word_pos_class_prob, word_neg_class_prob in word_prob_list:
            if word in test_words:
                sent_log_pos_class_prob = sent_log_pos_class_prob + math.log(word_pos_class_prob)
                sent_log_neg_class_prob = sent_log_neg_class_prob + math.log(word_neg_class_prob)
            else:
                if use_unseen:
                    sent_log_pos_class_prob = sent_log_pos_class_prob + math.log(1.0 - word_pos_class_prob)
                    sent_log_neg_class_prob = sent_log_neg_class_prob + math.log(1.0 - word_neg_class_prob)
        
        sent_pos_class_prob = math.exp(sent_log_pos_class_prob)
        sent_neg_class_prob = math.exp(sent_log_neg_class_prob)
        
        pos_class_prob = sent_pos_class_prob/(sent_pos_class_prob+sent_neg_class_prob)
        neg_class_prob = sent_neg_class_prob/(sent_pos_class_prob+sent_neg_class_prob)
        
        return pos_class_prob, neg_class_prob
        
    
    def train(self, train_file_path):
        
        train_docs, train_labels = self.load_data(train_file_path)
        
        word_count_dict = self.count_words(train_docs, train_labels)
        
        pos_class_num = len([label for label in train_labels if label == 'pos'])
        neg_class_num = len([label for label in train_labels if label == 'neg'])
        
        self.word_probs = self.word_prob(word_count_dict, pos_class_num, neg_class_num, self.k)
        
    def classify(self, doc, use_unseen=False):
        
        pos_class_prob, neg_class_prob = self.class_prob(self.word_probs, doc, use_unseen)
        
        if pos_class_prob > neg_class_prob:
            print('pos', pos_class_prob)
        else:
            print('neg', neg_class_prob)

In [2]:
classifier = MyNaiveBayesClassifier()
classifier.train('ratings_train.txt')

num of words... 3055


In [3]:
classifier.classify('꼭 보시길 최고의 영화', use_unseen=False)

pos 0.9392181142000583


In [11]:
classifier.classify('인생 영화입니다', use_unseen=False)

pos 0.9290554296429239


In [13]:
classifier.classify('보다가 중간에 나왔습니다.', use_unseen=False)                                        

neg 0.6562452919895833


In [10]:
use_unseen = True
classifier.classify('꼭 보시길 최고의 영화', use_unseen)
classifier.classify('인생 영화입니다', use_unseen)
classifier.classify('보다가 중간에 나왔습니다', use_unseen)                                        

pos 0.9705386585557745
pos 0.8506813724201091
pos 0.5135404361197469


과연 내 프로젝트에도 적용하면 보다 정확한결과가 나올까