In [17]:
import sys
from konlpy.tag import Okt
import math

In [18]:
import sys
from konlpy.tag import Okt
import math

class BayesianFilter:
    def __init__(self):
        self.words = set()
        self.word_dict = {}
        self.category_dict = {}
    def split(self, text):
        result = []
        ok = Okt()
        malist = ok.pos(text, norm = True, stem = True)
        for word in malist:
            if not word[1] in ['Josa', 'Eomi', 'Punctuation']:
                result.append(word[0])
        return result
    def inc_word(self, word, category):
        if not category in self.word_dict:
            self.word_dict[category] = {}
        if not word in self.word_dict[category]:
            self.word_dict[category][word] = 1
        else:
            self.word_dict[category][word] +=1
        self.words.add(word)
    def inc_category(self, category):
        if not category in self.category_dict:
            self.category_dict[category] = 0
        self.category_dict[category] +=1
    def fit(self, text, category):
        word_list = self.split(text)
        for word in word_list:
            self.inc_word(word, category)
        self.inc_category(category)
    def score(self, words, category):
        score = math.log(self.category_prob(category))
        for word in words:
            score += math.log(self.word_prob(word, category))
        return score
    def predict(self, text):
        best_category = None
        max_score = -sys.maxsize
        words = self.split(text)
        score_list = list()
        for category in self.category_dict.keys():
            score = self.score(words, category)
            score_list.append((category, score))
            if score>max_score:
                max_score = score
                best_category = category
        return best_category, score_list
    def get_word_count(self, word, category):
        if word in self.word_dict[category]:
            return self.word_dict[category][word]
        else:
            return 0
    def category_prob(self,category):
        sum_categories = sum(self.category_dict.values())
        category_v = self.category_dict[category]
        return category_v/sum_categories
    def word_prob(self, word, category):
        n = self.get_word_count(word,category) +1
        d = sum(self.word_dict[category].values()) + len(self.words)
        return n/d

In [14]:
bf = BayesianFilter()

In [15]:
bf.fit('파격 세일 - 오늘까지만 30% 할인', '광고')
bf.fit('쿠폰 선물 & 무료 배송', '광고')
bf.fit('봄과 함께 찾아온 따듯한 신제품 소식', '광고')
bf.fit('신세계 백화점 세일','광고')
bf.fit('인기 제품 기간 한정 세일','광고')
bf.fit('오늘 일정 확인', '중요')
bf.fit('프로젝트 진행 상황 보고', '중요')
bf.fit('계약 잘 부탁드립니다.','중요')
bf.fit('회의 일정이 등록되었습니다.','중요')
bf.fit('오늘 일정이 없습니다.','중요')
# bf.fit()

In [20]:
pre, scorelist = bf.predict('재고 정리 할인, 무료 배송')
print('결과 : ', pre)
print(scorelist)

결과 :  광고
[('광고', -19.329379270837773), ('중요', -20.544606748320554)]


In [23]:
bf.category_dict

{'광고': 5, '중요': 5}

In [24]:
bf.word_dict

{'광고': {'파격': 1,
  '세': 3,
  '일': 3,
  '오늘': 1,
  '30%': 1,
  '할인': 1,
  '쿠폰': 1,
  '선물': 1,
  '무료': 1,
  '배송': 1,
  '봄': 1,
  '함께': 1,
  '찾아오다': 1,
  '따다': 1,
  '한': 1,
  '신제품': 1,
  '소식': 1,
  '신세계': 1,
  '백화점': 1,
  '인기': 1,
  '제품': 1,
  '기간': 1,
  '한정': 1},
 '중요': {'오늘': 2,
  '일정': 3,
  '확인': 1,
  '프로젝트': 1,
  '진행': 1,
  '상황': 1,
  '보고': 1,
  '계약': 1,
  '자다': 1,
  '부탁드리다': 1,
  '회의': 1,
  '등록': 1,
  '되어다': 1,
  '없다': 1}}