### Naive Bayes Classifier
나이브 베이즈 분류기(Naive Bayes Classifier)는 베이즈 정리를 기반으로 하는 확률적 분류 알고리즘입니다. 

주어진 데이터를 특성들의 조건부 확률을 계산하여 각 클래스에 속할 확률을 추정하고, 추정된 확률을 기반으로 데이터를 분류합니다. 

"나이브"라는 이름은 특성들이 독립적이라는 가정을 하기 때문에 붙여진 것입니다.


### 감성분석 - 영어

In [1]:
from nltk.tokenize import word_tokenize
import nltk

In [2]:
train = [
    ('i like you', 'pos'),
    ('i hate you', 'neg'),
    ('you like me', 'neg'),
    ('i like him','pos')
]

In [4]:
all_words = set(
    word.lower() for sentence in train for word in word_tokenize(sentence[0])
)

all_words

{'hate', 'him', 'i', 'like', 'me', 'you'}

In [8]:
t = [({word : (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t

[({'him': False,
   'you': True,
   'i': True,
   'hate': False,
   'like': True,
   'me': False},
  'pos'),
 ({'him': False,
   'you': True,
   'i': True,
   'hate': True,
   'like': False,
   'me': False},
  'neg'),
 ({'him': False,
   'you': True,
   'i': False,
   'hate': False,
   'like': True,
   'me': True},
  'neg'),
 ({'him': True,
   'you': False,
   'i': True,
   'hate': False,
   'like': True,
   'me': False},
  'pos')]

In [10]:
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

Most Informative Features
                    hate = False             pos : neg    =      1.7 : 1.0
                     him = False             neg : pos    =      1.7 : 1.0
                       i = True              pos : neg    =      1.7 : 1.0
                    like = True              pos : neg    =      1.7 : 1.0
                      me = False             pos : neg    =      1.7 : 1.0
                     you = True              neg : pos    =      1.7 : 1.0


In [11]:
test_sentence = 'i like pet'
test_sentence_features = {
    word.lower() :( word in word_tokenize(test_sentence.lower())) for word in all_words
}

test_sentence_features

{'him': False,
 'you': False,
 'i': True,
 'hate': False,
 'like': True,
 'me': False}

In [12]:
classifier.classify(test_sentence_features)

'pos'

### 감성분석 - 한글 

In [13]:
from konlpy.tag import Okt

post_tagger = Okt()

In [35]:
train = [
    ("메리가 좋아",'pos'),
    ("고양이가 좋아",'pos'),
    ("머신러닝은 어려워",'neg'),
    ("메리는 귀염둥이 고양이야",'pos'),
    ("나는 공부 끝내고 메리랑 놀래",'pos'),

]

In [36]:
all_words = set(
    word for sentence in train for word in word_tokenize(sentence[0])
)

all_words

{'고양이가',
 '고양이야',
 '공부',
 '귀염둥이',
 '끝내고',
 '나는',
 '놀래',
 '머신러닝은',
 '메리가',
 '메리는',
 '메리랑',
 '어려워',
 '좋아'}

In [37]:
t = [({word : (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t

[({'놀래': False,
   '메리가': True,
   '좋아': True,
   '어려워': False,
   '메리는': False,
   '고양이야': False,
   '메리랑': False,
   '머신러닝은': False,
   '공부': False,
   '고양이가': False,
   '나는': False,
   '끝내고': False,
   '귀염둥이': False},
  'pos'),
 ({'놀래': False,
   '메리가': False,
   '좋아': True,
   '어려워': False,
   '메리는': False,
   '고양이야': False,
   '메리랑': False,
   '머신러닝은': False,
   '공부': False,
   '고양이가': True,
   '나는': False,
   '끝내고': False,
   '귀염둥이': False},
  'pos'),
 ({'놀래': False,
   '메리가': False,
   '좋아': False,
   '어려워': True,
   '메리는': False,
   '고양이야': False,
   '메리랑': False,
   '머신러닝은': True,
   '공부': False,
   '고양이가': False,
   '나는': False,
   '끝내고': False,
   '귀염둥이': False},
  'neg'),
 ({'놀래': False,
   '메리가': False,
   '좋아': False,
   '어려워': False,
   '메리는': True,
   '고양이야': True,
   '메리랑': False,
   '머신러닝은': False,
   '공부': False,
   '고양이가': False,
   '나는': False,
   '끝내고': False,
   '귀염둥이': True},
  'pos'),
 ({'놀래': True,
   '메리가': False,
   '좋아': False,
   '어려워': False,
   '메리는': Fa

In [38]:
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

Most Informative Features
                      좋아 = False             neg : pos    =      1.5 : 1.0
                    고양이가 = False             neg : pos    =      1.1 : 1.0
                    고양이야 = False             neg : pos    =      1.1 : 1.0
                      공부 = False             neg : pos    =      1.1 : 1.0
                    귀염둥이 = False             neg : pos    =      1.1 : 1.0
                     끝내고 = False             neg : pos    =      1.1 : 1.0
                      나는 = False             neg : pos    =      1.1 : 1.0
                      놀래 = False             neg : pos    =      1.1 : 1.0
                     메리가 = False             neg : pos    =      1.1 : 1.0
                     메리는 = False             neg : pos    =      1.1 : 1.0


In [39]:
test_sentence = '난 머신러닝 마치면 메리랑 놀거야'
test_sentence_features = {
    word.lower() :( word in word_tokenize(test_sentence.lower())) for word in all_words
}

test_sentence_features

{'놀래': False,
 '메리가': False,
 '좋아': False,
 '어려워': False,
 '메리는': False,
 '고양이야': False,
 '메리랑': True,
 '머신러닝은': False,
 '공부': False,
 '고양이가': False,
 '나는': False,
 '끝내고': False,
 '귀염둥이': False}

In [40]:
classifier.classify(test_sentence_features)

'pos'

In [41]:
# 형태소 나누기 
def tokenize(doc):
    return ['/'.join(t) for t in post_tagger.pos(doc, norm=True, stem=True)] #stem : 어간추출 여부 -> ran, runs,running => run으로 통일 

In [42]:
train_docs = [(tokenize(row[0]), row[1]) for row in train]
train_docs

[(['메리/Noun', '가/Josa', '좋다/Adjective'], 'pos'),
 (['고양이/Noun', '가/Josa', '좋다/Adjective'], 'pos'),
 (['머신/Noun', '러닝/Noun', '은/Josa', '어렵다/Adjective'], 'neg'),
 (['메리/Noun', '는/Josa', '귀염둥이/Noun', '고양이/Noun', '야/Josa'], 'pos'),
 (['나/Noun',
   '는/Josa',
   '공부/Noun',
   '끝내다/Verb',
   '메리/Noun',
   '랑/Josa',
   '놀래다/Adjective'],
  'pos')]

In [43]:
tokens = [t for d in train_docs for t in d[0]]
tokens

['메리/Noun',
 '가/Josa',
 '좋다/Adjective',
 '고양이/Noun',
 '가/Josa',
 '좋다/Adjective',
 '머신/Noun',
 '러닝/Noun',
 '은/Josa',
 '어렵다/Adjective',
 '메리/Noun',
 '는/Josa',
 '귀염둥이/Noun',
 '고양이/Noun',
 '야/Josa',
 '나/Noun',
 '는/Josa',
 '공부/Noun',
 '끝내다/Verb',
 '메리/Noun',
 '랑/Josa',
 '놀래다/Adjective']

In [44]:
def term_exists(doc):
    return {word: (word in set(doc)) for word in tokens}

In [45]:
train_xy = [(term_exists(d),c) for d, c in train_docs]
train_xy

[({'메리/Noun': True,
   '가/Josa': True,
   '좋다/Adjective': True,
   '고양이/Noun': False,
   '머신/Noun': False,
   '러닝/Noun': False,
   '은/Josa': False,
   '어렵다/Adjective': False,
   '는/Josa': False,
   '귀염둥이/Noun': False,
   '야/Josa': False,
   '나/Noun': False,
   '공부/Noun': False,
   '끝내다/Verb': False,
   '랑/Josa': False,
   '놀래다/Adjective': False},
  'pos'),
 ({'메리/Noun': False,
   '가/Josa': True,
   '좋다/Adjective': True,
   '고양이/Noun': True,
   '머신/Noun': False,
   '러닝/Noun': False,
   '은/Josa': False,
   '어렵다/Adjective': False,
   '는/Josa': False,
   '귀염둥이/Noun': False,
   '야/Josa': False,
   '나/Noun': False,
   '공부/Noun': False,
   '끝내다/Verb': False,
   '랑/Josa': False,
   '놀래다/Adjective': False},
  'pos'),
 ({'메리/Noun': False,
   '가/Josa': False,
   '좋다/Adjective': False,
   '고양이/Noun': False,
   '머신/Noun': True,
   '러닝/Noun': True,
   '은/Josa': True,
   '어렵다/Adjective': True,
   '는/Josa': False,
   '귀염둥이/Noun': False,
   '야/Josa': False,
   '나/Noun': False,
   '공부/Noun': False,
   '

In [46]:
classifier = nltk.NaiveBayesClassifier.train(train_xy)
classifier.show_most_informative_features()

Most Informative Features
                 메리/Noun = False             neg : pos    =      2.5 : 1.0
                  가/Josa = False             neg : pos    =      1.5 : 1.0
                고양이/Noun = False             neg : pos    =      1.5 : 1.0
                  는/Josa = False             neg : pos    =      1.5 : 1.0
            좋다/Adjective = False             neg : pos    =      1.5 : 1.0
                 공부/Noun = False             neg : pos    =      1.1 : 1.0
               귀염둥이/Noun = False             neg : pos    =      1.1 : 1.0
                끝내다/Verb = False             neg : pos    =      1.1 : 1.0
                  나/Noun = False             neg : pos    =      1.1 : 1.0
           놀래다/Adjective = False             neg : pos    =      1.1 : 1.0


In [47]:
test_sentence = [('난 머신러닝 마치면 메리랑 놀거야')]

test_docs = post_tagger.pos(test_sentence[0])
test_docs

[('난', 'Noun'),
 ('머신', 'Noun'),
 ('러닝', 'Noun'),
 ('마치', 'Noun'),
 ('면', 'Josa'),
 ('메리', 'Noun'),
 ('랑', 'Josa'),
 ('놀거야', 'Verb')]

In [48]:
test_sentence_features = {word: (word in tokens) for word in test_docs}
test_sentence_features

{('난', 'Noun'): False,
 ('머신', 'Noun'): False,
 ('러닝', 'Noun'): False,
 ('마치', 'Noun'): False,
 ('면', 'Josa'): False,
 ('메리', 'Noun'): False,
 ('랑', 'Josa'): False,
 ('놀거야', 'Verb'): False}

In [49]:
classifier.classify(test_sentence_features)

'pos'